From: Teemu Murtola <teemu.murtola@gmail.com>
Date: Sun, 24 Jun 2012 17:00:12 +0000 (-0400)
Subject: Merge remote-tracking branch 'gerrit/release-4-6'
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=79e6e1a447486bd1a1c0b133843ed829827aeef4;p=alexxy%2Fgromacs.git

Merge remote-tracking branch 'gerrit/release-4-6'

Moved files added to include/ to src/gromacs/legacyheaders/ and
files added to src/gmxlib/ to src/gromacs/gmxlib/.

Conflicts:
    CMakeLists.txt
    src/config.h.cmakein

Conflicts for files removed from master:
    cmake/gmxTestInlineASM.cmake (resurrected)
    cmake/TestInlineASM_gcc_x86.c (no conflict, resurrected)
    src/gmxlib/CMakeLists.txt
        (changes applied to src/gromacs/CMakeLists.txt and
        src/gromacs/gmxlib/CMakeLists.txt)

Conflicts for files removed from release-4-6:
    src/gromacs/gmxlib/nonbonded/nb_kernel_ia32_sse/*
    src/gromacs/gmxlib/nonbonded/nb_kernel_ia32_sse2/*
    src/gromacs/gmxlib/nonbonded/nb_kernel_ia64_double/*
    src/gromacs/gmxlib/nonbonded/nb_kernel_ia64_single/*
    src/gromacs/gmxlib/nonbonded/nb_kernel_ppc_altivec/*
    src/gromacs/gmxlib/nonbonded/nb_kernel_x86_64_sse/*
    src/gromacs/gmxlib/nonbonded/nb_kernel_x86_64_sse2/*
    src/gromacs/legacyheaders/gmx_sse2_double.h
    src/gromacs/legacyheaders/gmx_sse2_single.h

Change-Id: I6202a89a802a8fd3e1788df95eac408cf60f3456
---

79e6e1a447486bd1a1c0b133843ed829827aeef4
diff --cc CMakeLists.txt
index d0e7869bb2,f1ed2488e2..ce7ee32f9c
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@@ -482,14 -513,19 +503,21 @@@ find_package(Doxygen
  ########################################################################
  
  add_definitions( -DHAVE_CONFIG_H )
 +include_directories(${CMAKE_SOURCE_DIR}/src)
 +# Required for config.h, maybe should only be set in src/CMakeLists.txt
  include_directories(${CMAKE_BINARY_DIR}/src)
 -include_directories(${CMAKE_BINARY_DIR}/include)
 -include_directories(${CMAKE_SOURCE_DIR}/include)
 +# Required for now to make old code compile
 +include_directories(${CMAKE_SOURCE_DIR}/src/gromacs/legacyheaders)
  
- include(gmxCheckBuildUserTime)
- gmx_check_build_user_time(BUILD_TIME BUILD_USER BUILD_MACHINE)
+ include(gmxTestInlineASM)
+ gmx_test_inline_asm_gcc_x86(GMX_X86_GCC_INLINE_ASM)
+ 
+ include(gmxSetBuildInformation)
+ gmx_set_build_information()
+ if(BUILD_CPU_FEATURES MATCHES "rdtscp")
+     # The timestep counter headers do not include config.h
+     add_definitions(-DHAVE_RDTSCP)
+ endif(BUILD_CPU_FEATURES MATCHES "rdtscp")
  
  include(gmxTestFloatFormat)
  gmx_test_float_format(GMX_FLOAT_FORMAT_IEEE754 
@@@ -514,22 -553,6 +542,10 @@@ gmx_test_isfinite(HAVE_ISFINITE
  gmx_test__isfinite(HAVE__ISFINITE)
  gmx_test__finite(HAVE__FINITE)
  
 +include(gmxTestCXX11)
 +gmx_test_cxx11(GMX_CXX11 CXX11_FLAG)
 +set(GROMACS_CXX_FLAGS "${CXX11_FLAG} ${GROMACS_CXX_FLAGS}")
 +
- # turn on SSE if supported with reasonable defaults.
- if (${GMX_ACCELERATION} STREQUAL "auto" AND NOT GMX_OPENMM)
-   if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(i.86|x86|x64|x86_64|AMD64|amd64)" OR CYGWIN)
- 
-     set(GMX_ACCELERATION "SSE" CACHE STRING "Accelerated kernels. Pick one of: auto, none, SSE, BlueGene, Power6, ia64, altivec, fortran" FORCE)
-     
-     # Determine the assembler/compiler to use
-   else()
-     set(GMX_ACCELERATION "none" CACHE STRING "Accelerated kernels. Pick one of: auto, none, SSE, BlueGene, Power6, ia64, altivec, fortran" FORCE)
-   endif()
- endif (${GMX_ACCELERATION} STREQUAL "auto" AND NOT GMX_OPENMM)
- 
  include(gmxTestXDR)
  gmx_test_xdr(GMX_SYSTEM_XDR)
  if(NOT GMX_SYSTEM_XDR)
diff --cc cmake/gmxDetectAcceleration.cmake
index 0000000000,dbe18b2428..9f64b5fe70
mode 000000,100644..100644
--- a/cmake/gmxDetectAcceleration.cmake
+++ b/cmake/gmxDetectAcceleration.cmake
@@@ -1,0 -1,50 +1,50 @@@
+ # - Check the username performing the build, as well as date and time
+ #
+ # GMX_DETECT_ACCELERATION(GMX_SUGGESTED_ACCELERATION)
+ #
+ # Try to detect CPU information and suggest an acceleration option
+ # (such as SSE/AVX) that fits the current CPU.
+ #
+ # GMX_SUGGESTED_ACCELERATION
+ #
+ 
+ # we rely on inline asm support for GNU!
+ include(gmxTestInlineASM)
+ 
+ macro(gmx_detect_acceleration GMX_SUGGESTED_ACCELERATION)
+     IF(NOT DEFINED ${GMX_SUGGESTED_ACCELERATION})
+ 
+     gmx_test_inline_asm_gcc_x86(GMX_X86_GCC_INLINE_ASM)
+ 
+     if(GMX_X86_GCC_INLINE_ASM)
+         set(GCC_INLINE_ASM_DEFINE "-DGMX_X86_GCC_INLINE_ASM")
+     else(GMX_X86_GCC_INLINE_ASM)
+         set(GCC_INLINE_ASM_DEFINE "")
+     endif(GMX_X86_GCC_INLINE_ASM)
+ 
+     message(STATUS "Detecting best acceleration for this CPU")
+ 
+     # Get CPU acceleration information
+     try_run(GMX_DETECTCPU_RUN_ACC GMX_DETECTCPU_COMPILED
+             ${CMAKE_BINARY_DIR}
 -            ${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_detectcpu.c
 -            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_DETECTCPU_STANDALONE"
++            ${CMAKE_SOURCE_DIR}/src/gromacs/gmxlib/gmx_detectcpu.c
++            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/src/gromacs/legacyheaders/ -DGMX_DETECTCPU_STANDALONE"
+             RUN_OUTPUT_VARIABLE OUTPUT_TMP
+             COMPILE_OUTPUT_VARIABLE GMX_DETECTCPU_COMPILE_OUTPUT 
+             ARGS "-acceleration")
+ 
+     if(NOT GMX_DETECTCPU_COMPILED)
+         message(WARNING "Cannot compile CPU detection code, which means no optimization.")
+         message(STATUS "Compile output: ${GMX_DETECTCPU_COMPILE_OUTPUT}")
+         set(OUTPUT_TMP "None")
+     endif(NOT GMX_DETECTCPU_COMPILED)
+ 
+     string(STRIP "@OUTPUT_TMP@" OUTPUT_ACC)
+ 
+     message(STATUS "Detecting best acceleration for this CPU - @OUTPUT_ACC@")
+ 
+     set(${GMX_SUGGESTED_ACCELERATION}    "@OUTPUT_ACC@" CACHE INTERNAL "Gromacs CPU Acceleration")
+ 
+     ENDIF(NOT DEFINED ${GMX_SUGGESTED_ACCELERATION})
+ endmacro(gmx_detect_acceleration GMX_SUGGESTED_ACCELERATION)
+ 
diff --cc cmake/gmxSetBuildInformation.cmake
index 0000000000,36c62f556b..b05f9bc442
mode 000000,100644..100644
--- a/cmake/gmxSetBuildInformation.cmake
+++ b/cmake/gmxSetBuildInformation.cmake
@@@ -1,0 -1,108 +1,108 @@@
+ 
+ # - Check the username performing the build, as well as date and time
+ #
+ # gmx_set_build_information()
+ #
+ # The macro variables will be set to the user/host/cpu used for configuration,
+ # or anonymous/unknown if it cannot be detected (windows)
+ #
+ # BUILD_TIME
+ # BUILD_USER
+ # BUILD_HOST
+ # BUILD_CPU_VENDOR
+ # BUILD_CPU_BRAND
+ # BUILD_CPU_FAMILY
+ # BUILD_CPU_MODEL
+ # BUILD_CPU_STEPPING
+ # BUILD_CPU_FEATURES
+ #
+ 
+ # we rely on inline asm support for GNU!
+ include(gmxTestInlineASM)
+ 
+ macro(gmx_set_build_information)
+     IF(NOT DEFINED BUILD_USER)
+ 
+     gmx_test_inline_asm_gcc_x86(GMX_X86_GCC_INLINE_ASM)
+ 
+     if(GMX_X86_GCC_INLINE_ASM)
+         set(GCC_INLINE_ASM_DEFINE "-DGMX_X86_GCC_INLINE_ASM")
+     else(GMX_X86_GCC_INLINE_ASM)
+         set(GCC_INLINE_ASM_DEFINE "")
+     endif(GMX_X86_GCC_INLINE_ASM)
+ 
+     message(STATUS "Setting build user/date/host/cpu information")
+     if(CMAKE_HOST_UNIX)
+         execute_process( COMMAND date     OUTPUT_VARIABLE TMP_TIME    OUTPUT_STRIP_TRAILING_WHITESPACE)
+         execute_process( COMMAND whoami   OUTPUT_VARIABLE TMP_USER       OUTPUT_STRIP_TRAILING_WHITESPACE)
+         execute_process( COMMAND hostname OUTPUT_VARIABLE TMP_HOSTNAME   OUTPUT_STRIP_TRAILING_WHITESPACE)
+         set(BUILD_USER    "@TMP_USER@\@@TMP_HOSTNAME@ [CMAKE]" CACHE INTERNAL "Build user")
+         set(BUILD_TIME    "@TMP_TIME@" CACHE INTERNAL "Build date & time")
+         execute_process( COMMAND uname -srm OUTPUT_VARIABLE TMP_HOST OUTPUT_STRIP_TRAILING_WHITESPACE)
+         set(BUILD_HOST    "@TMP_HOST@" CACHE INTERNAL "Build host & architecture")
+         message(STATUS "Setting build user & time - OK")
+     else(CMAKE_HOST_UNIX)
+         set(BUILD_USER    "Anonymous@unknown [CMAKE]" CACHE INTERNAL "Build user")
+         set(BUILD_TIME    "Unknown date" CACHE INTERNAL "Build date & time")
+         set(BUILD_HOST    "@CMAKE_HOST_SYSTEM@ @CMAKE_HOST_SYSTEM_PROCESSOR@" CACHE INTERNAL "Build host & architecture")
+         message(STATUS "Setting build user & time - not on Unix, using anonymous")
+     endif(CMAKE_HOST_UNIX)
+ 
+     if(NOT CMAKE_CROSSCOMPILING)
+         # Get CPU acceleration information
+         try_run(GMX_DETECTCPU_RUN_VENDOR GMX_DETECTCPU_COMPILED
+             ${CMAKE_BINARY_DIR}
 -            ${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_detectcpu.c
 -            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_DETECTCPU_STANDALONE"
++            ${CMAKE_SOURCE_DIR}/src/gromacs/gmxlib/gmx_detectcpu.c
++            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/src/gromacs/legacyheaders/ -DGMX_DETECTCPU_STANDALONE"
+             RUN_OUTPUT_VARIABLE OUTPUT_CPU_VENDOR ARGS "-vendor")
+         try_run(GMX_DETECTCPU_RUN_BRAND GMX_DETECTCPU_COMPILED
+             ${CMAKE_BINARY_DIR}
 -            ${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_detectcpu.c
 -            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_DETECTCPU_STANDALONE"
++            ${CMAKE_SOURCE_DIR}/src/gromacs/gmxlib/gmx_detectcpu.c
++            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/src/gromacs/legacyheaders/ -DGMX_DETECTCPU_STANDALONE"
+             RUN_OUTPUT_VARIABLE OUTPUT_CPU_BRAND ARGS "-brand")
+         try_run(GMX_DETECTCPU_RUN_FAMILY GMX_DETECTCPU_COMPILED
+             ${CMAKE_BINARY_DIR}
 -            ${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_detectcpu.c
 -            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_DETECTCPU_STANDALONE"
++            ${CMAKE_SOURCE_DIR}/src/gromacs/gmxlib/gmx_detectcpu.c
++            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/src/gromacs/legacyheaders/ -DGMX_DETECTCPU_STANDALONE"
+             RUN_OUTPUT_VARIABLE OUTPUT_CPU_FAMILY ARGS "-family")
+         try_run(GMX_DETECTCPU_RUN_MODEL GMX_DETECTCPU_COMPILED
+             ${CMAKE_BINARY_DIR}
 -            ${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_detectcpu.c
 -            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_DETECTCPU_STANDALONE"
++            ${CMAKE_SOURCE_DIR}/src/gromacs/gmxlib/gmx_detectcpu.c
++            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/src/gromacs/legacyheaders/ -DGMX_DETECTCPU_STANDALONE"
+             RUN_OUTPUT_VARIABLE OUTPUT_CPU_MODEL ARGS "-model")
+        try_run(GMX_DETECTCPU_RUN_STEPPING GMX_DETECTCPU_COMPILED
+             ${CMAKE_BINARY_DIR}
 -            ${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_detectcpu.c
 -            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_DETECTCPU_STANDALONE"
++            ${CMAKE_SOURCE_DIR}/src/gromacs/gmxlib/gmx_detectcpu.c
++            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/src/gromacs/legacyheaders/ -DGMX_DETECTCPU_STANDALONE"
+             RUN_OUTPUT_VARIABLE OUTPUT_CPU_STEPPING ARGS "-stepping")
+         try_run(GMX_DETECTCPU_RUN_FEATURES GMX_DETECTCPU_COMPILED
+             ${CMAKE_BINARY_DIR}
 -            ${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_detectcpu.c
 -            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_DETECTCPU_STANDALONE"
++            ${CMAKE_SOURCE_DIR}/src/gromacs/gmxlib/gmx_detectcpu.c
++            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/src/gromacs/legacyheaders/ -DGMX_DETECTCPU_STANDALONE"
+             RUN_OUTPUT_VARIABLE OUTPUT_CPU_FEATURES ARGS "-features")
+ 
+         string(STRIP "@OUTPUT_CPU_VENDOR@" OUTPUT_CPU_VENDOR)
+         string(STRIP "@OUTPUT_CPU_BRAND@" OUTPUT_CPU_BRAND)
+         string(STRIP "@OUTPUT_CPU_FEATURES@" OUTPUT_CPU_FEATURES)
+ 
+         set(BUILD_CPU_VENDOR   "@OUTPUT_CPU_VENDOR@"   CACHE INTERNAL "Build CPU vendor")
+         set(BUILD_CPU_BRAND    "@OUTPUT_CPU_BRAND@"    CACHE INTERNAL "Build CPU brand")
+         set(BUILD_CPU_FAMILY   "@OUTPUT_CPU_FAMILY@"   CACHE INTERNAL "Build CPU family")
+         set(BUILD_CPU_MODEL    "@OUTPUT_CPU_MODEL@"    CACHE INTERNAL "Build CPU model")
+         set(BUILD_CPU_STEPPING "@OUTPUT_CPU_STEPPING@" CACHE INTERNAL "Build CPU stepping")
+         set(BUILD_CPU_FEATURES "@OUTPUT_CPU_FEATURES@" CACHE INTERNAL "Build CPU features")
+ 
+     else(NOT CMAKE_CROSSCOMPILING)
+         
+         set(BUILD_CPU_VENDOR   "Unknown, cross-compiled"   CACHE INTERNAL "Build CPU vendor")
+         set(BUILD_CPU_BRAND    "Unknown, cross-compiled"    CACHE INTERNAL "Build CPU brand")
+         set(BUILD_CPU_FAMILY   "0"   CACHE INTERNAL "Build CPU family")
+         set(BUILD_CPU_MODEL    "0"    CACHE INTERNAL "Build CPU model")
+         set(BUILD_CPU_STEPPING "0" CACHE INTERNAL "Build CPU stepping")
+         set(BUILD_CPU_FEATURES "" CACHE INTERNAL "Build CPU features")
+ 
+     endif(NOT CMAKE_CROSSCOMPILING)
+ 
+     ENDIF(NOT DEFINED BUILD_USER)
+ endmacro(gmx_set_build_information)
+ 
diff --cc src/config.h.cmakein
index 54e89b3412,0472b8412d..8fb7c625a2
--- a/src/config.h.cmakein
+++ b/src/config.h.cmakein
@@@ -20,30 -20,39 +20,48 @@@
  #cmakedefine USE_VERSION_H
  
  /* Default location of data files */
- #cmakedefine GMXLIBDIR "@GMXLIBDIR@"
+ #define GMXLIBDIR "@GMXLIBDIR@"
  
  /* Hardware and OS version for build host */
- #cmakedefine BUILD_MACHINE "@BUILD_MACHINE@"
+ #define BUILD_HOST "@BUILD_HOST@"
+ 
+ /* CPU information for build host */
+ #define BUILD_CPU_VENDOR "@BUILD_CPU_VENDOR@"
+ 
+ #define BUILD_CPU_BRAND "@BUILD_CPU_BRAND@"
+ 
+ #define BUILD_CPU_FAMILY @BUILD_CPU_FAMILY@
+ 
+ #define BUILD_CPU_MODEL @BUILD_CPU_MODEL@
+ 
+ #define BUILD_CPU_STEPPING @BUILD_CPU_STEPPING@
+ 
+ #define BUILD_CPU_FEATURES "@BUILD_CPU_FEATURES@"
+ 
+ /* Compiler and CFLAGS from build */
+ #define BUILD_COMPILER "@BUILD_COMPILER@"
+ 
+ #define BUILD_CFLAGS   "@BUILD_CFLAGS@"
  
  /* Date and time for build */
- #cmakedefine BUILD_TIME "@BUILD_TIME@"
+ #define BUILD_TIME "@BUILD_TIME@"
  
  /* User doing build */
- #cmakedefine BUILD_USER "@BUILD_USER@"
+ #define BUILD_USER "@BUILD_USER@"
  
 +/* Binary suffix for the created binaries */
 +#define GMX_BINARY_SUFFIX "@GMX_BINARY_SUFFIX@"
 +
 +/* Source directory for the build */
 +#cmakedefine CMAKE_SOURCE_DIR "@CMAKE_SOURCE_DIR@"
 +
 +/* Binary directory for the build */
 +#cmakedefine CMAKE_BINARY_DIR "@CMAKE_BINARY_DIR@"
 +
- /* Turn off water-water neighborlist optimization only */
+ /* Turn off water-water neighborlist optimization only - not used right now */
  #cmakedefine DISABLE_WATERWATER_NLIST
  
- /* Turn off all water neighborlist optimization */
+ /* Turn off all water neighborlist optimization - not used right now */
  #cmakedefine DISABLE_WATER_NLIST
  
  /* Fortran support */
@@@ -170,27 -185,18 +191,9 @@@
  /* Define for sysconf() */
  #cmakedefine HAVE_SYSCONF
  
- /* Single-precision SSE instructions on X86_64 */
- #cmakedefine GMX_X86_64_SSE
- 
- /* Double-precision SSE2 instructions on X86_64 */
- #cmakedefine GMX_X86_64_SSE2
- 
- /* Support for SSE intrinsics */
- #cmakedefine GMX_SSE
- 
- /* Support for SSE2 intrinsics */
- #cmakedefine GMX_SSE2
- 
- /* Support for SSE3 intrinsics */
- #cmakedefine GMX_SSE3
- 
- /* Support for SSE4.1 intrinsics */
- #cmakedefine GMX_SSE4_1
 -/* Define for GetSystemInfo() */
 -#cmakedefine HAVE_SYSTEM_INFO
--
- /* Define to 1 if you have the <altivec.h> header file. */
- #cmakedefine HAVE_ALTIVEC_H
+ /* Enable x86 gcc inline assembly */
+ #cmakedefine GMX_X86_GCC_INLINE_ASM
  
 -/* Define to 1 if the system has the type gmx_bool. */
 -#cmakedefine HAVE_BOOL
 -
  /* Define to 1 if fseeko (and presumably ftello) exists and is declared. */
  #cmakedefine HAVE_FSEEKO
  
@@@ -269,6 -326,18 +272,9 @@@
  /* Define to 1 if you have the <sys/time.h> header file. */
  #cmakedefine HAVE_SYS_TIME_H
  
 -/* Define to 1 if you have the <rpc/rpc.h> header file. */
 -#cmakedefine HAVE_RPC_RPC_H
 -
 -/* Define to 1 if you have the <rpc/xdr.h> header file. */
 -#cmakedefine HAVE_RPC_XDR_H
 -
+ /* Define to 1 if you have the <x86intrin.h> header file */
+ #cmakedefine HAVE_X86INTRIN_H
+ 
  /* Define for sched.h (this is for thread_mpi)*/
  #define HAVE_SCHED_H
  
diff --cc src/gromacs/CMakeLists.txt
index 9951d6d083,0000000000..2f473f7f41
mode 100644,000000..100644
--- a/src/gromacs/CMakeLists.txt
+++ b/src/gromacs/CMakeLists.txt
@@@ -1,88 -1,0 +1,61 @@@
 +set(LIBGROMACS_SOURCES)
 +
 +add_subdirectory(legacyheaders)
 +add_subdirectory(gmxlib)
 +add_subdirectory(mdlib)
 +add_subdirectory(gmxpreprocess)
 +add_subdirectory(analysisdata)
 +add_subdirectory(commandline)
 +add_subdirectory(linearalgebra)
 +add_subdirectory(onlinehelp)
 +add_subdirectory(options)
 +add_subdirectory(selection)
 +add_subdirectory(trajectoryanalysis)
 +add_subdirectory(utility)
 +
 +file(GLOB LIBGROMACS_HEADERS *.h)
 +install(FILES ${LIBGROMACS_HEADERS} DESTINATION ${INCL_INSTALL_DIR}/gromacs
 +        COMPONENT development)
 +
- # only fiddle with assembly kernels if we're not doing OpenMM build
- if(GMX_SSEKERNEL_ASM_SRC AND NOT GMX_OPENMM)
- if(GMX_ASM_USEASM_NASM)
-   enable_language(ASM_NASM)
-   # if NASM is used, we need a special build command for windows...
-   FOREACH(SRC ${GMX_SSEKERNEL_ASM_SRC})
-     GET_FILENAME_COMPONENT(FILE_BASE ${SRC} NAME_WE)
-     SET(OBJ ${CMAKE_CURRENT_BINARY_DIR}/${FILE_BASE}${CMAKE_C_OUTPUT_EXTENSION})
- 
-     ADD_CUSTOM_COMMAND(OUTPUT ${OBJ}
-                        MAIN_DEPENDENCY ${SRC}
-                        COMMAND ${CMAKE_ASM_NASM_COMPILER} -f ${CMAKE_ASM_NASM_OBJECT_FORMAT} -o ${OBJ} ${SRC})
- 
-     SET(ALL_ASM_OBJS ${ALL_ASM_OBJS} ${OBJ})
-   ENDFOREACH(SRC ${GMX_SSEKERNEL_ASM_SRC})
-   set(GMX_SSEKERNEL_ASM_SRC ${ALL_ASM_OBJS})
- else(GMX_ASM_USEASM_NASM)
-   enable_language(ASM-ATT)
-   SET(CMAKE_ASM-ATT_COMPILER ${CMAKE_C_COMPILER})
-   if(GMX_IA32_ASM)
-     set_source_files_properties(${GMX_SSEKERNEL_ASM_SRC} PROPERTIES COMPILE_FLAGS "-c -m32")
-   else()
-     set_source_files_properties(${GMX_SSEKERNEL_ASM_SRC} PROPERTIES COMPILE_FLAGS "-c -m64")
-   endif()
- endif(GMX_ASM_USEASM_NASM)
- endif(GMX_SSEKERNEL_ASM_SRC AND NOT GMX_OPENMM)
- 
- list(APPEND LIBGROMACS_SOURCES ${GMXLIB_SOURCES} ${GMX_SSEKERNEL_ASM_SRC} ${MDLIB_SOURCES})
++list(APPEND LIBGROMACS_SOURCES ${GMXLIB_SOURCES} ${MDLIB_SOURCES})
 +
 +# add target that generates version.c every time a make is run
 +# only do this if we generate the version
 +if (USE_VERSION_H)
 +    add_custom_target(gmx_version ALL
 +            COMMAND ${CMAKE_COMMAND} 
 +                -D GIT_EXECUTABLE="${GIT_EXECUTABLE}"
 +                -D GIT_VERSION="${GIT_VERSION}"
 +                -D PROJECT_VERSION="${PROJECT_VERSION}"
 +                -D PROJECT_SOURCE_DIR="${PROJECT_SOURCE_DIR}"
 +                -D VERSION_C_CMAKEIN="${CMAKE_CURRENT_SOURCE_DIR}/version.c.cmakein"
 +                -D VERSION_C_OUT="${CMAKE_CURRENT_BINARY_DIR}/version.c"
 +                -P ${CMAKE_SOURCE_DIR}/cmake/gmxGenerateVersionInfo.cmake 
 +            WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/src/gmxlib 
 +            DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/version.c.cmakein
 +            COMMENT "Generating version information")
 +    list(APPEND LIBGROMACS_SOURCES ${CMAKE_CURRENT_BINARY_DIR}/version.c) # auto-generated
 +    set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/version.c 
 +                                PROPERTIES GENERATED true)
 +endif (USE_VERSION_H)
 +
 +add_library(libgromacs ${LIBGROMACS_SOURCES})
 +if (USE_VERSION_H)
 +    add_dependencies(libgromacs gmx_version)
 +endif (USE_VERSION_H)
 +target_link_libraries(libgromacs
 +                      ${GMX_EXTRA_LIBRARIES} ${FFT_LIBRARIES} ${XML_LIBRARIES}
 +                      ${THREAD_LIB})
 +set_target_properties(libgromacs PROPERTIES
 +                      OUTPUT_NAME "gromacs${GMX_LIBS_SUFFIX}"
 +                      SOVERSION ${SOVERSION}
 +                      INSTALL_NAME_DIR "${LIB_INSTALL_DIR}")
 +
 +install(TARGETS libgromacs DESTINATION ${LIB_INSTALL_DIR} COMPONENT libraries)
 +
 +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/libgromacs.pc.cmakein
 +               ${CMAKE_CURRENT_BINARY_DIR}/libgromacs.pc @ONLY)
 +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libgromacs.pc
 +        DESTINATION ${LIB_INSTALL_DIR}/pkgconfig
 +        RENAME "libgromacs${GMX_LIBS_SUFFIX}.pc"
 +        COMPONENT development)
diff --cc src/gromacs/gmxlib/CMakeLists.txt
index c84de7ca5e,0000000000..1ef1466914
mode 100644,000000..100644
--- a/src/gromacs/gmxlib/CMakeLists.txt
+++ b/src/gromacs/gmxlib/CMakeLists.txt
@@@ -1,80 -1,0 +1,33 @@@
 +include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 +
++add_subdirectory(nonbonded)
++
 +# The nonbonded directory contains subdirectories that are only
 +# conditionally built, so we cannot use a GLOB_RECURSE here.
- file(GLOB GMXLIB_SOURCES *.c *.cpp
-      statistics/*.c nonbonded/*.c nonbonded/nb_kernel_c/*.c
-      nonbonded/nb_kernel_adress_c/*.c)
- 
- if(GMX_DOUBLE)
-   set(SSETYPE sse2)
- else()
-   set(SSETYPE sse)
- endif()
- 
- if(GMX_IA32_ASM)
-   file(GLOB GMX_SSEKERNEL_C_SRC   nonbonded/nb_kernel_ia32_${SSETYPE}/*.c)
-   if(GMX_ASM_USEASM_NASM)
-     file(GLOB GMX_SSEKERNEL_ASM_SRC nonbonded/nb_kernel_ia32_${SSETYPE}/*intel_syntax*.s)    
-   else()
-     file(GLOB GMX_SSEKERNEL_ASM_SRC nonbonded/nb_kernel_ia32_${SSETYPE}/*${SSETYPE}.s nonbonded/nb_kernel_ia32_${SSETYPE}/*asm.s)
-   endif()
- endif(GMX_IA32_ASM)
- 
- if(GMX_X86_64_ASM)
-   file(GLOB GMX_SSEKERNEL_C_SRC   nonbonded/nb_kernel_x86_64_${SSETYPE}/*.c)
-   if(GMX_ASM_USEASM_NASM)
-     file(GLOB GMX_SSEKERNEL_ASM_SRC nonbonded/nb_kernel_x86_64_${SSETYPE}/*intel_syntax*.s)
-   else()
-     file(GLOB GMX_SSEKERNEL_ASM_SRC nonbonded/nb_kernel_x86_64_${SSETYPE}/*${SSETYPE}.s nonbonded/nb_kernel_x86_64_${SSETYPE}/*asm.s)
-   endif()
- endif(GMX_X86_64_ASM)
- 
- if(GMX_FORTRAN)
-   if (GMX_DOUBLE)
-     file(GLOB FORTRAN_SOURCES nonbonded/nb_kernel_f77_double/*.[cf])
-   else(GMX_DOUBLE)
-     file(GLOB FORTRAN_SOURCES nonbonded/nb_kernel_f77_single/*.[cf])
-   endif(GMX_DOUBLE)
- endif(GMX_FORTRAN)
- 
- if(GMX_POWER6)
-   file(GLOB FORTRAN_SOURCES nonbonded/nb_kernel_power6/*.[cF])
- endif(GMX_POWER6)
- 
- if(GMX_BLUEGENE)
-   file(GLOB GMX_BLUEGENE_C_SRC nonbonded/nb_kernel_bluegene/*.c)
- endif(GMX_BLUEGENE)
- 
- if(GMX_PPC_ALTIVEC)
-   file(GLOB GMX_PPC_ALTIVEC_SRC nonbonded/nb_kernel_ppc_altivec/*.c)
- endif(GMX_PPC_ALTIVEC)
++file(GLOB GMXLIB_SOURCES *.c *.cpp statistics/*.c)
 +
 +# This would be the standard way to include thread_mpi, but we want libgmx
 +# to link the functions directly
 +#if(GMX_THREAD_MPI)
 +#    add_subdirectory(thread_mpi)
 +#endif(GMX_THREAD_MPI)
 +#target_link_libraries(gmx ${GMX_EXTRA_LIBRARIES} ${THREAD_MPI_LIB})
 +
 +# Files called xxx_test.c are test drivers with a main() function for module xxx.c,
 +# so they should not be included in the library
 +file(GLOB_RECURSE NOT_GMXLIB_SOURCES *_test.c *\#*)
 +list(REMOVE_ITEM GMXLIB_SOURCES ${NOT_GMXLIB_SOURCES})  
 +
 +if(GMX_USE_PLUGINS)
 +  set(GMXLIB_SOURCES ${GMXLIB_SOURCES} ${CMAKE_SOURCE_DIR}/src/external/vmd_molfile/vmddlopen.c)
 +else()
 +  list(REMOVE_ITEM GMXLIB_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/vmdio.c)
 +endif()
 +
 +# An ugly hack to get absolute paths...
 +file(GLOB THREAD_MPI_SOURCES ${THREAD_MPI_SRC})
 +file(GLOB THREAD_MPI_CXX_SOURCES ${THREAD_MPI_CXX_SRC})
 +
- set(GMX_SSEKERNEL_ASM_SRC ${GMX_SSEKERNEL_ASM_SRC} PARENT_SCOPE)
 +set(GMXLIB_SOURCES ${GMXLIB_SOURCES}
-     ${GMX_SSEKERNEL_C_SRC} ${FORTRAN_SOURCES}
-     ${GMX_BLUEGENE_C_SRC} ${GMX_PPC_ALTIVEC_SRC} ${THREAD_MPI_SOURCES}
-     ${THREAD_MPI_CXX_SOURCES}
++    ${THREAD_MPI_SOURCES} ${THREAD_MPI_CXX_SOURCES} ${NONBONDED_SOURCES}
 +    PARENT_SCOPE)
diff --cc src/gromacs/gmxlib/checkpoint.c
index 5d9beaa2ef,0000000000..c1792367b0
mode 100644,000000..100644
--- a/src/gromacs/gmxlib/checkpoint.c
+++ b/src/gromacs/gmxlib/checkpoint.c
@@@ -1,2398 -1,0 +1,2404 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + * This file is part of Gromacs        Copyright (c) 1991-2008
 + * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gnomes, ROck Monsters And Chili Sauce
 + */
 +
 +/* The source code in this file should be thread-safe. 
 + Please keep it that way. */
 +
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +#include "gromacs/utility/gmx_header_config.h"
 +
 +#include <string.h>
 +#include <time.h>
 +
 +#ifdef HAVE_SYS_TIME_H
 +#include <sys/time.h>
 +#endif
 +
 +#ifdef HAVE_UNISTD_H
 +#include <unistd.h>
 +#endif
 +
 +#ifdef GMX_NATIVE_WINDOWS
 +/* _chsize_s */
 +#include <io.h>
 +#include <sys/locking.h>
 +#endif
 +
 +
 +#include "filenm.h"
 +#include "names.h"
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "gmxfio.h"
 +#include "xdrf.h"
 +#include "statutil.h"
 +#include "txtdump.h"
 +#include "vec.h"
 +#include "mdrun.h"
 +#include "network.h"
 +#include "gmx_random.h"
 +#include "checkpoint.h"
 +#include "futil.h"
 +#include "string2.h"
 +#include <fcntl.h>
 +
 +
 +#ifdef GMX_FAHCORE
 +#include "corewrap.h"
 +#endif
 +
 +
 +/* Portable version of ctime_r implemented in src/gmxlib/string2.c, but we do not want it declared in public installed headers */
 +char *
 +gmx_ctime_r(const time_t *clock,char *buf, int n);
 +
 +
 +#define CPT_MAGIC1 171817
 +#define CPT_MAGIC2 171819
 +#define CPTSTRLEN 1024
 +
 +#ifdef GMX_DOUBLE
 +#define GMX_CPT_BUILD_DP 1
 +#else
 +#define GMX_CPT_BUILD_DP 0
 +#endif
 +
 +/* cpt_version should normally only be changed
 + * when the header of footer format changes.
 + * The state data format itself is backward and forward compatible.
 + * But old code can not read a new entry that is present in the file
 + * (but can read a new format when new entries are not present).
 + */
 +static const int cpt_version = 13;
 +
 +
 +const char *est_names[estNR]=
 +{
 +    "FE-lambda",
 +    "box", "box-rel", "box-v", "pres_prev",
 +    "nosehoover-xi", "thermostat-integral",
 +    "x", "v", "SDx", "CGp", "LD-rng", "LD-rng-i",
 +    "disre_initf", "disre_rm3tav",
 +    "orire_initf", "orire_Dtav",
 +    "svir_prev", "nosehoover-vxi", "v_eta", "vol0", "nhpres_xi", "nhpres_vxi", "fvir_prev","fep_state", "MC-rng", "MC-rng-i"
 +};
 +
 +enum { eeksEKIN_N, eeksEKINH, eeksDEKINDL, eeksMVCOS, eeksEKINF, eeksEKINO, eeksEKINSCALEF, eeksEKINSCALEH, eeksVSCALE, eeksEKINTOTAL, eeksNR };
 +
 +const char *eeks_names[eeksNR]=
 +{
 +    "Ekin_n", "Ekinh", "dEkindlambda", "mv_cos",
 +    "Ekinf", "Ekinh_old", "EkinScaleF_NHC", "EkinScaleH_NHC","Vscale_NHC","Ekin_Total"
 +};
 +
 +enum { eenhENERGY_N, eenhENERGY_AVER, eenhENERGY_SUM, eenhENERGY_NSUM,
 +       eenhENERGY_SUM_SIM, eenhENERGY_NSUM_SIM,
 +       eenhENERGY_NSTEPS, eenhENERGY_NSTEPS_SIM, 
 +       eenhENERGY_DELTA_H_NN,
 +       eenhENERGY_DELTA_H_LIST, 
 +       eenhENERGY_DELTA_H_STARTTIME, 
 +       eenhENERGY_DELTA_H_STARTLAMBDA, 
 +       eenhNR };
 +
 +const char *eenh_names[eenhNR]=
 +{
 +    "energy_n", "energy_aver", "energy_sum", "energy_nsum",
 +    "energy_sum_sim", "energy_nsum_sim",
 +    "energy_nsteps", "energy_nsteps_sim", 
 +    "energy_delta_h_nn",
 +    "energy_delta_h_list", 
 +    "energy_delta_h_start_time", 
 +    "energy_delta_h_start_lambda"
 +};
 +
 +/* free energy history variables -- need to be preserved over checkpoint */
 +enum { edfhBEQUIL,edfhNATLAMBDA,edfhWLHISTO,edfhWLDELTA,edfhSUMWEIGHTS,edfhSUMDG,edfhSUMMINVAR,edfhSUMVAR,
 +       edfhACCUMP,edfhACCUMM,edfhACCUMP2,edfhACCUMM2,edfhTIJ,edfhTIJEMP,edfhNR };
 +/* free energy history variable names  */
 +const char *edfh_names[edfhNR]=
 +{
 +    "bEquilibrated","N_at_state", "Wang-Landau_Histogram", "Wang-Landau-delta", "Weights", "Free Energies", "minvar","variance",
 +    "accumulated_plus", "accumulated_minus", "accumulated_plus_2",  "accumulated_minus_2", "Tij", "Tij_empirical"
 +};
 +
 +#ifdef GMX_NATIVE_WINDOWS
 +static int
 +gmx_wintruncate(const char *filename, __int64 size)
 +{
 +#ifdef GMX_FAHCORE
 +    /*we do this elsewhere*/
 +    return 0;
 +#else
 +    FILE *fp;
 +    int   rc;
 +    
 +    fp=fopen(filename,"rb+");
 +    
 +    if(fp==NULL)
 +    {
 +        return -1;
 +    }
 +    
 +    return _chsize_s( fileno(fp), size);
 +#endif
 +}
 +#endif
 +
 +
 +enum { ecprREAL, ecprRVEC, ecprMATRIX };
 +
 +enum { cptpEST, cptpEEKS, cptpEENH, cptpEDFH };
 +/* enums for the different components of checkpoint variables, replacing the hard coded ones.
 +   cptpEST - state variables.
 +   cptpEEKS - Kinetic energy state variables.
 +   cptpEENH - Energy history state variables.
 +   cptpEDFH - free energy history variables.
 +*/
 +
 +
 +static const char *st_names(int cptp,int ecpt)
 +{
 +    switch (cptp)
 +    {
 +    case cptpEST: return est_names [ecpt]; break;
 +    case cptpEEKS: return eeks_names[ecpt]; break;
 +    case cptpEENH: return eenh_names[ecpt]; break;
 +    case cptpEDFH: return edfh_names[ecpt]; break;
 +    }
 +
 +    return NULL;
 +}
 +
 +static void cp_warning(FILE *fp)
 +{
 +    fprintf(fp,"\nWARNING: Checkpoint file is corrupted or truncated\n\n");
 +}
 +
 +static void cp_error()
 +{
 +    gmx_fatal(FARGS,"Checkpoint file corrupted/truncated, or maybe you are out of disk space?");
 +}
 +
 +static void do_cpt_string_err(XDR *xd,gmx_bool bRead,const char *desc,char **s,FILE *list)
 +{
 +    bool_t res=0;
 +    
 +    if (bRead)
 +    {
 +        snew(*s,CPTSTRLEN);
 +    }
 +    res = xdr_string(xd,s,CPTSTRLEN);
 +    if (res == 0)
 +    {
 +        cp_error();
 +    }
 +    if (list)
 +    {
 +        fprintf(list,"%s = %s\n",desc,*s);
 +        sfree(*s);
 +    }
 +}
 +
 +static int do_cpt_int(XDR *xd,const char *desc,int *i,FILE *list)
 +{
 +    bool_t res=0;
 +    
 +    res = xdr_int(xd,i);
 +    if (res == 0)
 +    {
 +        return -1;
 +    }
 +    if (list)
 +    {
 +        fprintf(list,"%s = %d\n",desc,*i);
 +    }
 +    return 0;
 +}
 +
 +static int do_cpt_u_chars(XDR *xd,const char *desc,int n,unsigned char *i,FILE *list)
 +{
 +    bool_t res=1;
 +    int j;
 +    if (list)
 +    {
 +        fprintf(list,"%s = ",desc);
 +    }
 +    for (j=0; j<n && res; j++)
 +    {
 +        res &= xdr_u_char(xd,&i[j]);
 +        if (list)
 +        {
 +            fprintf(list,"%02x",i[j]);
 +        }
 +    }
 +    if (list)
 +    {
 +        fprintf(list,"\n");
 +    }
 +    if (res == 0)
 +    {
 +        return -1;
 +    }
 +
 +    return 0;
 +}
 +
 +static void do_cpt_int_err(XDR *xd,const char *desc,int *i,FILE *list)
 +{
 +    if (do_cpt_int(xd,desc,i,list) < 0)
 +    {
 +        cp_error();
 +    }
 +}
 +
 +static void do_cpt_step_err(XDR *xd,const char *desc,gmx_large_int_t *i,FILE *list)
 +{
 +    bool_t res=0;
 +    char   buf[STEPSTRSIZE];
 +
 +    res = xdr_gmx_large_int(xd,i,"reading checkpoint file");
 +    if (res == 0)
 +    {
 +        cp_error();
 +    }
 +    if (list)
 +    {
 +        fprintf(list,"%s = %s\n",desc,gmx_step_str(*i,buf));
 +    }
 +}
 +
 +static void do_cpt_double_err(XDR *xd,const char *desc,double *f,FILE *list)
 +{
 +    bool_t res=0;
 +    
 +    res = xdr_double(xd,f);
 +    if (res == 0)
 +    {
 +        cp_error();
 +    }
 +    if (list)
 +    {
 +        fprintf(list,"%s = %f\n",desc,*f);
 +    }
 +}
 +
 +/* If nval >= 0, nval is used; on read this should match the passed value.
 + * If nval n<0, *nptr is used; on read the value is stored in nptr
 + */
 +static int do_cpte_reals_low(XDR *xd,int cptp,int ecpt,int sflags,
 +                             int nval,int *nptr,real **v,
 +                             FILE *list,int erealtype)
 +{
 +    bool_t res=0;
 +#ifndef GMX_DOUBLE
 +    int  dtc=xdr_datatype_float; 
 +#else
 +    int  dtc=xdr_datatype_double;
 +#endif
 +    real *vp,*va=NULL;
 +    float  *vf;
 +    double *vd;
 +    int  nf,dt,i;
 +    
 +    if (list == NULL)
 +    {
 +        if (nval >= 0)
 +        {
 +            nf = nval;
 +        }
 +        else
 +        {
 +        if (nptr == NULL)
 +        {
 +            gmx_incons("*ntpr=NULL in do_cpte_reals_low");
 +        }
 +        nf = *nptr;
 +        }
 +    }
 +    res = xdr_int(xd,&nf);
 +    if (res == 0)
 +    {
 +        return -1;
 +    }
 +    if (list == NULL)
 +    {
 +        if (nval >= 0)
 +        {
 +            if (nf != nval)
 +            {
 +                gmx_fatal(FARGS,"Count mismatch for state entry %s, code count is %d, file count is %d\n",st_names(cptp,ecpt),nval,nf);
 +            }
 +        }
 +        else
 +        {
 +            *nptr = nf;
 +        }
 +    }
 +    dt = dtc;
 +    res = xdr_int(xd,&dt);
 +    if (res == 0)
 +    {
 +        return -1;
 +    }
 +    if (dt != dtc)
 +    {
 +        fprintf(stderr,"Precision mismatch for state entry %s, code precision is %s, file precision is %s\n",
 +                st_names(cptp,ecpt),xdr_datatype_names[dtc],
 +                xdr_datatype_names[dt]);
 +    }
 +    if (list || !(sflags & (1<<ecpt)))
 +    {
 +        snew(va,nf);
 +        vp = va;
 +    }
 +    else
 +    {
 +        if (*v == NULL)
 +        {
 +            snew(*v,nf);
 +        }
 +        vp = *v;
 +    }
 +    if (dt == xdr_datatype_float)
 +    {
 +        if (dtc == xdr_datatype_float)
 +        {
 +            vf = (float *)vp;
 +        }
 +        else
 +        {
 +            snew(vf,nf);
 +        }
 +        res = xdr_vector(xd,(char *)vf,nf,
 +                         (unsigned int)sizeof(float),(xdrproc_t)xdr_float);
 +        if (res == 0)
 +        {
 +            return -1;
 +        }
 +        if (dtc != xdr_datatype_float)
 +        {
 +            for(i=0; i<nf; i++)
 +            {
 +                vp[i] = vf[i];
 +            }
 +            sfree(vf);
 +        }
 +    }
 +    else
 +    {
 +        if (dtc == xdr_datatype_double)
 +        {
 +            vd = (double *)vp;
 +        }
 +        else
 +        {
 +            snew(vd,nf);
 +        }
 +        res = xdr_vector(xd,(char *)vd,nf,
 +                         (unsigned int)sizeof(double),(xdrproc_t)xdr_double);
 +        if (res == 0)
 +        {
 +            return -1;
 +        }
 +        if (dtc != xdr_datatype_double)
 +        {
 +            for(i=0; i<nf; i++)
 +            {
 +                vp[i] = vd[i];
 +            }
 +            sfree(vd);
 +        }
 +    }
 +    
 +    if (list)
 +    {
 +        switch (erealtype)
 +        {
 +        case ecprREAL:
 +            pr_reals(list,0,st_names(cptp,ecpt),vp,nf);
 +            break;
 +        case ecprRVEC:
 +            pr_rvecs(list,0,st_names(cptp,ecpt),(rvec *)vp,nf/3);
 +            break;
 +        default:
 +            gmx_incons("Unknown checkpoint real type");
 +        }
 +    }
 +    if (va)
 +    {
 +        sfree(va);
 +    }
 +
 +    return 0;
 +}
 +
 +
 +/* This function stores n along with the reals for reading,
 + * but on reading it assumes that n matches the value in the checkpoint file,
 + * a fatal error is generated when this is not the case.
 + */
 +static int do_cpte_reals(XDR *xd,int cptp,int ecpt,int sflags,
 +                         int n,real **v,FILE *list)
 +{
 +    return do_cpte_reals_low(xd,cptp,ecpt,sflags,n,NULL,v,list,ecprREAL);
 +}
 +
 +/* This function does the same as do_cpte_reals,
 + * except that on reading it ignores the passed value of *n
 + * and stored the value read from the checkpoint file in *n.
 + */
 +static int do_cpte_n_reals(XDR *xd,int cptp,int ecpt,int sflags,
 +                           int *n,real **v,FILE *list)
 +{
 +    return do_cpte_reals_low(xd,cptp,ecpt,sflags,-1,n,v,list,ecprREAL);
 +}
 +
 +static int do_cpte_real(XDR *xd,int cptp,int ecpt,int sflags,
 +                        real *r,FILE *list)
 +{
 +    int n;
 +
 +    return do_cpte_reals_low(xd,cptp,ecpt,sflags,1,NULL,&r,list,ecprREAL);
 +}
 +
 +static int do_cpte_ints(XDR *xd,int cptp,int ecpt,int sflags,
 +                        int n,int **v,FILE *list)
 +{
 +    bool_t res=0;
 +    int  dtc=xdr_datatype_int;
 +    int *vp,*va=NULL;
 +    int  nf,dt,i;
 +    
 +    nf = n;
 +    res = xdr_int(xd,&nf);
 +    if (res == 0)
 +    {
 +        return -1;
 +    }
 +    if (list == NULL && v != NULL && nf != n)
 +    {
 +        gmx_fatal(FARGS,"Count mismatch for state entry %s, code count is %d, file count is %d\n",st_names(cptp,ecpt),n,nf);
 +    }
 +    dt = dtc;
 +    res = xdr_int(xd,&dt);
 +    if (res == 0)
 +    {
 +        return -1;
 +    }
 +    if (dt != dtc)
 +    {
 +        gmx_fatal(FARGS,"Type mismatch for state entry %s, code type is %s, file type is %s\n",
 +                  st_names(cptp,ecpt),xdr_datatype_names[dtc],
 +                  xdr_datatype_names[dt]);
 +    }
 +    if (list || !(sflags & (1<<ecpt)) || v == NULL)
 +    {
 +        snew(va,nf);
 +        vp = va;
 +    }
 +    else
 +    {
 +        if (*v == NULL)
 +        {
 +            snew(*v,nf);
 +        }
 +        vp = *v;
 +    }
 +    res = xdr_vector(xd,(char *)vp,nf,
 +                     (unsigned int)sizeof(int),(xdrproc_t)xdr_int);
 +    if (res == 0)
 +    {
 +        return -1;
 +    }
 +    if (list)
 +    {
 +        pr_ivec(list,0,st_names(cptp,ecpt),vp,nf,TRUE);
 +    }
 +    if (va)
 +    {
 +        sfree(va);
 +    }
 +
 +    return 0;
 +}
 +
 +static int do_cpte_int(XDR *xd,int cptp,int ecpt,int sflags,
 +                       int *i,FILE *list)
 +{
 +    return do_cpte_ints(xd,cptp,ecpt,sflags,1,&i,list);
 +}
 +
 +static int do_cpte_doubles(XDR *xd,int cptp,int ecpt,int sflags,
 +                           int n,double **v,FILE *list)
 +{
 +    bool_t res=0;
 +    int  dtc=xdr_datatype_double;
 +    double *vp,*va=NULL;
 +    int  nf,dt,i;
 +    
 +    nf = n;
 +    res = xdr_int(xd,&nf);
 +    if (res == 0)
 +    {
 +        return -1;
 +    }
 +    if (list == NULL && nf != n)
 +    {
 +        gmx_fatal(FARGS,"Count mismatch for state entry %s, code count is %d, file count is %d\n",st_names(cptp,ecpt),n,nf);
 +    }
 +    dt = dtc;
 +    res = xdr_int(xd,&dt);
 +    if (res == 0)
 +    {
 +        return -1;
 +    }
 +    if (dt != dtc)
 +    {
 +        gmx_fatal(FARGS,"Precision mismatch for state entry %s, code precision is %s, file precision is %s\n",
 +                  st_names(cptp,ecpt),xdr_datatype_names[dtc],
 +                  xdr_datatype_names[dt]);
 +    }
 +    if (list || !(sflags & (1<<ecpt)))
 +    {
 +        snew(va,nf);
 +        vp = va;
 +    }
 +    else
 +    {
 +        if (*v == NULL)
 +        {
 +            snew(*v,nf);
 +        }
 +        vp = *v;
 +    }
 +    res = xdr_vector(xd,(char *)vp,nf,
 +                     (unsigned int)sizeof(double),(xdrproc_t)xdr_double);
 +    if (res == 0)
 +    {
 +        return -1;
 +    }
 +    if (list)
 +    {
 +        pr_doubles(list,0,st_names(cptp,ecpt),vp,nf);
 +    }
 +    if (va)
 +    {
 +        sfree(va);
 +    }
 +
 +    return 0;
 +}
 +
 +static int do_cpte_double(XDR *xd,int cptp,int ecpt,int sflags,
 +                          double *r,FILE *list)
 +{
 +    return do_cpte_doubles(xd,cptp,ecpt,sflags,1,&r,list);
 +}
 +
 +
 +static int do_cpte_rvecs(XDR *xd,int cptp,int ecpt,int sflags,
 +                         int n,rvec **v,FILE *list)
 +{
 +    int n3;
 +
 +    return do_cpte_reals_low(xd,cptp,ecpt,sflags,
 +                             n*DIM,NULL,(real **)v,list,ecprRVEC);
 +}
 +
 +static int do_cpte_matrix(XDR *xd,int cptp,int ecpt,int sflags,
 +                          matrix v,FILE *list)
 +{
 +    real *vr;
 +    real ret;
 +
 +    vr = (real *)&(v[0][0]);
 +    ret = do_cpte_reals_low(xd,cptp,ecpt,sflags,
 +                            DIM*DIM,NULL,&vr,NULL,ecprMATRIX);
 +    
 +    if (list && ret == 0)
 +    {
 +        pr_rvecs(list,0,st_names(cptp,ecpt),v,DIM);
 +    }
 +    
 +    return ret;
 +}
 +
 +
 +static int do_cpte_nmatrix(XDR *xd,int cptp,int ecpt,int sflags,
 +                           int n, real **v,FILE *list)
 +{
 +    int i;
 +    real *vr;
 +    real ret,reti;
 +    char name[CPTSTRLEN];
 +
 +    ret = 0;
 +    if (v==NULL)
 +    {
 +        snew(v,n);
 +    }
 +    for (i=0;i<n;i++)
 +    {
 +        reti = 0;
 +        vr = v[i];
 +        reti = do_cpte_reals_low(xd,cptp,ecpt,sflags,n,NULL,&(v[i]),NULL,ecprREAL);
 +        if (list && reti == 0)
 +        {
 +            sprintf(name,"%s[%d]",st_names(cptp,ecpt),i);
 +            pr_reals(list,0,name,v[i],n);
 +        }
 +        if (reti == 0)
 +        {
 +            ret = 0;
 +        }
 +    }
 +    return ret;
 +}
 +
 +static int do_cpte_matrices(XDR *xd,int cptp,int ecpt,int sflags,
 +                            int n,matrix **v,FILE *list)
 +{
 +    bool_t res=0;
 +    matrix *vp,*va=NULL;
 +    real *vr;
 +    int  nf,i,j,k;
 +    int  ret;
 +
 +    nf = n;
 +    res = xdr_int(xd,&nf);
 +    if (res == 0)
 +    {
 +        return -1;
 +    }
 +    if (list == NULL && nf != n)
 +    {
 +        gmx_fatal(FARGS,"Count mismatch for state entry %s, code count is %d, file count is %d\n",st_names(cptp,ecpt),n,nf);
 +    }
 +    if (list || !(sflags & (1<<ecpt)))
 +    {
 +        snew(va,nf);
 +        vp = va;
 +    }
 +    else
 +    {
 +        if (*v == NULL)
 +        {
 +            snew(*v,nf);
 +        }
 +        vp = *v;
 +    }
 +    snew(vr,nf*DIM*DIM);
 +    for(i=0; i<nf; i++)
 +    {
 +        for(j=0; j<DIM; j++)
 +        {
 +            for(k=0; k<DIM; k++)
 +            {
 +                vr[(i*DIM+j)*DIM+k] = vp[i][j][k];
 +            }
 +        }
 +    }
 +    ret = do_cpte_reals_low(xd,cptp,ecpt,sflags,
 +                            nf*DIM*DIM,NULL,&vr,NULL,ecprMATRIX);
 +    for(i=0; i<nf; i++)
 +    {
 +        for(j=0; j<DIM; j++)
 +        {
 +            for(k=0; k<DIM; k++)
 +            {
 +                vp[i][j][k] = vr[(i*DIM+j)*DIM+k];
 +            }
 +        }
 +    }
 +    sfree(vr);
 +    
 +    if (list && ret == 0)
 +    {
 +        for(i=0; i<nf; i++)
 +        {
 +            pr_rvecs(list,0,st_names(cptp,ecpt),vp[i],DIM);
 +        }
 +    }
 +    if (va)
 +    {
 +        sfree(va);
 +    }
 +    
 +    return ret;
 +}
 +
 +static void do_cpt_header(XDR *xd,gmx_bool bRead,int *file_version,
-                           char **version,char **btime,char **buser,char **bmach,
++                          char **version,char **btime,char **buser,char **bhost,
 +                          int *double_prec,
 +                          char **fprog,char **ftime,
 +                          int *eIntegrator,int *simulation_part,
 +                          gmx_large_int_t *step,double *t,
 +                          int *nnodes,int *dd_nc,int *npme,
 +                          int *natoms,int *ngtc, int *nnhpres, int *nhchainlength,
 +                          int *nlambda, int *flags_state,
 +                          int *flags_eks,int *flags_enh, int *flags_dfh,
 +                          FILE *list)
 +{
 +    bool_t res=0;
 +    int  magic;
 +    int  idum=0;
 +    int  i;
 +    char *fhost;
 +
 +    if (bRead)
 +    {
 +        magic = -1;
 +    }
 +    else
 +    {
 +        magic = CPT_MAGIC1;
 +    }
 +    res = xdr_int(xd,&magic);
 +    if (res == 0)
 +    {
 +        gmx_fatal(FARGS,"The checkpoint file is empty/corrupted, or maybe you are out of disk space?");
 +    }
 +    if (magic != CPT_MAGIC1)
 +    {
 +        gmx_fatal(FARGS,"Start of file magic number mismatch, checkpoint file has %d, should be %d\n"
 +                  "The checkpoint file is corrupted or not a checkpoint file",
 +                  magic,CPT_MAGIC1);
 +    }
 +    if (!bRead)
 +    {
 +        snew(fhost,255);
 +#ifdef HAVE_UNISTD_H
 +        if (gethostname(fhost,255) != 0)
 +        {
 +            sprintf(fhost,"unknown");
 +        }
 +#else
 +        sprintf(fhost,"unknown");
 +#endif  
 +    }
 +    do_cpt_string_err(xd,bRead,"GROMACS version"           ,version,list);
 +    do_cpt_string_err(xd,bRead,"GROMACS build time"        ,btime,list);
 +    do_cpt_string_err(xd,bRead,"GROMACS build user"        ,buser,list);
-     do_cpt_string_err(xd,bRead,"GROMACS build machine"     ,bmach,list);
++    do_cpt_string_err(xd,bRead,"GROMACS build host"        ,bhost,list);
 +    do_cpt_string_err(xd,bRead,"generating program"        ,fprog,list);
 +    do_cpt_string_err(xd,bRead,"generation time"           ,ftime,list);
 +    *file_version = cpt_version;
 +    do_cpt_int_err(xd,"checkpoint file version",file_version,list);
 +    if (*file_version > cpt_version)
 +    {
 +        gmx_fatal(FARGS,"Attempting to read a checkpoint file of version %d with code of version %d\n",*file_version,cpt_version);
 +    }
 +    if (*file_version >= 13)
 +    {
 +        do_cpt_int_err(xd,"GROMACS double precision",double_prec,list);
 +    }
 +    else
 +    {
 +        *double_prec = -1;
 +    }
 +    if (*file_version >= 12)
 +    {
 +        do_cpt_string_err(xd,bRead,"generating host"           ,&fhost,list);
 +        if (list == NULL)
 +        {
 +            sfree(fhost);
 +        }
 +    }
 +    do_cpt_int_err(xd,"#atoms"            ,natoms     ,list);
 +    do_cpt_int_err(xd,"#T-coupling groups",ngtc       ,list);
 +    if (*file_version >= 10) 
 +    {
 +        do_cpt_int_err(xd,"#Nose-Hoover T-chains",nhchainlength,list);
 +    }
 +    else
 +    {
 +        *nhchainlength = 1;
 +    }
 +    if (*file_version >= 11)
 +    {
 +        do_cpt_int_err(xd,"#Nose-Hoover T-chains for barostat ",nnhpres,list);
 +    }
 +    else
 +    {
 +        *nnhpres = 0;
 +    }
 +    if (*file_version >= 12)
 +    {
 +        do_cpt_int_err(xd,"# of total lambda states ",nlambda,list);
 +    }
 +    else
 +    {
 +        *nlambda = 0;
 +    }
 +    do_cpt_int_err(xd,"integrator"        ,eIntegrator,list);
 +	if (*file_version >= 3)
 +	{
 +		do_cpt_int_err(xd,"simulation part #", simulation_part,list);
 +	}
 +	else
 +	{
 +		*simulation_part = 1;
 +	}
 +    if (*file_version >= 5)
 +    {
 +        do_cpt_step_err(xd,"step"         ,step       ,list);
 +    }
 +    else
 +    {
 +        do_cpt_int_err(xd,"step"          ,&idum      ,list);
 +        *step = idum;
 +    }
 +    do_cpt_double_err(xd,"t"              ,t          ,list);
 +    do_cpt_int_err(xd,"#PP-nodes"         ,nnodes     ,list);
 +    idum = 1;
 +    do_cpt_int_err(xd,"dd_nc[x]",dd_nc ? &(dd_nc[0]) : &idum,list);
 +    do_cpt_int_err(xd,"dd_nc[y]",dd_nc ? &(dd_nc[1]) : &idum,list);
 +    do_cpt_int_err(xd,"dd_nc[z]",dd_nc ? &(dd_nc[2]) : &idum,list);
 +    do_cpt_int_err(xd,"#PME-only nodes",npme,list);
 +    do_cpt_int_err(xd,"state flags",flags_state,list);
 +	if (*file_version >= 4)
 +    {
 +        do_cpt_int_err(xd,"ekin data flags",flags_eks,list);
 +        do_cpt_int_err(xd,"energy history flags",flags_enh,list);
 +    }
 +    else
 +    {
 +        *flags_eks  = 0;
 +        *flags_enh   = (*flags_state >> (estORIRE_DTAV+1));
 +        *flags_state = (*flags_state & ~((1<<(estORIRE_DTAV+1)) |
 +                                         (1<<(estORIRE_DTAV+2)) |
 +                                         (1<<(estORIRE_DTAV+3))));
 +    }
 +	if (*file_version >= 12)
 +    {
 +        do_cpt_int_err(xd,"df history flags",flags_dfh,list);
 +    } else {
 +        *flags_dfh = 0;
 +    }
 +}
 +
 +static int do_cpt_footer(XDR *xd,gmx_bool bRead,int file_version)
 +{
 +    bool_t res=0;
 +    int  magic;
 +    
 +    if (file_version >= 2)
 +    {
 +        magic = CPT_MAGIC2;
 +        res = xdr_int(xd,&magic);
 +        if (res == 0)
 +        {
 +            cp_error();
 +        }
 +        if (magic != CPT_MAGIC2)
 +        {
 +            return -1;
 +        }
 +    }
 +
 +    return 0;
 +}
 +
 +static int do_cpt_state(XDR *xd,gmx_bool bRead,
 +                        int fflags,t_state *state,
 +                        gmx_bool bReadRNG,FILE *list)
 +{
 +    int  sflags;
 +    int  **rng_p,**rngi_p;
 +    int  i;
 +    int  ret;
 +    int  nnht,nnhtp;
 +
 +    ret = 0;
 +    
 +    nnht = state->nhchainlength*state->ngtc;
 +    nnhtp = state->nhchainlength*state->nnhpres;
 +
 +    if (bReadRNG)
 +    {
 +        rng_p  = (int **)&state->ld_rng;
 +        rngi_p = &state->ld_rngi;
 +    }
 +    else
 +    {
 +        /* Do not read the RNG data */
 +        rng_p  = NULL;
 +        rngi_p = NULL;
 +    }
 +    /* We want the MC_RNG the same across all the notes for now -- lambda MC is global */
 +
 +    sflags = state->flags;
 +    for(i=0; (i<estNR && ret == 0); i++)
 +    {
 +        if (fflags & (1<<i))
 +        {
 +            switch (i)
 +            {
 +            case estLAMBDA:  ret = do_cpte_reals(xd,cptpEST,i,sflags,efptNR,&(state->lambda),list); break;
 +            case estFEPSTATE: ret = do_cpte_int (xd,cptpEST,i,sflags,&state->fep_state,list); break;
 +            case estBOX:     ret = do_cpte_matrix(xd,cptpEST,i,sflags,state->box,list); break;
 +            case estBOX_REL: ret = do_cpte_matrix(xd,cptpEST,i,sflags,state->box_rel,list); break;
 +            case estBOXV:    ret = do_cpte_matrix(xd,cptpEST,i,sflags,state->boxv,list); break;
 +            case estPRES_PREV: ret = do_cpte_matrix(xd,cptpEST,i,sflags,state->pres_prev,list); break;
 +            case estSVIR_PREV:  ret = do_cpte_matrix(xd,cptpEST,i,sflags,state->svir_prev,list); break;
 +            case estFVIR_PREV:  ret = do_cpte_matrix(xd,cptpEST,i,sflags,state->fvir_prev,list); break;
 +            case estNH_XI:   ret = do_cpte_doubles(xd,cptpEST,i,sflags,nnht,&state->nosehoover_xi,list); break;
 +            case estNH_VXI:  ret = do_cpte_doubles(xd,cptpEST,i,sflags,nnht,&state->nosehoover_vxi,list); break;
 +            case estNHPRES_XI:   ret = do_cpte_doubles(xd,cptpEST,i,sflags,nnhtp,&state->nhpres_xi,list); break;
 +            case estNHPRES_VXI:  ret = do_cpte_doubles(xd,cptpEST,i,sflags,nnhtp,&state->nhpres_vxi,list); break;
 +            case estTC_INT:  ret = do_cpte_doubles(xd,cptpEST,i,sflags,state->ngtc,&state->therm_integral,list); break;
 +            case estVETA:    ret = do_cpte_real(xd,cptpEST,i,sflags,&state->veta,list); break;
 +            case estVOL0:    ret = do_cpte_real(xd,cptpEST,i,sflags,&state->vol0,list); break;
 +            case estX:       ret = do_cpte_rvecs(xd,cptpEST,i,sflags,state->natoms,&state->x,list); break;
 +            case estV:       ret = do_cpte_rvecs(xd,cptpEST,i,sflags,state->natoms,&state->v,list); break;
 +            case estSDX:     ret = do_cpte_rvecs(xd,cptpEST,i,sflags,state->natoms,&state->sd_X,list); break;
 +            case estLD_RNG:  ret = do_cpte_ints(xd,cptpEST,i,sflags,state->nrng,rng_p,list); break;
 +            case estLD_RNGI: ret = do_cpte_ints(xd,cptpEST,i,sflags,state->nrngi,rngi_p,list); break;
 +            case estMC_RNG:  ret = do_cpte_ints(xd,cptpEST,i,sflags,state->nmcrng,(int **)&state->mc_rng,list); break;
 +            case estMC_RNGI: ret = do_cpte_ints(xd,cptpEST,i,sflags,1,&state->mc_rngi,list); break;
 +            case estDISRE_INITF:  ret = do_cpte_real (xd,cptpEST,i,sflags,&state->hist.disre_initf,list); break;
 +            case estDISRE_RM3TAV: ret = do_cpte_reals(xd,cptpEST,i,sflags,state->hist.ndisrepairs,&state->hist.disre_rm3tav,list); break;
 +            case estORIRE_INITF:  ret = do_cpte_real (xd,cptpEST,i,sflags,&state->hist.orire_initf,list); break;
 +            case estORIRE_DTAV:   ret = do_cpte_reals(xd,cptpEST,i,sflags,state->hist.norire_Dtav,&state->hist.orire_Dtav,list); break;
 +            default:
 +                gmx_fatal(FARGS,"Unknown state entry %d\n"
 +                          "You are probably reading a new checkpoint file with old code",i);
 +            }
 +        }
 +    }
 +    
 +    return ret;
 +}
 +
 +static int do_cpt_ekinstate(XDR *xd,gmx_bool bRead,
 +                            int fflags,ekinstate_t *ekins,
 +                            FILE *list)
 +{
 +    int  i;
 +    int  ret;
 +
 +    ret = 0;
 +
 +    for(i=0; (i<eeksNR && ret == 0); i++)
 +    {
 +        if (fflags & (1<<i))
 +        {
 +            switch (i)
 +            {
 +                
 +			case eeksEKIN_N:     ret = do_cpte_int(xd,cptpEEKS,i,fflags,&ekins->ekin_n,list); break;
 +			case eeksEKINH :     ret = do_cpte_matrices(xd,cptpEEKS,i,fflags,ekins->ekin_n,&ekins->ekinh,list); break;
 +			case eeksEKINF:      ret = do_cpte_matrices(xd,cptpEEKS,i,fflags,ekins->ekin_n,&ekins->ekinf,list); break;
 +			case eeksEKINO:      ret = do_cpte_matrices(xd,cptpEEKS,i,fflags,ekins->ekin_n,&ekins->ekinh_old,list); break;
 +            case eeksEKINTOTAL:  ret = do_cpte_matrix(xd,cptpEEKS,i,fflags,ekins->ekin_total,list); break;
 +            case eeksEKINSCALEF: ret = do_cpte_doubles(xd,cptpEEKS,i,fflags,ekins->ekin_n,&ekins->ekinscalef_nhc,list); break;
 +            case eeksVSCALE:     ret = do_cpte_doubles(xd,1,cptpEEKS,fflags,ekins->ekin_n,&ekins->vscale_nhc,list); break;
 +            case eeksEKINSCALEH: ret = do_cpte_doubles(xd,1,cptpEEKS,fflags,ekins->ekin_n,&ekins->ekinscaleh_nhc,list); break;
 + 			case eeksDEKINDL :   ret = do_cpte_real(xd,1,cptpEEKS,fflags,&ekins->dekindl,list); break;
 +            case eeksMVCOS:      ret = do_cpte_real(xd,1,cptpEEKS,fflags,&ekins->mvcos,list); break;
 +            default:
 +                gmx_fatal(FARGS,"Unknown ekin data state entry %d\n"
 +                          "You are probably reading a new checkpoint file with old code",i);
 +            }
 +        }
 +    }
 +    
 +    return ret;
 +}
 +
 +
 +static int do_cpt_enerhist(XDR *xd,gmx_bool bRead,
 +                           int fflags,energyhistory_t *enerhist,
 +                           FILE *list)
 +{
 +    int  i;
 +    int  j;
 +    int  ret;
 +
 +    ret = 0;
 +
 +    if (bRead)
 +    {
 +        enerhist->nsteps     = 0;
 +        enerhist->nsum       = 0;
 +        enerhist->nsteps_sim = 0;
 +        enerhist->nsum_sim   = 0;
 +        enerhist->dht        = NULL;
 +
 +        if (fflags & (1<< eenhENERGY_DELTA_H_NN) )
 +        {
 +            snew(enerhist->dht,1);
 +            enerhist->dht->ndh = NULL;
 +            enerhist->dht->dh = NULL;
 +            enerhist->dht->start_lambda_set=FALSE;
 +        }
 +    }
 +
 +    for(i=0; (i<eenhNR && ret == 0); i++)
 +    {
 +        if (fflags & (1<<i))
 +        {
 +            switch (i)
 +            {
 +			case eenhENERGY_N:     ret = do_cpte_int(xd,cptpEENH,i,fflags,&enerhist->nener,list); break;
 +			case eenhENERGY_AVER:  ret = do_cpte_doubles(xd,cptpEENH,i,fflags,enerhist->nener,&enerhist->ener_ave,list); break;
 + 			case eenhENERGY_SUM:   ret = do_cpte_doubles(xd,cptpEENH,i,fflags,enerhist->nener,&enerhist->ener_sum,list); break;
 +            case eenhENERGY_NSUM:  do_cpt_step_err(xd,eenh_names[i],&enerhist->nsum,list); break;
 +            case eenhENERGY_SUM_SIM: ret = do_cpte_doubles(xd,cptpEENH,i,fflags,enerhist->nener,&enerhist->ener_sum_sim,list); break;
 +            case eenhENERGY_NSUM_SIM:   do_cpt_step_err(xd,eenh_names[i],&enerhist->nsum_sim,list); break;
 +            case eenhENERGY_NSTEPS:     do_cpt_step_err(xd,eenh_names[i],&enerhist->nsteps,list); break;
 +            case eenhENERGY_NSTEPS_SIM: do_cpt_step_err(xd,eenh_names[i],&enerhist->nsteps_sim,list); break;
 +            case eenhENERGY_DELTA_H_NN: do_cpt_int_err(xd,eenh_names[i], &(enerhist->dht->nndh), list);
 +                if (bRead) /* now allocate memory for it */
 +                {
 +                    snew(enerhist->dht->dh, enerhist->dht->nndh);
 +                    snew(enerhist->dht->ndh, enerhist->dht->nndh);
 +                    for(j=0;j<enerhist->dht->nndh;j++)
 +                    {
 +                        enerhist->dht->ndh[j] = 0;
 +                        enerhist->dht->dh[j] = NULL;
 +                    }
 +                }
 +                break;
 +            case eenhENERGY_DELTA_H_LIST:
 +                for(j=0;j<enerhist->dht->nndh;j++)
 +                {
 +                    ret=do_cpte_n_reals(xd, cptpEENH, i, fflags, &enerhist->dht->ndh[j], &(enerhist->dht->dh[j]), list);
 +                }
 +                break;
 +            case eenhENERGY_DELTA_H_STARTTIME:
 +                ret=do_cpte_double(xd, cptpEENH, i, fflags, &(enerhist->dht->start_time), list); break;
 +            case eenhENERGY_DELTA_H_STARTLAMBDA:
 +                ret=do_cpte_double(xd, cptpEENH, i, fflags, &(enerhist->dht->start_lambda), list); break;
 +            default:
 +                gmx_fatal(FARGS,"Unknown energy history entry %d\n"
 +                          "You are probably reading a new checkpoint file with old code",i);
 +            }
 +        }
 +    }
 +
 +    if ((fflags & (1<<eenhENERGY_SUM)) && !(fflags & (1<<eenhENERGY_SUM_SIM)))
 +    {
 +        /* Assume we have an old file format and copy sum to sum_sim */
 +        srenew(enerhist->ener_sum_sim,enerhist->nener);
 +        for(i=0; i<enerhist->nener; i++)
 +        {
 +            enerhist->ener_sum_sim[i] = enerhist->ener_sum[i];
 +        }
 +        fflags |= (1<<eenhENERGY_SUM_SIM);
 +    }
 +    
 +    if ( (fflags & (1<<eenhENERGY_NSUM)) &&
 +        !(fflags & (1<<eenhENERGY_NSTEPS)))
 +    {
 +        /* Assume we have an old file format and copy nsum to nsteps */
 +        enerhist->nsteps = enerhist->nsum;
 +        fflags |= (1<<eenhENERGY_NSTEPS);
 +    }
 +    if ( (fflags & (1<<eenhENERGY_NSUM_SIM)) &&
 +        !(fflags & (1<<eenhENERGY_NSTEPS_SIM)))
 +    {
 +        /* Assume we have an old file format and copy nsum to nsteps */
 +        enerhist->nsteps_sim = enerhist->nsum_sim;
 +        fflags |= (1<<eenhENERGY_NSTEPS_SIM);
 +    }
 +
 +    return ret;
 +}
 +
 +static int do_cpt_df_hist(XDR *xd,gmx_bool bRead,int fflags,df_history_t *dfhist,FILE *list)
 +{
 +    int  i,nlambda;
 +    int  ret;
 +
 +    nlambda = dfhist->nlambda;
 +    ret = 0;
 +
 +    for(i=0; (i<edfhNR && ret == 0); i++)
 +    {
 +        if (fflags & (1<<i))
 +        {
 +            switch (i)
 +            {
 +            case edfhBEQUIL:       ret = do_cpte_int(xd,cptpEDFH,i,fflags,&dfhist->bEquil,list); break;
 +            case edfhNATLAMBDA:    ret = do_cpte_ints(xd,cptpEDFH,i,fflags,nlambda,&dfhist->n_at_lam,list); break;
 +            case edfhWLHISTO:      ret = do_cpte_reals(xd,cptpEDFH,i,fflags,nlambda,&dfhist->wl_histo,list); break;
 +            case edfhWLDELTA:      ret = do_cpte_real(xd,cptpEDFH,i,fflags,&dfhist->wl_delta,list); break;
 +            case edfhSUMWEIGHTS:   ret = do_cpte_reals(xd,cptpEDFH,i,fflags,nlambda,&dfhist->sum_weights,list); break;
 +            case edfhSUMDG:        ret = do_cpte_reals(xd,cptpEDFH,i,fflags,nlambda,&dfhist->sum_dg,list); break;
 +            case edfhSUMMINVAR:    ret = do_cpte_reals(xd,cptpEDFH,i,fflags,nlambda,&dfhist->sum_minvar,list); break;
 +            case edfhSUMVAR:       ret = do_cpte_reals(xd,cptpEDFH,i,fflags,nlambda,&dfhist->sum_variance,list); break;
 +            case edfhACCUMP:       ret = do_cpte_nmatrix(xd,cptpEDFH,i,fflags,nlambda,dfhist->accum_p,list); break;
 +            case edfhACCUMM:       ret = do_cpte_nmatrix(xd,cptpEDFH,i,fflags,nlambda,dfhist->accum_m,list); break;
 +            case edfhACCUMP2:      ret = do_cpte_nmatrix(xd,cptpEDFH,i,fflags,nlambda,dfhist->accum_p2,list); break;
 +            case edfhACCUMM2:      ret = do_cpte_nmatrix(xd,cptpEDFH,i,fflags,nlambda,dfhist->accum_m2,list); break;
 +            case edfhTIJ:          ret = do_cpte_nmatrix(xd,cptpEDFH,i,fflags,nlambda,dfhist->Tij,list); break;
 +            case edfhTIJEMP:       ret = do_cpte_nmatrix(xd,cptpEDFH,i,fflags,nlambda,dfhist->Tij_empirical,list); break;
 +
 +            default:
 +                gmx_fatal(FARGS,"Unknown df history entry %d\n"
 +                          "You are probably reading a new checkpoint file with old code",i);
 +            }
 +        }
 +    }
 +
 +    return ret;
 +}
 +
 +static int do_cpt_files(XDR *xd, gmx_bool bRead, 
 +                        gmx_file_position_t **p_outputfiles, int *nfiles, 
 +                        FILE *list, int file_version)
 +{
 +    int    i,j;
 +    gmx_off_t  offset;
 +    gmx_off_t  mask = 0xFFFFFFFFL;
 +    int    offset_high,offset_low;
 +    char   *buf;
 +    gmx_file_position_t *outputfiles;
 +
 +    if (do_cpt_int(xd,"number of output files",nfiles,list) != 0)
 +    {
 +        return -1;
 +    }
 +
 +    if(bRead)
 +    {
 +        snew(*p_outputfiles,*nfiles);
 +    }
 +
 +    outputfiles = *p_outputfiles;
 +
 +    for(i=0;i<*nfiles;i++)
 +    {
 +        /* 64-bit XDR numbers are not portable, so it is stored as separate high/low fractions */
 +        if(bRead)
 +        {
 +            do_cpt_string_err(xd,bRead,"output filename",&buf,list);
 +            strncpy(outputfiles[i].filename,buf,CPTSTRLEN-1);
 +            if(list==NULL)
 +            {
 +                sfree(buf);			
 +            }
 +
 +            if (do_cpt_int(xd,"file_offset_high",&offset_high,list) != 0)
 +            {
 +                return -1;
 +            }
 +            if (do_cpt_int(xd,"file_offset_low",&offset_low,list) != 0)
 +            {
 +                return -1;
 +            }
 +#if (SIZEOF_GMX_OFF_T > 4)
 +            outputfiles[i].offset = ( ((gmx_off_t) offset_high) << 32 ) | ( (gmx_off_t) offset_low & mask );
 +#else
 +            outputfiles[i].offset = offset_low;
 +#endif
 +        }
 +        else
 +        {
 +            buf = outputfiles[i].filename;
 +            do_cpt_string_err(xd,bRead,"output filename",&buf,list);
 +            /* writing */
 +            offset      = outputfiles[i].offset;
 +            if (offset == -1)
 +            {
 +                offset_low  = -1;
 +                offset_high = -1;
 +            }
 +            else
 +            {
 +#if (SIZEOF_GMX_OFF_T > 4)
 +                offset_low  = (int) (offset & mask);
 +                offset_high = (int) ((offset >> 32) & mask);
 +#else
 +                offset_low  = offset;
 +                offset_high = 0;
 +#endif
 +            }
 +            if (do_cpt_int(xd,"file_offset_high",&offset_high,list) != 0)
 +            {
 +                return -1;
 +            }
 +            if (do_cpt_int(xd,"file_offset_low",&offset_low,list) != 0)
 +            {
 +                return -1;
 +            }
 +        }
 +        if (file_version >= 8)
 +        {
 +            if (do_cpt_int(xd,"file_checksum_size",&(outputfiles[i].chksum_size),
 +                           list) != 0)
 +            {
 +                return -1;
 +            }
 +            if (do_cpt_u_chars(xd,"file_checksum",16,outputfiles[i].chksum,list) != 0)
 +            {
 +                return -1;
 +            }
 +        } 
 +        else 
 +        {
 +            outputfiles[i].chksum_size = -1;
 +        }
 +    }
 +    return 0;
 +}
 +
 +
 +void write_checkpoint(const char *fn,gmx_bool bNumberAndKeep,
 +                      FILE *fplog,t_commrec *cr,
 +                      int eIntegrator,int simulation_part,
 +                      gmx_bool bExpanded, int elamstats,
 +                      gmx_large_int_t step,double t,t_state *state)
 +{
 +    t_fileio *fp;
 +    int  file_version;
 +    char *version;
 +    char *btime;
 +    char *buser;
-     char *bmach;
++    char *bhost;
 +    int  double_prec;
 +    char *fprog;
 +    char *fntemp; /* the temporary checkpoint file name */
 +    time_t now;
 +    char timebuf[STRLEN];
 +    int  nppnodes,npmenodes,flag_64bit;
 +    char buf[1024],suffix[5+STEPSTRSIZE],sbuf[STEPSTRSIZE];
 +    gmx_file_position_t *outputfiles;
 +    int  noutputfiles;
 +    char *ftime;
 +    int  flags_eks,flags_enh,flags_dfh,i;
 +    t_fileio *ret;
 +		
 +    if (PAR(cr))
 +    {
 +        if (DOMAINDECOMP(cr))
 +        {
 +            nppnodes  = cr->dd->nnodes;
 +            npmenodes = cr->npmenodes;
 +        }
 +        else
 +        {
 +            nppnodes  = cr->nnodes;
 +            npmenodes = 0;
 +        }
 +    }
 +    else
 +    {
 +        nppnodes  = 1;
 +        npmenodes = 0;
 +    }
 +
 +    /* make the new temporary filename */
 +    snew(fntemp, strlen(fn)+5+STEPSTRSIZE);
 +    strcpy(fntemp,fn);
 +    fntemp[strlen(fn) - strlen(ftp2ext(fn2ftp(fn))) - 1] = '\0';
 +    sprintf(suffix,"_%s%s","step",gmx_step_str(step,sbuf));
 +    strcat(fntemp,suffix);
 +    strcat(fntemp,fn+strlen(fn) - strlen(ftp2ext(fn2ftp(fn))) - 1);
 +   
 +    time(&now);
 +    gmx_ctime_r(&now,timebuf,STRLEN);
 +
 +    if (fplog)
 +    { 
 +        fprintf(fplog,"Writing checkpoint, step %s at %s\n\n",
 +                gmx_step_str(step,buf),timebuf);
 +    }
 +    
 +    /* Get offsets for open files */
 +    gmx_fio_get_output_file_positions(&outputfiles, &noutputfiles);
 +
 +    fp = gmx_fio_open(fntemp,"w");
 +	
 +    if (state->ekinstate.bUpToDate)
 +    {
 +        flags_eks =
 +            ((1<<eeksEKIN_N) | (1<<eeksEKINH) | (1<<eeksEKINF) | 
 +             (1<<eeksEKINO) | (1<<eeksEKINSCALEF) | (1<<eeksEKINSCALEH) | 
 +             (1<<eeksVSCALE) | (1<<eeksDEKINDL) | (1<<eeksMVCOS));
 +    }
 +    else
 +    {
 +        flags_eks = 0;
 +    }
 +
 +    flags_enh = 0;
 +    if (state->enerhist.nsum > 0 || state->enerhist.nsum_sim > 0)
 +    {
 +        flags_enh |= (1<<eenhENERGY_N);
 +        if (state->enerhist.nsum > 0)
 +        {
 +            flags_enh |= ((1<<eenhENERGY_AVER) | (1<<eenhENERGY_SUM) |
 +                          (1<<eenhENERGY_NSTEPS) | (1<<eenhENERGY_NSUM));
 +        }
 +        if (state->enerhist.nsum_sim > 0)
 +        {
 +            flags_enh |= ((1<<eenhENERGY_SUM_SIM) | (1<<eenhENERGY_NSTEPS_SIM) |
 +                          (1<<eenhENERGY_NSUM_SIM));
 +        }
 +        if (state->enerhist.dht)
 +        {
 +            flags_enh |= ( (1<< eenhENERGY_DELTA_H_NN) |
 +                           (1<< eenhENERGY_DELTA_H_LIST) | 
 +                           (1<< eenhENERGY_DELTA_H_STARTTIME) |
 +                           (1<< eenhENERGY_DELTA_H_STARTLAMBDA) );
 +        }
 +    }
 +
 +    if (bExpanded)
 +    {
 +        flags_dfh = ((1<<edfhBEQUIL) | (1<<edfhNATLAMBDA) | (1<<edfhSUMWEIGHTS) |  (1<<edfhSUMDG)  |
 +                     (1<<edfhTIJ) | (1<<edfhTIJEMP));
 +        if (EWL(elamstats))
 +        {
 +            flags_dfh |= ((1<<edfhWLDELTA) | (1<<edfhWLHISTO));
 +        }
 +        if ((elamstats == elamstatsMINVAR) || (elamstats == elamstatsBARKER) || (elamstats == elamstatsMETROPOLIS))
 +        {
 +            flags_dfh |= ((1<<edfhACCUMP) | (1<<edfhACCUMM) | (1<<edfhACCUMP2) | (1<<edfhACCUMM2)
 +                          | (1<<edfhSUMMINVAR) | (1<<edfhSUMVAR));
 +        }
 +    } else {
 +        flags_dfh = 0;
 +    }
++    
++    /* We can check many more things now (CPU, acceleration, etc), but
++     * it is highly unlikely to have two separate builds with exactly
++     * the same version, user, time, and build host!
++     */
 +
 +    version = gmx_strdup(VERSION);
 +    btime   = gmx_strdup(BUILD_TIME);
 +    buser   = gmx_strdup(BUILD_USER);
-     bmach   = gmx_strdup(BUILD_MACHINE);
++    bhost   = gmx_strdup(BUILD_HOST);
++
 +    double_prec = GMX_CPT_BUILD_DP;
 +    fprog   = gmx_strdup(Program());
 +
 +    ftime   = &(timebuf[0]);
 +    
 +    do_cpt_header(gmx_fio_getxdr(fp),FALSE,&file_version,
-                   &version,&btime,&buser,&bmach,&double_prec,&fprog,&ftime,
++                  &version,&btime,&buser,&bhost,&double_prec,&fprog,&ftime,
 +                  &eIntegrator,&simulation_part,&step,&t,&nppnodes,
 +                  DOMAINDECOMP(cr) ? cr->dd->nc : NULL,&npmenodes,
 +                  &state->natoms,&state->ngtc,&state->nnhpres,
 +                  &state->nhchainlength,&(state->dfhist.nlambda),&state->flags,&flags_eks,&flags_enh,&flags_dfh,
 +                  NULL);
 +    
 +    sfree(version);
 +    sfree(btime);
 +    sfree(buser);
-     sfree(bmach);
++    sfree(bhost);
 +    sfree(fprog);
 +
 +    if((do_cpt_state(gmx_fio_getxdr(fp),FALSE,state->flags,state,TRUE,NULL) < 0)        ||
 +       (do_cpt_ekinstate(gmx_fio_getxdr(fp),FALSE,flags_eks,&state->ekinstate,NULL) < 0)||
 +       (do_cpt_enerhist(gmx_fio_getxdr(fp),FALSE,flags_enh,&state->enerhist,NULL) < 0)  ||
 +       (do_cpt_df_hist(gmx_fio_getxdr(fp),FALSE,flags_dfh,&state->dfhist,NULL) < 0)  ||
 +       (do_cpt_files(gmx_fio_getxdr(fp),FALSE,&outputfiles,&noutputfiles,NULL,
 +                     file_version) < 0))
 +    {
 +        gmx_file("Cannot read/write checkpoint; corrupt file, or maybe you are out of disk space?");
 +    }
 +
 +    do_cpt_footer(gmx_fio_getxdr(fp),FALSE,file_version);
 +
 +    /* we really, REALLY, want to make sure to physically write the checkpoint, 
 +       and all the files it depends on, out to disk. Because we've
 +       opened the checkpoint with gmx_fio_open(), it's in our list
 +       of open files.  */
 +    ret=gmx_fio_all_output_fsync();
 +
 +    if (ret)
 +    {
 +        char buf[STRLEN];
 +        sprintf(buf,
 +                "Cannot fsync '%s'; maybe you are out of disk space?",
 +                gmx_fio_getname(ret));
 +
 +        if (getenv(GMX_IGNORE_FSYNC_FAILURE_ENV)==NULL)
 +        {
 +            gmx_file(buf);
 +        }
 +        else
 +        {
 +            gmx_warning(buf);
 +        }
 +    }
 +
 +    if( gmx_fio_close(fp) != 0)
 +    {
 +        gmx_file("Cannot read/write checkpoint; corrupt file, or maybe you are out of disk space?");
 +    }
 +
 +    /* we don't move the checkpoint if the user specified they didn't want it,
 +       or if the fsyncs failed */
 +    if (!bNumberAndKeep && !ret)
 +    {
 +        if (gmx_fexist(fn))
 +        {
 +            /* Rename the previous checkpoint file */
 +            strcpy(buf,fn);
 +            buf[strlen(fn) - strlen(ftp2ext(fn2ftp(fn))) - 1] = '\0';
 +            strcat(buf,"_prev");
 +            strcat(buf,fn+strlen(fn) - strlen(ftp2ext(fn2ftp(fn))) - 1);
 +#ifndef GMX_FAHCORE
 +            /* we copy here so that if something goes wrong between now and
 +             * the rename below, there's always a state.cpt.
 +             * If renames are atomic (such as in POSIX systems),
 +             * this copying should be unneccesary.
 +             */
 +            gmx_file_copy(fn, buf, FALSE);
 +            /* We don't really care if this fails: 
 +             * there's already a new checkpoint.
 +             */
 +#else
 +            gmx_file_rename(fn, buf);
 +#endif
 +        }
 +        if (gmx_file_rename(fntemp, fn) != 0)
 +        {
 +            gmx_file("Cannot rename checkpoint file; maybe you are out of disk space?");
 +        }
 +    }
 +
 +    sfree(outputfiles);
 +    sfree(fntemp);
 +
 +#ifdef GMX_FAHCORE
 +    /*code for alternate checkpointing scheme.  moved from top of loop over 
 +      steps */
 +    fcRequestCheckPoint();
 +    if ( fcCheckPointParallel( cr->nodeid, NULL,0) == 0 ) {
 +        gmx_fatal( 3,__FILE__,__LINE__, "Checkpoint error on step %d\n", step );
 +    }
 +#endif /* end GMX_FAHCORE block */
 +}
 +
 +static void print_flag_mismatch(FILE *fplog,int sflags,int fflags)
 +{
 +    int i;
 +    
 +    fprintf(fplog,"\nState entry mismatch between the simulation and the checkpoint file\n");
 +    fprintf(fplog,"Entries which are not present in the checkpoint file will not be updated\n");
 +    fprintf(fplog,"  %24s    %11s    %11s\n","","simulation","checkpoint");
 +    for(i=0; i<estNR; i++)
 +    {
 +        if ((sflags & (1<<i)) || (fflags & (1<<i)))
 +        {
 +            fprintf(fplog,"  %24s    %11s    %11s\n",
 +                    est_names[i],
 +                    (sflags & (1<<i)) ? "  present  " : "not present",
 +                    (fflags & (1<<i)) ? "  present  " : "not present");
 +        }
 +    }
 +}
 +
 +static void check_int(FILE *fplog,const char *type,int p,int f,gmx_bool *mm)
 +{
 +	FILE *fp = fplog ? fplog : stderr;
 +
 +    if (p != f)
 +    {
 +		fprintf(fp,"  %s mismatch,\n",type);
 +		fprintf(fp,"    current program: %d\n",p);
 +		fprintf(fp,"    checkpoint file: %d\n",f);
 +		fprintf(fp,"\n");
 +        *mm = TRUE;
 +    }
 +}
 +
 +static void check_string(FILE *fplog,const char *type,const char *p,
 +                         const char *f,gmx_bool *mm)
 +{
 +	FILE *fp = fplog ? fplog : stderr;
 +	
 +    if (strcmp(p,f) != 0)
 +    {
 +		fprintf(fp,"  %s mismatch,\n",type);
 +		fprintf(fp,"    current program: %s\n",p);
 +		fprintf(fp,"    checkpoint file: %s\n",f);
 +		fprintf(fp,"\n");
 +        *mm = TRUE;
 +    }
 +}
 +
 +static void check_match(FILE *fplog,
 +                        char *version,
-                         char *btime,char *buser,char *bmach,int double_prec,
++                        char *btime,char *buser,char *bhost,int double_prec,
 +                        char *fprog,
 +                        t_commrec *cr,gmx_bool bPartDecomp,int npp_f,int npme_f,
 +                        ivec dd_nc,ivec dd_nc_f)
 +{
 +    int  npp;
 +    gmx_bool mm;
 +    
 +    mm = FALSE;
 +    
 +    check_string(fplog,"Version"      ,VERSION      ,version,&mm);
 +    check_string(fplog,"Build time"   ,BUILD_TIME   ,btime  ,&mm);
 +    check_string(fplog,"Build user"   ,BUILD_USER   ,buser  ,&mm);
-     check_string(fplog,"Build machine",BUILD_MACHINE,bmach  ,&mm);
++    check_string(fplog,"Build host"   ,BUILD_HOST   ,bhost  ,&mm);
 +    check_int   (fplog,"Double prec." ,GMX_CPT_BUILD_DP,double_prec,&mm);
 +    check_string(fplog,"Program name" ,Program()    ,fprog  ,&mm);
 +    
 +    check_int   (fplog,"#nodes"       ,cr->nnodes   ,npp_f+npme_f ,&mm);
 +    if (bPartDecomp)
 +    {
 +        dd_nc[XX] = 1;
 +        dd_nc[YY] = 1;
 +        dd_nc[ZZ] = 1;
 +    }
 +    if (cr->nnodes > 1)
 +    {
 +        check_int (fplog,"#PME-nodes"  ,cr->npmenodes,npme_f     ,&mm);
 +
 +        npp = cr->nnodes;
 +        if (cr->npmenodes >= 0)
 +        {
 +            npp -= cr->npmenodes;
 +        }
 +        if (npp == npp_f)
 +        {
 +            check_int (fplog,"#DD-cells[x]",dd_nc[XX]    ,dd_nc_f[XX],&mm);
 +            check_int (fplog,"#DD-cells[y]",dd_nc[YY]    ,dd_nc_f[YY],&mm);
 +            check_int (fplog,"#DD-cells[z]",dd_nc[ZZ]    ,dd_nc_f[ZZ],&mm);
 +        }
 +    }
 +    
 +    if (mm)
 +    {
 +		fprintf(stderr,
 +				"Gromacs binary or parallel settings not identical to previous run.\n"
 +				"Continuation is exact, but is not guaranteed to be binary identical%s.\n\n",
 +				fplog ? ",\n see the log file for details" : "");
 +		
 +        if (fplog)
 +        {
 +			fprintf(fplog,
 +					"Gromacs binary or parallel settings not identical to previous run.\n"
 +					"Continuation is exact, but is not guaranteed to be binary identical.\n\n");
 +		}
 +    }
 +}
 +
 +static void read_checkpoint(const char *fn,FILE **pfplog,
 +                            t_commrec *cr,gmx_bool bPartDecomp,ivec dd_nc,
 +                            int eIntegrator, int *init_fep_state, gmx_large_int_t *step,double *t,
 +                            t_state *state,gmx_bool *bReadRNG,gmx_bool *bReadEkin,
 +                            int *simulation_part,
 +                            gmx_bool bAppendOutputFiles,gmx_bool bForceAppend)
 +{
 +    t_fileio *fp;
 +    int  i,j,rc;
 +    int  file_version;
-     char *version,*btime,*buser,*bmach,*fprog,*ftime;
++    char *version,*btime,*buser,*bhost,*fprog,*ftime;
 +    int  double_prec;
 +	char filename[STRLEN],buf[STEPSTRSIZE];
 +    int  nppnodes,eIntegrator_f,nppnodes_f,npmenodes_f;
 +    ivec dd_nc_f;
 +    int  natoms,ngtc,nnhpres,nhchainlength,nlambda,fflags,flags_eks,flags_enh,flags_dfh;
 +    int  d;
 +    int  ret;
 +    gmx_file_position_t *outputfiles;
 +    int  nfiles;
 +    t_fileio *chksum_file;
 +    FILE* fplog = *pfplog;
 +    unsigned char digest[16];
 +#ifndef GMX_NATIVE_WINDOWS
 +    struct flock fl;  /* don't initialize here: the struct order is OS 
 +                         dependent! */
 +#endif
 +
 +    const char *int_warn=
 +              "WARNING: The checkpoint file was generated with integrator %s,\n"
 +              "         while the simulation uses integrator %s\n\n";
 +    const char *sd_note=
 +        "NOTE: The checkpoint file was for %d nodes doing SD or BD,\n"
 +        "      while the simulation uses %d SD or BD nodes,\n"
 +        "      continuation will be exact, except for the random state\n\n";
 +    
 +#ifndef GMX_NATIVE_WINDOWS
 +    fl.l_type=F_WRLCK;
 +    fl.l_whence=SEEK_SET;
 +    fl.l_start=0;
 +    fl.l_len=0;
 +    fl.l_pid=0;
 +#endif
 +
 +    if (PARTDECOMP(cr))
 +    {
 +        gmx_fatal(FARGS,
 +                  "read_checkpoint not (yet) supported with particle decomposition");
 +    }
 +    
 +    fp = gmx_fio_open(fn,"r");
 +    do_cpt_header(gmx_fio_getxdr(fp),TRUE,&file_version,
-                   &version,&btime,&buser,&bmach,&double_prec,&fprog,&ftime,
++                  &version,&btime,&buser,&bhost,&double_prec,&fprog,&ftime,
 +                  &eIntegrator_f,simulation_part,step,t,
 +                  &nppnodes_f,dd_nc_f,&npmenodes_f,
 +                  &natoms,&ngtc,&nnhpres,&nhchainlength,&nlambda,
 +                  &fflags,&flags_eks,&flags_enh,&flags_dfh,NULL);
 +
 +    if (bAppendOutputFiles &&
 +        file_version >= 13 && double_prec != GMX_CPT_BUILD_DP)
 +    {
 +        gmx_fatal(FARGS,"Output file appending requested, but the code and checkpoint file precision (single/double) don't match");
 +    }
 +    
 +    if (cr == NULL || MASTER(cr))
 +    {
 +        fprintf(stderr,"\nReading checkpoint file %s generated: %s\n\n",
 +                fn,ftime);
 +    }
 +	
 +	/* This will not be written if we do appending, since fplog is still NULL then */
 +    if (fplog)
 +    {
 +        fprintf(fplog,"\n");
 +        fprintf(fplog,"Reading checkpoint file %s\n",fn);
 +        fprintf(fplog,"  file generated by:     %s\n",fprog);  
 +        fprintf(fplog,"  file generated at:     %s\n",ftime);  
 +        fprintf(fplog,"  GROMACS build time:    %s\n",btime);  
 +        fprintf(fplog,"  GROMACS build user:    %s\n",buser);  
-         fprintf(fplog,"  GROMACS build machine: %s\n",bmach);  
++        fprintf(fplog,"  GROMACS build host:    %s\n",bhost);
 +        fprintf(fplog,"  GROMACS double prec.:  %d\n",double_prec);
 +        fprintf(fplog,"  simulation part #:     %d\n",*simulation_part);
 +        fprintf(fplog,"  step:                  %s\n",gmx_step_str(*step,buf));
 +        fprintf(fplog,"  time:                  %f\n",*t);  
 +        fprintf(fplog,"\n");
 +    }
 +    
 +    if (natoms != state->natoms)
 +    {
 +        gmx_fatal(FARGS,"Checkpoint file is for a system of %d atoms, while the current system consists of %d atoms",natoms,state->natoms);
 +    }
 +    if (ngtc != state->ngtc)
 +    {
 +        gmx_fatal(FARGS,"Checkpoint file is for a system of %d T-coupling groups, while the current system consists of %d T-coupling groups",ngtc,state->ngtc);
 +    }
 +    if (nnhpres != state->nnhpres)
 +    {
 +        gmx_fatal(FARGS,"Checkpoint file is for a system of %d NH-pressure-coupling variables, while the current system consists of %d NH-pressure-coupling variables",nnhpres,state->nnhpres);
 +    }
 +
 +    if (nlambda != state->dfhist.nlambda)
 +    {
 +        gmx_fatal(FARGS,"Checkpoint file is for a system with %d lambda states, while the current system consists of %d lambda states",nlambda,state->dfhist.nlambda);
 +    }
 +
 +    init_gtc_state(state,state->ngtc,state->nnhpres,nhchainlength); /* need to keep this here to keep the tpr format working */
 +    /* write over whatever was read; we use the number of Nose-Hoover chains from the checkpoint */
 +    
 +    if (eIntegrator_f != eIntegrator)
 +    {
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr,int_warn,EI(eIntegrator_f),EI(eIntegrator));
 +        }
 +		if(bAppendOutputFiles)
 +		{
 +			gmx_fatal(FARGS,
 +					  "Output file appending requested, but input/checkpoint integrators do not match.\n"
 +					  "Stopping the run to prevent you from ruining all your data...\n"
 +					  "If you _really_ know what you are doing, try with the -noappend option.\n");
 +		}
 +        if (fplog)
 +        {
 +            fprintf(fplog,int_warn,EI(eIntegrator_f),EI(eIntegrator));
 +        }
 +    }
 +
 +    if (!PAR(cr))
 +    {
 +        nppnodes = 1;
 +        cr->npmenodes = 0;
 +    }
 +    else if (bPartDecomp)
 +    {
 +        nppnodes = cr->nnodes;
 +        cr->npmenodes = 0;
 +    }
 +    else if (cr->nnodes == nppnodes_f + npmenodes_f)
 +    {
 +        if (cr->npmenodes < 0)
 +        {
 +            cr->npmenodes = npmenodes_f;
 +        }
 +        nppnodes = cr->nnodes - cr->npmenodes;
 +        if (nppnodes == nppnodes_f)
 +        {
 +            for(d=0; d<DIM; d++)
 +            {
 +                if (dd_nc[d] == 0)
 +                {
 +                    dd_nc[d] = dd_nc_f[d];
 +                }
 +            }
 +        }
 +    }
 +    else
 +    {
 +        /* The number of PP nodes has not been set yet */
 +        nppnodes = -1;
 +    }
 +
 +    if ((EI_SD(eIntegrator) || eIntegrator == eiBD) && nppnodes > 0)
 +    {
 +        /* Correct the RNG state size for the number of PP nodes.
 +         * Such assignments should all be moved to one central function.
 +         */
 +        state->nrng  = nppnodes*gmx_rng_n();
 +        state->nrngi = nppnodes;
 +    }
 +    
 +    *bReadRNG = TRUE;
 +    if (fflags != state->flags)
 +    {
 +		
 +        if (MASTER(cr))
 +        {
 +			if(bAppendOutputFiles)
 +			{
 +				gmx_fatal(FARGS,
 +						  "Output file appending requested, but input and checkpoint states are not identical.\n"
 +						  "Stopping the run to prevent you from ruining all your data...\n"
 +						  "You can try with the -noappend option, and get more info in the log file.\n");
 +			}
 +			
 +            if (getenv("GMX_ALLOW_CPT_MISMATCH") == NULL)
 +            {
 +                gmx_fatal(FARGS,"You seem to have switched ensemble, integrator, T and/or P-coupling algorithm between the cpt and tpr file. The recommended way of doing this is passing the cpt file to grompp (with option -t) instead of to mdrun. If you know what you are doing, you can override this error by setting the env.var. GMX_ALLOW_CPT_MISMATCH");
 +            }
 +            else
 +            {
 +                fprintf(stderr,
 +                        "WARNING: The checkpoint state entries do not match the simulation,\n"
 +                        "         see the log file for details\n\n");
 +            }
 +        }
 +		
 +		if(fplog)
 +		{
 +			print_flag_mismatch(fplog,state->flags,fflags);
 +		}
 +    }
 +    else
 +    {
 +        if ((EI_SD(eIntegrator) || eIntegrator == eiBD) &&
 +            nppnodes != nppnodes_f)
 +        {
 +            *bReadRNG = FALSE;
 +            if (MASTER(cr))
 +            {
 +                fprintf(stderr,sd_note,nppnodes_f,nppnodes);
 +            }
 +            if (fplog)
 +            {
 +                fprintf(fplog ,sd_note,nppnodes_f,nppnodes);
 +            }
 +        }
 +        if (MASTER(cr))
 +        {
-             check_match(fplog,version,btime,buser,bmach,double_prec,fprog,
++            check_match(fplog,version,btime,buser,bhost,double_prec,fprog,
 +                        cr,bPartDecomp,nppnodes_f,npmenodes_f,dd_nc,dd_nc_f);
 +        }
 +    }
 +    ret = do_cpt_state(gmx_fio_getxdr(fp),TRUE,fflags,state,*bReadRNG,NULL);
 +    *init_fep_state = state->fep_state;  /* there should be a better way to do this than setting it here.
 +                                            Investigate for 5.0. */
 +    if (ret)
 +    {
 +        cp_error();
 +    }
 +    ret = do_cpt_ekinstate(gmx_fio_getxdr(fp),TRUE,
 +                           flags_eks,&state->ekinstate,NULL);
 +    if (ret)
 +    {
 +        cp_error();
 +    }
 +    *bReadEkin = ((flags_eks & (1<<eeksEKINH)) || (flags_eks & (1<<eeksEKINF)) || (flags_eks & (1<<eeksEKINO)) ||
 +                  ((flags_eks & (1<<eeksEKINSCALEF)) | (flags_eks & (1<<eeksEKINSCALEH)) | (flags_eks & (1<<eeksVSCALE))));
 +    
 +    ret = do_cpt_enerhist(gmx_fio_getxdr(fp),TRUE,
 +                          flags_enh,&state->enerhist,NULL);
 +    if (ret)
 +    {
 +        cp_error();
 +    }
 +
 +    if (file_version < 6)
 +    {
 +        const char *warn="Reading checkpoint file in old format, assuming that the run that generated this file started at step 0, if this is not the case the averages stored in the energy file will be incorrect.";
 +
 +        fprintf(stderr,"\nWARNING: %s\n\n",warn);
 +        if (fplog)
 +        {
 +            fprintf(fplog,"\nWARNING: %s\n\n",warn);
 +        }
 +        state->enerhist.nsum     = *step;
 +        state->enerhist.nsum_sim = *step;
 +    }
 +
 +    ret = do_cpt_df_hist(gmx_fio_getxdr(fp),TRUE,
 +                         flags_dfh,&state->dfhist,NULL);
 +    if (ret)
 +    {
 +        cp_error();
 +    }
 +
 +	ret = do_cpt_files(gmx_fio_getxdr(fp),TRUE,&outputfiles,&nfiles,NULL,file_version);
 +	if (ret)
 +	{
 +		cp_error();
 +	}
 +					   
 +    ret = do_cpt_footer(gmx_fio_getxdr(fp),TRUE,file_version);
 +    if (ret)
 +    {
 +        cp_error();
 +    }
 +    if( gmx_fio_close(fp) != 0)
 +	{
 +        gmx_file("Cannot read/write checkpoint; corrupt file, or maybe you are out of disk space?");
 +	}
 +    
 +    sfree(fprog);
 +    sfree(ftime);
 +    sfree(btime);
 +    sfree(buser);
-     sfree(bmach);
++    sfree(bhost);
 +	
 +	/* If the user wants to append to output files,
 +     * we use the file pointer positions of the output files stored
 +     * in the checkpoint file and truncate the files such that any frames
 +     * written after the checkpoint time are removed.
 +     * All files are md5sum checked such that we can be sure that
 +     * we do not truncate other (maybe imprortant) files.
 +	 */
 +    if (bAppendOutputFiles)
 +    {
 +        if (fn2ftp(outputfiles[0].filename)!=efLOG)
 +        {
 +            /* make sure first file is log file so that it is OK to use it for 
 +             * locking
 +             */
 +            gmx_fatal(FARGS,"The first output file should always be the log "
 +                      "file but instead is: %s. Cannot do appending because of this condition.", outputfiles[0].filename);
 +        }
 +        for(i=0;i<nfiles;i++)
 +        {
 +            if (outputfiles[i].offset < 0)
 +            {
 +                gmx_fatal(FARGS,"The original run wrote a file called '%s' which "
 +                    "is larger than 2 GB, but mdrun did not support large file"
 +                    " offsets. Can not append. Run mdrun with -noappend",
 +                    outputfiles[i].filename);
 +            }
 +#ifdef GMX_FAHCORE
 +            chksum_file=gmx_fio_open(outputfiles[i].filename,"a");
 +
 +#else
 +            chksum_file=gmx_fio_open(outputfiles[i].filename,"r+");
 +
 +            /* lock log file */                
 +            if (i==0)
 +            {
 +                /* Note that there are systems where the lock operation
 +                 * will succeed, but a second process can also lock the file.
 +                 * We should probably try to detect this.
 +                 */
 +#ifndef GMX_NATIVE_WINDOWS
 +                if (fcntl(fileno(gmx_fio_getfp(chksum_file)), F_SETLK, &fl)
 +                    ==-1)
 +#else
 +                if (_locking(fileno(gmx_fio_getfp(chksum_file)), _LK_NBLCK, LONG_MAX)==-1)
 +#endif
 +                {
 +                    if (errno == ENOSYS)
 +                    {
 +                        if (!bForceAppend)
 +                        {
 +                            gmx_fatal(FARGS,"File locking is not supported on this system. Use -noappend or specify -append explicitly to append anyhow.");
 +                        }
 +                        else
 +                        {
 +                            fprintf(stderr,"\nNOTE: File locking is not supported on this system, will not lock %s\n\n",outputfiles[i].filename);
 +                            if (fplog)
 +                            {
 +                                fprintf(fplog,"\nNOTE: File locking not supported on this system, will not lock %s\n\n",outputfiles[i].filename);
 +                            }
 +                        }
 +                    }
 +                    else if (errno == EACCES || errno == EAGAIN)
 +                    {
 +                        gmx_fatal(FARGS,"Failed to lock: %s. Already running "
 +                                  "simulation?", outputfiles[i].filename);
 +                    }
 +                    else
 +                    {
 +                        gmx_fatal(FARGS,"Failed to lock: %s. %s.",
 +                                  outputfiles[i].filename, strerror(errno));
 +                    }
 +                }
 +            }
 +            
 +            /* compute md5 chksum */ 
 +            if (outputfiles[i].chksum_size != -1)
 +            {
 +                if (gmx_fio_get_file_md5(chksum_file,outputfiles[i].offset,
 +                                     digest) != outputfiles[i].chksum_size)  /*at the end of the call the file position is at the end of the file*/
 +                {
 +                    gmx_fatal(FARGS,"Can't read %d bytes of '%s' to compute checksum. The file has been replaced or its contents have been modified. Cannot do appending because of this condition.",
 +                              outputfiles[i].chksum_size, 
 +                              outputfiles[i].filename);
 +                }
 +            } 
 +            if (i==0)  /*log file needs to be seeked in case we need to truncate (other files are truncated below)*/
 +            {
 +                if (gmx_fio_seek(chksum_file,outputfiles[i].offset))
 +                {
 +                	gmx_fatal(FARGS,"Seek error! Failed to truncate log-file: %s.", strerror(errno));
 +                }
 +            }
 +#endif
 +            
 +            if (i==0) /*open log file here - so that lock is never lifted 
 +                        after chksum is calculated */
 +            {
 +                *pfplog = gmx_fio_getfp(chksum_file);
 +            }
 +            else
 +            {
 +                gmx_fio_close(chksum_file);
 +            }
 +#ifndef GMX_FAHCORE            
 +            /* compare md5 chksum */
 +            if (outputfiles[i].chksum_size != -1 &&
 +                memcmp(digest,outputfiles[i].chksum,16)!=0) 
 +            {
 +                if (debug)
 +                {
 +                    fprintf(debug,"chksum for %s: ",outputfiles[i].filename);
 +                    for (j=0; j<16; j++)
 +                    {
 +                        fprintf(debug,"%02x",digest[j]);
 +                    }
 +                    fprintf(debug,"\n");
 +                }
 +                gmx_fatal(FARGS,"Checksum wrong for '%s'. The file has been replaced or its contents have been modified. Cannot do appending because of this condition.",
 +                          outputfiles[i].filename);
 +            }
 +#endif        
 +
 +              
 +            if (i!=0) /*log file is already seeked to correct position */
 +            {
 +#ifdef GMX_NATIVE_WINDOWS
 +                rc = gmx_wintruncate(outputfiles[i].filename,outputfiles[i].offset);
 +#else            
 +                rc = truncate(outputfiles[i].filename,outputfiles[i].offset);
 +#endif
 +                if(rc!=0)
 +                {
 +                    gmx_fatal(FARGS,"Truncation of file %s failed. Cannot do appending because of this failure.",outputfiles[i].filename);
 +                }
 +            }
 +        }
 +    }
 +
 +    sfree(outputfiles);
 +}
 +
 +
 +void load_checkpoint(const char *fn,FILE **fplog,
 +                     t_commrec *cr,gmx_bool bPartDecomp,ivec dd_nc,
 +                     t_inputrec *ir,t_state *state,
 +                     gmx_bool *bReadRNG,gmx_bool *bReadEkin,
 +                     gmx_bool bAppend,gmx_bool bForceAppend)
 +{
 +    gmx_large_int_t step;
 +    double t;
 +
 +    if (SIMMASTER(cr)) {
 +      /* Read the state from the checkpoint file */
 +      read_checkpoint(fn,fplog,
 +                      cr,bPartDecomp,dd_nc,
 +                      ir->eI,&(ir->fepvals->init_fep_state),&step,&t,state,bReadRNG,bReadEkin,
 +                      &ir->simulation_part,bAppend,bForceAppend);
 +    }
 +    if (PAR(cr)) {
 +      gmx_bcast(sizeof(cr->npmenodes),&cr->npmenodes,cr);
 +      gmx_bcast(DIM*sizeof(dd_nc[0]),dd_nc,cr);
 +      gmx_bcast(sizeof(step),&step,cr);
 +      gmx_bcast(sizeof(*bReadRNG),bReadRNG,cr);
 +      gmx_bcast(sizeof(*bReadEkin),bReadEkin,cr);
 +    }
 +    ir->bContinuation    = TRUE;
 +    if (ir->nsteps >= 0)
 +    {
 +        ir->nsteps          += ir->init_step - step;
 +    }
 +    ir->init_step        = step;
 +	ir->simulation_part += 1;
 +}
 +
 +static void read_checkpoint_data(t_fileio *fp,int *simulation_part,
 +                                 gmx_large_int_t *step,double *t,t_state *state,
 +                                 gmx_bool bReadRNG,
 +                                 int *nfiles,gmx_file_position_t **outputfiles)
 +{
 +    int  file_version;
-     char *version,*btime,*buser,*bmach,*fprog,*ftime;
++    char *version,*btime,*buser,*bhost,*fprog,*ftime;
 +    int  double_prec;
 +    int  eIntegrator;
 +    int  nppnodes,npme;
 +    ivec dd_nc;
 +    int  flags_eks,flags_enh,flags_dfh;
 +    int  nfiles_loc;
 +    gmx_file_position_t *files_loc=NULL;
 +    int  ret;
 +	
 +    do_cpt_header(gmx_fio_getxdr(fp),TRUE,&file_version,
-                   &version,&btime,&buser,&bmach,&double_prec,&fprog,&ftime,
++                  &version,&btime,&buser,&bhost,&double_prec,&fprog,&ftime,
 +                  &eIntegrator,simulation_part,step,t,&nppnodes,dd_nc,&npme,
 +                  &state->natoms,&state->ngtc,&state->nnhpres,&state->nhchainlength,
 +                  &(state->dfhist.nlambda),&state->flags,&flags_eks,&flags_enh,&flags_dfh,NULL);
 +    ret =
 +        do_cpt_state(gmx_fio_getxdr(fp),TRUE,state->flags,state,bReadRNG,NULL);
 +    if (ret)
 +    {
 +        cp_error();
 +    }
 +    ret = do_cpt_ekinstate(gmx_fio_getxdr(fp),TRUE,
 +                           flags_eks,&state->ekinstate,NULL);
 +    if (ret)
 +    {
 +        cp_error();
 +    }
 +    ret = do_cpt_enerhist(gmx_fio_getxdr(fp),TRUE,
 +                          flags_enh,&state->enerhist,NULL);
 +    if (ret)
 +    {
 +        cp_error();
 +    }
 +    ret = do_cpt_df_hist(gmx_fio_getxdr(fp),TRUE,
 +                          flags_dfh,&state->dfhist,NULL);
 +    if (ret)
 +    {
 +        cp_error();
 +    }
 +
 +    ret = do_cpt_files(gmx_fio_getxdr(fp),TRUE,
 +                       outputfiles != NULL ? outputfiles : &files_loc,
 +                       outputfiles != NULL ? nfiles : &nfiles_loc,
 +                       NULL,file_version);
 +    if (files_loc != NULL)
 +    {
 +        sfree(files_loc);
 +    }
 +	
 +    if (ret)
 +    {
 +        cp_error();
 +    }
 +	
 +    ret = do_cpt_footer(gmx_fio_getxdr(fp),TRUE,file_version);
 +    if (ret)
 +    {
 +        cp_error();
 +    }
 +
 +    sfree(fprog);
 +    sfree(ftime);
 +    sfree(btime);
 +    sfree(buser);
-     sfree(bmach);
++    sfree(bhost);
 +}
 +
 +void 
 +read_checkpoint_state(const char *fn,int *simulation_part,
 +                      gmx_large_int_t *step,double *t,t_state *state)
 +{
 +    t_fileio *fp;
 +    
 +    fp = gmx_fio_open(fn,"r");
 +    read_checkpoint_data(fp,simulation_part,step,t,state,FALSE,NULL,NULL);
 +    if( gmx_fio_close(fp) != 0)
 +	{
 +        gmx_file("Cannot read/write checkpoint; corrupt file, or maybe you are out of disk space?");
 +	}
 +}
 +
 +void read_checkpoint_trxframe(t_fileio *fp,t_trxframe *fr)
 +{
 +    t_state state;
 +    int simulation_part;
 +    gmx_large_int_t step;
 +    double t;
 +    
 +    init_state(&state,0,0,0,0,0);
 +    
 +    read_checkpoint_data(fp,&simulation_part,&step,&t,&state,FALSE,NULL,NULL);
 +    
 +    fr->natoms  = state.natoms;
 +    fr->bTitle  = FALSE;
 +    fr->bStep   = TRUE;
 +    fr->step    = gmx_large_int_to_int(step,
 +                                    "conversion of checkpoint to trajectory");
 +    fr->bTime   = TRUE;
 +    fr->time    = t;
 +    fr->bLambda = TRUE;
 +    fr->lambda  = state.lambda[efptFEP];
 +    fr->fep_state  = state.fep_state;
 +    fr->bAtoms  = FALSE;
 +    fr->bX      = (state.flags & (1<<estX));
 +    if (fr->bX)
 +    {
 +        fr->x     = state.x;
 +        state.x   = NULL;
 +    }
 +    fr->bV      = (state.flags & (1<<estV));
 +    if (fr->bV)
 +    {
 +        fr->v     = state.v;
 +        state.v   = NULL;
 +    }
 +    fr->bF      = FALSE;
 +    fr->bBox    = (state.flags & (1<<estBOX));
 +    if (fr->bBox)
 +    {
 +        copy_mat(state.box,fr->box);
 +    }
 +    done_state(&state);
 +}
 +
 +void list_checkpoint(const char *fn,FILE *out)
 +{
 +    t_fileio *fp;
 +    int  file_version;
-     char *version,*btime,*buser,*bmach,*fprog,*ftime;
++    char *version,*btime,*buser,*bhost,*fprog,*ftime;
 +    int  double_prec;
 +    int  eIntegrator,simulation_part,nppnodes,npme;
 +    gmx_large_int_t step;
 +    double t;
 +    ivec dd_nc;
 +    t_state state;
 +    int  flags_eks,flags_enh,flags_dfh;
 +    int  indent;
 +    int  i,j;
 +    int  ret;
 +    gmx_file_position_t *outputfiles;
 +	int  nfiles;
 +	
 +    init_state(&state,-1,-1,-1,-1,0);
 +
 +    fp = gmx_fio_open(fn,"r");
 +    do_cpt_header(gmx_fio_getxdr(fp),TRUE,&file_version,
-                   &version,&btime,&buser,&bmach,&double_prec,&fprog,&ftime,
++                  &version,&btime,&buser,&bhost,&double_prec,&fprog,&ftime,
 +                  &eIntegrator,&simulation_part,&step,&t,&nppnodes,dd_nc,&npme,
 +                  &state.natoms,&state.ngtc,&state.nnhpres,&state.nhchainlength,
 +                  &(state.dfhist.nlambda),&state.flags,
 +                  &flags_eks,&flags_enh,&flags_dfh,out);
 +    ret = do_cpt_state(gmx_fio_getxdr(fp),TRUE,state.flags,&state,TRUE,out);
 +    if (ret)
 +    {
 +        cp_error();
 +    }
 +    ret = do_cpt_ekinstate(gmx_fio_getxdr(fp),TRUE,
 +                           flags_eks,&state.ekinstate,out);
 +    if (ret)
 +    {
 +        cp_error();
 +    }
 +    ret = do_cpt_enerhist(gmx_fio_getxdr(fp),TRUE,
 +                          flags_enh,&state.enerhist,out);
 +
 +    if (ret == 0)
 +    {
 +        init_df_history(&state.dfhist,state.dfhist.nlambda,0); /* reinitialize state with correct sizes */
 +        ret = do_cpt_df_hist(gmx_fio_getxdr(fp),TRUE,
 +                             flags_dfh,&state.dfhist,out);
 +    }
 +    if (ret == 0)
 +    {
 +		do_cpt_files(gmx_fio_getxdr(fp),TRUE,&outputfiles,&nfiles,out,file_version);
 +	}
 +	
 +    if (ret == 0)
 +    {
 +        ret = do_cpt_footer(gmx_fio_getxdr(fp),TRUE,file_version);
 +    }
 +	
 +    if (ret)
 +    {
 +        cp_warning(out);
 +    }
 +    if( gmx_fio_close(fp) != 0)
 +	{
 +        gmx_file("Cannot read/write checkpoint; corrupt file, or maybe you are out of disk space?");
 +	}
 +    
 +    done_state(&state);
 +}
 +
 +
 +static gmx_bool exist_output_file(const char *fnm_cp,int nfile,const t_filenm fnm[])
 +{
 +    int i;
 +
 +    /* Check if the output file name stored in the checkpoint file
 +     * is one of the output file names of mdrun.
 +     */
 +    i = 0;
 +    while (i < nfile &&
 +           !(is_output(&fnm[i]) && strcmp(fnm_cp,fnm[i].fns[0]) == 0))
 +    {
 +        i++;
 +    }
 +    
 +    return (i < nfile && gmx_fexist(fnm_cp));
 +}
 +
 +/* This routine cannot print tons of data, since it is called before the log file is opened. */
 +gmx_bool read_checkpoint_simulation_part(const char *filename, int *simulation_part,
 +                                     gmx_large_int_t *cpt_step,t_commrec *cr,
 +                                     gmx_bool bAppendReq,
 +                                     int nfile,const t_filenm fnm[],
 +                                     const char *part_suffix,gmx_bool *bAddPart)
 +{
 +    t_fileio *fp;
 +    gmx_large_int_t step=0;
 +	double t;
 +    t_state state;
 +    int  nfiles;
 +    gmx_file_position_t *outputfiles;
 +    int  nexist,f;
 +    gmx_bool bAppend;
 +    char *fn,suf_up[STRLEN];
 +
 +    bAppend = FALSE;
 +
 +    if (SIMMASTER(cr)) {
 +        if(!gmx_fexist(filename) || (!(fp = gmx_fio_open(filename,"r")) ))
 +        {
 +            *simulation_part = 0;
 +        }
 +        else 
 +        {
 +            init_state(&state,0,0,0,0,0);
 +
 +            read_checkpoint_data(fp,simulation_part,&step,&t,&state,FALSE,
 +                                 &nfiles,&outputfiles);
 +            if( gmx_fio_close(fp) != 0)
 +            {
 +                gmx_file("Cannot read/write checkpoint; corrupt file, or maybe you are out of disk space?");
 +            }
 +            done_state(&state);
 +
 +            if (bAppendReq)
 +            {
 +                nexist = 0;
 +                for(f=0; f<nfiles; f++)
 +                {
 +                    if (exist_output_file(outputfiles[f].filename,nfile,fnm))
 +                    {
 +                        nexist++;
 +                    }
 +                }
 +                if (nexist == nfiles)
 +                {
 +                    bAppend = bAppendReq;
 +                }
 +                else if (nexist > 0)
 +                {
 +                    fprintf(stderr,
 +                            "Output file appending has been requested,\n"
 +                            "but some output files listed in the checkpoint file %s\n"
 +                            "are not present or are named differently by the current program:\n",
 +                            filename);
 +                    fprintf(stderr,"output files present:");
 +                    for(f=0; f<nfiles; f++)
 +                    {
 +                        if (exist_output_file(outputfiles[f].filename,
 +                                              nfile,fnm))
 +                        {
 +                            fprintf(stderr," %s",outputfiles[f].filename);
 +                        }
 +                    }
 +                    fprintf(stderr,"\n");
 +                    fprintf(stderr,"output files not present or named differently:");
 +                    for(f=0; f<nfiles; f++)
 +                    {
 +                        if (!exist_output_file(outputfiles[f].filename,
 +                                               nfile,fnm))
 +                        {
 +                            fprintf(stderr," %s",outputfiles[f].filename);
 +                        }
 +                    }
 +                    fprintf(stderr,"\n");
 +                    
 +                    gmx_fatal(FARGS,"File appending requested, but only %d of the %d output files are present",nexist,nfiles);
 +                }
 +            }
 +            
 +            if (bAppend)
 +            {
 +                if (nfiles == 0)
 +                {
 +                    gmx_fatal(FARGS,"File appending requested, but no output file information is stored in the checkpoint file");
 +                }
 +                fn = outputfiles[0].filename;
 +                if (strlen(fn) < 4 ||
 +                    gmx_strcasecmp(fn+strlen(fn)-4,ftp2ext(efLOG)) == 0)
 +                {
 +                    gmx_fatal(FARGS,"File appending requested, but the log file is not the first file listed in the checkpoint file");
 +                }
 +                /* Set bAddPart to whether the suffix string '.part' is present
 +                 * in the log file name.
 +                 */
 +                strcpy(suf_up,part_suffix);
 +                upstring(suf_up);
 +                *bAddPart = (strstr(fn,part_suffix) != NULL ||
 +                             strstr(fn,suf_up) != NULL);
 +            }
 +
 +            sfree(outputfiles);
 +        }
 +    }
 +    if (PAR(cr))
 +    {
 +        gmx_bcast(sizeof(*simulation_part),simulation_part,cr);
 +
 +        if (*simulation_part > 0 && bAppendReq)
 +        {
 +            gmx_bcast(sizeof(bAppend),&bAppend,cr);
 +            gmx_bcast(sizeof(*bAddPart),bAddPart,cr);
 +        }
 +    }
 +    if (NULL != cpt_step)
 +    {
 +        *cpt_step = step;
 +    }
 +
 +    return bAppend;
 +}
diff --cc src/gromacs/gmxlib/gmx_detectcpu.c
index 0000000000,d58d34bd29..d58d34bd29
mode 000000,100644..100644
--- a/src/gromacs/gmxlib/gmx_detectcpu.c
+++ b/src/gromacs/gmxlib/gmx_detectcpu.c
diff --cc src/gromacs/gmxlib/main.c
index c1c9179807,0000000000..06a5a8e5cc
mode 100644,000000..100644
--- a/src/gromacs/gmxlib/main.c
+++ b/src/gromacs/gmxlib/main.c
@@@ -1,554 -1,0 +1,560 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROningen Mixture of Alchemy and Childrens' Stories
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +#include "gromacs/utility/gmx_header_config.h"
 +
 +#include <stdio.h>
 +#include <stdlib.h>
 +#include <string.h>
 +#include <limits.h>
 +#include <time.h>
 +
 +#ifdef HAVE_SYS_TIME_H
 +#include <sys/time.h>
 +#endif
 +
 +#include "smalloc.h"
 +#include "gmx_fatal.h"
 +#include "network.h"
 +#include "main.h"
 +#include "macros.h"
 +#include "futil.h"
 +#include "filenm.h"
 +#include "mdrun.h"
 +#include "gmxfio.h"
 +#include "string2.h"
 +
 +#ifdef GMX_THREAD_MPI
 +#include "thread_mpi.h"
 +#endif
 +
 +/* The source code in this file should be thread-safe. 
 +         Please keep it that way. */
 +
 +
 +#ifdef HAVE_UNISTD_H
 +#include <unistd.h>
 +#endif
 +
 +#ifdef GMX_NATIVE_WINDOWS
 +#include <process.h>
 +#endif
 +
 +
 +/* Portable version of ctime_r implemented in src/gmxlib/string2.c, but we do not want it declared in public installed headers */
 +char *
 +gmx_ctime_r(const time_t *clock,char *buf, int n);
 +
 +
 +#define BUFSIZE	1024
 +
 +
 +static void par_fn(char *base,int ftp,const t_commrec *cr,
 +		   gmx_bool bAppendSimId,gmx_bool bAppendNodeId,
 +		   char buf[],int bufsize)
 +{
 +  int n;
 +  
 +  if((size_t)bufsize<(strlen(base)+10))
 +     gmx_mem("Character buffer too small!");
 +
 +  /* Copy to buf, and strip extension */
 +  strcpy(buf,base);
 +  buf[strlen(base) - strlen(ftp2ext(fn2ftp(base))) - 1] = '\0';
 +
 +  if (bAppendSimId) {
 +    sprintf(buf+strlen(buf),"%d",cr->ms->sim);
 +  }
 +  if (bAppendNodeId) {
 +    strcat(buf,"_node");
 +    sprintf(buf+strlen(buf),"%d",cr->nodeid);
 +  }
 +  strcat(buf,".");
 +  
 +  /* Add extension again */
 +  strcat(buf,(ftp == efTPX) ? "tpr" : (ftp == efEDR) ? "edr" : ftp2ext(ftp));
 +  if (cr->nodeid == 0) {
 +    printf("node %d par_fn '%s'\n",cr->nodeid,buf);
 +    if (fn2ftp(buf) == efLOG) {
 +      printf("log\n");
 +    }
 +  }
 +}
 +
 +void check_multi_int(FILE *log,const gmx_multisim_t *ms,int val,
 +                     const char *name)
 +{
 +  int  *ibuf,p;
 +  gmx_bool bCompatible;
 +
 +  if (NULL != log)
 +      fprintf(log,"Multi-checking %s ... ",name);
 +  
 +  if (ms == NULL)
 +    gmx_fatal(FARGS,
 +	      "check_multi_int called with a NULL communication pointer");
 +
 +  snew(ibuf,ms->nsim);
 +  ibuf[ms->sim] = val;
 +  gmx_sumi_sim(ms->nsim,ibuf,ms);
 +  
 +  bCompatible = TRUE;
 +  for(p=1; p<ms->nsim; p++)
 +    bCompatible = bCompatible && (ibuf[p-1] == ibuf[p]);
 +  
 +  if (bCompatible) 
 +  {
 +      if (NULL != log)
 +          fprintf(log,"OK\n");
 +  }
 +  else 
 +  {
 +      if (NULL != log)
 +      {
 +          fprintf(log,"\n%s is not equal for all subsystems\n",name);
 +          for(p=0; p<ms->nsim; p++)
 +              fprintf(log,"  subsystem %d: %d\n",p,ibuf[p]);
 +      }
 +      gmx_fatal(FARGS,"The %d subsystems are not compatible\n",ms->nsim);
 +  }
 +  
 +  sfree(ibuf);
 +}
 +
 +void check_multi_large_int(FILE *log,const gmx_multisim_t *ms,
 +                           gmx_large_int_t val, const char *name)
 +{
 +  gmx_large_int_t  *ibuf;
 +  int p;
 +  gmx_bool bCompatible;
 +
 +  if (NULL != log)
 +      fprintf(log,"Multi-checking %s ... ",name);
 +  
 +  if (ms == NULL)
 +    gmx_fatal(FARGS,
 +	      "check_multi_int called with a NULL communication pointer");
 +
 +  snew(ibuf,ms->nsim);
 +  ibuf[ms->sim] = val;
 +  gmx_sumli_sim(ms->nsim,ibuf,ms);
 +  
 +  bCompatible = TRUE;
 +  for(p=1; p<ms->nsim; p++)
 +    bCompatible = bCompatible && (ibuf[p-1] == ibuf[p]);
 +  
 +  if (bCompatible) 
 +  {
 +      if (NULL != log)
 +          fprintf(log,"OK\n");
 +  }
 +  else 
 +  {
 +      if (NULL != log)
 +      {
 +          fprintf(log,"\n%s is not equal for all subsystems\n",name);
 +          for(p=0; p<ms->nsim; p++)
 +          {
 +              char strbuf[255];
 +              /* first make the format string */
 +              snprintf(strbuf, 255, "  subsystem %%d: %s\n", 
 +                       gmx_large_int_pfmt);
 +              fprintf(log,strbuf,p,ibuf[p]);
 +          }
 +      }
 +      gmx_fatal(FARGS,"The %d subsystems are not compatible\n",ms->nsim);
 +  }
 +  
 +  sfree(ibuf);
 +}
 +
 +
 +void gmx_log_open(const char *lognm,const t_commrec *cr,gmx_bool bMasterOnly, 
 +                   unsigned long Flags, FILE** fplog)
 +{
 +    int  len,testlen,pid;
 +    char buf[256],host[256];
 +    time_t t;
 +    char timebuf[STRLEN];
 +    FILE *fp=*fplog;
 +    char *tmpnm;
 +
 +    gmx_bool bAppend = Flags & MD_APPENDFILES;	
 +  
 +    debug_gmx();
 +  
 +    /* Communicate the filename for logfile */
 +    if (cr->nnodes > 1 && !bMasterOnly
 +#ifdef GMX_THREAD_MPI
 +        /* With thread MPI the non-master log files are opened later
 +         * when the files names are already known on all nodes.
 +         */
 +        && FALSE
 +#endif
 +        )
 +    {
 +        if (MASTER(cr))
 +        {
 +            len = strlen(lognm) + 1;
 +        }
 +        gmx_bcast(sizeof(len),&len,cr);
 +        if (!MASTER(cr))
 +        {
 +            snew(tmpnm,len+8);
 +        }
 +        else
 +        {
 +            tmpnm=gmx_strdup(lognm);
 +        }
 +        gmx_bcast(len*sizeof(*tmpnm),tmpnm,cr);
 +    }
 +    else
 +    {
 +        tmpnm=gmx_strdup(lognm);
 +    }
 +  
 +    debug_gmx();
 +
 +    if (!bMasterOnly && !MASTER(cr))
 +    {
 +        /* Since log always ends with '.log' let's use this info */
 +        par_fn(tmpnm,efLOG,cr,FALSE,!bMasterOnly,buf,255);
 +        fp = gmx_fio_fopen(buf, bAppend ? "a+" : "w+" );
 +    }
 +    else if (!bAppend)
 +    {
 +        fp = gmx_fio_fopen(tmpnm, bAppend ? "a+" : "w+" );
 +    }
 +
 +    sfree(tmpnm);
 +
 +    gmx_fatal_set_log_file(fp);
 +  
 +    /* Get some machine parameters */
 +#ifdef HAVE_UNISTD_H
 +    if (gethostname(host,255) != 0)
 +    {
 +        sprintf(host,"unknown");
 +    }
 +#else
 +    sprintf(host,"unknown");
 +#endif  
 +
 +    time(&t);
 +
 +#ifndef NO_GETPID
 +#   ifdef GMX_NATIVE_WINDOWS
 +    pid = _getpid();
 +#   else
 +    pid = getpid();
 +#   endif
 +#else
 +	pid = 0;
 +#endif
 +
 +    if (bAppend)
 +    {
 +        fprintf(fp,
 +                "\n"
 +                "\n"
 +                "-----------------------------------------------------------\n"
 +                "Restarting from checkpoint, appending to previous log file.\n"
 +                "\n"
 +            );
 +    }
 +	
 +    gmx_ctime_r(&t,timebuf,STRLEN);
 +
 +    fprintf(fp,
 +            "Log file opened on %s"
 +            "Host: %s  pid: %d  nodeid: %d  nnodes:  %d\n",
 +            timebuf,host,pid,cr->nodeid,cr->nnodes);
- 
- #if (defined BUILD_MACHINE && defined BUILD_TIME && defined BUILD_USER) 
 +    fprintf(fp,
-             "The Gromacs distribution was built %s by\n"
-             "%s (%s)\n\n\n",BUILD_TIME,BUILD_USER,BUILD_MACHINE);
- #endif
++            "Built %s by %s\n"
++            "Build os/architecture: %s\n"
++            "Build CPU Vendor: %s  Brand: %s\n"
++            "Build CPU Family: %d  Model: %d  Stepping: %d\n"
++            "Build CPU Features: %s\n"
++            "Compiler: %s\n"
++            "CFLAGS: %s\n\n",
++            BUILD_TIME,BUILD_USER,BUILD_HOST,
++            BUILD_CPU_VENDOR,BUILD_CPU_BRAND,
++            BUILD_CPU_FAMILY,BUILD_CPU_MODEL,BUILD_CPU_STEPPING,
++            BUILD_CPU_FEATURES,BUILD_COMPILER,BUILD_CFLAGS);
 +
 +    fflush(fp);
 +    debug_gmx();
 +
 +    *fplog = fp;
 +}
 +
 +void gmx_log_close(FILE *fp)
 +{
 +  if (fp) {
 +    gmx_fatal_set_log_file(NULL);
 +    gmx_fio_fclose(fp);
 +  }
 +}
 +
 +static void comm_args(const t_commrec *cr,int *argc,char ***argv)
 +{
 +  int i,len;
 +  
 +  if (PAR(cr))
 +    gmx_bcast(sizeof(*argc),argc,cr);
 +  
 +  if (!MASTER(cr))
 +    snew(*argv,*argc+1);
 +  fprintf(stderr,"NODEID=%d argc=%d\n",cr->nodeid,*argc);
 +  for(i=0; (i<*argc); i++) {
 +    if (MASTER(cr))
 +      len = strlen((*argv)[i])+1;
 +    gmx_bcast(sizeof(len),&len,cr);
 +    if (!MASTER(cr))
 +      snew((*argv)[i],len);
 +    /*gmx_bcast(len*sizeof((*argv)[i][0]),(*argv)[i],cr);*/
 +    gmx_bcast(len*sizeof(char),(*argv)[i],cr);
 +  }
 +  debug_gmx();
 +}
 +
 +void init_multisystem(t_commrec *cr,int nsim, char **multidirs,
 +                      int nfile, const t_filenm fnm[],gmx_bool bParFn)
 +{
 +    gmx_multisim_t *ms;
 +    int  nnodes,nnodpersim,sim,i,ftp;
 +    char buf[256];
 +#ifdef GMX_MPI
 +    MPI_Group mpi_group_world;
 +#endif  
 +    int *rank;
 +
 +#ifndef GMX_MPI
 +    if (nsim > 1)
 +    {
 +        gmx_fatal(FARGS,"This binary is compiled without MPI support, can not do multiple simulations.");
 +    }
 +#endif
 +
 +    nnodes  = cr->nnodes;
 +    if (nnodes % nsim != 0)
 +    {
 +        gmx_fatal(FARGS,"The number of nodes (%d) is not a multiple of the number of simulations (%d)",nnodes,nsim);
 +    }
 +
 +    nnodpersim = nnodes/nsim;
 +    sim = cr->nodeid/nnodpersim;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"We have %d simulations, %d nodes per simulation, local simulation is %d\n",nsim,nnodpersim,sim);
 +    }
 +
 +    snew(ms,1);
 +    cr->ms = ms;
 +    ms->nsim = nsim;
 +    ms->sim  = sim;
 +#ifdef GMX_MPI
 +    /* Create a communicator for the master nodes */
 +    snew(rank,ms->nsim);
 +    for(i=0; i<ms->nsim; i++)
 +    {
 +        rank[i] = i*nnodpersim;
 +    }
 +    MPI_Comm_group(MPI_COMM_WORLD,&mpi_group_world);
 +    MPI_Group_incl(mpi_group_world,nsim,rank,&ms->mpi_group_masters);
 +    sfree(rank);
 +    MPI_Comm_create(MPI_COMM_WORLD,ms->mpi_group_masters,
 +                    &ms->mpi_comm_masters);
 +
 +#if !defined(GMX_THREAD_MPI) && !defined(MPI_IN_PLACE_EXISTS)
 +    /* initialize the MPI_IN_PLACE replacement buffers */
 +    snew(ms->mpb, 1);
 +    ms->mpb->ibuf=NULL;
 +    ms->mpb->libuf=NULL;
 +    ms->mpb->fbuf=NULL;
 +    ms->mpb->dbuf=NULL;
 +    ms->mpb->ibuf_alloc=0;
 +    ms->mpb->libuf_alloc=0;
 +    ms->mpb->fbuf_alloc=0;
 +    ms->mpb->dbuf_alloc=0;
 +#endif
 +
 +#endif
 +
 +    /* Reduce the intra-simulation communication */
 +    cr->sim_nodeid = cr->nodeid % nnodpersim;
 +    cr->nnodes = nnodpersim;
 +#ifdef GMX_MPI
 +    MPI_Comm_split(MPI_COMM_WORLD,sim,cr->sim_nodeid,&cr->mpi_comm_mysim);
 +    cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
 +    cr->nodeid = cr->sim_nodeid;
 +#endif
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"This is simulation %d",cr->ms->sim);
 +        if (PAR(cr))
 +        {
 +            fprintf(debug,", local number of nodes %d, local nodeid %d",
 +                    cr->nnodes,cr->sim_nodeid);
 +        }
 +        fprintf(debug,"\n\n");
 +    }
 +
 +    if (multidirs)
 +    {
 +        int ret;
 +        if (debug)
 +        {
 +            fprintf(debug,"Changing to directory %s\n",multidirs[cr->ms->sim]);
 +        }
 +        gmx_chdir(multidirs[cr->ms->sim]);
 +    }
 +    else if (bParFn)
 +    {
 +        /* Patch output and tpx, cpt and rerun input file names */
 +        for(i=0; (i<nfile); i++)
 +        {
 +            /* Because of possible multiple extensions per type we must look 
 +             * at the actual file name 
 +             */
 +            if (is_output(&fnm[i]) ||
 +                fnm[i].ftp == efTPX || fnm[i].ftp == efCPT ||
 +                strcmp(fnm[i].opt,"-rerun") == 0)
 +            {
 +                ftp = fn2ftp(fnm[i].fns[0]);
 +                par_fn(fnm[i].fns[0],ftp,cr,TRUE,FALSE,buf,255);
 +                sfree(fnm[i].fns[0]);
 +                fnm[i].fns[0] = gmx_strdup(buf);
 +            }
 +        }
 +    }
 +}
 +
 +t_commrec *init_par(int *argc,char ***argv_ptr)
 +{
 +    t_commrec *cr;
 +    char      **argv;
 +    int       i;
 +    gmx_bool      pe=FALSE;
 +
 +    snew(cr,1);
 +
 +    argv = *argv_ptr;
 +
 +#if defined GMX_MPI && !defined GMX_THREAD_MPI
 +    cr->sim_nodeid = gmx_setup(argc,argv,&cr->nnodes);
 +
 +    if (!PAR(cr) && (cr->sim_nodeid != 0))
 +    {
 +        gmx_comm("(!PAR(cr) && (cr->sim_nodeid != 0))");
 +    }
 +
 +    cr->mpi_comm_mysim   = MPI_COMM_WORLD;
 +    cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
 +#else
 +    /* These should never be accessed */
 +    cr->mpi_comm_mysim   = NULL;
 +    cr->mpi_comm_mygroup = NULL;
 +    cr->nnodes           = 1;
 +    cr->sim_nodeid       = 0;
 +#endif
 +
 +    cr->nodeid = cr->sim_nodeid;
 +
 +    cr->duty = (DUTY_PP | DUTY_PME);
 +
 +    /* Communicate arguments if parallel */
 +#ifndef GMX_THREAD_MPI
 +    if (PAR(cr))
 +    {
 +        comm_args(cr,argc,argv_ptr);
 +    }
 +#endif /* GMX_THREAD_MPI */
 +
 +#ifdef GMX_MPI
 +#if !defined(GMX_THREAD_MPI) && !defined(MPI_IN_PLACE_EXISTS)
 +  /* initialize the MPI_IN_PLACE replacement buffers */
 +  snew(cr->mpb, 1);
 +  cr->mpb->ibuf=NULL;
 +  cr->mpb->libuf=NULL;
 +  cr->mpb->fbuf=NULL;
 +  cr->mpb->dbuf=NULL;
 +  cr->mpb->ibuf_alloc=0;
 +  cr->mpb->libuf_alloc=0;
 +  cr->mpb->fbuf_alloc=0;
 +  cr->mpb->dbuf_alloc=0;
 +#endif
 +#endif
 +
 +    return cr;
 +}
 +
 +t_commrec *init_par_threads(const t_commrec *cro)
 +{
 +#ifdef GMX_THREAD_MPI
 +    int initialized;
 +    t_commrec *cr;
 +
 +    /* make a thread-specific commrec */
 +    snew(cr,1);
 +    /* now copy the whole thing, so settings like the number of PME nodes
 +       get propagated. */
 +    *cr=*cro;
 +
 +    /* and we start setting our own thread-specific values for things */
 +    MPI_Initialized(&initialized);
 +    if (!initialized)
 +    {
 +        gmx_comm("Initializing threads without comm");
 +    }
 +    /* once threads will be used together with MPI, we'll
 +       fill the cr structure with distinct data here. This might even work: */
 +    cr->sim_nodeid = gmx_setup(0,NULL, &cr->nnodes);
 +
 +    cr->mpi_comm_mysim = MPI_COMM_WORLD;
 +    cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
 +    cr->nodeid = cr->sim_nodeid;
 +    cr->duty = (DUTY_PP | DUTY_PME);
 +
 +    return cr;
 +#else
 +    return NULL;
 +#endif
 +}
diff --cc src/gromacs/gmxlib/nonbonded/CMakeLists.txt
index 0000000000,2c3a658eb3..2c3a658eb3
mode 000000,100644..100644
--- a/src/gromacs/gmxlib/nonbonded/CMakeLists.txt
+++ b/src/gromacs/gmxlib/nonbonded/CMakeLists.txt
diff --cc src/gromacs/legacyheaders/gmx_detectcpu.h
index 0000000000,fc001c2335..fc001c2335
mode 000000,100644..100644
--- a/src/gromacs/legacyheaders/gmx_detectcpu.h
+++ b/src/gromacs/legacyheaders/gmx_detectcpu.h
diff --cc src/gromacs/legacyheaders/gmx_math_x86_avx_128_fma_double.h
index 0000000000,3189a407e8..3189a407e8
mode 000000,100644..100644
--- a/src/gromacs/legacyheaders/gmx_math_x86_avx_128_fma_double.h
+++ b/src/gromacs/legacyheaders/gmx_math_x86_avx_128_fma_double.h
diff --cc src/gromacs/legacyheaders/gmx_math_x86_avx_128_fma_single.h
index 0000000000,d9c61a46db..d9c61a46db
mode 000000,100644..100644
--- a/src/gromacs/legacyheaders/gmx_math_x86_avx_128_fma_single.h
+++ b/src/gromacs/legacyheaders/gmx_math_x86_avx_128_fma_single.h
diff --cc src/gromacs/legacyheaders/gmx_math_x86_avx_256_double.h
index 0000000000,aa6f4d7fb5..aa6f4d7fb5
mode 000000,100644..100644
--- a/src/gromacs/legacyheaders/gmx_math_x86_avx_256_double.h
+++ b/src/gromacs/legacyheaders/gmx_math_x86_avx_256_double.h
diff --cc src/gromacs/legacyheaders/gmx_math_x86_avx_256_single.h
index 0000000000,0eb653934c..0eb653934c
mode 000000,100644..100644
--- a/src/gromacs/legacyheaders/gmx_math_x86_avx_256_single.h
+++ b/src/gromacs/legacyheaders/gmx_math_x86_avx_256_single.h
diff --cc src/gromacs/legacyheaders/gmx_math_x86_sse2_double.h
index 0000000000,303eb7e3bc..303eb7e3bc
mode 000000,100644..100644
--- a/src/gromacs/legacyheaders/gmx_math_x86_sse2_double.h
+++ b/src/gromacs/legacyheaders/gmx_math_x86_sse2_double.h
diff --cc src/gromacs/legacyheaders/gmx_math_x86_sse2_single.h
index 0000000000,6086e2a169..6086e2a169
mode 000000,100644..100644
--- a/src/gromacs/legacyheaders/gmx_math_x86_sse2_single.h
+++ b/src/gromacs/legacyheaders/gmx_math_x86_sse2_single.h
diff --cc src/gromacs/legacyheaders/gmx_math_x86_sse4_1_double.h
index 0000000000,37a9cac29b..37a9cac29b
mode 000000,100644..100644
--- a/src/gromacs/legacyheaders/gmx_math_x86_sse4_1_double.h
+++ b/src/gromacs/legacyheaders/gmx_math_x86_sse4_1_double.h
diff --cc src/gromacs/legacyheaders/gmx_math_x86_sse4_1_single.h
index 0000000000,3a430edba6..3a430edba6
mode 000000,100644..100644
--- a/src/gromacs/legacyheaders/gmx_math_x86_sse4_1_single.h
+++ b/src/gromacs/legacyheaders/gmx_math_x86_sse4_1_single.h
diff --cc src/gromacs/legacyheaders/gmx_x86_avx_128_fma.h
index 0000000000,260a317147..260a317147
mode 000000,100644..100644
--- a/src/gromacs/legacyheaders/gmx_x86_avx_128_fma.h
+++ b/src/gromacs/legacyheaders/gmx_x86_avx_128_fma.h
diff --cc src/gromacs/legacyheaders/gmx_x86_avx_256.h
index 0000000000,90317a5328..90317a5328
mode 000000,100644..100644
--- a/src/gromacs/legacyheaders/gmx_x86_avx_256.h
+++ b/src/gromacs/legacyheaders/gmx_x86_avx_256.h
diff --cc src/gromacs/legacyheaders/gmx_x86_sse2.h
index 0000000000,80f0a7e54e..80f0a7e54e
mode 000000,100644..100644
--- a/src/gromacs/legacyheaders/gmx_x86_sse2.h
+++ b/src/gromacs/legacyheaders/gmx_x86_sse2.h
diff --cc src/gromacs/legacyheaders/gmx_x86_sse4_1.h
index 0000000000,75e61e1bd9..75e61e1bd9
mode 000000,100644..100644
--- a/src/gromacs/legacyheaders/gmx_x86_sse4_1.h
+++ b/src/gromacs/legacyheaders/gmx_x86_sse4_1.h
diff --cc src/gromacs/mdlib/pme.c
index d37d5b7519,0000000000..8993d5d219
mode 100644,000000..100644
--- a/src/gromacs/mdlib/pme.c
+++ b/src/gromacs/mdlib/pme.c
@@@ -1,4353 -1,0 +1,4356 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +/* IMPORTANT FOR DEVELOPERS:
 + *
 + * Triclinic pme stuff isn't entirely trivial, and we've experienced
 + * some bugs during development (many of them due to me). To avoid
 + * this in the future, please check the following things if you make
 + * changes in this file:
 + *
 + * 1. You should obtain identical (at least to the PME precision)
 + *    energies, forces, and virial for
 + *    a rectangular box and a triclinic one where the z (or y) axis is
 + *    tilted a whole box side. For instance you could use these boxes:
 + *
 + *    rectangular       triclinic
 + *     2  0  0           2  0  0
 + *     0  2  0           0  2  0
 + *     0  0  6           2  2  6
 + *
 + * 2. You should check the energy conservation in a triclinic box.
 + *
 + * It might seem an overkill, but better safe than sorry.
 + * /Erik 001109
 + */
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#ifdef GMX_LIB_MPI
 +#include <mpi.h>
 +#endif
 +#ifdef GMX_THREAD_MPI
 +#include "tmpi.h"
 +#endif
 +
 +#ifdef GMX_OPENMP
 +#include <omp.h>
 +#endif
 +
 +#include <stdio.h>
 +#include <string.h>
 +#include <math.h>
 +#include <assert.h>
 +#include "typedefs.h"
 +#include "txtdump.h"
 +#include "vec.h"
 +#include "gmxcomplex.h"
 +#include "smalloc.h"
 +#include "futil.h"
 +#include "coulomb.h"
 +#include "gmx_fatal.h"
 +#include "pme.h"
 +#include "network.h"
 +#include "physics.h"
 +#include "nrnb.h"
 +#include "copyrite.h"
 +#include "gmx_wallcycle.h"
 +#include "gmx_parallel_3dfft.h"
 +#include "pdbio.h"
 +#include "gmx_cyclecounter.h"
 +#include "macros.h"
 +
- #if ( !defined(GMX_DOUBLE) && ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_X86_64_SSE2) ) )
- #include "gmx_sse2_single.h"
++/* Single precision, with SSE2 or higher available */
++#if defined(GMX_X86_SSE2) && !defined(GMX_DOUBLE)
++
++#include "gmx_x86_sse2.h"
++#include "gmx_math_x86_sse2_single.h"
 +
 +#define PME_SSE
 +/* Some old AMD processors could have problems with unaligned loads+stores */
 +#ifndef GMX_FAHCORE
 +#define PME_SSE_UNALIGNED
 +#endif
 +#endif
 +
 +#define DFT_TOL 1e-7
 +/* #define PRT_FORCE */
 +/* conditions for on the fly time-measurement */
 +/* #define TAKETIME (step > 1 && timesteps < 10) */
 +#define TAKETIME FALSE
 +
 +/* #define PME_TIME_THREADS */
 +
 +#ifdef GMX_DOUBLE
 +#define mpi_type MPI_DOUBLE
 +#else
 +#define mpi_type MPI_FLOAT
 +#endif
 +
 +/* GMX_CACHE_SEP should be a multiple of 16 to preserve alignment */
 +#define GMX_CACHE_SEP 64
 +
 +/* We only define a maximum to be able to use local arrays without allocation.
 + * An order larger than 12 should never be needed, even for test cases.
 + * If needed it can be changed here.
 + */
 +#define PME_ORDER_MAX 12
 +
 +/* Internal datastructures */
 +typedef struct {
 +    int send_index0;
 +    int send_nindex;
 +    int recv_index0;
 +    int recv_nindex;
 +} pme_grid_comm_t;
 +
 +typedef struct {
 +#ifdef GMX_MPI
 +    MPI_Comm mpi_comm;
 +#endif
 +    int  nnodes,nodeid;
 +    int  *s2g0;
 +    int  *s2g1;
 +    int  noverlap_nodes;
 +    int  *send_id,*recv_id;
 +    pme_grid_comm_t *comm_data;
 +    real *sendbuf;
 +    real *recvbuf;
 +} pme_overlap_t;
 +
 +typedef struct {
 +    int *n;     /* Cumulative counts of the number of particles per thread */
 +    int nalloc; /* Allocation size of i */
 +    int *i;     /* Particle indices ordered on thread index (n) */
 +} thread_plist_t;
 +
 +typedef struct {
 +    int  n;
 +    int  *ind;
 +    splinevec theta;
 +    splinevec dtheta;
 +} splinedata_t;
 +
 +typedef struct {
 +    int  dimind;            /* The index of the dimension, 0=x, 1=y */
 +    int  nslab;
 +    int  nodeid;
 +#ifdef GMX_MPI
 +    MPI_Comm mpi_comm;
 +#endif
 +
 +    int  *node_dest;        /* The nodes to send x and q to with DD */
 +    int  *node_src;         /* The nodes to receive x and q from with DD */
 +    int  *buf_index;        /* Index for commnode into the buffers */
 +
 +    int  maxshift;
 +
 +    int  npd;
 +    int  pd_nalloc;
 +    int  *pd;
 +    int  *count;            /* The number of atoms to send to each node */
 +    int  **count_thread;
 +    int  *rcount;           /* The number of atoms to receive */
 +
 +    int  n;
 +    int  nalloc;
 +    rvec *x;
 +    real *q;
 +    rvec *f;
 +    gmx_bool bSpread;       /* These coordinates are used for spreading */
 +    int  pme_order;
 +    ivec *idx;
 +    rvec *fractx;            /* Fractional coordinate relative to the
 +                              * lower cell boundary
 +                              */
 +    int  nthread;
 +    int  *thread_idx;        /* Which thread should spread which charge */
 +    thread_plist_t *thread_plist;
 +    splinedata_t *spline;
 +} pme_atomcomm_t;
 +
 +#define FLBS  3
 +#define FLBSZ 4
 +
 +typedef struct {
 +    ivec ci;     /* The spatial location of this grid       */
 +    ivec n;      /* The size of *grid, including order-1    */
 +    ivec offset; /* The grid offset from the full node grid */
 +    int  order;  /* PME spreading order                     */
 +    real *grid;  /* The grid local thread, size n           */
 +} pmegrid_t;
 +
 +typedef struct {
 +    pmegrid_t grid;     /* The full node grid (non thread-local)            */
 +    int  nthread;       /* The number of threads operating on this grid     */
 +    ivec nc;            /* The local spatial decomposition over the threads */
 +    pmegrid_t *grid_th; /* Array of grids for each thread                   */
 +    int  **g2t;         /* The grid to thread index                         */
 +    ivec nthread_comm;  /* The number of threads to communicate with        */
 +} pmegrids_t;
 +
 +
 +typedef struct {
 +#ifdef PME_SSE
 +    /* Masks for SSE aligned spreading and gathering */
 +    __m128 mask_SSE0[6],mask_SSE1[6];
 +#else
 +    int dummy; /* C89 requires that struct has at least one member */
 +#endif
 +} pme_spline_work_t;
 +
 +typedef struct {
 +    /* work data for solve_pme */
 +    int      nalloc;
 +    real *   mhx;
 +    real *   mhy;
 +    real *   mhz;
 +    real *   m2;
 +    real *   denom;
 +    real *   tmp1_alloc;
 +    real *   tmp1;
 +    real *   eterm;
 +    real *   m2inv;
 +
 +    real     energy;
 +    matrix   vir;
 +} pme_work_t;
 +
 +typedef struct gmx_pme {
 +    int  ndecompdim;         /* The number of decomposition dimensions */
 +    int  nodeid;             /* Our nodeid in mpi->mpi_comm */
 +    int  nodeid_major;
 +    int  nodeid_minor;
 +    int  nnodes;             /* The number of nodes doing PME */
 +    int  nnodes_major;
 +    int  nnodes_minor;
 +
 +    MPI_Comm mpi_comm;
 +    MPI_Comm mpi_comm_d[2];  /* Indexed on dimension, 0=x, 1=y */
 +#ifdef GMX_MPI
 +    MPI_Datatype  rvec_mpi;  /* the pme vector's MPI type */
 +#endif
 +
 +    int  nthread;            /* The number of threads doing PME */
 +
 +    gmx_bool bPPnode;        /* Node also does particle-particle forces */
 +    gmx_bool bFEP;           /* Compute Free energy contribution */
 +    int nkx,nky,nkz;         /* Grid dimensions */
 +    gmx_bool bP3M;           /* Do P3M: optimize the influence function */
 +    int pme_order;
 +    real epsilon_r;
 +
 +    pmegrids_t pmegridA;  /* Grids on which we do spreading/interpolation, includes overlap */
 +    pmegrids_t pmegridB;
 +    /* The PME charge spreading grid sizes/strides, includes pme_order-1 */
 +    int     pmegrid_nx,pmegrid_ny,pmegrid_nz;
 +    /* pmegrid_nz might be larger than strictly necessary to ensure
 +     * memory alignment, pmegrid_nz_base gives the real base size.
 +     */
 +    int     pmegrid_nz_base;
 +    /* The local PME grid starting indices */
 +    int     pmegrid_start_ix,pmegrid_start_iy,pmegrid_start_iz;
 +
 +    /* Work data for spreading and gathering */
 +    pme_spline_work_t spline_work;
 +
 +    real *fftgridA;             /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
 +    real *fftgridB;             /* inside the interpolation grid, but separate for 2D PME decomp. */
 +    int   fftgrid_nx,fftgrid_ny,fftgrid_nz;
 +
 +    t_complex *cfftgridA;             /* Grids for complex FFT data */
 +    t_complex *cfftgridB;
 +    int   cfftgrid_nx,cfftgrid_ny,cfftgrid_nz;
 +
 +    gmx_parallel_3dfft_t  pfft_setupA;
 +    gmx_parallel_3dfft_t  pfft_setupB;
 +
 +    int  *nnx,*nny,*nnz;
 +    real *fshx,*fshy,*fshz;
 +
 +    pme_atomcomm_t atc[2];  /* Indexed on decomposition index */
 +    matrix    recipbox;
 +    splinevec bsp_mod;
 +
 +    pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */
 +
 +    pme_atomcomm_t atc_energy; /* Only for gmx_pme_calc_energy */
 +
 +    rvec *bufv;             /* Communication buffer */
 +    real *bufr;             /* Communication buffer */
 +    int  buf_nalloc;        /* The communication buffer size */
 +
 +    /* thread local work data for solve_pme */
 +    pme_work_t *work;
 +
 +    /* Work data for PME_redist */
 +    gmx_bool redist_init;
 +    int *    scounts;
 +    int *    rcounts;
 +    int *    sdispls;
 +    int *    rdispls;
 +    int *    sidx;
 +    int *    idxa;
 +    real *   redist_buf;
 +    int      redist_buf_nalloc;
 +
 +    /* Work data for sum_qgrid */
 +    real *   sum_qgrid_tmp;
 +    real *   sum_qgrid_dd_tmp;
 +} t_gmx_pme;
 +
 +
 +static void calc_interpolation_idx(gmx_pme_t pme,pme_atomcomm_t *atc,
 +                                   int start,int end,int thread)
 +{
 +    int  i;
 +    int  *idxptr,tix,tiy,tiz;
 +    real *xptr,*fptr,tx,ty,tz;
 +    real rxx,ryx,ryy,rzx,rzy,rzz;
 +    int  nx,ny,nz;
 +    int  start_ix,start_iy,start_iz;
 +    int  *g2tx,*g2ty,*g2tz;
 +    gmx_bool bThreads;
 +    int  *thread_idx=NULL;
 +    thread_plist_t *tpl=NULL;
 +    int  *tpl_n=NULL;
 +    int  thread_i;
 +
 +    nx  = pme->nkx;
 +    ny  = pme->nky;
 +    nz  = pme->nkz;
 +
 +    start_ix = pme->pmegrid_start_ix;
 +    start_iy = pme->pmegrid_start_iy;
 +    start_iz = pme->pmegrid_start_iz;
 +
 +    rxx = pme->recipbox[XX][XX];
 +    ryx = pme->recipbox[YY][XX];
 +    ryy = pme->recipbox[YY][YY];
 +    rzx = pme->recipbox[ZZ][XX];
 +    rzy = pme->recipbox[ZZ][YY];
 +    rzz = pme->recipbox[ZZ][ZZ];
 +
 +    g2tx = pme->pmegridA.g2t[XX];
 +    g2ty = pme->pmegridA.g2t[YY];
 +    g2tz = pme->pmegridA.g2t[ZZ];
 +
 +    bThreads = (atc->nthread > 1);
 +    if (bThreads)
 +    {
 +        thread_idx = atc->thread_idx;
 +
 +        tpl   = &atc->thread_plist[thread];
 +        tpl_n = tpl->n;
 +        for(i=0; i<atc->nthread; i++)
 +        {
 +            tpl_n[i] = 0;
 +        }
 +    }
 +
 +    for(i=start; i<end; i++) {
 +        xptr   = atc->x[i];
 +        idxptr = atc->idx[i];
 +        fptr   = atc->fractx[i];
 +
 +        /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
 +        tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 );
 +        ty = ny * (                  xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 );
 +        tz = nz * (                                   xptr[ZZ] * rzz + 2.0 );
 +
 +        tix = (int)(tx);
 +        tiy = (int)(ty);
 +        tiz = (int)(tz);
 +
 +        /* Because decomposition only occurs in x and y,
 +         * we never have a fraction correction in z.
 +         */
 +        fptr[XX] = tx - tix + pme->fshx[tix];
 +        fptr[YY] = ty - tiy + pme->fshy[tiy];
 +        fptr[ZZ] = tz - tiz;
 +
 +        idxptr[XX] = pme->nnx[tix];
 +        idxptr[YY] = pme->nny[tiy];
 +        idxptr[ZZ] = pme->nnz[tiz];
 +
 +#ifdef DEBUG
 +        range_check(idxptr[XX],0,pme->pmegrid_nx);
 +        range_check(idxptr[YY],0,pme->pmegrid_ny);
 +        range_check(idxptr[ZZ],0,pme->pmegrid_nz);
 +#endif
 +
 +        if (bThreads)
 +        {
 +            thread_i = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
 +            thread_idx[i] = thread_i;
 +            tpl_n[thread_i]++;
 +        }
 +    }
 +
 +    if (bThreads)
 +    {
 +        /* Make a list of particle indices sorted on thread */
 +
 +        /* Get the cumulative count */
 +        for(i=1; i<atc->nthread; i++)
 +        {
 +            tpl_n[i] += tpl_n[i-1];
 +        }
 +        /* The current implementation distributes particles equally
 +         * over the threads, so we could actually allocate for that
 +         * in pme_realloc_atomcomm_things.
 +         */
 +        if (tpl_n[atc->nthread-1] > tpl->nalloc)
 +        {
 +            tpl->nalloc = over_alloc_large(tpl_n[atc->nthread-1]);
 +            srenew(tpl->i,tpl->nalloc);
 +        }
 +        /* Set tpl_n to the cumulative start */
 +        for(i=atc->nthread-1; i>=1; i--)
 +        {
 +            tpl_n[i] = tpl_n[i-1];
 +        }
 +        tpl_n[0] = 0;
 +
 +        /* Fill our thread local array with indices sorted on thread */
 +        for(i=start; i<end; i++)
 +        {
 +            tpl->i[tpl_n[atc->thread_idx[i]]++] = i;
 +        }
 +        /* Now tpl_n contains the cummulative count again */
 +    }
 +}
 +
 +static void make_thread_local_ind(pme_atomcomm_t *atc,
 +                                  int thread,splinedata_t *spline)
 +{
 +    int  n,t,i,start,end;
 +    thread_plist_t *tpl;
 +
 +    /* Combine the indices made by each thread into one index */
 +
 +    n = 0;
 +    start = 0;
 +    for(t=0; t<atc->nthread; t++)
 +    {
 +        tpl = &atc->thread_plist[t];
 +        /* Copy our part (start - end) from the list of thread t */
 +        if (thread > 0)
 +        {
 +            start = tpl->n[thread-1];
 +        }
 +        end = tpl->n[thread];
 +        for(i=start; i<end; i++)
 +        {
 +            spline->ind[n++] = tpl->i[i];
 +        }
 +    }
 +
 +    spline->n = n;
 +}
 +
 +
 +static void pme_calc_pidx(int start, int end,
 +                          matrix recipbox, rvec x[],
 +                          pme_atomcomm_t *atc, int *count)
 +{
 +    int  nslab,i;
 +    int  si;
 +    real *xptr,s;
 +    real rxx,ryx,rzx,ryy,rzy;
 +    int *pd;
 +
 +    /* Calculate PME task index (pidx) for each grid index.
 +     * Here we always assign equally sized slabs to each node
 +     * for load balancing reasons (the PME grid spacing is not used).
 +     */
 +
 +    nslab = atc->nslab;
 +    pd    = atc->pd;
 +
 +    /* Reset the count */
 +    for(i=0; i<nslab; i++)
 +    {
 +        count[i] = 0;
 +    }
 +
 +    if (atc->dimind == 0)
 +    {
 +        rxx = recipbox[XX][XX];
 +        ryx = recipbox[YY][XX];
 +        rzx = recipbox[ZZ][XX];
 +        /* Calculate the node index in x-dimension */
 +        for(i=start; i<end; i++)
 +        {
 +            xptr   = x[i];
 +            /* Fractional coordinates along box vectors */
 +            s = nslab*(xptr[XX]*rxx + xptr[YY]*ryx + xptr[ZZ]*rzx);
 +            si = (int)(s + 2*nslab) % nslab;
 +            pd[i] = si;
 +            count[si]++;
 +        }
 +    }
 +    else
 +    {
 +        ryy = recipbox[YY][YY];
 +        rzy = recipbox[ZZ][YY];
 +        /* Calculate the node index in y-dimension */
 +        for(i=start; i<end; i++)
 +        {
 +            xptr   = x[i];
 +            /* Fractional coordinates along box vectors */
 +            s = nslab*(xptr[YY]*ryy + xptr[ZZ]*rzy);
 +            si = (int)(s + 2*nslab) % nslab;
 +            pd[i] = si;
 +            count[si]++;
 +        }
 +    }
 +}
 +
 +static void pme_calc_pidx_wrapper(int natoms, matrix recipbox, rvec x[],
 +                                  pme_atomcomm_t *atc)
 +{
 +    int nthread,thread,slab;
 +
 +    nthread = atc->nthread;
 +
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +    for(thread=0; thread<nthread; thread++)
 +    {
 +        pme_calc_pidx(natoms* thread   /nthread,
 +                      natoms*(thread+1)/nthread,
 +                      recipbox,x,atc,atc->count_thread[thread]);
 +    }
 +    /* Non-parallel reduction, since nslab is small */
 +
 +    for(thread=1; thread<nthread; thread++)
 +    {
 +        for(slab=0; slab<atc->nslab; slab++)
 +        {
 +            atc->count_thread[0][slab] += atc->count_thread[thread][slab];
 +        }
 +    }
 +}
 +
 +static void pme_realloc_splinedata(splinedata_t *spline, pme_atomcomm_t *atc)
 +{
 +    int i,d;
 +
 +    srenew(spline->ind,atc->nalloc);
 +    /* Initialize the index to identity so it works without threads */
 +    for(i=0; i<atc->nalloc; i++)
 +    {
 +        spline->ind[i] = i;
 +    }
 +
 +    for(d=0;d<DIM;d++)
 +    {
 +        srenew(spline->theta[d] ,atc->pme_order*atc->nalloc);
 +        srenew(spline->dtheta[d],atc->pme_order*atc->nalloc);
 +    }
 +}
 +
 +static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
 +{
 +    int nalloc_old,i,j,nalloc_tpl;
 +
 +    /* We have to avoid a NULL pointer for atc->x to avoid
 +     * possible fatal errors in MPI routines.
 +     */
 +    if (atc->n > atc->nalloc || atc->nalloc == 0)
 +    {
 +        nalloc_old = atc->nalloc;
 +        atc->nalloc = over_alloc_dd(max(atc->n,1));
 +
 +        if (atc->nslab > 1) {
 +            srenew(atc->x,atc->nalloc);
 +            srenew(atc->q,atc->nalloc);
 +            srenew(atc->f,atc->nalloc);
 +            for(i=nalloc_old; i<atc->nalloc; i++)
 +            {
 +                clear_rvec(atc->f[i]);
 +            }
 +        }
 +        if (atc->bSpread) {
 +            srenew(atc->fractx,atc->nalloc);
 +            srenew(atc->idx   ,atc->nalloc);
 +
 +            if (atc->nthread > 1)
 +            {
 +                srenew(atc->thread_idx,atc->nalloc);
 +            }
 +
 +            for(i=0; i<atc->nthread; i++)
 +            {
 +                pme_realloc_splinedata(&atc->spline[i],atc);
 +            }
 +        }
 +    }
 +}
 +
 +static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
 +                         int n, gmx_bool bXF, rvec *x_f, real *charge,
 +                         pme_atomcomm_t *atc)
 +/* Redistribute particle data for PME calculation */
 +/* domain decomposition by x coordinate           */
 +{
 +    int *idxa;
 +    int i, ii;
 +
 +    if(FALSE == pme->redist_init) {
 +        snew(pme->scounts,atc->nslab);
 +        snew(pme->rcounts,atc->nslab);
 +        snew(pme->sdispls,atc->nslab);
 +        snew(pme->rdispls,atc->nslab);
 +        snew(pme->sidx,atc->nslab);
 +        pme->redist_init = TRUE;
 +    }
 +    if (n > pme->redist_buf_nalloc) {
 +        pme->redist_buf_nalloc = over_alloc_dd(n);
 +        srenew(pme->redist_buf,pme->redist_buf_nalloc*DIM);
 +    }
 +
 +    pme->idxa = atc->pd;
 +
 +#ifdef GMX_MPI
 +    if (forw && bXF) {
 +        /* forward, redistribution from pp to pme */
 +
 +        /* Calculate send counts and exchange them with other nodes */
 +        for(i=0; (i<atc->nslab); i++) pme->scounts[i]=0;
 +        for(i=0; (i<n); i++) pme->scounts[pme->idxa[i]]++;
 +        MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm);
 +
 +        /* Calculate send and receive displacements and index into send
 +           buffer */
 +        pme->sdispls[0]=0;
 +        pme->rdispls[0]=0;
 +        pme->sidx[0]=0;
 +        for(i=1; i<atc->nslab; i++) {
 +            pme->sdispls[i]=pme->sdispls[i-1]+pme->scounts[i-1];
 +            pme->rdispls[i]=pme->rdispls[i-1]+pme->rcounts[i-1];
 +            pme->sidx[i]=pme->sdispls[i];
 +        }
 +        /* Total # of particles to be received */
 +        atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1];
 +
 +        pme_realloc_atomcomm_things(atc);
 +
 +        /* Copy particle coordinates into send buffer and exchange*/
 +        for(i=0; (i<n); i++) {
 +            ii=DIM*pme->sidx[pme->idxa[i]];
 +            pme->sidx[pme->idxa[i]]++;
 +            pme->redist_buf[ii+XX]=x_f[i][XX];
 +            pme->redist_buf[ii+YY]=x_f[i][YY];
 +            pme->redist_buf[ii+ZZ]=x_f[i][ZZ];
 +        }
 +        MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
 +                      pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
 +                      pme->rvec_mpi, atc->mpi_comm);
 +    }
 +    if (forw) {
 +        /* Copy charge into send buffer and exchange*/
 +        for(i=0; i<atc->nslab; i++) pme->sidx[i]=pme->sdispls[i];
 +        for(i=0; (i<n); i++) {
 +            ii=pme->sidx[pme->idxa[i]];
 +            pme->sidx[pme->idxa[i]]++;
 +            pme->redist_buf[ii]=charge[i];
 +        }
 +        MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, mpi_type,
 +                      atc->q, pme->rcounts, pme->rdispls, mpi_type,
 +                      atc->mpi_comm);
 +    }
 +    else { /* backward, redistribution from pme to pp */
 +        MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi,
 +                      pme->redist_buf, pme->scounts, pme->sdispls,
 +                      pme->rvec_mpi, atc->mpi_comm);
 +
 +        /* Copy data from receive buffer */
 +        for(i=0; i<atc->nslab; i++)
 +            pme->sidx[i] = pme->sdispls[i];
 +        for(i=0; (i<n); i++) {
 +            ii = DIM*pme->sidx[pme->idxa[i]];
 +            x_f[i][XX] += pme->redist_buf[ii+XX];
 +            x_f[i][YY] += pme->redist_buf[ii+YY];
 +            x_f[i][ZZ] += pme->redist_buf[ii+ZZ];
 +            pme->sidx[pme->idxa[i]]++;
 +        }
 +    }
 +#endif
 +}
 +
 +static void pme_dd_sendrecv(pme_atomcomm_t *atc,
 +                            gmx_bool bBackward,int shift,
 +                            void *buf_s,int nbyte_s,
 +                            void *buf_r,int nbyte_r)
 +{
 +#ifdef GMX_MPI
 +    int dest,src;
 +    MPI_Status stat;
 +
 +    if (bBackward == FALSE) {
 +        dest = atc->node_dest[shift];
 +        src  = atc->node_src[shift];
 +    } else {
 +        dest = atc->node_src[shift];
 +        src  = atc->node_dest[shift];
 +    }
 +
 +    if (nbyte_s > 0 && nbyte_r > 0) {
 +        MPI_Sendrecv(buf_s,nbyte_s,MPI_BYTE,
 +                     dest,shift,
 +                     buf_r,nbyte_r,MPI_BYTE,
 +                     src,shift,
 +                     atc->mpi_comm,&stat);
 +    } else if (nbyte_s > 0) {
 +        MPI_Send(buf_s,nbyte_s,MPI_BYTE,
 +                 dest,shift,
 +                 atc->mpi_comm);
 +    } else if (nbyte_r > 0) {
 +        MPI_Recv(buf_r,nbyte_r,MPI_BYTE,
 +                 src,shift,
 +                 atc->mpi_comm,&stat);
 +    }
 +#endif
 +}
 +
 +static void dd_pmeredist_x_q(gmx_pme_t pme,
 +                             int n, gmx_bool bX, rvec *x, real *charge,
 +                             pme_atomcomm_t *atc)
 +{
 +    int *commnode,*buf_index;
 +    int nnodes_comm,i,nsend,local_pos,buf_pos,node,scount,rcount;
 +
 +    commnode  = atc->node_dest;
 +    buf_index = atc->buf_index;
 +
 +    nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
 +
 +    nsend = 0;
 +    for(i=0; i<nnodes_comm; i++) {
 +        buf_index[commnode[i]] = nsend;
 +        nsend += atc->count[commnode[i]];
 +    }
 +    if (bX) {
 +        if (atc->count[atc->nodeid] + nsend != n)
 +            gmx_fatal(FARGS,"%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
 +                      "This usually means that your system is not well equilibrated.",
 +                      n - (atc->count[atc->nodeid] + nsend),
 +                      pme->nodeid,'x'+atc->dimind);
 +
 +        if (nsend > pme->buf_nalloc) {
 +            pme->buf_nalloc = over_alloc_dd(nsend);
 +            srenew(pme->bufv,pme->buf_nalloc);
 +            srenew(pme->bufr,pme->buf_nalloc);
 +        }
 +
 +        atc->n = atc->count[atc->nodeid];
 +        for(i=0; i<nnodes_comm; i++) {
 +            scount = atc->count[commnode[i]];
 +            /* Communicate the count */
 +            if (debug)
 +                fprintf(debug,"dimind %d PME node %d send to node %d: %d\n",
 +                        atc->dimind,atc->nodeid,commnode[i],scount);
 +            pme_dd_sendrecv(atc,FALSE,i,
 +                            &scount,sizeof(int),
 +                            &atc->rcount[i],sizeof(int));
 +            atc->n += atc->rcount[i];
 +        }
 +
 +        pme_realloc_atomcomm_things(atc);
 +    }
 +
 +    local_pos = 0;
 +    for(i=0; i<n; i++) {
 +        node = atc->pd[i];
 +        if (node == atc->nodeid) {
 +            /* Copy direct to the receive buffer */
 +            if (bX) {
 +                copy_rvec(x[i],atc->x[local_pos]);
 +            }
 +            atc->q[local_pos] = charge[i];
 +            local_pos++;
 +        } else {
 +            /* Copy to the send buffer */
 +            if (bX) {
 +                copy_rvec(x[i],pme->bufv[buf_index[node]]);
 +            }
 +            pme->bufr[buf_index[node]] = charge[i];
 +            buf_index[node]++;
 +        }
 +    }
 +
 +    buf_pos = 0;
 +    for(i=0; i<nnodes_comm; i++) {
 +        scount = atc->count[commnode[i]];
 +        rcount = atc->rcount[i];
 +        if (scount > 0 || rcount > 0) {
 +            if (bX) {
 +                /* Communicate the coordinates */
 +                pme_dd_sendrecv(atc,FALSE,i,
 +                                pme->bufv[buf_pos],scount*sizeof(rvec),
 +                                atc->x[local_pos],rcount*sizeof(rvec));
 +            }
 +            /* Communicate the charges */
 +            pme_dd_sendrecv(atc,FALSE,i,
 +                            pme->bufr+buf_pos,scount*sizeof(real),
 +                            atc->q+local_pos,rcount*sizeof(real));
 +            buf_pos   += scount;
 +            local_pos += atc->rcount[i];
 +        }
 +    }
 +}
 +
 +static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc,
 +                           int n, rvec *f,
 +                           gmx_bool bAddF)
 +{
 +  int *commnode,*buf_index;
 +  int nnodes_comm,local_pos,buf_pos,i,scount,rcount,node;
 +
 +  commnode  = atc->node_dest;
 +  buf_index = atc->buf_index;
 +
 +  nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
 +
 +  local_pos = atc->count[atc->nodeid];
 +  buf_pos = 0;
 +  for(i=0; i<nnodes_comm; i++) {
 +    scount = atc->rcount[i];
 +    rcount = atc->count[commnode[i]];
 +    if (scount > 0 || rcount > 0) {
 +      /* Communicate the forces */
 +      pme_dd_sendrecv(atc,TRUE,i,
 +                      atc->f[local_pos],scount*sizeof(rvec),
 +                      pme->bufv[buf_pos],rcount*sizeof(rvec));
 +      local_pos += scount;
 +    }
 +    buf_index[commnode[i]] = buf_pos;
 +    buf_pos   += rcount;
 +  }
 +
 +    local_pos = 0;
 +    if (bAddF)
 +    {
 +        for(i=0; i<n; i++)
 +        {
 +            node = atc->pd[i];
 +            if (node == atc->nodeid)
 +            {
 +                /* Add from the local force array */
 +                rvec_inc(f[i],atc->f[local_pos]);
 +                local_pos++;
 +            }
 +            else
 +            {
 +                /* Add from the receive buffer */
 +                rvec_inc(f[i],pme->bufv[buf_index[node]]);
 +                buf_index[node]++;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        for(i=0; i<n; i++)
 +        {
 +            node = atc->pd[i];
 +            if (node == atc->nodeid)
 +            {
 +                /* Copy from the local force array */
 +                copy_rvec(atc->f[local_pos],f[i]);
 +                local_pos++;
 +            }
 +            else
 +            {
 +                /* Copy from the receive buffer */
 +                copy_rvec(pme->bufv[buf_index[node]],f[i]);
 +                buf_index[node]++;
 +            }
 +        }
 +    }
 +}
 +
 +#ifdef GMX_MPI
 +static void
 +gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
 +{
 +    pme_overlap_t *overlap;
 +    int send_index0,send_nindex;
 +    int recv_index0,recv_nindex;
 +    MPI_Status stat;
 +    int i,j,k,ix,iy,iz,icnt;
 +    int ipulse,send_id,recv_id,datasize;
 +    real *p;
 +    real *sendptr,*recvptr;
 +
 +    /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
 +    overlap = &pme->overlap[1];
 +
 +    for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
 +    {
 +        /* Since we have already (un)wrapped the overlap in the z-dimension,
 +         * we only have to communicate 0 to nkz (not pmegrid_nz).
 +         */
 +        if (direction==GMX_SUM_QGRID_FORWARD)
 +        {
 +            send_id = overlap->send_id[ipulse];
 +            recv_id = overlap->recv_id[ipulse];
 +            send_index0   = overlap->comm_data[ipulse].send_index0;
 +            send_nindex   = overlap->comm_data[ipulse].send_nindex;
 +            recv_index0   = overlap->comm_data[ipulse].recv_index0;
 +            recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +        }
 +        else
 +        {
 +            send_id = overlap->recv_id[ipulse];
 +            recv_id = overlap->send_id[ipulse];
 +            send_index0   = overlap->comm_data[ipulse].recv_index0;
 +            send_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +            recv_index0   = overlap->comm_data[ipulse].send_index0;
 +            recv_nindex   = overlap->comm_data[ipulse].send_nindex;
 +        }
 +
 +        /* Copy data to contiguous send buffer */
 +        if (debug)
 +        {
 +            fprintf(debug,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
 +                    pme->nodeid,overlap->nodeid,send_id,
 +                    pme->pmegrid_start_iy,
 +                    send_index0-pme->pmegrid_start_iy,
 +                    send_index0-pme->pmegrid_start_iy+send_nindex);
 +        }
 +        icnt = 0;
 +        for(i=0;i<pme->pmegrid_nx;i++)
 +        {
 +            ix = i;
 +            for(j=0;j<send_nindex;j++)
 +            {
 +                iy = j + send_index0 - pme->pmegrid_start_iy;
 +                for(k=0;k<pme->nkz;k++)
 +                {
 +                    iz = k;
 +                    overlap->sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
 +                }
 +            }
 +        }
 +
 +        datasize      = pme->pmegrid_nx * pme->nkz;
 +
 +        MPI_Sendrecv(overlap->sendbuf,send_nindex*datasize,GMX_MPI_REAL,
 +                     send_id,ipulse,
 +                     overlap->recvbuf,recv_nindex*datasize,GMX_MPI_REAL,
 +                     recv_id,ipulse,
 +                     overlap->mpi_comm,&stat);
 +
 +        /* Get data from contiguous recv buffer */
 +        if (debug)
 +        {
 +            fprintf(debug,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
 +                    pme->nodeid,overlap->nodeid,recv_id,
 +                    pme->pmegrid_start_iy,
 +                    recv_index0-pme->pmegrid_start_iy,
 +                    recv_index0-pme->pmegrid_start_iy+recv_nindex);
 +        }
 +        icnt = 0;
 +        for(i=0;i<pme->pmegrid_nx;i++)
 +        {
 +            ix = i;
 +            for(j=0;j<recv_nindex;j++)
 +            {
 +                iy = j + recv_index0 - pme->pmegrid_start_iy;
 +                for(k=0;k<pme->nkz;k++)
 +                {
 +                    iz = k;
 +                    if(direction==GMX_SUM_QGRID_FORWARD)
 +                    {
 +                        grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
 +                    }
 +                    else
 +                    {
 +                        grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz]  = overlap->recvbuf[icnt++];
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    /* Major dimension is easier, no copying required,
 +     * but we might have to sum to separate array.
 +     * Since we don't copy, we have to communicate up to pmegrid_nz,
 +     * not nkz as for the minor direction.
 +     */
 +    overlap = &pme->overlap[0];
 +
 +    for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
 +    {
 +        if(direction==GMX_SUM_QGRID_FORWARD)
 +        {
 +            send_id = overlap->send_id[ipulse];
 +            recv_id = overlap->recv_id[ipulse];
 +            send_index0   = overlap->comm_data[ipulse].send_index0;
 +            send_nindex   = overlap->comm_data[ipulse].send_nindex;
 +            recv_index0   = overlap->comm_data[ipulse].recv_index0;
 +            recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +            recvptr   = overlap->recvbuf;
 +        }
 +        else
 +        {
 +            send_id = overlap->recv_id[ipulse];
 +            recv_id = overlap->send_id[ipulse];
 +            send_index0   = overlap->comm_data[ipulse].recv_index0;
 +            send_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +            recv_index0   = overlap->comm_data[ipulse].send_index0;
 +            recv_nindex   = overlap->comm_data[ipulse].send_nindex;
 +            recvptr   = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
 +        }
 +
 +        sendptr       = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
 +        datasize      = pme->pmegrid_ny * pme->pmegrid_nz;
 +
 +        if (debug)
 +        {
 +            fprintf(debug,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
 +                    pme->nodeid,overlap->nodeid,send_id,
 +                    pme->pmegrid_start_ix,
 +                    send_index0-pme->pmegrid_start_ix,
 +                    send_index0-pme->pmegrid_start_ix+send_nindex);
 +            fprintf(debug,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
 +                    pme->nodeid,overlap->nodeid,recv_id,
 +                    pme->pmegrid_start_ix,
 +                    recv_index0-pme->pmegrid_start_ix,
 +                    recv_index0-pme->pmegrid_start_ix+recv_nindex);
 +        }
 +
 +        MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
 +                     send_id,ipulse,
 +                     recvptr,recv_nindex*datasize,GMX_MPI_REAL,
 +                     recv_id,ipulse,
 +                     overlap->mpi_comm,&stat);
 +
 +        /* ADD data from contiguous recv buffer */
 +        if(direction==GMX_SUM_QGRID_FORWARD)
 +        {
 +            p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
 +            for(i=0;i<recv_nindex*datasize;i++)
 +            {
 +                p[i] += overlap->recvbuf[i];
 +            }
 +        }
 +    }
 +}
 +#endif
 +
 +
 +static int
 +copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
 +{
 +    ivec    local_fft_ndata,local_fft_offset,local_fft_size;
 +    ivec    local_pme_size;
 +    int     i,ix,iy,iz;
 +    int     pmeidx,fftidx;
 +
 +    /* Dimensions should be identical for A/B grid, so we just use A here */
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +
 +    local_pme_size[0] = pme->pmegrid_nx;
 +    local_pme_size[1] = pme->pmegrid_ny;
 +    local_pme_size[2] = pme->pmegrid_nz;
 +
 +    /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
 +     the offset is identical, and the PME grid always has more data (due to overlap)
 +     */
 +    {
 +#ifdef DEBUG_PME
 +        FILE *fp,*fp2;
 +        char fn[STRLEN],format[STRLEN];
 +        real val;
 +        sprintf(fn,"pmegrid%d.pdb",pme->nodeid);
 +        fp = ffopen(fn,"w");
 +        sprintf(fn,"pmegrid%d.txt",pme->nodeid);
 +        fp2 = ffopen(fn,"w");
 +     sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
 +#endif
 +
 +    for(ix=0;ix<local_fft_ndata[XX];ix++)
 +    {
 +        for(iy=0;iy<local_fft_ndata[YY];iy++)
 +        {
 +            for(iz=0;iz<local_fft_ndata[ZZ];iz++)
 +            {
 +                pmeidx = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
 +                fftidx = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
 +                fftgrid[fftidx] = pmegrid[pmeidx];
 +#ifdef DEBUG_PME
 +                val = 100*pmegrid[pmeidx];
 +                if (pmegrid[pmeidx] != 0)
 +                fprintf(fp,format,"ATOM",pmeidx,"CA","GLY",' ',pmeidx,' ',
 +                        5.0*ix,5.0*iy,5.0*iz,1.0,val);
 +                if (pmegrid[pmeidx] != 0)
 +                    fprintf(fp2,"%-12s  %5d  %5d  %5d  %12.5e\n",
 +                            "qgrid",
 +                            pme->pmegrid_start_ix + ix,
 +                            pme->pmegrid_start_iy + iy,
 +                            pme->pmegrid_start_iz + iz,
 +                            pmegrid[pmeidx]);
 +#endif
 +            }
 +        }
 +    }
 +#ifdef DEBUG_PME
 +    ffclose(fp);
 +    ffclose(fp2);
 +#endif
 +    }
 +    return 0;
 +}
 +
 +
 +static gmx_cycles_t omp_cyc_start()
 +{
 +    return gmx_cycles_read();
 +}
 +
 +static gmx_cycles_t omp_cyc_end(gmx_cycles_t c)
 +{
 +    return gmx_cycles_read() - c;
 +}
 +
 +
 +static int
 +copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid,
 +                        int nthread,int thread)
 +{
 +    ivec    local_fft_ndata,local_fft_offset,local_fft_size;
 +    ivec    local_pme_size;
 +    int     ixy0,ixy1,ixy,ix,iy,iz;
 +    int     pmeidx,fftidx;
 +#ifdef PME_TIME_THREADS
 +    gmx_cycles_t c1;
 +    static double cs1=0;
 +    static int cnt=0;
 +#endif
 +
 +#ifdef PME_TIME_THREADS
 +    c1 = omp_cyc_start();
 +#endif
 +    /* Dimensions should be identical for A/B grid, so we just use A here */
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +
 +    local_pme_size[0] = pme->pmegrid_nx;
 +    local_pme_size[1] = pme->pmegrid_ny;
 +    local_pme_size[2] = pme->pmegrid_nz;
 +
 +    /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
 +     the offset is identical, and the PME grid always has more data (due to overlap)
 +     */
 +    ixy0 = ((thread  )*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
 +    ixy1 = ((thread+1)*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
 +
 +    for(ixy=ixy0;ixy<ixy1;ixy++)
 +    {
 +        ix = ixy/local_fft_ndata[YY];
 +        iy = ixy - ix*local_fft_ndata[YY];
 +
 +        pmeidx = (ix*local_pme_size[YY] + iy)*local_pme_size[ZZ];
 +        fftidx = (ix*local_fft_size[YY] + iy)*local_fft_size[ZZ];
 +        for(iz=0;iz<local_fft_ndata[ZZ];iz++)
 +        {
 +            pmegrid[pmeidx+iz] = fftgrid[fftidx+iz];
 +        }
 +    }
 +
 +#ifdef PME_TIME_THREADS
 +    c1 = omp_cyc_end(c1);
 +    cs1 += (double)c1;
 +    cnt++;
 +    if (cnt % 20 == 0)
 +    {
 +        printf("copy %.2f\n",cs1*1e-9);
 +    }
 +#endif
 +
 +    return 0;
 +}
 +
 +
 +static void
 +wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
 +{
 +    int     nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix,iy,iz;
 +
 +    nx = pme->nkx;
 +    ny = pme->nky;
 +    nz = pme->nkz;
 +
 +    pnx = pme->pmegrid_nx;
 +    pny = pme->pmegrid_ny;
 +    pnz = pme->pmegrid_nz;
 +
 +    overlap = pme->pme_order - 1;
 +
 +    /* Add periodic overlap in z */
 +    for(ix=0; ix<pme->pmegrid_nx; ix++)
 +    {
 +        for(iy=0; iy<pme->pmegrid_ny; iy++)
 +        {
 +            for(iz=0; iz<overlap; iz++)
 +            {
 +                pmegrid[(ix*pny+iy)*pnz+iz] +=
 +                    pmegrid[(ix*pny+iy)*pnz+nz+iz];
 +            }
 +        }
 +    }
 +
 +    if (pme->nnodes_minor == 1)
 +    {
 +       for(ix=0; ix<pme->pmegrid_nx; ix++)
 +       {
 +           for(iy=0; iy<overlap; iy++)
 +           {
 +               for(iz=0; iz<nz; iz++)
 +               {
 +                   pmegrid[(ix*pny+iy)*pnz+iz] +=
 +                       pmegrid[(ix*pny+ny+iy)*pnz+iz];
 +               }
 +           }
 +       }
 +    }
 +
 +    if (pme->nnodes_major == 1)
 +    {
 +        ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
 +
 +        for(ix=0; ix<overlap; ix++)
 +        {
 +            for(iy=0; iy<ny_x; iy++)
 +            {
 +                for(iz=0; iz<nz; iz++)
 +                {
 +                    pmegrid[(ix*pny+iy)*pnz+iz] +=
 +                        pmegrid[((nx+ix)*pny+iy)*pnz+iz];
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void
 +unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
 +{
 +    int     nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix;
 +
 +    nx = pme->nkx;
 +    ny = pme->nky;
 +    nz = pme->nkz;
 +
 +    pnx = pme->pmegrid_nx;
 +    pny = pme->pmegrid_ny;
 +    pnz = pme->pmegrid_nz;
 +
 +    overlap = pme->pme_order - 1;
 +
 +    if (pme->nnodes_major == 1)
 +    {
 +        ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
 +
 +        for(ix=0; ix<overlap; ix++)
 +        {
 +            int iy,iz;
 +
 +            for(iy=0; iy<ny_x; iy++)
 +            {
 +                for(iz=0; iz<nz; iz++)
 +                {
 +                    pmegrid[((nx+ix)*pny+iy)*pnz+iz] =
 +                        pmegrid[(ix*pny+iy)*pnz+iz];
 +                }
 +            }
 +        }
 +    }
 +
 +    if (pme->nnodes_minor == 1)
 +    {
 +#pragma omp parallel for num_threads(pme->nthread) schedule(static)
 +       for(ix=0; ix<pme->pmegrid_nx; ix++)
 +       {
 +           int iy,iz;
 +
 +           for(iy=0; iy<overlap; iy++)
 +           {
 +               for(iz=0; iz<nz; iz++)
 +               {
 +                   pmegrid[(ix*pny+ny+iy)*pnz+iz] =
 +                       pmegrid[(ix*pny+iy)*pnz+iz];
 +               }
 +           }
 +       }
 +    }
 +
 +    /* Copy periodic overlap in z */
 +#pragma omp parallel for num_threads(pme->nthread) schedule(static)
 +    for(ix=0; ix<pme->pmegrid_nx; ix++)
 +    {
 +        int iy,iz;
 +
 +        for(iy=0; iy<pme->pmegrid_ny; iy++)
 +        {
 +            for(iz=0; iz<overlap; iz++)
 +            {
 +                pmegrid[(ix*pny+iy)*pnz+nz+iz] =
 +                    pmegrid[(ix*pny+iy)*pnz+iz];
 +            }
 +        }
 +    }
 +}
 +
 +static void clear_grid(int nx,int ny,int nz,real *grid,
 +                       ivec fs,int *flag,
 +                       int fx,int fy,int fz,
 +                       int order)
 +{
 +    int nc,ncz;
 +    int fsx,fsy,fsz,gx,gy,gz,g0x,g0y,x,y,z;
 +    int flind;
 +
 +    nc  = 2 + (order - 2)/FLBS;
 +    ncz = 2 + (order - 2)/FLBSZ;
 +
 +    for(fsx=fx; fsx<fx+nc; fsx++)
 +    {
 +        for(fsy=fy; fsy<fy+nc; fsy++)
 +        {
 +            for(fsz=fz; fsz<fz+ncz; fsz++)
 +            {
 +                flind = (fsx*fs[YY] + fsy)*fs[ZZ] + fsz;
 +                if (flag[flind] == 0)
 +                {
 +                    gx = fsx*FLBS;
 +                    gy = fsy*FLBS;
 +                    gz = fsz*FLBSZ;
 +                    g0x = (gx*ny + gy)*nz + gz;
 +                    for(x=0; x<FLBS; x++)
 +                    {
 +                        g0y = g0x;
 +                        for(y=0; y<FLBS; y++)
 +                        {
 +                            for(z=0; z<FLBSZ; z++)
 +                            {
 +                                grid[g0y+z] = 0;
 +                            }
 +                            g0y += nz;
 +                        }
 +                        g0x += ny*nz;
 +                    }
 +
 +                    flag[flind] = 1;
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
 +#define DO_BSPLINE(order)                            \
 +for(ithx=0; (ithx<order); ithx++)                    \
 +{                                                    \
 +    index_x = (i0+ithx)*pny*pnz;                     \
 +    valx    = qn*thx[ithx];                          \
 +                                                     \
 +    for(ithy=0; (ithy<order); ithy++)                \
 +    {                                                \
 +        valxy    = valx*thy[ithy];                   \
 +        index_xy = index_x+(j0+ithy)*pnz;            \
 +                                                     \
 +        for(ithz=0; (ithz<order); ithz++)            \
 +        {                                            \
 +            index_xyz        = index_xy+(k0+ithz);   \
 +            grid[index_xyz] += valxy*thz[ithz];      \
 +        }                                            \
 +    }                                                \
 +}
 +
 +
 +static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
 +                                     pme_atomcomm_t *atc, splinedata_t *spline,
 +                                     pme_spline_work_t *work)
 +{
 +
 +    /* spread charges from home atoms to local grid */
 +    real     *grid;
 +    pme_overlap_t *ol;
 +    int      b,i,nn,n,ithx,ithy,ithz,i0,j0,k0;
 +    int *    idxptr;
 +    int      order,norder,index_x,index_xy,index_xyz;
 +    real     valx,valxy,qn;
 +    real     *thx,*thy,*thz;
 +    int      localsize, bndsize;
 +    int      pnx,pny,pnz,ndatatot;
 +    int      offx,offy,offz;
 +
 +    pnx = pmegrid->n[XX];
 +    pny = pmegrid->n[YY];
 +    pnz = pmegrid->n[ZZ];
 +
 +    offx = pmegrid->offset[XX];
 +    offy = pmegrid->offset[YY];
 +    offz = pmegrid->offset[ZZ];
 +
 +    ndatatot = pnx*pny*pnz;
 +    grid = pmegrid->grid;
 +    for(i=0;i<ndatatot;i++)
 +    {
 +        grid[i] = 0;
 +    }
 +
 +    order = pmegrid->order;
 +
 +    for(nn=0; nn<spline->n; nn++)
 +    {
 +        n  = spline->ind[nn];
 +        qn = atc->q[n];
 +
 +        if (qn != 0)
 +        {
 +            idxptr = atc->idx[n];
 +            norder = nn*order;
 +
 +            i0   = idxptr[XX] - offx;
 +            j0   = idxptr[YY] - offy;
 +            k0   = idxptr[ZZ] - offz;
 +
 +            thx = spline->theta[XX] + norder;
 +            thy = spline->theta[YY] + norder;
 +            thz = spline->theta[ZZ] + norder;
 +
 +            switch (order) {
 +            case 4:
 +#ifdef PME_SSE
 +#ifdef PME_SSE_UNALIGNED
 +#define PME_SPREAD_SSE_ORDER4
 +#else
 +#define PME_SPREAD_SSE_ALIGNED
 +#define PME_ORDER 4
 +#endif
 +#include "pme_sse_single.h"
 +#else
 +                DO_BSPLINE(4);
 +#endif
 +                break;
 +            case 5:
 +#ifdef PME_SSE
 +#define PME_SPREAD_SSE_ALIGNED
 +#define PME_ORDER 5
 +#include "pme_sse_single.h"
 +#else
 +                DO_BSPLINE(5);
 +#endif
 +                break;
 +            default:
 +                DO_BSPLINE(order);
 +                break;
 +            }
 +        }
 +    }
 +}
 +
 +static void set_grid_alignment(int *pmegrid_nz,int pme_order)
 +{
 +#ifdef PME_SSE
 +    if (pme_order == 5
 +#ifndef PME_SSE_UNALIGNED
 +        || pme_order == 4
 +#endif
 +        )
 +    {
 +        /* Round nz up to a multiple of 4 to ensure alignment */
 +        *pmegrid_nz = ((*pmegrid_nz + 3) & ~3);
 +    }
 +#endif
 +}
 +
 +static void set_gridsize_alignment(int *gridsize,int pme_order)
 +{
 +#ifdef PME_SSE
 +#ifndef PME_SSE_UNALIGNED
 +    if (pme_order == 4)
 +    {
 +        /* Add extra elements to ensured aligned operations do not go
 +         * beyond the allocated grid size.
 +         * Note that for pme_order=5, the pme grid z-size alignment
 +         * ensures that we will not go beyond the grid size.
 +         */
 +         *gridsize += 4;
 +    }
 +#endif
 +#endif
 +}
 +
 +static void pmegrid_init(pmegrid_t *grid,
 +                         int cx, int cy, int cz,
 +                         int x0, int y0, int z0,
 +                         int x1, int y1, int z1,
 +                         gmx_bool set_alignment,
 +                         int pme_order,
 +                         real *ptr)
 +{
 +    int nz,gridsize;
 +
 +    grid->ci[XX] = cx;
 +    grid->ci[YY] = cy;
 +    grid->ci[ZZ] = cz;
 +    grid->offset[XX] = x0;
 +    grid->offset[YY] = y0;
 +    grid->offset[ZZ] = z0;
 +    grid->n[XX]      = x1 - x0 + pme_order - 1;
 +    grid->n[YY]      = y1 - y0 + pme_order - 1;
 +    grid->n[ZZ]      = z1 - z0 + pme_order - 1;
 +
 +    nz = grid->n[ZZ];
 +    set_grid_alignment(&nz,pme_order);
 +    if (set_alignment)
 +    {
 +        grid->n[ZZ] = nz;
 +    }
 +    else if (nz != grid->n[ZZ])
 +    {
 +        gmx_incons("pmegrid_init call with an unaligned z size");
 +    }
 +
 +    grid->order = pme_order;
 +    if (ptr == NULL)
 +    {
 +        gridsize = grid->n[XX]*grid->n[YY]*grid->n[ZZ];
 +        set_gridsize_alignment(&gridsize,pme_order);
 +        snew_aligned(grid->grid,gridsize,16);
 +    }
 +    else
 +    {
 +        grid->grid = ptr;
 +    }
 +}
 +
 +static int div_round_up(int enumerator,int denominator)
 +{
 +    return (enumerator + denominator - 1)/denominator;
 +}
 +
 +static void make_subgrid_division(const ivec n,int ovl,int nthread,
 +                                  ivec nsub)
 +{
 +    int gsize_opt,gsize;
 +    int nsx,nsy,nsz;
 +    char *env;
 +
 +    gsize_opt = -1;
 +    for(nsx=1; nsx<=nthread; nsx++)
 +    {
 +        if (nthread % nsx == 0)
 +        {
 +            for(nsy=1; nsy<=nthread; nsy++)
 +            {
 +                if (nsx*nsy <= nthread && nthread % (nsx*nsy) == 0)
 +                {
 +                    nsz = nthread/(nsx*nsy);
 +
 +                    /* Determine the number of grid points per thread */
 +                    gsize =
 +                        (div_round_up(n[XX],nsx) + ovl)*
 +                        (div_round_up(n[YY],nsy) + ovl)*
 +                        (div_round_up(n[ZZ],nsz) + ovl);
 +
 +                    /* Minimize the number of grids points per thread
 +                     * and, secondarily, the number of cuts in minor dimensions.
 +                     */
 +                    if (gsize_opt == -1 ||
 +                        gsize < gsize_opt ||
 +                        (gsize == gsize_opt &&
 +                         (nsz < nsub[ZZ] || (nsz == nsub[ZZ] && nsy < nsub[YY]))))
 +                    {
 +                        nsub[XX] = nsx;
 +                        nsub[YY] = nsy;
 +                        nsub[ZZ] = nsz;
 +                        gsize_opt = gsize;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    env = getenv("GMX_PME_THREAD_DIVISION");
 +    if (env != NULL)
 +    {
 +        sscanf(env,"%d %d %d",&nsub[XX],&nsub[YY],&nsub[ZZ]);
 +    }
 +
 +    if (nsub[XX]*nsub[YY]*nsub[ZZ] != nthread)
 +    {
 +        gmx_fatal(FARGS,"PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)",nsub[XX],nsub[YY],nsub[ZZ],nthread);
 +    }
 +}
 +
 +static void pmegrids_init(pmegrids_t *grids,
 +                          int nx,int ny,int nz,int nz_base,
 +                          int pme_order,
 +                          int nthread,
 +                          int overlap_x,
 +                          int overlap_y)
 +{
 +    ivec n,n_base,g0,g1;
 +    int t,x,y,z,d,i,tfac;
 +    int max_comm_lines;
 +
 +    n[XX] = nx - (pme_order - 1);
 +    n[YY] = ny - (pme_order - 1);
 +    n[ZZ] = nz - (pme_order - 1);
 +
 +    copy_ivec(n,n_base);
 +    n_base[ZZ] = nz_base;
 +
 +    pmegrid_init(&grids->grid,0,0,0,0,0,0,n[XX],n[YY],n[ZZ],FALSE,pme_order,
 +                 NULL);
 +
 +    grids->nthread = nthread;
 +
 +    make_subgrid_division(n_base,pme_order-1,grids->nthread,grids->nc);
 +
 +    if (grids->nthread > 1)
 +    {
 +        ivec nst;
 +        int gridsize;
 +        real *grid_all;
 +
 +        for(d=0; d<DIM; d++)
 +        {
 +            nst[d] = div_round_up(n[d],grids->nc[d]) + pme_order - 1;
 +        }
 +        set_grid_alignment(&nst[ZZ],pme_order);
 +
 +        if (debug)
 +        {
 +            fprintf(debug,"pmegrid thread local division: %d x %d x %d\n",
 +                    grids->nc[XX],grids->nc[YY],grids->nc[ZZ]);
 +            fprintf(debug,"pmegrid %d %d %d max thread pmegrid %d %d %d\n",
 +                    nx,ny,nz,
 +                    nst[XX],nst[YY],nst[ZZ]);
 +        }
 +
 +        snew(grids->grid_th,grids->nthread);
 +        t = 0;
 +        gridsize = nst[XX]*nst[YY]*nst[ZZ];
 +        set_gridsize_alignment(&gridsize,pme_order);
 +        snew_aligned(grid_all,
 +                     grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
 +                     16);
 +
 +        for(x=0; x<grids->nc[XX]; x++)
 +        {
 +            for(y=0; y<grids->nc[YY]; y++)
 +            {
 +                for(z=0; z<grids->nc[ZZ]; z++)
 +                {
 +                    pmegrid_init(&grids->grid_th[t],
 +                                 x,y,z,
 +                                 (n[XX]*(x  ))/grids->nc[XX],
 +                                 (n[YY]*(y  ))/grids->nc[YY],
 +                                 (n[ZZ]*(z  ))/grids->nc[ZZ],
 +                                 (n[XX]*(x+1))/grids->nc[XX],
 +                                 (n[YY]*(y+1))/grids->nc[YY],
 +                                 (n[ZZ]*(z+1))/grids->nc[ZZ],
 +                                 TRUE,
 +                                 pme_order,
 +                                 grid_all+GMX_CACHE_SEP+t*(gridsize+GMX_CACHE_SEP));
 +                    t++;
 +                }
 +            }
 +        }
 +    }
 +
 +    snew(grids->g2t,DIM);
 +    tfac = 1;
 +    for(d=DIM-1; d>=0; d--)
 +    {
 +        snew(grids->g2t[d],n[d]);
 +        t = 0;
 +        for(i=0; i<n[d]; i++)
 +        {
 +            /* The second check should match the parameters
 +             * of the pmegrid_init call above.
 +             */
 +            while (t + 1 < grids->nc[d] && i >= (n[d]*(t+1))/grids->nc[d])
 +            {
 +                t++;
 +            }
 +            grids->g2t[d][i] = t*tfac;
 +        }
 +
 +        tfac *= grids->nc[d];
 +
 +        switch (d)
 +        {
 +        case XX: max_comm_lines = overlap_x;     break;
 +        case YY: max_comm_lines = overlap_y;     break;
 +        case ZZ: max_comm_lines = pme_order - 1; break;
 +        }
 +        grids->nthread_comm[d] = 0;
 +        while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines)
 +        {
 +            grids->nthread_comm[d]++;
 +        }
 +        if (debug != NULL)
 +        {
 +            fprintf(debug,"pmegrid thread grid communication range in %c: %d\n",
 +                    'x'+d,grids->nthread_comm[d]);
 +        }
 +        /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
 +         * work, but this is not a problematic restriction.
 +         */
 +        if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d])
 +        {
 +            gmx_fatal(FARGS,"Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME",grids->nthread);
 +        }
 +    }
 +}
 +
 +
 +static void pmegrids_destroy(pmegrids_t *grids)
 +{
 +    int t;
 +
 +    if (grids->grid.grid != NULL)
 +    {
 +        sfree(grids->grid.grid);
 +
 +        if (grids->nthread > 0)
 +        {
 +            for(t=0; t<grids->nthread; t++)
 +            {
 +                sfree(grids->grid_th[t].grid);
 +            }
 +            sfree(grids->grid_th);
 +        }
 +    }
 +}
 +
 +
 +static void realloc_work(pme_work_t *work,int nkx)
 +{
 +    if (nkx > work->nalloc)
 +    {
 +        work->nalloc = nkx;
 +        srenew(work->mhx  ,work->nalloc);
 +        srenew(work->mhy  ,work->nalloc);
 +        srenew(work->mhz  ,work->nalloc);
 +        srenew(work->m2   ,work->nalloc);
 +        /* Allocate an aligned pointer for SSE operations, including 3 extra
 +         * elements at the end since SSE operates on 4 elements at a time.
 +         */
 +        sfree_aligned(work->denom);
 +        sfree_aligned(work->tmp1);
 +        sfree_aligned(work->eterm);
 +        snew_aligned(work->denom,work->nalloc+3,16);
 +        snew_aligned(work->tmp1 ,work->nalloc+3,16);
 +        snew_aligned(work->eterm,work->nalloc+3,16);
 +        srenew(work->m2inv,work->nalloc);
 +    }
 +}
 +
 +
 +static void free_work(pme_work_t *work)
 +{
 +    sfree(work->mhx);
 +    sfree(work->mhy);
 +    sfree(work->mhz);
 +    sfree(work->m2);
 +    sfree_aligned(work->denom);
 +    sfree_aligned(work->tmp1);
 +    sfree_aligned(work->eterm);
 +    sfree(work->m2inv);
 +}
 +
 +
 +#ifdef PME_SSE
 +    /* Calculate exponentials through SSE in float precision */
 +inline static void calc_exponentials(int start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
 +{
 +    {
 +        const __m128 two = _mm_set_ps(2.0f,2.0f,2.0f,2.0f);
 +        __m128 f_sse;
 +        __m128 lu;
 +        __m128 tmp_d1,d_inv,tmp_r,tmp_e;
 +        int kx;
 +        f_sse = _mm_load1_ps(&f);
 +        for(kx=0; kx<end; kx+=4)
 +        {
 +            tmp_d1   = _mm_load_ps(d_aligned+kx);
 +            lu       = _mm_rcp_ps(tmp_d1);
 +            d_inv    = _mm_mul_ps(lu,_mm_sub_ps(two,_mm_mul_ps(lu,tmp_d1)));
 +            tmp_r    = _mm_load_ps(r_aligned+kx);
 +            tmp_r    = gmx_mm_exp_ps(tmp_r);
 +            tmp_e    = _mm_mul_ps(f_sse,d_inv);
 +            tmp_e    = _mm_mul_ps(tmp_e,tmp_r);
 +            _mm_store_ps(e_aligned+kx,tmp_e);
 +        }
 +    }
 +}
 +#else
 +inline static void calc_exponentials(int start, int end, real f, real *d, real *r, real *e)
 +{
 +    int kx;
 +    for(kx=start; kx<end; kx++)
 +    {
 +        d[kx] = 1.0/d[kx];
 +    }
 +    for(kx=start; kx<end; kx++)
 +    {
 +        r[kx] = exp(r[kx]);
 +    }
 +    for(kx=start; kx<end; kx++)
 +    {
 +        e[kx] = f*r[kx]*d[kx];
 +    }
 +}
 +#endif
 +
 +
 +static int solve_pme_yzx(gmx_pme_t pme,t_complex *grid,
 +                         real ewaldcoeff,real vol,
 +                         gmx_bool bEnerVir,
 +                         int nthread,int thread)
 +{
 +    /* do recip sum over local cells in grid */
 +    /* y major, z middle, x minor or continuous */
 +    t_complex *p0;
 +    int     kx,ky,kz,maxkx,maxky,maxkz;
 +    int     nx,ny,nz,iyz0,iyz1,iyz,iy,iz,kxstart,kxend;
 +    real    mx,my,mz;
 +    real    factor=M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
 +    real    ets2,struct2,vfactor,ets2vf;
 +    real    d1,d2,energy=0;
 +    real    by,bz;
 +    real    virxx=0,virxy=0,virxz=0,viryy=0,viryz=0,virzz=0;
 +    real    rxx,ryx,ryy,rzx,rzy,rzz;
 +    pme_work_t *work;
 +    real    *mhx,*mhy,*mhz,*m2,*denom,*tmp1,*eterm,*m2inv;
 +    real    mhxk,mhyk,mhzk,m2k;
 +    real    corner_fac;
 +    ivec    complex_order;
 +    ivec    local_ndata,local_offset,local_size;
 +    real    elfac;
 +
 +    elfac = ONE_4PI_EPS0/pme->epsilon_r;
 +
 +    nx = pme->nkx;
 +    ny = pme->nky;
 +    nz = pme->nkz;
 +
 +    /* Dimensions should be identical for A/B grid, so we just use A here */
 +    gmx_parallel_3dfft_complex_limits(pme->pfft_setupA,
 +                                      complex_order,
 +                                      local_ndata,
 +                                      local_offset,
 +                                      local_size);
 +
 +    rxx = pme->recipbox[XX][XX];
 +    ryx = pme->recipbox[YY][XX];
 +    ryy = pme->recipbox[YY][YY];
 +    rzx = pme->recipbox[ZZ][XX];
 +    rzy = pme->recipbox[ZZ][YY];
 +    rzz = pme->recipbox[ZZ][ZZ];
 +
 +    maxkx = (nx+1)/2;
 +    maxky = (ny+1)/2;
 +    maxkz = nz/2+1;
 +
 +    work = &pme->work[thread];
 +    mhx   = work->mhx;
 +    mhy   = work->mhy;
 +    mhz   = work->mhz;
 +    m2    = work->m2;
 +    denom = work->denom;
 +    tmp1  = work->tmp1;
 +    eterm = work->eterm;
 +    m2inv = work->m2inv;
 +
 +    iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread   /nthread;
 +    iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
 +
 +    for(iyz=iyz0; iyz<iyz1; iyz++)
 +    {
 +        iy = iyz/local_ndata[ZZ];
 +        iz = iyz - iy*local_ndata[ZZ];
 +
 +        ky = iy + local_offset[YY];
 +
 +        if (ky < maxky)
 +        {
 +            my = ky;
 +        }
 +        else
 +        {
 +            my = (ky - ny);
 +        }
 +
 +        by = M_PI*vol*pme->bsp_mod[YY][ky];
 +
 +        kz = iz + local_offset[ZZ];
 +
 +        mz = kz;
 +
 +        bz = pme->bsp_mod[ZZ][kz];
 +
 +        /* 0.5 correction for corner points */
 +        corner_fac = 1;
 +        if (kz == 0 || kz == (nz+1)/2)
 +        {
 +            corner_fac = 0.5;
 +        }
 +
 +        p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
 +
 +        /* We should skip the k-space point (0,0,0) */
 +        if (local_offset[XX] > 0 || ky > 0 || kz > 0)
 +        {
 +            kxstart = local_offset[XX];
 +        }
 +        else
 +        {
 +            kxstart = local_offset[XX] + 1;
 +            p0++;
 +        }
 +        kxend = local_offset[XX] + local_ndata[XX];
 +
 +        if (bEnerVir)
 +        {
 +            /* More expensive inner loop, especially because of the storage
 +             * of the mh elements in array's.
 +             * Because x is the minor grid index, all mh elements
 +             * depend on kx for triclinic unit cells.
 +             */
 +
 +                /* Two explicit loops to avoid a conditional inside the loop */
 +            for(kx=kxstart; kx<maxkx; kx++)
 +            {
 +                mx = kx;
 +
 +                mhxk      = mx * rxx;
 +                mhyk      = mx * ryx + my * ryy;
 +                mhzk      = mx * rzx + my * rzy + mz * rzz;
 +                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 +                mhx[kx]   = mhxk;
 +                mhy[kx]   = mhyk;
 +                mhz[kx]   = mhzk;
 +                m2[kx]    = m2k;
 +                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
 +                tmp1[kx]  = -factor*m2k;
 +            }
 +
 +            for(kx=maxkx; kx<kxend; kx++)
 +            {
 +                mx = (kx - nx);
 +
 +                mhxk      = mx * rxx;
 +                mhyk      = mx * ryx + my * ryy;
 +                mhzk      = mx * rzx + my * rzy + mz * rzz;
 +                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 +                mhx[kx]   = mhxk;
 +                mhy[kx]   = mhyk;
 +                mhz[kx]   = mhzk;
 +                m2[kx]    = m2k;
 +                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
 +                tmp1[kx]  = -factor*m2k;
 +            }
 +
 +            for(kx=kxstart; kx<kxend; kx++)
 +            {
 +                m2inv[kx] = 1.0/m2[kx];
 +            }
 +
 +            calc_exponentials(kxstart,kxend,elfac,denom,tmp1,eterm);
 +
 +            for(kx=kxstart; kx<kxend; kx++,p0++)
 +            {
 +                d1      = p0->re;
 +                d2      = p0->im;
 +
 +                p0->re  = d1*eterm[kx];
 +                p0->im  = d2*eterm[kx];
 +
 +                struct2 = 2.0*(d1*d1+d2*d2);
 +
 +                tmp1[kx] = eterm[kx]*struct2;
 +            }
 +
 +            for(kx=kxstart; kx<kxend; kx++)
 +            {
 +                ets2     = corner_fac*tmp1[kx];
 +                vfactor  = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
 +                energy  += ets2;
 +
 +                ets2vf   = ets2*vfactor;
 +                virxx   += ets2vf*mhx[kx]*mhx[kx] - ets2;
 +                virxy   += ets2vf*mhx[kx]*mhy[kx];
 +                virxz   += ets2vf*mhx[kx]*mhz[kx];
 +                viryy   += ets2vf*mhy[kx]*mhy[kx] - ets2;
 +                viryz   += ets2vf*mhy[kx]*mhz[kx];
 +                virzz   += ets2vf*mhz[kx]*mhz[kx] - ets2;
 +            }
 +        }
 +        else
 +        {
 +            /* We don't need to calculate the energy and the virial.
 +             * In this case the triclinic overhead is small.
 +             */
 +
 +            /* Two explicit loops to avoid a conditional inside the loop */
 +
 +            for(kx=kxstart; kx<maxkx; kx++)
 +            {
 +                mx = kx;
 +
 +                mhxk      = mx * rxx;
 +                mhyk      = mx * ryx + my * ryy;
 +                mhzk      = mx * rzx + my * rzy + mz * rzz;
 +                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 +                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
 +                tmp1[kx]  = -factor*m2k;
 +            }
 +
 +            for(kx=maxkx; kx<kxend; kx++)
 +            {
 +                mx = (kx - nx);
 +
 +                mhxk      = mx * rxx;
 +                mhyk      = mx * ryx + my * ryy;
 +                mhzk      = mx * rzx + my * rzy + mz * rzz;
 +                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 +                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
 +                tmp1[kx]  = -factor*m2k;
 +            }
 +
 +            calc_exponentials(kxstart,kxend,elfac,denom,tmp1,eterm);
 +
 +            for(kx=kxstart; kx<kxend; kx++,p0++)
 +            {
 +                d1      = p0->re;
 +                d2      = p0->im;
 +
 +                p0->re  = d1*eterm[kx];
 +                p0->im  = d2*eterm[kx];
 +            }
 +        }
 +    }
 +
 +    if (bEnerVir)
 +    {
 +        /* Update virial with local values.
 +         * The virial is symmetric by definition.
 +         * this virial seems ok for isotropic scaling, but I'm
 +         * experiencing problems on semiisotropic membranes.
 +         * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
 +         */
 +        work->vir[XX][XX] = 0.25*virxx;
 +        work->vir[YY][YY] = 0.25*viryy;
 +        work->vir[ZZ][ZZ] = 0.25*virzz;
 +        work->vir[XX][YY] = work->vir[YY][XX] = 0.25*virxy;
 +        work->vir[XX][ZZ] = work->vir[ZZ][XX] = 0.25*virxz;
 +        work->vir[YY][ZZ] = work->vir[ZZ][YY] = 0.25*viryz;
 +
 +        /* This energy should be corrected for a charged system */
 +        work->energy = 0.5*energy;
 +    }
 +
 +    /* Return the loop count */
 +    return local_ndata[YY]*local_ndata[XX];
 +}
 +
 +static void get_pme_ener_vir(const gmx_pme_t pme,int nthread,
 +                             real *mesh_energy,matrix vir)
 +{
 +    /* This function sums output over threads
 +     * and should therefore only be called after thread synchronization.
 +     */
 +    int thread;
 +
 +    *mesh_energy = pme->work[0].energy;
 +    copy_mat(pme->work[0].vir,vir);
 +
 +    for(thread=1; thread<nthread; thread++)
 +    {
 +        *mesh_energy += pme->work[thread].energy;
 +        m_add(vir,pme->work[thread].vir,vir);
 +    }
 +}
 +
 +#define DO_FSPLINE(order)                      \
 +for(ithx=0; (ithx<order); ithx++)              \
 +{                                              \
 +    index_x = (i0+ithx)*pny*pnz;               \
 +    tx      = thx[ithx];                       \
 +    dx      = dthx[ithx];                      \
 +                                               \
 +    for(ithy=0; (ithy<order); ithy++)          \
 +    {                                          \
 +        index_xy = index_x+(j0+ithy)*pnz;      \
 +        ty       = thy[ithy];                  \
 +        dy       = dthy[ithy];                 \
 +        fxy1     = fz1 = 0;                    \
 +                                               \
 +        for(ithz=0; (ithz<order); ithz++)      \
 +        {                                      \
 +            gval  = grid[index_xy+(k0+ithz)];  \
 +            fxy1 += thz[ithz]*gval;            \
 +            fz1  += dthz[ithz]*gval;           \
 +        }                                      \
 +        fx += dx*ty*fxy1;                      \
 +        fy += tx*dy*fxy1;                      \
 +        fz += tx*ty*fz1;                       \
 +    }                                          \
 +}
 +
 +
 +static void gather_f_bsplines(gmx_pme_t pme,real *grid,
 +                              gmx_bool bClearF,pme_atomcomm_t *atc,
 +                              splinedata_t *spline,
 +                              real scale)
 +{
 +    /* sum forces for local particles */
 +    int     nn,n,ithx,ithy,ithz,i0,j0,k0;
 +    int     index_x,index_xy;
 +    int     nx,ny,nz,pnx,pny,pnz;
 +    int *   idxptr;
 +    real    tx,ty,dx,dy,qn;
 +    real    fx,fy,fz,gval;
 +    real    fxy1,fz1;
 +    real    *thx,*thy,*thz,*dthx,*dthy,*dthz;
 +    int     norder;
 +    real    rxx,ryx,ryy,rzx,rzy,rzz;
 +    int     order;
 +
 +    pme_spline_work_t *work;
 +
 +    work = &pme->spline_work;
 +
 +    order = pme->pme_order;
 +    thx   = spline->theta[XX];
 +    thy   = spline->theta[YY];
 +    thz   = spline->theta[ZZ];
 +    dthx  = spline->dtheta[XX];
 +    dthy  = spline->dtheta[YY];
 +    dthz  = spline->dtheta[ZZ];
 +    nx    = pme->nkx;
 +    ny    = pme->nky;
 +    nz    = pme->nkz;
 +    pnx   = pme->pmegrid_nx;
 +    pny   = pme->pmegrid_ny;
 +    pnz   = pme->pmegrid_nz;
 +
 +    rxx   = pme->recipbox[XX][XX];
 +    ryx   = pme->recipbox[YY][XX];
 +    ryy   = pme->recipbox[YY][YY];
 +    rzx   = pme->recipbox[ZZ][XX];
 +    rzy   = pme->recipbox[ZZ][YY];
 +    rzz   = pme->recipbox[ZZ][ZZ];
 +
 +    for(nn=0; nn<spline->n; nn++)
 +    {
 +        n  = spline->ind[nn];
 +        qn = scale*atc->q[n];
 +
 +        if (bClearF)
 +        {
 +            atc->f[n][XX] = 0;
 +            atc->f[n][YY] = 0;
 +            atc->f[n][ZZ] = 0;
 +        }
 +        if (qn != 0)
 +        {
 +            fx     = 0;
 +            fy     = 0;
 +            fz     = 0;
 +            idxptr = atc->idx[n];
 +            norder = nn*order;
 +
 +            i0   = idxptr[XX];
 +            j0   = idxptr[YY];
 +            k0   = idxptr[ZZ];
 +
 +            /* Pointer arithmetic alert, next six statements */
 +            thx  = spline->theta[XX] + norder;
 +            thy  = spline->theta[YY] + norder;
 +            thz  = spline->theta[ZZ] + norder;
 +            dthx = spline->dtheta[XX] + norder;
 +            dthy = spline->dtheta[YY] + norder;
 +            dthz = spline->dtheta[ZZ] + norder;
 +
 +            switch (order) {
 +            case 4:
 +#ifdef PME_SSE
 +#ifdef PME_SSE_UNALIGNED
 +#define PME_GATHER_F_SSE_ORDER4
 +#else
 +#define PME_GATHER_F_SSE_ALIGNED
 +#define PME_ORDER 4
 +#endif
 +#include "pme_sse_single.h"
 +#else
 +                DO_FSPLINE(4);
 +#endif
 +                break;
 +            case 5:
 +#ifdef PME_SSE
 +#define PME_GATHER_F_SSE_ALIGNED
 +#define PME_ORDER 5
 +#include "pme_sse_single.h"
 +#else
 +                DO_FSPLINE(5);
 +#endif
 +                break;
 +            default:
 +                DO_FSPLINE(order);
 +                break;
 +            }
 +
 +            atc->f[n][XX] += -qn*( fx*nx*rxx );
 +            atc->f[n][YY] += -qn*( fx*nx*ryx + fy*ny*ryy );
 +            atc->f[n][ZZ] += -qn*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
 +        }
 +    }
 +    /* Since the energy and not forces are interpolated
 +     * the net force might not be exactly zero.
 +     * This can be solved by also interpolating F, but
 +     * that comes at a cost.
 +     * A better hack is to remove the net force every
 +     * step, but that must be done at a higher level
 +     * since this routine doesn't see all atoms if running
 +     * in parallel. Don't know how important it is?  EL 990726
 +     */
 +}
 +
 +
 +static real gather_energy_bsplines(gmx_pme_t pme,real *grid,
 +                                   pme_atomcomm_t *atc)
 +{
 +    splinedata_t *spline;
 +    int     n,ithx,ithy,ithz,i0,j0,k0;
 +    int     index_x,index_xy;
 +    int *   idxptr;
 +    real    energy,pot,tx,ty,qn,gval;
 +    real    *thx,*thy,*thz;
 +    int     norder;
 +    int     order;
 +
 +    spline = &atc->spline[0];
 +
 +    order = pme->pme_order;
 +
 +    energy = 0;
 +    for(n=0; (n<atc->n); n++) {
 +        qn      = atc->q[n];
 +
 +        if (qn != 0) {
 +            idxptr = atc->idx[n];
 +            norder = n*order;
 +
 +            i0   = idxptr[XX];
 +            j0   = idxptr[YY];
 +            k0   = idxptr[ZZ];
 +
 +            /* Pointer arithmetic alert, next three statements */
 +            thx  = spline->theta[XX] + norder;
 +            thy  = spline->theta[YY] + norder;
 +            thz  = spline->theta[ZZ] + norder;
 +
 +            pot = 0;
 +            for(ithx=0; (ithx<order); ithx++)
 +            {
 +                index_x = (i0+ithx)*pme->pmegrid_ny*pme->pmegrid_nz;
 +                tx      = thx[ithx];
 +
 +                for(ithy=0; (ithy<order); ithy++)
 +                {
 +                    index_xy = index_x+(j0+ithy)*pme->pmegrid_nz;
 +                    ty       = thy[ithy];
 +
 +                    for(ithz=0; (ithz<order); ithz++)
 +                    {
 +                        gval  = grid[index_xy+(k0+ithz)];
 +                        pot  += tx*ty*thz[ithz]*gval;
 +                    }
 +
 +                }
 +            }
 +
 +            energy += pot*qn;
 +        }
 +    }
 +
 +    return energy;
 +}
 +
 +/* Macro to force loop unrolling by fixing order.
 + * This gives a significant performance gain.
 + */
 +#define CALC_SPLINE(order)                     \
 +{                                              \
 +    int j,k,l;                                 \
 +    real dr,div;                               \
 +    real data[PME_ORDER_MAX];                  \
 +    real ddata[PME_ORDER_MAX];                 \
 +                                               \
 +    for(j=0; (j<DIM); j++)                     \
 +    {                                          \
 +        dr  = xptr[j];                         \
 +                                               \
 +        /* dr is relative offset from lower cell limit */ \
 +        data[order-1] = 0;                     \
 +        data[1] = dr;                          \
 +        data[0] = 1 - dr;                      \
 +                                               \
 +        for(k=3; (k<order); k++)               \
 +        {                                      \
 +            div = 1.0/(k - 1.0);               \
 +            data[k-1] = div*dr*data[k-2];      \
 +            for(l=1; (l<(k-1)); l++)           \
 +            {                                  \
 +                data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
 +                                   data[k-l-1]);                \
 +            }                                  \
 +            data[0] = div*(1-dr)*data[0];      \
 +        }                                      \
 +        /* differentiate */                    \
 +        ddata[0] = -data[0];                   \
 +        for(k=1; (k<order); k++)               \
 +        {                                      \
 +            ddata[k] = data[k-1] - data[k];    \
 +        }                                      \
 +                                               \
 +        div = 1.0/(order - 1);                 \
 +        data[order-1] = div*dr*data[order-2];  \
 +        for(l=1; (l<(order-1)); l++)           \
 +        {                                      \
 +            data[order-l-1] = div*((dr+l)*data[order-l-2]+    \
 +                               (order-l-dr)*data[order-l-1]); \
 +        }                                      \
 +        data[0] = div*(1 - dr)*data[0];        \
 +                                               \
 +        for(k=0; k<order; k++)                 \
 +        {                                      \
 +            theta[j][i*order+k]  = data[k];    \
 +            dtheta[j][i*order+k] = ddata[k];   \
 +        }                                      \
 +    }                                          \
 +}
 +
 +void make_bsplines(splinevec theta,splinevec dtheta,int order,
 +                   rvec fractx[],int nr,int ind[],real charge[],
 +                   gmx_bool bFreeEnergy)
 +{
 +    /* construct splines for local atoms */
 +    int  i,ii;
 +    real *xptr;
 +
 +    for(i=0; i<nr; i++)
 +    {
 +        /* With free energy we do not use the charge check.
 +         * In most cases this will be more efficient than calling make_bsplines
 +         * twice, since usually more than half the particles have charges.
 +         */
 +        ii = ind[i];
 +        if (bFreeEnergy || charge[ii] != 0.0) {
 +            xptr = fractx[ii];
 +            switch(order) {
 +            case 4:  CALC_SPLINE(4);     break;
 +            case 5:  CALC_SPLINE(5);     break;
 +            default: CALC_SPLINE(order); break;
 +            }
 +        }
 +    }
 +}
 +
 +
 +void make_dft_mod(real *mod,real *data,int ndata)
 +{
 +  int i,j;
 +  real sc,ss,arg;
 +
 +  for(i=0;i<ndata;i++) {
 +    sc=ss=0;
 +    for(j=0;j<ndata;j++) {
 +      arg=(2.0*M_PI*i*j)/ndata;
 +      sc+=data[j]*cos(arg);
 +      ss+=data[j]*sin(arg);
 +    }
 +    mod[i]=sc*sc+ss*ss;
 +  }
 +  for(i=0;i<ndata;i++)
 +    if(mod[i]<1e-7)
 +      mod[i]=(mod[i-1]+mod[i+1])*0.5;
 +}
 +
 +
 +static void make_bspline_moduli(splinevec bsp_mod,
 +                                int nx,int ny,int nz,int order)
 +{
 +  int nmax=max(nx,max(ny,nz));
 +  real *data,*ddata,*bsp_data;
 +  int i,k,l;
 +  real div;
 +
 +  snew(data,order);
 +  snew(ddata,order);
 +  snew(bsp_data,nmax);
 +
 +  data[order-1]=0;
 +  data[1]=0;
 +  data[0]=1;
 +
 +  for(k=3;k<order;k++) {
 +    div=1.0/(k-1.0);
 +    data[k-1]=0;
 +    for(l=1;l<(k-1);l++)
 +      data[k-l-1]=div*(l*data[k-l-2]+(k-l)*data[k-l-1]);
 +    data[0]=div*data[0];
 +  }
 +  /* differentiate */
 +  ddata[0]=-data[0];
 +  for(k=1;k<order;k++)
 +    ddata[k]=data[k-1]-data[k];
 +  div=1.0/(order-1);
 +  data[order-1]=0;
 +  for(l=1;l<(order-1);l++)
 +    data[order-l-1]=div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
 +  data[0]=div*data[0];
 +
 +  for(i=0;i<nmax;i++)
 +    bsp_data[i]=0;
 +  for(i=1;i<=order;i++)
 +    bsp_data[i]=data[i-1];
 +
 +  make_dft_mod(bsp_mod[XX],bsp_data,nx);
 +  make_dft_mod(bsp_mod[YY],bsp_data,ny);
 +  make_dft_mod(bsp_mod[ZZ],bsp_data,nz);
 +
 +  sfree(data);
 +  sfree(ddata);
 +  sfree(bsp_data);
 +}
 +
 +
 +/* Return the P3M optimal influence function */
 +static double do_p3m_influence(double z, int order)
 +{
 +    double z2,z4;
 +
 +    z2 = z*z;
 +    z4 = z2*z2;
 +
 +    /* The formula and most constants can be found in:
 +     * Ballenegger et al., JCTC 8, 936 (2012)
 +     */
 +    switch(order)
 +    {
 +    case 2:
 +        return 1.0 - 2.0*z2/3.0;
 +        break;
 +    case 3:
 +        return 1.0 - z2 + 2.0*z4/15.0;
 +        break;
 +    case 4:
 +        return 1.0 - 4.0*z2/3.0 + 2.0*z4/5.0 + 4.0*z2*z4/315.0;
 +        break;
 +    case 5:
 +        return 1.0 - 5.0*z2/3.0 + 7.0*z4/9.0 - 17.0*z2*z4/189.0 + 2.0*z4*z4/2835.0;
 +        break;
 +    case 6:
 +        return 1.0 - 2.0*z2 + 19.0*z4/15.0 - 256.0*z2*z4/945.0 + 62.0*z4*z4/4725.0 + 4.0*z2*z4*z4/155925.0;
 +        break;
 +    case 7:
 +        return 1.0 - 7.0*z2/3.0 + 28.0*z4/15.0 - 16.0*z2*z4/27.0 + 26.0*z4*z4/405.0 - 2.0*z2*z4*z4/1485.0 + 4.0*z4*z4*z4/6081075.0;
 +    case 8:
 +        return 1.0 - 8.0*z2/3.0 + 116.0*z4/45.0 - 344.0*z2*z4/315.0 + 914.0*z4*z4/4725.0 - 248.0*z4*z4*z2/22275.0 + 21844.0*z4*z4*z4/212837625.0 - 8.0*z4*z4*z4*z2/638512875.0;
 +        break;
 +    }
 +
 +    return 0.0;
 +}
 +
 +/* Calculate the P3M B-spline moduli for one dimension */
 +static void make_p3m_bspline_moduli_dim(real *bsp_mod,int n,int order)
 +{
 +    double zarg,zai,sinzai,infl;
 +    int    maxk,i;
 +
 +    if (order > 8)
 +    {
 +        gmx_fatal(FARGS,"The current P3M code only supports orders up to 8");
 +    }
 +
 +    zarg = M_PI/n;
 +
 +    maxk = (n + 1)/2;
 +
 +    for(i=-maxk; i<0; i++)
 +    {
 +        zai    = zarg*i;
 +        sinzai = sin(zai);
 +        infl   = do_p3m_influence(sinzai,order);
 +        bsp_mod[n+i] = infl*infl*pow(sinzai/zai,-2.0*order);
 +    }
 +    bsp_mod[0] = 1.0;
 +    for(i=1; i<maxk; i++)
 +    {
 +        zai    = zarg*i;
 +        sinzai = sin(zai);
 +        infl   = do_p3m_influence(sinzai,order);
 +        bsp_mod[i] = infl*infl*pow(sinzai/zai,-2.0*order);
 +    }
 +}
 +
 +/* Calculate the P3M B-spline moduli */
 +static void make_p3m_bspline_moduli(splinevec bsp_mod,
 +                                    int nx,int ny,int nz,int order)
 +{
 +    make_p3m_bspline_moduli_dim(bsp_mod[XX],nx,order);
 +    make_p3m_bspline_moduli_dim(bsp_mod[YY],ny,order);
 +    make_p3m_bspline_moduli_dim(bsp_mod[ZZ],nz,order);
 +}
 +
 +
 +static void setup_coordinate_communication(pme_atomcomm_t *atc)
 +{
 +  int nslab,n,i;
 +  int fw,bw;
 +
 +  nslab = atc->nslab;
 +
 +  n = 0;
 +  for(i=1; i<=nslab/2; i++) {
 +    fw = (atc->nodeid + i) % nslab;
 +    bw = (atc->nodeid - i + nslab) % nslab;
 +    if (n < nslab - 1) {
 +      atc->node_dest[n] = fw;
 +      atc->node_src[n]  = bw;
 +      n++;
 +    }
 +    if (n < nslab - 1) {
 +      atc->node_dest[n] = bw;
 +      atc->node_src[n]  = fw;
 +      n++;
 +    }
 +  }
 +}
 +
 +int gmx_pme_destroy(FILE *log,gmx_pme_t *pmedata)
 +{
 +    int thread;
 +
 +    if(NULL != log)
 +    {
 +        fprintf(log,"Destroying PME data structures.\n");
 +    }
 +
 +    sfree((*pmedata)->nnx);
 +    sfree((*pmedata)->nny);
 +    sfree((*pmedata)->nnz);
 +
 +    pmegrids_destroy(&(*pmedata)->pmegridA);
 +
 +    sfree((*pmedata)->fftgridA);
 +    sfree((*pmedata)->cfftgridA);
 +    gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupA);
 +
 +    if ((*pmedata)->pmegridB.grid.grid != NULL)
 +    {
 +        pmegrids_destroy(&(*pmedata)->pmegridB);
 +        sfree((*pmedata)->fftgridB);
 +        sfree((*pmedata)->cfftgridB);
 +        gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupB);
 +    }
 +    for(thread=0; thread<(*pmedata)->nthread; thread++)
 +    {
 +        free_work(&(*pmedata)->work[thread]);
 +    }
 +    sfree((*pmedata)->work);
 +
 +    sfree(*pmedata);
 +    *pmedata = NULL;
 +
 +  return 0;
 +}
 +
 +static int mult_up(int n,int f)
 +{
 +    return ((n + f - 1)/f)*f;
 +}
 +
 +
 +static double pme_load_imbalance(gmx_pme_t pme)
 +{
 +    int    nma,nmi;
 +    double n1,n2,n3;
 +
 +    nma = pme->nnodes_major;
 +    nmi = pme->nnodes_minor;
 +
 +    n1 = mult_up(pme->nkx,nma)*mult_up(pme->nky,nmi)*pme->nkz;
 +    n2 = mult_up(pme->nkx,nma)*mult_up(pme->nkz,nmi)*pme->nky;
 +    n3 = mult_up(pme->nky,nma)*mult_up(pme->nkz,nmi)*pme->nkx;
 +
 +    /* pme_solve is roughly double the cost of an fft */
 +
 +    return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
 +}
 +
 +static void init_atomcomm(gmx_pme_t pme,pme_atomcomm_t *atc, t_commrec *cr,
 +                          int dimind,gmx_bool bSpread)
 +{
 +    int nk,k,s,thread;
 +
 +    atc->dimind = dimind;
 +    atc->nslab  = 1;
 +    atc->nodeid = 0;
 +    atc->pd_nalloc = 0;
 +#ifdef GMX_MPI
 +    if (pme->nnodes > 1)
 +    {
 +        atc->mpi_comm = pme->mpi_comm_d[dimind];
 +        MPI_Comm_size(atc->mpi_comm,&atc->nslab);
 +        MPI_Comm_rank(atc->mpi_comm,&atc->nodeid);
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug,"For PME atom communication in dimind %d: nslab %d rank %d\n",atc->dimind,atc->nslab,atc->nodeid);
 +    }
 +#endif
 +
 +    atc->bSpread   = bSpread;
 +    atc->pme_order = pme->pme_order;
 +
 +    if (atc->nslab > 1)
 +    {
 +        /* These three allocations are not required for particle decomp. */
 +        snew(atc->node_dest,atc->nslab);
 +        snew(atc->node_src,atc->nslab);
 +        setup_coordinate_communication(atc);
 +
 +        snew(atc->count_thread,pme->nthread);
 +        for(thread=0; thread<pme->nthread; thread++)
 +        {
 +            snew(atc->count_thread[thread],atc->nslab);
 +        }
 +        atc->count = atc->count_thread[0];
 +        snew(atc->rcount,atc->nslab);
 +        snew(atc->buf_index,atc->nslab);
 +    }
 +
 +    atc->nthread = pme->nthread;
 +    if (atc->nthread > 1)
 +    {
 +        snew(atc->thread_plist,atc->nthread);
 +    }
 +    snew(atc->spline,atc->nthread);
 +    for(thread=0; thread<atc->nthread; thread++)
 +    {
 +        if (atc->nthread > 1)
 +        {
 +            snew(atc->thread_plist[thread].n,atc->nthread+2*GMX_CACHE_SEP);
 +            atc->thread_plist[thread].n += GMX_CACHE_SEP;
 +        }
 +    }
 +}
 +
 +static void
 +init_overlap_comm(pme_overlap_t *  ol,
 +                  int              norder,
 +#ifdef GMX_MPI
 +                  MPI_Comm         comm,
 +#endif
 +                  int              nnodes,
 +                  int              nodeid,
 +                  int              ndata,
 +                  int              commplainsize)
 +{
 +    int lbnd,rbnd,maxlr,b,i;
 +    int exten;
 +    int nn,nk;
 +    pme_grid_comm_t *pgc;
 +    gmx_bool bCont;
 +    int fft_start,fft_end,send_index1,recv_index1;
 +
 +#ifdef GMX_MPI
 +    ol->mpi_comm = comm;
 +#endif
 +
 +    ol->nnodes = nnodes;
 +    ol->nodeid = nodeid;
 +
 +    /* Linear translation of the PME grid wo'nt affect reciprocal space
 +     * calculations, so to optimize we only interpolate "upwards",
 +     * which also means we only have to consider overlap in one direction.
 +     * I.e., particles on this node might also be spread to grid indices
 +     * that belong to higher nodes (modulo nnodes)
 +     */
 +
 +    snew(ol->s2g0,ol->nnodes+1);
 +    snew(ol->s2g1,ol->nnodes);
 +    if (debug) { fprintf(debug,"PME slab boundaries:"); }
 +    for(i=0; i<nnodes; i++)
 +    {
 +        /* s2g0 the local interpolation grid start.
 +         * s2g1 the local interpolation grid end.
 +         * Because grid overlap communication only goes forward,
 +         * the grid the slabs for fft's should be rounded down.
 +         */
 +        ol->s2g0[i] = ( i   *ndata + 0       )/nnodes;
 +        ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
 +
 +        if (debug)
 +        {
 +            fprintf(debug,"  %3d %3d",ol->s2g0[i],ol->s2g1[i]);
 +        }
 +    }
 +    ol->s2g0[nnodes] = ndata;
 +    if (debug) { fprintf(debug,"\n"); }
 +
 +    /* Determine with how many nodes we need to communicate the grid overlap */
 +    b = 0;
 +    do
 +    {
 +        b++;
 +        bCont = FALSE;
 +        for(i=0; i<nnodes; i++)
 +        {
 +            if ((i+b <  nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
 +                (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
 +            {
 +                bCont = TRUE;
 +            }
 +        }
 +    }
 +    while (bCont && b < nnodes);
 +    ol->noverlap_nodes = b - 1;
 +
 +    snew(ol->send_id,ol->noverlap_nodes);
 +    snew(ol->recv_id,ol->noverlap_nodes);
 +    for(b=0; b<ol->noverlap_nodes; b++)
 +    {
 +        ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
 +        ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
 +    }
 +    snew(ol->comm_data, ol->noverlap_nodes);
 +
 +    for(b=0; b<ol->noverlap_nodes; b++)
 +    {
 +        pgc = &ol->comm_data[b];
 +        /* Send */
 +        fft_start        = ol->s2g0[ol->send_id[b]];
 +        fft_end          = ol->s2g0[ol->send_id[b]+1];
 +        if (ol->send_id[b] < nodeid)
 +        {
 +            fft_start += ndata;
 +            fft_end   += ndata;
 +        }
 +        send_index1      = ol->s2g1[nodeid];
 +        send_index1      = min(send_index1,fft_end);
 +        pgc->send_index0 = fft_start;
 +        pgc->send_nindex = max(0,send_index1 - pgc->send_index0);
 +
 +        /* We always start receiving to the first index of our slab */
 +        fft_start        = ol->s2g0[ol->nodeid];
 +        fft_end          = ol->s2g0[ol->nodeid+1];
 +        recv_index1      = ol->s2g1[ol->recv_id[b]];
 +        if (ol->recv_id[b] > nodeid)
 +        {
 +            recv_index1 -= ndata;
 +        }
 +        recv_index1      = min(recv_index1,fft_end);
 +        pgc->recv_index0 = fft_start;
 +        pgc->recv_nindex = max(0,recv_index1 - pgc->recv_index0);
 +    }
 +
 +    /* For non-divisible grid we need pme_order iso pme_order-1 */
 +    snew(ol->sendbuf,norder*commplainsize);
 +    snew(ol->recvbuf,norder*commplainsize);
 +}
 +
 +static void
 +make_gridindex5_to_localindex(int n,int local_start,int local_range,
 +                              int **global_to_local,
 +                              real **fraction_shift)
 +{
 +    int i;
 +    int * gtl;
 +    real * fsh;
 +
 +    snew(gtl,5*n);
 +    snew(fsh,5*n);
 +    for(i=0; (i<5*n); i++)
 +    {
 +        /* Determine the global to local grid index */
 +        gtl[i] = (i - local_start + n) % n;
 +        /* For coordinates that fall within the local grid the fraction
 +         * is correct, we don't need to shift it.
 +         */
 +        fsh[i] = 0;
 +        if (local_range < n)
 +        {
 +            /* Due to rounding issues i could be 1 beyond the lower or
 +             * upper boundary of the local grid. Correct the index for this.
 +             * If we shift the index, we need to shift the fraction by
 +             * the same amount in the other direction to not affect
 +             * the weights.
 +             * Note that due to this shifting the weights at the end of
 +             * the spline might change, but that will only involve values
 +             * between zero and values close to the precision of a real,
 +             * which is anyhow the accuracy of the whole mesh calculation.
 +             */
 +            /* With local_range=0 we should not change i=local_start */
 +            if (i % n != local_start)
 +            {
 +                if (gtl[i] == n-1)
 +                {
 +                    gtl[i] = 0;
 +                    fsh[i] = -1;
 +                }
 +                else if (gtl[i] == local_range)
 +                {
 +                    gtl[i] = local_range - 1;
 +                    fsh[i] = 1;
 +                }
 +            }
 +        }
 +    }
 +
 +    *global_to_local = gtl;
 +    *fraction_shift  = fsh;
 +}
 +
 +static void sse_mask_init(pme_spline_work_t *work,int order)
 +{
 +#ifdef PME_SSE
 +    float  tmp[8];
 +    __m128 zero_SSE;
 +    int    of,i;
 +
 +    zero_SSE = _mm_setzero_ps();
 +
 +    for(of=0; of<8-(order-1); of++)
 +    {
 +        for(i=0; i<8; i++)
 +        {
 +            tmp[i] = (i >= of && i < of+order ? 1 : 0);
 +        }
 +        work->mask_SSE0[of] = _mm_loadu_ps(tmp);
 +        work->mask_SSE1[of] = _mm_loadu_ps(tmp+4);
 +        work->mask_SSE0[of] = _mm_cmpgt_ps(work->mask_SSE0[of],zero_SSE);
 +        work->mask_SSE1[of] = _mm_cmpgt_ps(work->mask_SSE1[of],zero_SSE);
 +    }
 +#endif
 +}
 +
 +static void
 +gmx_pme_check_grid_restrictions(FILE *fplog,char dim,int nnodes,int *nk)
 +{
 +    int nk_new;
 +
 +    if (*nk % nnodes != 0)
 +    {
 +        nk_new = nnodes*(*nk/nnodes + 1);
 +
 +        if (2*nk_new >= 3*(*nk))
 +        {
 +            gmx_fatal(FARGS,"The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). The grid size would have to be increased by more than 50%% to make the grid divisible. Change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).",
 +                      dim,*nk,dim,nnodes,dim);
 +        }
 +
 +        if (fplog != NULL)
 +        {
 +            fprintf(fplog,"\nNOTE: The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). Increasing the PME grid size in dim %c to %d. This will increase the accuracy and will not decrease the performance significantly on this number of nodes. For optimal performance change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).\n\n",
 +                    dim,*nk,dim,nnodes,dim,nk_new,dim);
 +        }
 +
 +        *nk = nk_new;
 +    }
 +}
 +
 +int gmx_pme_init(gmx_pme_t *         pmedata,
 +                 t_commrec *         cr,
 +                 int                 nnodes_major,
 +                 int                 nnodes_minor,
 +                 t_inputrec *        ir,
 +                 int                 homenr,
 +                 gmx_bool            bFreeEnergy,
 +                 gmx_bool            bReproducible,
 +                 int                 nthread)
 +{
 +    gmx_pme_t pme=NULL;
 +
 +    pme_atomcomm_t *atc;
 +    ivec ndata;
 +
 +    if (debug)
 +        fprintf(debug,"Creating PME data structures.\n");
 +    snew(pme,1);
 +
 +    pme->redist_init         = FALSE;
 +    pme->sum_qgrid_tmp       = NULL;
 +    pme->sum_qgrid_dd_tmp    = NULL;
 +    pme->buf_nalloc          = 0;
 +    pme->redist_buf_nalloc   = 0;
 +
 +    pme->nnodes              = 1;
 +    pme->bPPnode             = TRUE;
 +
 +    pme->nnodes_major        = nnodes_major;
 +    pme->nnodes_minor        = nnodes_minor;
 +
 +#ifdef GMX_MPI
 +    if (nnodes_major*nnodes_minor > 1)
 +    {
 +        pme->mpi_comm = cr->mpi_comm_mygroup;
 +
 +        MPI_Comm_rank(pme->mpi_comm,&pme->nodeid);
 +        MPI_Comm_size(pme->mpi_comm,&pme->nnodes);
 +        if (pme->nnodes != nnodes_major*nnodes_minor)
 +        {
 +            gmx_incons("PME node count mismatch");
 +        }
 +    }
 +    else
 +    {
 +        pme->mpi_comm = MPI_COMM_NULL;
 +    }
 +#endif
 +
 +    if (pme->nnodes == 1)
 +    {
 +        pme->ndecompdim = 0;
 +        pme->nodeid_major = 0;
 +        pme->nodeid_minor = 0;
 +#ifdef GMX_MPI
 +        pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL;
 +#endif
 +    }
 +    else
 +    {
 +        if (nnodes_minor == 1)
 +        {
 +#ifdef GMX_MPI
 +            pme->mpi_comm_d[0] = pme->mpi_comm;
 +            pme->mpi_comm_d[1] = MPI_COMM_NULL;
 +#endif
 +            pme->ndecompdim = 1;
 +            pme->nodeid_major = pme->nodeid;
 +            pme->nodeid_minor = 0;
 +
 +        }
 +        else if (nnodes_major == 1)
 +        {
 +#ifdef GMX_MPI
 +            pme->mpi_comm_d[0] = MPI_COMM_NULL;
 +            pme->mpi_comm_d[1] = pme->mpi_comm;
 +#endif
 +            pme->ndecompdim = 1;
 +            pme->nodeid_major = 0;
 +            pme->nodeid_minor = pme->nodeid;
 +        }
 +        else
 +        {
 +            if (pme->nnodes % nnodes_major != 0)
 +            {
 +                gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
 +            }
 +            pme->ndecompdim = 2;
 +
 +#ifdef GMX_MPI
 +            MPI_Comm_split(pme->mpi_comm,pme->nodeid % nnodes_minor,
 +                           pme->nodeid,&pme->mpi_comm_d[0]);  /* My communicator along major dimension */
 +            MPI_Comm_split(pme->mpi_comm,pme->nodeid/nnodes_minor,
 +                           pme->nodeid,&pme->mpi_comm_d[1]);  /* My communicator along minor dimension */
 +
 +            MPI_Comm_rank(pme->mpi_comm_d[0],&pme->nodeid_major);
 +            MPI_Comm_size(pme->mpi_comm_d[0],&pme->nnodes_major);
 +            MPI_Comm_rank(pme->mpi_comm_d[1],&pme->nodeid_minor);
 +            MPI_Comm_size(pme->mpi_comm_d[1],&pme->nnodes_minor);
 +#endif
 +        }
 +        pme->bPPnode = (cr->duty & DUTY_PP);
 +    }
 +
 +    pme->nthread = nthread;
 +
 +    if (ir->ePBC == epbcSCREW)
 +    {
 +        gmx_fatal(FARGS,"pme does not (yet) work with pbc = screw");
 +    }
 +
 +    pme->bFEP        = ((ir->efep != efepNO) && bFreeEnergy);
 +    pme->nkx         = ir->nkx;
 +    pme->nky         = ir->nky;
 +    pme->nkz         = ir->nkz;
 +    pme->bP3M        = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL);
 +    pme->pme_order   = ir->pme_order;
 +    pme->epsilon_r   = ir->epsilon_r;
 +
 +    if (pme->pme_order > PME_ORDER_MAX)
 +    {
 +        gmx_fatal(FARGS,"pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
 +                  pme->pme_order,PME_ORDER_MAX);
 +    }
 +
 +    /* Currently pme.c supports only the fft5d FFT code.
 +     * Therefore the grid always needs to be divisible by nnodes.
 +     * When the old 1D code is also supported again, change this check.
 +     *
 +     * This check should be done before calling gmx_pme_init
 +     * and fplog should be passed iso stderr.
 +     *
 +    if (pme->ndecompdim >= 2)
 +    */
 +    if (pme->ndecompdim >= 1)
 +    {
 +        /*
 +        gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
 +                                        'x',nnodes_major,&pme->nkx);
 +        gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
 +                                        'y',nnodes_minor,&pme->nky);
 +        */
 +    }
 +
 +    if (pme->nkx <= pme->pme_order*(pme->nnodes_major > 1 ? 2 : 1) ||
 +        pme->nky <= pme->pme_order*(pme->nnodes_minor > 1 ? 2 : 1) ||
 +        pme->nkz <= pme->pme_order)
 +    {
 +        gmx_fatal(FARGS,"The pme grid dimensions need to be larger than pme_order (%d) and in parallel larger than 2*pme_ordern for x and/or y",pme->pme_order);
 +    }
 +
 +    if (pme->nnodes > 1) {
 +        double imbal;
 +
 +#ifdef GMX_MPI
 +        MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
 +        MPI_Type_commit(&(pme->rvec_mpi));
 +#endif
 +
 +        /* Note that the charge spreading and force gathering, which usually
 +         * takes about the same amount of time as FFT+solve_pme,
 +         * is always fully load balanced
 +         * (unless the charge distribution is inhomogeneous).
 +         */
 +
 +        imbal = pme_load_imbalance(pme);
 +        if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
 +        {
 +            fprintf(stderr,
 +                    "\n"
 +                    "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
 +                    "      For optimal PME load balancing\n"
 +                    "      PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
 +                    "      and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
 +                    "\n",
 +                    (int)((imbal-1)*100 + 0.5),
 +                    pme->nkx,pme->nky,pme->nnodes_major,
 +                    pme->nky,pme->nkz,pme->nnodes_minor);
 +        }
 +    }
 +
 +    /* For non-divisible grid we need pme_order iso pme_order-1 */
 +    /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
 +     * y is always copied through a buffer: we don't need padding in z,
 +     * but we do need the overlap in x because of the communication order.
 +     */
 +    init_overlap_comm(&pme->overlap[0],pme->pme_order,
 +#ifdef GMX_MPI
 +                      pme->mpi_comm_d[0],
 +#endif
 +                      pme->nnodes_major,pme->nodeid_major,
 +                      pme->nkx,
 +                      (div_round_up(pme->nky,pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
 +
 +    init_overlap_comm(&pme->overlap[1],pme->pme_order,
 +#ifdef GMX_MPI
 +                      pme->mpi_comm_d[1],
 +#endif
 +                      pme->nnodes_minor,pme->nodeid_minor,
 +                      pme->nky,
 +                      (div_round_up(pme->nkx,pme->nnodes_major)+pme->pme_order)*pme->nkz);
 +
 +    /* Check for a limitation of the (current) sum_fftgrid_dd code */
 +    if (pme->nthread > 1 &&
 +        (pme->overlap[0].noverlap_nodes > 1 ||
 +         pme->overlap[1].noverlap_nodes > 1))
 +    {
 +        gmx_fatal(FARGS,"With threads the number of grid lines per node along x and or y should be pme_order (%d) or more or exactly pme_order-1",pme->pme_order);
 +    }
 +
 +    snew(pme->bsp_mod[XX],pme->nkx);
 +    snew(pme->bsp_mod[YY],pme->nky);
 +    snew(pme->bsp_mod[ZZ],pme->nkz);
 +
 +    /* The required size of the interpolation grid, including overlap.
 +     * The allocated size (pmegrid_n?) might be slightly larger.
 +     */
 +    pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
 +                      pme->overlap[0].s2g0[pme->nodeid_major];
 +    pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
 +                      pme->overlap[1].s2g0[pme->nodeid_minor];
 +    pme->pmegrid_nz_base = pme->nkz;
 +    pme->pmegrid_nz = pme->pmegrid_nz_base + pme->pme_order - 1;
 +    set_grid_alignment(&pme->pmegrid_nz,pme->pme_order);
 +
 +    pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
 +    pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
 +    pme->pmegrid_start_iz = 0;
 +
 +    make_gridindex5_to_localindex(pme->nkx,
 +                                  pme->pmegrid_start_ix,
 +                                  pme->pmegrid_nx - (pme->pme_order-1),
 +                                  &pme->nnx,&pme->fshx);
 +    make_gridindex5_to_localindex(pme->nky,
 +                                  pme->pmegrid_start_iy,
 +                                  pme->pmegrid_ny - (pme->pme_order-1),
 +                                  &pme->nny,&pme->fshy);
 +    make_gridindex5_to_localindex(pme->nkz,
 +                                  pme->pmegrid_start_iz,
 +                                  pme->pmegrid_nz_base,
 +                                  &pme->nnz,&pme->fshz);
 +
 +    pmegrids_init(&pme->pmegridA,
 +                  pme->pmegrid_nx,pme->pmegrid_ny,pme->pmegrid_nz,
 +                  pme->pmegrid_nz_base,
 +                  pme->pme_order,
 +                  pme->nthread,
 +                  pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
 +                  pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
 +
 +    sse_mask_init(&pme->spline_work,pme->pme_order);
 +
 +    ndata[0] = pme->nkx;
 +    ndata[1] = pme->nky;
 +    ndata[2] = pme->nkz;
 +
 +    /* This routine will allocate the grid data to fit the FFTs */
 +    gmx_parallel_3dfft_init(&pme->pfft_setupA,ndata,
 +                            &pme->fftgridA,&pme->cfftgridA,
 +                            pme->mpi_comm_d,
 +                            pme->overlap[0].s2g0,pme->overlap[1].s2g0,
 +                            bReproducible,pme->nthread);
 +
 +    if (bFreeEnergy)
 +    {
 +        pmegrids_init(&pme->pmegridB,
 +                      pme->pmegrid_nx,pme->pmegrid_ny,pme->pmegrid_nz,
 +                      pme->pmegrid_nz_base,
 +                      pme->pme_order,
 +                      pme->nthread,
 +                      pme->nkx % pme->nnodes_major != 0,
 +                      pme->nky % pme->nnodes_minor != 0);
 +
 +        gmx_parallel_3dfft_init(&pme->pfft_setupB,ndata,
 +                                &pme->fftgridB,&pme->cfftgridB,
 +                                pme->mpi_comm_d,
 +                                pme->overlap[0].s2g0,pme->overlap[1].s2g0,
 +                                bReproducible,pme->nthread);
 +    }
 +    else
 +    {
 +        pme->pmegridB.grid.grid = NULL;
 +        pme->fftgridB           = NULL;
 +        pme->cfftgridB          = NULL;
 +    }
 +
 +    if (!pme->bP3M)
 +    {
 +        /* Use plain SPME B-spline interpolation */
 +        make_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order);
 +    }
 +    else
 +    {
 +        /* Use the P3M grid-optimized influence function */
 +        make_p3m_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order);
 +    }
 +
 +    /* Use atc[0] for spreading */
 +    init_atomcomm(pme,&pme->atc[0],cr,nnodes_major > 1 ? 0 : 1,TRUE);
 +    if (pme->ndecompdim >= 2)
 +    {
 +        init_atomcomm(pme,&pme->atc[1],cr,1,FALSE);
 +    }
 +
 +    if (pme->nnodes == 1) {
 +        pme->atc[0].n = homenr;
 +        pme_realloc_atomcomm_things(&pme->atc[0]);
 +    }
 +
 +    {
 +        int thread;
 +
 +        /* Use fft5d, order after FFT is y major, z, x minor */
 +
 +        snew(pme->work,pme->nthread);
 +        for(thread=0; thread<pme->nthread; thread++)
 +        {
 +            realloc_work(&pme->work[thread],pme->nkx);
 +        }
 +    }
 +
 +    *pmedata = pme;
 +
 +    return 0;
 +}
 +
 +
 +static void copy_local_grid(gmx_pme_t pme,
 +                            pmegrids_t *pmegrids,int thread,real *fftgrid)
 +{
 +    ivec local_fft_ndata,local_fft_offset,local_fft_size;
 +    int  fft_my,fft_mz;
 +    int  nsx,nsy,nsz;
 +    ivec nf;
 +    int  offx,offy,offz,x,y,z,i0,i0t;
 +    int  d;
 +    pmegrid_t *pmegrid;
 +    real *grid_th;
 +
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +    fft_my = local_fft_size[YY];
 +    fft_mz = local_fft_size[ZZ];
 +
 +    pmegrid = &pmegrids->grid_th[thread];
 +
 +    nsx = pmegrid->n[XX];
 +    nsy = pmegrid->n[YY];
 +    nsz = pmegrid->n[ZZ];
 +
 +    for(d=0; d<DIM; d++)
 +    {
 +        nf[d] = min(pmegrid->n[d] - (pmegrid->order - 1),
 +                    local_fft_ndata[d] - pmegrid->offset[d]);
 +    }
 +
 +    offx = pmegrid->offset[XX];
 +    offy = pmegrid->offset[YY];
 +    offz = pmegrid->offset[ZZ];
 +
 +    /* Directly copy the non-overlapping parts of the local grids.
 +     * This also initializes the full grid.
 +     */
 +    grid_th = pmegrid->grid;
 +    for(x=0; x<nf[XX]; x++)
 +    {
 +        for(y=0; y<nf[YY]; y++)
 +        {
 +            i0  = ((offx + x)*fft_my + (offy + y))*fft_mz + offz;
 +            i0t = (x*nsy + y)*nsz;
 +            for(z=0; z<nf[ZZ]; z++)
 +            {
 +                fftgrid[i0+z] = grid_th[i0t+z];
 +            }
 +        }
 +    }
 +}
 +
 +static void print_sendbuf(gmx_pme_t pme,real *sendbuf)
 +{
 +    ivec local_fft_ndata,local_fft_offset,local_fft_size;
 +    pme_overlap_t *overlap;
 +    int datasize,nind;
 +    int i,x,y,z,n;
 +
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +    /* Major dimension */
 +    overlap = &pme->overlap[0];
 +
 +    nind   = overlap->comm_data[0].send_nindex;
 +
 +    for(y=0; y<local_fft_ndata[YY]; y++) {
 +         printf(" %2d",y);
 +    }
 +    printf("\n");
 +
 +    i = 0;
 +    for(x=0; x<nind; x++) {
 +        for(y=0; y<local_fft_ndata[YY]; y++) {
 +            n = 0;
 +            for(z=0; z<local_fft_ndata[ZZ]; z++) {
 +                if (sendbuf[i] != 0) n++;
 +                i++;
 +            }
 +            printf(" %2d",n);
 +        }
 +        printf("\n");
 +    }
 +}
 +
 +static void
 +reduce_threadgrid_overlap(gmx_pme_t pme,
 +                          const pmegrids_t *pmegrids,int thread,
 +                          real *fftgrid,real *commbuf_x,real *commbuf_y)
 +{
 +    ivec local_fft_ndata,local_fft_offset,local_fft_size;
 +    int  fft_nx,fft_ny,fft_nz;
 +    int  fft_my,fft_mz;
 +    int  buf_my=-1;
 +    int  nsx,nsy,nsz;
 +    ivec ne;
 +    int  offx,offy,offz,x,y,z,i0,i0t;
 +    int  sx,sy,sz,fx,fy,fz,tx1,ty1,tz1,ox,oy,oz;
 +    gmx_bool bClearBufX,bClearBufY,bClearBufXY,bClearBuf;
 +    gmx_bool bCommX,bCommY;
 +    int  d;
 +    int  thread_f;
 +    const pmegrid_t *pmegrid,*pmegrid_g,*pmegrid_f;
 +    const real *grid_th;
 +    real *commbuf=NULL;
 +
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +    fft_nx = local_fft_ndata[XX];
 +    fft_ny = local_fft_ndata[YY];
 +    fft_nz = local_fft_ndata[ZZ];
 +
 +    fft_my = local_fft_size[YY];
 +    fft_mz = local_fft_size[ZZ];
 +
 +    /* This routine is called when all thread have finished spreading.
 +     * Here each thread sums grid contributions calculated by other threads
 +     * to the thread local grid volume.
 +     * To minimize the number of grid copying operations,
 +     * this routines sums immediately from the pmegrid to the fftgrid.
 +     */
 +
 +    /* Determine which part of the full node grid we should operate on,
 +     * this is our thread local part of the full grid.
 +     */
 +    pmegrid = &pmegrids->grid_th[thread];
 +
 +    for(d=0; d<DIM; d++)
 +    {
 +        ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
 +                    local_fft_ndata[d]);
 +    }
 +
 +    offx = pmegrid->offset[XX];
 +    offy = pmegrid->offset[YY];
 +    offz = pmegrid->offset[ZZ];
 +
 +
 +    bClearBufX  = TRUE;
 +    bClearBufY  = TRUE;
 +    bClearBufXY = TRUE;
 +
 +    /* Now loop over all the thread data blocks that contribute
 +     * to the grid region we (our thread) are operating on.
 +     */
 +    /* Note that ffy_nx/y is equal to the number of grid points
 +     * between the first point of our node grid and the one of the next node.
 +     */
 +    for(sx=0; sx>=-pmegrids->nthread_comm[XX]; sx--)
 +    {
 +        fx = pmegrid->ci[XX] + sx;
 +        ox = 0;
 +        bCommX = FALSE;
 +        if (fx < 0) {
 +            fx += pmegrids->nc[XX];
 +            ox -= fft_nx;
 +            bCommX = (pme->nnodes_major > 1);
 +        }
 +        pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
 +        ox += pmegrid_g->offset[XX];
 +        if (!bCommX)
 +        {
 +            tx1 = min(ox + pmegrid_g->n[XX],ne[XX]);
 +        }
 +        else
 +        {
 +            tx1 = min(ox + pmegrid_g->n[XX],pme->pme_order);
 +        }
 +
 +        for(sy=0; sy>=-pmegrids->nthread_comm[YY]; sy--)
 +        {
 +            fy = pmegrid->ci[YY] + sy;
 +            oy = 0;
 +            bCommY = FALSE;
 +            if (fy < 0) {
 +                fy += pmegrids->nc[YY];
 +                oy -= fft_ny;
 +                bCommY = (pme->nnodes_minor > 1);
 +            }
 +            pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
 +            oy += pmegrid_g->offset[YY];
 +            if (!bCommY)
 +            {
 +                ty1 = min(oy + pmegrid_g->n[YY],ne[YY]);
 +            }
 +            else
 +            {
 +                ty1 = min(oy + pmegrid_g->n[YY],pme->pme_order);
 +            }
 +
 +            for(sz=0; sz>=-pmegrids->nthread_comm[ZZ]; sz--)
 +            {
 +                fz = pmegrid->ci[ZZ] + sz;
 +                oz = 0;
 +                if (fz < 0)
 +                {
 +                    fz += pmegrids->nc[ZZ];
 +                    oz -= fft_nz;
 +                }
 +                pmegrid_g = &pmegrids->grid_th[fz];
 +                oz += pmegrid_g->offset[ZZ];
 +                tz1 = min(oz + pmegrid_g->n[ZZ],ne[ZZ]);
 +
 +                if (sx == 0 && sy == 0 && sz == 0)
 +                {
 +                    /* We have already added our local contribution
 +                     * before calling this routine, so skip it here.
 +                     */
 +                    continue;
 +                }
 +
 +                thread_f = (fx*pmegrids->nc[YY] + fy)*pmegrids->nc[ZZ] + fz;
 +
 +                pmegrid_f = &pmegrids->grid_th[thread_f];
 +
 +                grid_th = pmegrid_f->grid;
 +
 +                nsx = pmegrid_f->n[XX];
 +                nsy = pmegrid_f->n[YY];
 +                nsz = pmegrid_f->n[ZZ];
 +
 +#ifdef DEBUG_PME_REDUCE
 +                printf("n%d t%d add %d  %2d %2d %2d  %2d %2d %2d  %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
 +                       pme->nodeid,thread,thread_f,
 +                       pme->pmegrid_start_ix,
 +                       pme->pmegrid_start_iy,
 +                       pme->pmegrid_start_iz,
 +                       sx,sy,sz,
 +                       offx-ox,tx1-ox,offx,tx1,
 +                       offy-oy,ty1-oy,offy,ty1,
 +                       offz-oz,tz1-oz,offz,tz1);
 +#endif
 +
 +                if (!(bCommX || bCommY))
 +                {
 +                    /* Copy from the thread local grid to the node grid */
 +                    for(x=offx; x<tx1; x++)
 +                    {
 +                        for(y=offy; y<ty1; y++)
 +                        {
 +                            i0  = (x*fft_my + y)*fft_mz;
 +                            i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
 +                            for(z=offz; z<tz1; z++)
 +                            {
 +                                fftgrid[i0+z] += grid_th[i0t+z];
 +                            }
 +                        }
 +                    }
 +                }
 +                else
 +                {
 +                    /* The order of this conditional decides
 +                     * where the corner volume gets stored with x+y decomp.
 +                     */
 +                    if (bCommY)
 +                    {
 +                        commbuf = commbuf_y;
 +                        buf_my  = ty1 - offy;
 +                        if (bCommX)
 +                        {
 +                            /* We index commbuf modulo the local grid size */
 +                            commbuf += buf_my*fft_nx*fft_nz;
 +
 +                            bClearBuf  = bClearBufXY;
 +                            bClearBufXY = FALSE;
 +                        }
 +                        else
 +                        {
 +                            bClearBuf  = bClearBufY;
 +                            bClearBufY = FALSE;
 +                        }
 +                    }
 +                    else
 +                    {
 +                        commbuf = commbuf_x;
 +                        buf_my  = fft_ny;
 +                        bClearBuf  = bClearBufX;
 +                        bClearBufX = FALSE;
 +                    }
 +
 +                    /* Copy to the communication buffer */
 +                    for(x=offx; x<tx1; x++)
 +                    {
 +                        for(y=offy; y<ty1; y++)
 +                        {
 +                            i0  = (x*buf_my + y)*fft_nz;
 +                            i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
 +
 +                            if (bClearBuf)
 +                            {
 +                                /* First access of commbuf, initialize it */
 +                                for(z=offz; z<tz1; z++)
 +                                {
 +                                    commbuf[i0+z]  = grid_th[i0t+z];
 +                                }
 +                            }
 +                            else
 +                            {
 +                                for(z=offz; z<tz1; z++)
 +                                {
 +                                    commbuf[i0+z] += grid_th[i0t+z];
 +                                }
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void sum_fftgrid_dd(gmx_pme_t pme,real *fftgrid)
 +{
 +    ivec local_fft_ndata,local_fft_offset,local_fft_size;
 +    pme_overlap_t *overlap;
 +    int  send_nindex;
 +    int  recv_index0,recv_nindex;
 +#ifdef GMX_MPI
 +    MPI_Status stat;
 +#endif
 +    int  ipulse,send_id,recv_id,datasize,gridsize,size_yx;
 +    real *sendptr,*recvptr;
 +    int  x,y,z,indg,indb;
 +
 +    /* Note that this routine is only used for forward communication.
 +     * Since the force gathering, unlike the charge spreading,
 +     * can be trivially parallelized over the particles,
 +     * the backwards process is much simpler and can use the "old"
 +     * communication setup.
 +     */
 +
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +
 +    /* Currently supports only a single communication pulse */
 +
 +/* for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++) */
 +    if (pme->nnodes_minor > 1)
 +    {
 +        /* Major dimension */
 +        overlap = &pme->overlap[1];
 +
 +        if (pme->nnodes_major > 1)
 +        {
 +             size_yx = pme->overlap[0].comm_data[0].send_nindex;
 +        }
 +        else
 +        {
 +            size_yx = 0;
 +        }
 +        datasize = (local_fft_ndata[XX]+size_yx)*local_fft_ndata[ZZ];
 +
 +        ipulse = 0;
 +
 +        send_id = overlap->send_id[ipulse];
 +        recv_id = overlap->recv_id[ipulse];
 +        send_nindex   = overlap->comm_data[ipulse].send_nindex;
 +        /* recv_index0   = overlap->comm_data[ipulse].recv_index0; */
 +        recv_index0 = 0;
 +        recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +
 +        sendptr = overlap->sendbuf;
 +        recvptr = overlap->recvbuf;
 +
 +        /*
 +        printf("node %d comm %2d x %2d x %2d\n",pme->nodeid,
 +               local_fft_ndata[XX]+size_yx,send_nindex,local_fft_ndata[ZZ]);
 +        printf("node %d send %f, %f\n",pme->nodeid,
 +               sendptr[0],sendptr[send_nindex*datasize-1]);
 +        */
 +
 +#ifdef GMX_MPI
 +        MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
 +                     send_id,ipulse,
 +                     recvptr,recv_nindex*datasize,GMX_MPI_REAL,
 +                     recv_id,ipulse,
 +                     overlap->mpi_comm,&stat);
 +#endif
 +
 +        for(x=0; x<local_fft_ndata[XX]; x++)
 +        {
 +            for(y=0; y<recv_nindex; y++)
 +            {
 +                indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
 +                indb = (x*recv_nindex        + y)*local_fft_ndata[ZZ];
 +                for(z=0; z<local_fft_ndata[ZZ]; z++)
 +                {
 +                    fftgrid[indg+z] += recvptr[indb+z];
 +                }
 +            }
 +        }
 +        if (pme->nnodes_major > 1)
 +        {
 +            sendptr = pme->overlap[0].sendbuf;
 +            for(x=0; x<size_yx; x++)
 +            {
 +                for(y=0; y<recv_nindex; y++)
 +                {
 +                    indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
 +                    indb = ((local_fft_ndata[XX] + x)*recv_nindex +y)*local_fft_ndata[ZZ];
 +                    for(z=0; z<local_fft_ndata[ZZ]; z++)
 +                    {
 +                        sendptr[indg+z] += recvptr[indb+z];
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    /* for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++) */
 +    if (pme->nnodes_major > 1)
 +    {
 +        /* Major dimension */
 +        overlap = &pme->overlap[0];
 +
 +        datasize = local_fft_ndata[YY]*local_fft_ndata[ZZ];
 +        gridsize = local_fft_size[YY] *local_fft_size[ZZ];
 +
 +        ipulse = 0;
 +
 +        send_id = overlap->send_id[ipulse];
 +        recv_id = overlap->recv_id[ipulse];
 +        send_nindex   = overlap->comm_data[ipulse].send_nindex;
 +        /* recv_index0   = overlap->comm_data[ipulse].recv_index0; */
 +        recv_index0 = 0;
 +        recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +
 +        sendptr = overlap->sendbuf;
 +        recvptr = overlap->recvbuf;
 +
 +        if (debug != NULL)
 +        {
 +            fprintf(debug,"PME fftgrid comm %2d x %2d x %2d\n",
 +                   send_nindex,local_fft_ndata[YY],local_fft_ndata[ZZ]);
 +        }
 +
 +#ifdef GMX_MPI
 +        MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
 +                     send_id,ipulse,
 +                     recvptr,recv_nindex*datasize,GMX_MPI_REAL,
 +                     recv_id,ipulse,
 +                     overlap->mpi_comm,&stat);
 +#endif
 +
 +        for(x=0; x<recv_nindex; x++)
 +        {
 +            for(y=0; y<local_fft_ndata[YY]; y++)
 +            {
 +                indg = (x*local_fft_size[YY]  + y)*local_fft_size[ZZ];
 +                indb = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
 +                for(z=0; z<local_fft_ndata[ZZ]; z++)
 +                {
 +                    fftgrid[indg+z] += recvptr[indb+z];
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void spread_on_grid(gmx_pme_t pme,
 +                           pme_atomcomm_t *atc,pmegrids_t *grids,
 +                           gmx_bool bCalcSplines,gmx_bool bSpread,
 +                           real *fftgrid)
 +{
 +    int nthread,thread;
 +#ifdef PME_TIME_THREADS
 +    gmx_cycles_t c1,c2,c3,ct1a,ct1b,ct1c;
 +    static double cs1=0,cs2=0,cs3=0;
 +    static double cs1a[6]={0,0,0,0,0,0};
 +    static int cnt=0;
 +#endif
 +
 +    nthread = pme->nthread;
 +    assert(nthread>0);
 +
 +#ifdef PME_TIME_THREADS
 +    c1 = omp_cyc_start();
 +#endif
 +    if (bCalcSplines)
 +    {
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +        for(thread=0; thread<nthread; thread++)
 +        {
 +            int start,end;
 +
 +            start = atc->n* thread   /nthread;
 +            end   = atc->n*(thread+1)/nthread;
 +
 +            /* Compute fftgrid index for all atoms,
 +             * with help of some extra variables.
 +             */
 +            calc_interpolation_idx(pme,atc,start,end,thread);
 +        }
 +    }
 +#ifdef PME_TIME_THREADS
 +    c1 = omp_cyc_end(c1);
 +    cs1 += (double)c1;
 +#endif
 +
 +#ifdef PME_TIME_THREADS
 +    c2 = omp_cyc_start();
 +#endif
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +    for(thread=0; thread<nthread; thread++)
 +    {
 +        splinedata_t *spline;
 +        pmegrid_t *grid;
 +
 +        /* make local bsplines  */
 +        if (grids == NULL || grids->nthread == 1)
 +        {
 +            spline = &atc->spline[0];
 +
 +            spline->n = atc->n;
 +
 +            grid = &grids->grid;
 +        }
 +        else
 +        {
 +            spline = &atc->spline[thread];
 +
 +            make_thread_local_ind(atc,thread,spline);
 +
 +            grid = &grids->grid_th[thread];
 +        }
 +
 +        if (bCalcSplines)
 +        {
 +            make_bsplines(spline->theta,spline->dtheta,pme->pme_order,
 +                          atc->fractx,spline->n,spline->ind,atc->q,pme->bFEP);
 +        }
 +
 +        if (bSpread)
 +        {
 +            /* put local atoms on grid. */
 +#ifdef PME_TIME_SPREAD
 +            ct1a = omp_cyc_start();
 +#endif
 +            spread_q_bsplines_thread(grid,atc,spline,&pme->spline_work);
 +
 +            if (grids->nthread > 1)
 +            {
 +                copy_local_grid(pme,grids,thread,fftgrid);
 +            }
 +#ifdef PME_TIME_SPREAD
 +            ct1a = omp_cyc_end(ct1a);
 +            cs1a[thread] += (double)ct1a;
 +#endif
 +        }
 +    }
 +#ifdef PME_TIME_THREADS
 +    c2 = omp_cyc_end(c2);
 +    cs2 += (double)c2;
 +#endif
 +
 +    if (bSpread && grids->nthread > 1)
 +    {
 +#ifdef PME_TIME_THREADS
 +        c3 = omp_cyc_start();
 +#endif
 +#pragma omp parallel for num_threads(grids->nthread) schedule(static)
 +        for(thread=0; thread<grids->nthread; thread++)
 +        {
 +            reduce_threadgrid_overlap(pme,grids,thread,
 +                                      fftgrid,
 +                                      pme->overlap[0].sendbuf,
 +                                      pme->overlap[1].sendbuf);
 +#ifdef PRINT_PME_SENDBUF
 +            print_sendbuf(pme,pme->overlap[0].sendbuf);
 +#endif
 +        }
 +#ifdef PME_TIME_THREADS
 +        c3 = omp_cyc_end(c3);
 +        cs3 += (double)c3;
 +#endif
 +
 +        if (pme->nnodes > 1)
 +        {
 +            /* Communicate the overlapping part of the fftgrid */
 +            sum_fftgrid_dd(pme,fftgrid);
 +        }
 +    }
 +
 +#ifdef PME_TIME_THREADS
 +    cnt++;
 +    if (cnt % 20 == 0)
 +    {
 +        printf("idx %.2f spread %.2f red %.2f",
 +               cs1*1e-9,cs2*1e-9,cs3*1e-9);
 +#ifdef PME_TIME_SPREAD
 +        for(thread=0; thread<nthread; thread++)
 +            printf(" %.2f",cs1a[thread]*1e-9);
 +#endif
 +        printf("\n");
 +    }
 +#endif
 +}
 +
 +
 +static void dump_grid(FILE *fp,
 +                      int sx,int sy,int sz,int nx,int ny,int nz,
 +                      int my,int mz,const real *g)
 +{
 +    int x,y,z;
 +
 +    for(x=0; x<nx; x++)
 +    {
 +        for(y=0; y<ny; y++)
 +        {
 +            for(z=0; z<nz; z++)
 +            {
 +                fprintf(fp,"%2d %2d %2d %6.3f\n",
 +                        sx+x,sy+y,sz+z,g[(x*my + y)*mz + z]);
 +            }
 +        }
 +    }
 +}
 +
 +static void dump_local_fftgrid(gmx_pme_t pme,const real *fftgrid)
 +{
 +    ivec local_fft_ndata,local_fft_offset,local_fft_size;
 +
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +
 +    dump_grid(stderr,
 +              pme->pmegrid_start_ix,
 +              pme->pmegrid_start_iy,
 +              pme->pmegrid_start_iz,
 +              pme->pmegrid_nx-pme->pme_order+1,
 +              pme->pmegrid_ny-pme->pme_order+1,
 +              pme->pmegrid_nz-pme->pme_order+1,
 +              local_fft_size[YY],
 +              local_fft_size[ZZ],
 +              fftgrid);
 +}
 +
 +
 +void gmx_pme_calc_energy(gmx_pme_t pme,int n,rvec *x,real *q,real *V)
 +{
 +    pme_atomcomm_t *atc;
 +    pmegrids_t *grid;
 +
 +    if (pme->nnodes > 1)
 +    {
 +        gmx_incons("gmx_pme_calc_energy called in parallel");
 +    }
 +    if (pme->bFEP > 1)
 +    {
 +        gmx_incons("gmx_pme_calc_energy with free energy");
 +    }
 +
 +    atc = &pme->atc_energy;
 +    atc->nthread   = 1;
 +    if (atc->spline == NULL)
 +    {
 +        snew(atc->spline,atc->nthread);
 +    }
 +    atc->nslab     = 1;
 +    atc->bSpread   = TRUE;
 +    atc->pme_order = pme->pme_order;
 +    atc->n         = n;
 +    pme_realloc_atomcomm_things(atc);
 +    atc->x         = x;
 +    atc->q         = q;
 +
 +    /* We only use the A-charges grid */
 +    grid = &pme->pmegridA;
 +
 +    spread_on_grid(pme,atc,NULL,TRUE,FALSE,pme->fftgridA);
 +
 +    *V = gather_energy_bsplines(pme,grid->grid.grid,atc);
 +}
 +
 +
 +static void reset_pmeonly_counters(t_commrec *cr,gmx_wallcycle_t wcycle,
 +        t_nrnb *nrnb,t_inputrec *ir, gmx_large_int_t step_rel)
 +{
 +    /* Reset all the counters related to performance over the run */
 +    wallcycle_stop(wcycle,ewcRUN);
 +    wallcycle_reset_all(wcycle);
 +    init_nrnb(nrnb);
 +    ir->init_step += step_rel;
 +    ir->nsteps    -= step_rel;
 +    wallcycle_start(wcycle,ewcRUN);
 +}
 +
 +
 +int gmx_pmeonly(gmx_pme_t pme,
 +                t_commrec *cr,    t_nrnb *nrnb,
 +                gmx_wallcycle_t wcycle,
 +                real ewaldcoeff,  gmx_bool bGatherOnly,
 +                t_inputrec *ir)
 +{
 +    gmx_pme_pp_t pme_pp;
 +    int  natoms;
 +    matrix box;
 +    rvec *x_pp=NULL,*f_pp=NULL;
 +    real *chargeA=NULL,*chargeB=NULL;
 +    real lambda=0;
 +    int  maxshift_x=0,maxshift_y=0;
 +    real energy,dvdlambda;
 +    matrix vir;
 +    float cycles;
 +    int  count;
 +    gmx_bool bEnerVir;
 +    gmx_large_int_t step,step_rel;
 +
 +
 +    pme_pp = gmx_pme_pp_init(cr);
 +
 +    init_nrnb(nrnb);
 +
 +    count = 0;
 +    do /****** this is a quasi-loop over time steps! */
 +    {
 +        /* Domain decomposition */
 +        natoms = gmx_pme_recv_q_x(pme_pp,
 +                                  &chargeA,&chargeB,box,&x_pp,&f_pp,
 +                                  &maxshift_x,&maxshift_y,
 +                                  &pme->bFEP,&lambda,
 +                                  &bEnerVir,
 +                                  &step);
 +
 +        if (natoms == -1) {
 +            /* We should stop: break out of the loop */
 +            break;
 +        }
 +
 +        step_rel = step - ir->init_step;
 +
 +        if (count == 0)
 +            wallcycle_start(wcycle,ewcRUN);
 +
 +        wallcycle_start(wcycle,ewcPMEMESH);
 +
 +        dvdlambda = 0;
 +        clear_mat(vir);
 +        gmx_pme_do(pme,0,natoms,x_pp,f_pp,chargeA,chargeB,box,
 +                   cr,maxshift_x,maxshift_y,nrnb,wcycle,vir,ewaldcoeff,
 +                   &energy,lambda,&dvdlambda,
 +                   GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
 +
 +        cycles = wallcycle_stop(wcycle,ewcPMEMESH);
 +
 +        gmx_pme_send_force_vir_ener(pme_pp,
 +                                    f_pp,vir,energy,dvdlambda,
 +                                    cycles);
 +
 +        count++;
 +
 +        if (step_rel == wcycle_get_reset_counters(wcycle))
 +        {
 +            /* Reset all the counters related to performance over the run */
 +            reset_pmeonly_counters(cr,wcycle,nrnb,ir,step_rel);
 +            wcycle_set_reset_counters(wcycle, 0);
 +        }
 +
 +    } /***** end of quasi-loop, we stop with the break above */
 +    while (TRUE);
 +
 +    return 0;
 +}
 +
 +int gmx_pme_do(gmx_pme_t pme,
 +               int start,       int homenr,
 +               rvec x[],        rvec f[],
 +               real *chargeA,   real *chargeB,
 +               matrix box, t_commrec *cr,
 +               int  maxshift_x, int maxshift_y,
 +               t_nrnb *nrnb,    gmx_wallcycle_t wcycle,
 +               matrix vir,      real ewaldcoeff,
 +               real *energy,    real lambda,
 +               real *dvdlambda, int flags)
 +{
 +    int     q,d,i,j,ntot,npme;
 +    int     nx,ny,nz;
 +    int     n_d,local_ny;
 +    pme_atomcomm_t *atc=NULL;
 +    pmegrids_t *pmegrid=NULL;
 +    real    *grid=NULL;
 +    real    *ptr;
 +    rvec    *x_d,*f_d;
 +    real    *charge=NULL,*q_d;
 +    real    energy_AB[2];
 +    matrix  vir_AB[2];
 +    gmx_bool bClearF;
 +    gmx_parallel_3dfft_t pfft_setup;
 +    real *  fftgrid;
 +    t_complex * cfftgrid;
 +    int     thread;
 +    const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR;
 +    const gmx_bool bCalcF = flags & GMX_PME_CALC_F;
 +
 +    assert(pme->nnodes > 0);
 +    assert(pme->nnodes == 1 || pme->ndecompdim > 0);
 +
 +    if (pme->nnodes > 1) {
 +        atc = &pme->atc[0];
 +        atc->npd = homenr;
 +        if (atc->npd > atc->pd_nalloc) {
 +            atc->pd_nalloc = over_alloc_dd(atc->npd);
 +            srenew(atc->pd,atc->pd_nalloc);
 +        }
 +        atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y);
 +    }
 +    else
 +    {
 +        /* This could be necessary for TPI */
 +        pme->atc[0].n = homenr;
 +    }
 +
 +    for(q=0; q<(pme->bFEP ? 2 : 1); q++) {
 +        if (q == 0) {
 +            pmegrid = &pme->pmegridA;
 +            fftgrid = pme->fftgridA;
 +            cfftgrid = pme->cfftgridA;
 +            pfft_setup = pme->pfft_setupA;
 +            charge = chargeA+start;
 +        } else {
 +            pmegrid = &pme->pmegridB;
 +            fftgrid = pme->fftgridB;
 +            cfftgrid = pme->cfftgridB;
 +            pfft_setup = pme->pfft_setupB;
 +            charge = chargeB+start;
 +        }
 +        grid = pmegrid->grid.grid;
 +        /* Unpack structure */
 +        if (debug) {
 +            fprintf(debug,"PME: nnodes = %d, nodeid = %d\n",
 +                    cr->nnodes,cr->nodeid);
 +            fprintf(debug,"Grid = %p\n",(void*)grid);
 +            if (grid == NULL)
 +                gmx_fatal(FARGS,"No grid!");
 +        }
 +        where();
 +
 +        m_inv_ur0(box,pme->recipbox);
 +
 +        if (pme->nnodes == 1) {
 +            atc = &pme->atc[0];
 +            if (DOMAINDECOMP(cr)) {
 +                atc->n = homenr;
 +                pme_realloc_atomcomm_things(atc);
 +            }
 +            atc->x = x;
 +            atc->q = charge;
 +            atc->f = f;
 +        } else {
 +            wallcycle_start(wcycle,ewcPME_REDISTXF);
 +            for(d=pme->ndecompdim-1; d>=0; d--)
 +            {
 +                if (d == pme->ndecompdim-1)
 +                {
 +                    n_d = homenr;
 +                    x_d = x + start;
 +                    q_d = charge;
 +                }
 +                else
 +                {
 +                    n_d = pme->atc[d+1].n;
 +                    x_d = atc->x;
 +                    q_d = atc->q;
 +                }
 +                atc = &pme->atc[d];
 +                atc->npd = n_d;
 +                if (atc->npd > atc->pd_nalloc) {
 +                    atc->pd_nalloc = over_alloc_dd(atc->npd);
 +                    srenew(atc->pd,atc->pd_nalloc);
 +                }
 +                atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y);
 +                pme_calc_pidx_wrapper(n_d,pme->recipbox,x_d,atc);
 +                where();
 +
 +                /* Redistribute x (only once) and qA or qB */
 +                if (DOMAINDECOMP(cr)) {
 +                    dd_pmeredist_x_q(pme, n_d, q==0, x_d, q_d, atc);
 +                } else {
 +                    pmeredist_pd(pme, TRUE, n_d, q==0, x_d, q_d, atc);
 +                }
 +            }
 +            where();
 +
 +            wallcycle_stop(wcycle,ewcPME_REDISTXF);
 +        }
 +
 +        if (debug)
 +            fprintf(debug,"Node= %6d, pme local particles=%6d\n",
 +                    cr->nodeid,atc->n);
 +
 +        if (flags & GMX_PME_SPREAD_Q)
 +        {
 +            wallcycle_start(wcycle,ewcPME_SPREADGATHER);
 +
 +            /* Spread the charges on a grid */
 +            spread_on_grid(pme,&pme->atc[0],pmegrid,q==0,TRUE,fftgrid);
 +
 +            if (q == 0)
 +            {
 +                inc_nrnb(nrnb,eNR_WEIGHTS,DIM*atc->n);
 +            }
 +            inc_nrnb(nrnb,eNR_SPREADQBSP,
 +                     pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
 +
 +            if (pme->nthread == 1)
 +            {
 +                wrap_periodic_pmegrid(pme,grid);
 +
 +                /* sum contributions to local grid from other nodes */
 +#ifdef GMX_MPI
 +                if (pme->nnodes > 1)
 +                {
 +                    gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_FORWARD);
 +                    where();
 +                }
 +#endif
 +
 +                copy_pmegrid_to_fftgrid(pme,grid,fftgrid);
 +            }
 +
 +            wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
 +
 +            /*
 +            dump_local_fftgrid(pme,fftgrid);
 +            exit(0);
 +            */
 +        }
 +
 +        /* Here we start a large thread parallel region */
 +#pragma omp parallel for num_threads(pme->nthread) schedule(static)
 +        for(thread=0; thread<pme->nthread; thread++)
 +        {
 +            if (flags & GMX_PME_SOLVE)
 +            {
 +                int loop_count;
 +
 +                /* do 3d-fft */
 +                if (thread == 0)
 +                {
 +                    wallcycle_start(wcycle,ewcPME_FFT);
 +                }
 +                gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_REAL_TO_COMPLEX,
 +                                           fftgrid,cfftgrid,thread,wcycle);
 +                if (thread == 0)
 +                {
 +                    wallcycle_stop(wcycle,ewcPME_FFT);
 +                }
 +                where();
 +
 +                /* solve in k-space for our local cells */
 +                if (thread == 0)
 +                {
 +                    wallcycle_start(wcycle,ewcPME_SOLVE);
 +                }
 +                loop_count =
 +                    solve_pme_yzx(pme,cfftgrid,ewaldcoeff,
 +                                  box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
 +                                  bCalcEnerVir,
 +                                  pme->nthread,thread);
 +                if (thread == 0)
 +                {
 +                    wallcycle_stop(wcycle,ewcPME_SOLVE);
 +                    where();
 +                    inc_nrnb(nrnb,eNR_SOLVEPME,loop_count);
 +                }
 +            }
 +
 +            if (bCalcF)
 +            {
 +                /* do 3d-invfft */
 +                if (thread == 0)
 +                {
 +                    where();
 +                    wallcycle_start(wcycle,ewcPME_FFT);
 +                }
 +                gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_COMPLEX_TO_REAL,
 +                                           cfftgrid,fftgrid,thread,wcycle);
 +                if (thread == 0)
 +                {
 +                    wallcycle_stop(wcycle,ewcPME_FFT);
 +
 +                    where();
 +
 +                    if (pme->nodeid == 0)
 +                    {
 +                        ntot = pme->nkx*pme->nky*pme->nkz;
 +                        npme  = ntot*log((real)ntot)/log(2.0);
 +                        inc_nrnb(nrnb,eNR_FFT,2*npme);
 +                    }
 +
 +                    wallcycle_start(wcycle,ewcPME_SPREADGATHER);
 +                }
 +
 +                copy_fftgrid_to_pmegrid(pme,fftgrid,grid,pme->nthread,thread);
 +            }
 +        }
 +        /* End of thread parallel section.
 +         * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
 +         */
 +
 +        if (bCalcF)
 +        {
 +            /* distribute local grid to all nodes */
 +#ifdef GMX_MPI
 +            if (pme->nnodes > 1) {
 +                gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_BACKWARD);
 +            }
 +#endif
 +            where();
 +
 +            unwrap_periodic_pmegrid(pme,grid);
 +
 +            /* interpolate forces for our local atoms */
 +
 +            where();
 +
 +            /* If we are running without parallelization,
 +             * atc->f is the actual force array, not a buffer,
 +             * therefore we should not clear it.
 +             */
 +            bClearF = (q == 0 && PAR(cr));
 +#pragma omp parallel for num_threads(pme->nthread) schedule(static)
 +            for(thread=0; thread<pme->nthread; thread++)
 +            {
 +                gather_f_bsplines(pme,grid,bClearF,atc,
 +                                  &atc->spline[thread],
 +                                  pme->bFEP ? (q==0 ? 1.0-lambda : lambda) : 1.0);
 +            }
 +
 +            where();
 +
 +            inc_nrnb(nrnb,eNR_GATHERFBSP,
 +                     pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
 +            wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
 +        }
 +
 +        if (bCalcEnerVir)
 +        {
 +            /* This should only be called on the master thread
 +             * and after the threads have synchronized.
 +             */
 +            get_pme_ener_vir(pme,pme->nthread,&energy_AB[q],vir_AB[q]);
 +        }
 +    } /* of q-loop */
 +
 +    if (bCalcF && pme->nnodes > 1) {
 +        wallcycle_start(wcycle,ewcPME_REDISTXF);
 +        for(d=0; d<pme->ndecompdim; d++)
 +        {
 +            atc = &pme->atc[d];
 +            if (d == pme->ndecompdim - 1)
 +            {
 +                n_d = homenr;
 +                f_d = f + start;
 +            }
 +            else
 +            {
 +                n_d = pme->atc[d+1].n;
 +                f_d = pme->atc[d+1].f;
 +            }
 +            if (DOMAINDECOMP(cr)) {
 +                dd_pmeredist_f(pme,atc,n_d,f_d,
 +                               d==pme->ndecompdim-1 && pme->bPPnode);
 +            } else {
 +                pmeredist_pd(pme, FALSE, n_d, TRUE, f_d, NULL, atc);
 +            }
 +        }
 +
 +        wallcycle_stop(wcycle,ewcPME_REDISTXF);
 +    }
 +    where();
 +
 +    if (bCalcEnerVir)
 +    {
 +        if (!pme->bFEP) {
 +            *energy = energy_AB[0];
 +            m_add(vir,vir_AB[0],vir);
 +        } else {
 +            *energy = (1.0-lambda)*energy_AB[0] + lambda*energy_AB[1];
 +            *dvdlambda += energy_AB[1] - energy_AB[0];
 +            for(i=0; i<DIM; i++)
 +            {
 +                for(j=0; j<DIM; j++)
 +                {
 +                    vir[i][j] += (1.0-lambda)*vir_AB[0][i][j] + 
 +                        lambda*vir_AB[1][i][j];
 +                }
 +            }
 +        }
 +    }
 +    else
 +    {
 +        *energy = 0;
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"PME mesh energy: %g\n",*energy);
 +    }
 +
 +    return 0;
 +}
diff --cc src/programs/mdrun/md.c
index 4a346186f7,0000000000..3786eee68c
mode 100644,000000..100644
--- a/src/programs/mdrun/md.c
+++ b/src/programs/mdrun/md.c
@@@ -1,1989 -1,0 +1,2000 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "sysstuff.h"
 +#include "vec.h"
 +#include "statutil.h"
 +#include "vcm.h"
 +#include "mdebin.h"
 +#include "nrnb.h"
 +#include "calcmu.h"
 +#include "index.h"
 +#include "vsite.h"
 +#include "update.h"
 +#include "ns.h"
 +#include "trnio.h"
 +#include "xtcio.h"
 +#include "mdrun.h"
 +#include "confio.h"
 +#include "network.h"
 +#include "pull.h"
 +#include "xvgr.h"
 +#include "physics.h"
 +#include "names.h"
 +#include "xmdrun.h"
 +#include "ionize.h"
 +#include "disre.h"
 +#include "orires.h"
 +#include "pme.h"
 +#include "mdatoms.h"
 +#include "repl_ex.h"
 +#include "qmmm.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "topsort.h"
 +#include "coulomb.h"
 +#include "constr.h"
 +#include "shellfc.h"
 +#include "compute_io.h"
 +#include "mvdata.h"
 +#include "checkpoint.h"
 +#include "mtop_util.h"
 +#include "sighandler.h"
 +#include "txtdump.h"
 +#include "string2.h"
 +#include "membed.h"
 +
 +#ifdef GMX_LIB_MPI
 +#include <mpi.h>
 +#endif
 +#ifdef GMX_THREAD_MPI
 +#include "tmpi.h"
 +#endif
 +
 +#ifdef GMX_FAHCORE
 +#include "corewrap.h"
 +#endif
 +
 +double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
 +             const output_env_t oenv, gmx_bool bVerbose,gmx_bool bCompact,
 +             int nstglobalcomm,
 +             gmx_vsite_t *vsite,gmx_constr_t constr,
 +             int stepout,t_inputrec *ir,
 +             gmx_mtop_t *top_global,
 +             t_fcdata *fcd,
 +             t_state *state_global,
 +             t_mdatoms *mdatoms,
 +             t_nrnb *nrnb,gmx_wallcycle_t wcycle,
 +             gmx_edsam_t ed,t_forcerec *fr,
 +             int repl_ex_nst,int repl_ex_nex,int repl_ex_seed,gmx_membed_t membed,
 +             real cpt_period,real max_hours,
 +             const char *deviceOptions,
 +             unsigned long Flags,
 +             gmx_runtime_t *runtime)
 +{
 +    gmx_mdoutf_t *outf;
 +    gmx_large_int_t step,step_rel;
 +    double     run_time;
 +    double     t,t0,lam0[efptNR];
 +    gmx_bool   bGStatEveryStep,bGStat,bNstEner,bCalcEnerPres,bEnergyHere;
 +    gmx_bool   bNS,bNStList,bSimAnn,bStopCM,bRerunMD,bNotLastFrame=FALSE,
 +               bFirstStep,bStateFromCP,bStateFromTPX,bInitStep,bLastStep,
 +               bBornRadii,bStartingFromCpt;
 +    gmx_bool   bDoDHDL=FALSE,bDoFEP=FALSE,bDoExpanded=FALSE;
 +    gmx_bool   do_ene,do_log,do_verbose,bRerunWarnNoV=TRUE,
 +               bForceUpdate=FALSE,bCPT;
 +    int        mdof_flags;
 +    gmx_bool   bMasterState;
 +    int        force_flags,cglo_flags;
 +    tensor     force_vir,shake_vir,total_vir,tmp_vir,pres;
 +    int        i,m;
 +    t_trxstatus *status;
 +    rvec       mu_tot;
 +    t_vcm      *vcm;
 +    t_state    *bufstate=NULL;   
 +    matrix     *scale_tot,pcoupl_mu,M,ebox;
 +    gmx_nlheur_t nlh;
 +    t_trxframe rerun_fr;
 +    gmx_repl_ex_t repl_ex=NULL;
 +    int        nchkpt=1;
 +    gmx_localtop_t *top;	
 +    t_mdebin *mdebin=NULL;
 +    df_history_t df_history;
 +    t_state    *state=NULL;
 +    rvec       *f_global=NULL;
 +    int        n_xtc=-1;
 +    rvec       *x_xtc=NULL;
 +    gmx_enerdata_t *enerd;
 +    rvec       *f=NULL;
 +    gmx_global_stat_t gstat;
 +    gmx_update_t upd=NULL;
 +    t_graph    *graph=NULL;
 +    globsig_t   gs;
 +    gmx_rng_t mcrng=NULL;
 +    gmx_bool        bFFscan;
 +    gmx_groups_t *groups;
 +    gmx_ekindata_t *ekind, *ekind_save;
 +    gmx_shellfc_t shellfc;
 +    int         count,nconverged=0;
 +    real        timestep=0;
 +    double      tcount=0;
 +    gmx_bool        bIonize=FALSE;
 +    gmx_bool        bTCR=FALSE,bConverged=TRUE,bOK,bSumEkinhOld,bExchanged;
 +    gmx_bool        bAppend;
 +    gmx_bool        bResetCountersHalfMaxH=FALSE;
 +    gmx_bool        bVV,bIterations,bFirstIterate,bTemp,bPres,bTrotter;
 +    real        mu_aver=0,dvdl;
 +    int         a0,a1,gnx=0,ii;
 +    atom_id     *grpindex=NULL;
 +    char        *grpname;
 +    t_coupl_rec *tcr=NULL;
 +    rvec        *xcopy=NULL,*vcopy=NULL,*cbuf=NULL;
 +    matrix      boxcopy={{0}},lastbox;
 +	tensor      tmpvir;
 +	real        fom,oldfom,veta_save,pcurr,scalevir,tracevir;
 +	real        vetanew = 0;
 +    int         lamnew=0;
 +    /* for FEP */
 +    int         fep_state=0;
 +    int         nstfep;
 +    real        rate;
 +    double      cycles;
 +	real        saved_conserved_quantity = 0;
 +    real        last_ekin = 0;
 +	int         iter_i;
 +	t_extmass   MassQ;
 +    int         **trotter_seq; 
 +    char        sbuf[STEPSTRSIZE],sbuf2[STEPSTRSIZE];
 +    int         handled_stop_condition=gmx_stop_cond_none; /* compare to get_stop_condition*/
 +    gmx_iterate_t iterate;
 +    gmx_large_int_t multisim_nsteps=-1; /* number of steps to do  before first multisim 
 +                                          simulation stops. If equal to zero, don't
 +                                          communicate any more between multisims.*/
++
++    if(MASTER(cr))
++    {
++        fprintf(stderr,
++                "\n* WARNING * WARNING * WARNING * WARNING * WARNING * WARNING *\n"
++                "We have just committed the new CPU detection code in this branch,\n"
++                "and will commit new SSE/AVX kernels in a few days. However, this\n"
++                "means that currently only the NxN kernels are accelerated!\n"
++                "In the mean time, you might want to avoid production runs in 4.6.\n\n");
++    }
++
 +#ifdef GMX_FAHCORE
 +    /* Temporary addition for FAHCORE checkpointing */
 +    int chkpt_ret;
 +#endif
 +
 +    /* Check for special mdrun options */
 +    bRerunMD = (Flags & MD_RERUN);
 +    bIonize  = (Flags & MD_IONIZE);
 +    bFFscan  = (Flags & MD_FFSCAN);
 +    bAppend  = (Flags & MD_APPENDFILES);
 +    if (Flags & MD_RESETCOUNTERSHALFWAY)
 +    {
 +        if (ir->nsteps > 0)
 +        {
 +            /* Signal to reset the counters half the simulation steps. */
 +            wcycle_set_reset_counters(wcycle,ir->nsteps/2);
 +        }
 +        /* Signal to reset the counters halfway the simulation time. */
 +        bResetCountersHalfMaxH = (max_hours > 0);
 +    }
 +
 +    /* md-vv uses averaged full step velocities for T-control 
 +       md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
 +       md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
 +    bVV = EI_VV(ir->eI);
 +    if (bVV) /* to store the initial velocities while computing virial */
 +    {
 +        snew(cbuf,top_global->natoms);
 +    }
 +    /* all the iteratative cases - only if there are constraints */ 
 +    bIterations = ((IR_NPH_TROTTER(ir) || IR_NPT_TROTTER(ir)) && (constr) && (!bRerunMD));
 +    bTrotter = (bVV && (IR_NPT_TROTTER(ir) || IR_NPH_TROTTER(ir) || IR_NVT_TROTTER(ir)));
 +    
 +    if (bRerunMD)
 +    {
 +        /* Since we don't know if the frames read are related in any way,
 +         * rebuild the neighborlist at every step.
 +         */
 +        ir->nstlist       = 1;
 +        ir->nstcalcenergy = 1;
 +        nstglobalcomm     = 1;
 +    }
 +
 +    check_ir_old_tpx_versions(cr,fplog,ir,top_global);
 +
 +    nstglobalcomm = check_nstglobalcomm(fplog,cr,nstglobalcomm,ir);
 +    bGStatEveryStep = (nstglobalcomm == 1);
 +
 +    if (!bGStatEveryStep && ir->nstlist == -1 && fplog != NULL)
 +    {
 +        fprintf(fplog,
 +                "To reduce the energy communication with nstlist = -1\n"
 +                "the neighbor list validity should not be checked at every step,\n"
 +                "this means that exact integration is not guaranteed.\n"
 +                "The neighbor list validity is checked after:\n"
 +                "  <n.list life time> - 2*std.dev.(n.list life time)  steps.\n"
 +                "In most cases this will result in exact integration.\n"
 +                "This reduces the energy communication by a factor of 2 to 3.\n"
 +                "If you want less energy communication, set nstlist > 3.\n\n");
 +    }
 +
 +    if (bRerunMD || bFFscan)
 +    {
 +        ir->nstxtcout = 0;
 +    }
 +    groups = &top_global->groups;
 +
 +    /* Initial values */
 +    init_md(fplog,cr,ir,oenv,&t,&t0,state_global->lambda,
 +            &(state_global->fep_state),lam0,
 +            nrnb,top_global,&upd,
 +            nfile,fnm,&outf,&mdebin,
 +            force_vir,shake_vir,mu_tot,&bSimAnn,&vcm,state_global,Flags);
 +
 +    clear_mat(total_vir);
 +    clear_mat(pres);
 +    /* Energy terms and groups */
 +    snew(enerd,1);
 +    init_enerdata(top_global->groups.grps[egcENER].nr,ir->fepvals->n_lambda,
 +                  enerd);
 +    if (DOMAINDECOMP(cr))
 +    {
 +        f = NULL;
 +    }
 +    else
 +    {
 +        snew(f,top_global->natoms);
 +    }
 +
 +    /* lambda Monte carlo random number generator  */
 +    if (ir->bExpanded)
 +    {
 +        mcrng = gmx_rng_init(ir->expandedvals->lmc_seed);
 +    }
 +    /* copy the state into df_history */
 +    copy_df_history(&df_history,&state_global->dfhist);
 +
 +    /* Kinetic energy data */
 +    snew(ekind,1);
 +    init_ekindata(fplog,top_global,&(ir->opts),ekind);
 +    /* needed for iteration of constraints */
 +    snew(ekind_save,1);
 +    init_ekindata(fplog,top_global,&(ir->opts),ekind_save);
 +    /* Copy the cos acceleration to the groups struct */    
 +    ekind->cosacc.cos_accel = ir->cos_accel;
 +
 +    gstat = global_stat_init(ir);
 +    debug_gmx();
 +
 +    /* Check for polarizable models and flexible constraints */
 +    shellfc = init_shell_flexcon(fplog,
 +                                 top_global,n_flexible_constraints(constr),
 +                                 (ir->bContinuation || 
 +                                  (DOMAINDECOMP(cr) && !MASTER(cr))) ?
 +                                 NULL : state_global->x);
 +
 +    if (DEFORM(*ir))
 +    {
 +#ifdef GMX_THREAD_MPI
 +        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
 +#endif
 +        set_deform_reference_box(upd,
 +                                 deform_init_init_step_tpx,
 +                                 deform_init_box_tpx);
 +#ifdef GMX_THREAD_MPI
 +        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
 +#endif
 +    }
 +
 +    {
 +        double io = compute_io(ir,top_global->natoms,groups,mdebin->ebin->nener,1);
 +        if ((io > 2000) && MASTER(cr))
 +            fprintf(stderr,
 +                    "\nWARNING: This run will generate roughly %.0f Mb of data\n\n",
 +                    io);
 +    }
 +
 +    if (DOMAINDECOMP(cr)) {
 +        top = dd_init_local_top(top_global);
 +
 +        snew(state,1);
 +        dd_init_local_state(cr->dd,state_global,state);
 +
 +        if (DDMASTER(cr->dd) && ir->nstfout) {
 +            snew(f_global,state_global->natoms);
 +        }
 +    } else {
 +        if (PAR(cr)) {
 +            /* Initialize the particle decomposition and split the topology */
 +            top = split_system(fplog,top_global,ir,cr);
 +
 +            pd_cg_range(cr,&fr->cg0,&fr->hcg);
 +            pd_at_range(cr,&a0,&a1);
 +        } else {
 +            top = gmx_mtop_generate_local_top(top_global,ir);
 +
 +            a0 = 0;
 +            a1 = top_global->natoms;
 +        }
 +
 +        state = partdec_init_local_state(cr,state_global);
 +        f_global = f;
 +
 +        atoms2md(top_global,ir,0,NULL,a0,a1-a0,mdatoms);
 +
 +        if (vsite) {
 +            set_vsite_top(vsite,top,mdatoms,cr);
 +        }
 +
 +        if (ir->ePBC != epbcNONE && !ir->bPeriodicMols) {
 +            graph = mk_graph(fplog,&(top->idef),0,top_global->natoms,FALSE,FALSE);
 +        }
 +
 +        if (shellfc) {
 +            make_local_shells(cr,mdatoms,shellfc);
 +        }
 +
 +        if (ir->pull && PAR(cr)) {
 +            dd_make_local_pull_groups(NULL,ir->pull,mdatoms);
 +        }
 +    }
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        /* Distribute the charge groups over the nodes from the master node */
 +        dd_partition_system(fplog,ir->init_step,cr,TRUE,1,
 +                            state_global,top_global,ir,
 +                            state,&f,mdatoms,top,fr,
 +                            vsite,shellfc,constr,
 +                            nrnb,wcycle,FALSE);
 +    }
 +
 +    update_mdatoms(mdatoms,state->lambda[efptMASS]);
 +
 +    if (opt2bSet("-cpi",nfile,fnm))
 +    {
 +        bStateFromCP = gmx_fexist_master(opt2fn_master("-cpi",nfile,fnm,cr),cr);
 +    }
 +    else
 +    {
 +        bStateFromCP = FALSE;
 +    }
 +
 +    if (MASTER(cr))
 +    {
 +        if (bStateFromCP)
 +        {
 +            /* Update mdebin with energy history if appending to output files */
 +            if ( Flags & MD_APPENDFILES )
 +            {
 +                restore_energyhistory_from_state(mdebin,&state_global->enerhist);
 +            }
 +            else
 +            {
 +                /* We might have read an energy history from checkpoint,
 +                 * free the allocated memory and reset the counts.
 +                 */
 +                done_energyhistory(&state_global->enerhist);
 +                init_energyhistory(&state_global->enerhist);
 +            }
 +        }
 +        /* Set the initial energy history in state by updating once */
 +        update_energyhistory(&state_global->enerhist,mdebin);
 +    }	
 +
 +    if ((state->flags & (1<<estLD_RNG)) && (Flags & MD_READ_RNG)) 
 +    {
 +        /* Set the random state if we read a checkpoint file */
 +        set_stochd_state(upd,state);
 +    }
 +
 +    if (state->flags & (1<<estMC_RNG))
 +    {
 +        set_mc_state(mcrng,state);
 +    }
 +
 +    /* Initialize constraints */
 +    if (constr) {
 +        if (!DOMAINDECOMP(cr))
 +            set_constraints(constr,top,ir,mdatoms,cr);
 +    }
 +
 +    /* Check whether we have to GCT stuff */
 +    bTCR = ftp2bSet(efGCT,nfile,fnm);
 +    if (bTCR) {
 +        if (MASTER(cr)) {
 +            fprintf(stderr,"Will do General Coupling Theory!\n");
 +        }
 +        gnx = top_global->mols.nr;
 +        snew(grpindex,gnx);
 +        for(i=0; (i<gnx); i++) {
 +            grpindex[i] = i;
 +        }
 +    }
 +
 +    if (repl_ex_nst > 0)
 +    {
 +        /* We need to be sure replica exchange can only occur
 +         * when the energies are current */
 +        check_nst_param(fplog,cr,"nstcalcenergy",ir->nstcalcenergy,
 +                        "repl_ex_nst",&repl_ex_nst);
 +        /* This check needs to happen before inter-simulation
 +         * signals are initialized, too */
 +    }
 +    if (repl_ex_nst > 0 && MASTER(cr))
 +    {
 +        repl_ex = init_replica_exchange(fplog,cr->ms,state_global,ir,
 +                                        repl_ex_nst,repl_ex_nex,repl_ex_seed); 
 +    }
 +    if (!ir->bContinuation && !bRerunMD)
 +    {
 +        if (mdatoms->cFREEZE && (state->flags & (1<<estV)))
 +        {
 +            /* Set the velocities of frozen particles to zero */
 +            for(i=mdatoms->start; i<mdatoms->start+mdatoms->homenr; i++)
 +            {
 +                for(m=0; m<DIM; m++)
 +                {
 +                    if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
 +                    {
 +                        state->v[i][m] = 0;
 +                    }
 +                }
 +            }
 +        }
 +
 +        if (constr)
 +        {
 +            /* Constrain the initial coordinates and velocities */
 +            do_constrain_first(fplog,constr,ir,mdatoms,state,f,
 +                               graph,cr,nrnb,fr,top,shake_vir);
 +        }
 +        if (vsite)
 +        {
 +            /* Construct the virtual sites for the initial configuration */
 +            construct_vsites(fplog,vsite,state->x,nrnb,ir->delta_t,NULL,
 +                             top->idef.iparams,top->idef.il,
 +                             fr->ePBC,fr->bMolPBC,graph,cr,state->box);
 +        }
 +    }
 +
 +    debug_gmx();
 +
 +    /* set free energy calculation frequency as the minimum of nstdhdl, nstexpanded, and nstrepl_ex_nst*/
 +    nstfep = ir->fepvals->nstdhdl;
 +    if (ir->bExpanded && (nstfep > ir->expandedvals->nstexpanded))
 +    {
 +        nstfep = ir->expandedvals->nstexpanded;
 +    }
 +    if (repl_ex_nst > 0 && repl_ex_nst > nstfep)
 +    {
 +        nstfep = repl_ex_nst;
 +    }
 +
 +    /* I'm assuming we need global communication the first time! MRS */
 +    cglo_flags = (CGLO_TEMPERATURE | CGLO_GSTAT
 +                  | ((ir->comm_mode != ecmNO) ? CGLO_STOPCM:0)
 +                  | (bVV ? CGLO_PRESSURE:0)
 +                  | (bVV ? CGLO_CONSTRAINT:0)
 +                  | (bRerunMD ? CGLO_RERUNMD:0)
 +                  | ((Flags & MD_READ_EKIN) ? CGLO_READEKIN:0));
 +    
 +    bSumEkinhOld = FALSE;
 +    compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
 +                    NULL,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
 +                    constr,NULL,FALSE,state->box,
 +                    top_global,&pcurr,top_global->natoms,&bSumEkinhOld,cglo_flags);
 +    if (ir->eI == eiVVAK) {
 +        /* a second call to get the half step temperature initialized as well */ 
 +        /* we do the same call as above, but turn the pressure off -- internally to 
 +           compute_globals, this is recognized as a velocity verlet half-step 
 +           kinetic energy calculation.  This minimized excess variables, but 
 +           perhaps loses some logic?*/
 +        
 +        compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
 +                        NULL,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
 +                        constr,NULL,FALSE,state->box,
 +                        top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
 +                        cglo_flags &~ (CGLO_STOPCM | CGLO_PRESSURE));
 +    }
 +    
 +    /* Calculate the initial half step temperature, and save the ekinh_old */
 +    if (!(Flags & MD_STARTFROMCPT)) 
 +    {
 +        for(i=0; (i<ir->opts.ngtc); i++) 
 +        {
 +            copy_mat(ekind->tcstat[i].ekinh,ekind->tcstat[i].ekinh_old);
 +        } 
 +    }
 +    if (ir->eI != eiVV) 
 +    {
 +        enerd->term[F_TEMP] *= 2; /* result of averages being done over previous and current step,
 +                                     and there is no previous step */
 +    }
 +    
 +    /* if using an iterative algorithm, we need to create a working directory for the state. */
 +    if (bIterations) 
 +    {
 +            bufstate = init_bufstate(state);
 +    }
 +    if (bFFscan) 
 +    {
 +        snew(xcopy,state->natoms);
 +        snew(vcopy,state->natoms);
 +        copy_rvecn(state->x,xcopy,0,state->natoms);
 +        copy_rvecn(state->v,vcopy,0,state->natoms);
 +        copy_mat(state->box,boxcopy);
 +    } 
 +    
 +    /* need to make an initiation call to get the Trotter variables set, as well as other constants for non-trotter
 +       temperature control */
 +    trotter_seq = init_npt_vars(ir,state,&MassQ,bTrotter);
 +    
 +    if (MASTER(cr))
 +    {
 +        if (constr && !ir->bContinuation && ir->eConstrAlg == econtLINCS)
 +        {
 +            fprintf(fplog,
 +                    "RMS relative constraint deviation after constraining: %.2e\n",
 +                    constr_rmsd(constr,FALSE));
 +        }
 +        if (EI_STATE_VELOCITY(ir->eI))
 +        {
 +            fprintf(fplog,"Initial temperature: %g K\n",enerd->term[F_TEMP]);
 +        }
 +        if (bRerunMD)
 +        {
 +            fprintf(stderr,"starting md rerun '%s', reading coordinates from"
 +                    " input trajectory '%s'\n\n",
 +                    *(top_global->name),opt2fn("-rerun",nfile,fnm));
 +            if (bVerbose)
 +            {
 +                fprintf(stderr,"Calculated time to finish depends on nsteps from "
 +                        "run input file,\nwhich may not correspond to the time "
 +                        "needed to process input trajectory.\n\n");
 +            }
 +        }
 +        else
 +        {
 +            char tbuf[20];
 +            fprintf(stderr,"starting mdrun '%s'\n",
 +                    *(top_global->name));
 +            if (ir->nsteps >= 0)
 +            {
 +                sprintf(tbuf,"%8.1f",(ir->init_step+ir->nsteps)*ir->delta_t);
 +            }
 +            else
 +            {
 +                sprintf(tbuf,"%s","infinite");
 +            }
 +            if (ir->init_step > 0)
 +            {
 +                fprintf(stderr,"%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
 +                        gmx_step_str(ir->init_step+ir->nsteps,sbuf),tbuf,
 +                        gmx_step_str(ir->init_step,sbuf2),
 +                        ir->init_step*ir->delta_t);
 +            }
 +            else
 +            {
 +                fprintf(stderr,"%s steps, %s ps.\n",
 +                        gmx_step_str(ir->nsteps,sbuf),tbuf);
 +            }
 +        }
 +        fprintf(fplog,"\n");
 +    }
 +
 +    /* Set and write start time */
 +    runtime_start(runtime);
 +    print_date_and_time(fplog,cr->nodeid,"Started mdrun",runtime);
 +    wallcycle_start(wcycle,ewcRUN);
 +    if (fplog)
 +    {
 +        fprintf(fplog,"\n");
 +    }
 +
 +    /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
 +#ifdef GMX_FAHCORE
 +    chkpt_ret=fcCheckPointParallel( cr->nodeid,
 +                                    NULL,0);
 +    if ( chkpt_ret == 0 ) 
 +        gmx_fatal( 3,__FILE__,__LINE__, "Checkpoint error on step %d\n", 0 );
 +#endif
 +
 +    debug_gmx();
 +    /***********************************************************
 +     *
 +     *             Loop over MD steps 
 +     *
 +     ************************************************************/
 +
 +    /* if rerunMD then read coordinates and velocities from input trajectory */
 +    if (bRerunMD)
 +    {
 +        if (getenv("GMX_FORCE_UPDATE"))
 +        {
 +            bForceUpdate = TRUE;
 +        }
 +
 +        rerun_fr.natoms = 0;
 +        if (MASTER(cr))
 +        {
 +            bNotLastFrame = read_first_frame(oenv,&status,
 +                                             opt2fn("-rerun",nfile,fnm),
 +                                             &rerun_fr,TRX_NEED_X | TRX_READ_V);
 +            if (rerun_fr.natoms != top_global->natoms)
 +            {
 +                gmx_fatal(FARGS,
 +                          "Number of atoms in trajectory (%d) does not match the "
 +                          "run input file (%d)\n",
 +                          rerun_fr.natoms,top_global->natoms);
 +            }
 +            if (ir->ePBC != epbcNONE)
 +            {
 +                if (!rerun_fr.bBox)
 +                {
 +                    gmx_fatal(FARGS,"Rerun trajectory frame step %d time %f does not contain a box, while pbc is used",rerun_fr.step,rerun_fr.time);
 +                }
 +                if (max_cutoff2(ir->ePBC,rerun_fr.box) < sqr(fr->rlistlong))
 +                {
 +                    gmx_fatal(FARGS,"Rerun trajectory frame step %d time %f has too small box dimensions",rerun_fr.step,rerun_fr.time);
 +                }
 +            }
 +        }
 +
 +        if (PAR(cr))
 +        {
 +            rerun_parallel_comm(cr,&rerun_fr,&bNotLastFrame);
 +        }
 +
 +        if (ir->ePBC != epbcNONE)
 +        {
 +            /* Set the shift vectors.
 +             * Necessary here when have a static box different from the tpr box.
 +             */
 +            calc_shifts(rerun_fr.box,fr->shift_vec);
 +        }
 +    }
 +
 +    /* loop over MD steps or if rerunMD to end of input trajectory */
 +    bFirstStep = TRUE;
 +    /* Skip the first Nose-Hoover integration when we get the state from tpx */
 +    bStateFromTPX = !bStateFromCP;
 +    bInitStep = bFirstStep && (bStateFromTPX || bVV);
 +    bStartingFromCpt = (Flags & MD_STARTFROMCPT) && bInitStep;
 +    bLastStep    = FALSE;
 +    bSumEkinhOld = FALSE;
 +    bExchanged   = FALSE;
 +
 +    init_global_signals(&gs,cr,ir,repl_ex_nst);
 +
 +    step = ir->init_step;
 +    step_rel = 0;
 +
 +    if (ir->nstlist == -1)
 +    {
 +        init_nlistheuristics(&nlh,bGStatEveryStep,step);
 +    }
 +
 +    if (MULTISIM(cr) && (repl_ex_nst <=0 ))
 +    {
 +        /* check how many steps are left in other sims */
 +        multisim_nsteps=get_multisim_nsteps(cr, ir->nsteps);
 +    }
 +
 +
 +    /* and stop now if we should */
 +    bLastStep = (bRerunMD || (ir->nsteps >= 0 && step_rel > ir->nsteps) ||
 +                 ((multisim_nsteps >= 0) && (step_rel >= multisim_nsteps )));
 +    while (!bLastStep || (bRerunMD && bNotLastFrame)) {
 +
 +        wallcycle_start(wcycle,ewcSTEP);
 +
 +        if (bRerunMD) {
 +            if (rerun_fr.bStep) {
 +                step = rerun_fr.step;
 +                step_rel = step - ir->init_step;
 +            }
 +            if (rerun_fr.bTime) {
 +                t = rerun_fr.time;
 +            }
 +            else
 +            {
 +                t = step;
 +            }
 +        } 
 +        else 
 +        {
 +            bLastStep = (step_rel == ir->nsteps);
 +            t = t0 + step*ir->delta_t;
 +        }
 +
 +        if (ir->efep != efepNO || ir->bSimTemp)
 +        {
 +            /* find and set the current lambdas.  If rerunning, we either read in a state, or a lambda value,
 +               requiring different logic. */
 +            
 +            set_current_lambdas(step,ir->fepvals,bRerunMD,&rerun_fr,state_global,state,lam0);
 +            bDoDHDL = do_per_step(step,ir->fepvals->nstdhdl);
 +            bDoFEP  = (do_per_step(step,nstfep) && (ir->efep != efepNO));
 +            bDoExpanded  = (do_per_step(step,ir->expandedvals->nstexpanded) && (ir->bExpanded) && (step > 0));
 +        }
 +
 +        if (bSimAnn) 
 +        {
 +            update_annealing_target_temp(&(ir->opts),t);
 +        }
 +
 +        if (bRerunMD)
 +        {
 +            if (!(DOMAINDECOMP(cr) && !MASTER(cr)))
 +            {
 +                for(i=0; i<state_global->natoms; i++)
 +                {
 +                    copy_rvec(rerun_fr.x[i],state_global->x[i]);
 +                }
 +                if (rerun_fr.bV)
 +                {
 +                    for(i=0; i<state_global->natoms; i++)
 +                    {
 +                        copy_rvec(rerun_fr.v[i],state_global->v[i]);
 +                    }
 +                }
 +                else
 +                {
 +                    for(i=0; i<state_global->natoms; i++)
 +                    {
 +                        clear_rvec(state_global->v[i]);
 +                    }
 +                    if (bRerunWarnNoV)
 +                    {
 +                        fprintf(stderr,"\nWARNING: Some frames do not contain velocities.\n"
 +                                "         Ekin, temperature and pressure are incorrect,\n"
 +                                "         the virial will be incorrect when constraints are present.\n"
 +                                "\n");
 +                        bRerunWarnNoV = FALSE;
 +                    }
 +                }
 +            }
 +            copy_mat(rerun_fr.box,state_global->box);
 +            copy_mat(state_global->box,state->box);
 +
 +            if (vsite && (Flags & MD_RERUN_VSITE))
 +            {
 +                if (DOMAINDECOMP(cr))
 +                {
 +                    gmx_fatal(FARGS,"Vsite recalculation with -rerun is not implemented for domain decomposition, use particle decomposition");
 +                }
 +                if (graph)
 +                {
 +                    /* Following is necessary because the graph may get out of sync
 +                     * with the coordinates if we only have every N'th coordinate set
 +                     */
 +                    mk_mshift(fplog,graph,fr->ePBC,state->box,state->x);
 +                    shift_self(graph,state->box,state->x);
 +                }
 +                construct_vsites(fplog,vsite,state->x,nrnb,ir->delta_t,state->v,
 +                                 top->idef.iparams,top->idef.il,
 +                                 fr->ePBC,fr->bMolPBC,graph,cr,state->box);
 +                if (graph)
 +                {
 +                    unshift_self(graph,state->box,state->x);
 +                }
 +            }
 +        }
 +
 +        /* Stop Center of Mass motion */
 +        bStopCM = (ir->comm_mode != ecmNO && do_per_step(step,ir->nstcomm));
 +
 +        /* Copy back starting coordinates in case we're doing a forcefield scan */
 +        if (bFFscan)
 +        {
 +            for(ii=0; (ii<state->natoms); ii++)
 +            {
 +                copy_rvec(xcopy[ii],state->x[ii]);
 +                copy_rvec(vcopy[ii],state->v[ii]);
 +            }
 +            copy_mat(boxcopy,state->box);
 +        }
 +
 +        if (bRerunMD)
 +        {
 +            /* for rerun MD always do Neighbour Searching */
 +            bNS = (bFirstStep || ir->nstlist != 0);
 +            bNStList = bNS;
 +        }
 +        else
 +        {
 +            /* Determine whether or not to do Neighbour Searching and LR */
 +            bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
 +            
 +            bNS = (bFirstStep || bExchanged || bNStList || bDoFEP ||
 +                   (ir->nstlist == -1 && nlh.nabnsb > 0));
 +
 +            if (bNS && ir->nstlist == -1)
 +            {
 +                set_nlistheuristics(&nlh,bFirstStep || bExchanged || bDoFEP, step);
 +            }
 +        } 
 +
 +        /* check whether we should stop because another simulation has 
 +           stopped. */
 +        if (MULTISIM(cr))
 +        {
 +            if ( (multisim_nsteps >= 0) &&  (step_rel >= multisim_nsteps)  &&  
 +                 (multisim_nsteps != ir->nsteps) )  
 +            {
 +                if (bNS)
 +                {
 +                    if (MASTER(cr))
 +                    {
 +                        fprintf(stderr, 
 +                                "Stopping simulation %d because another one has finished\n",
 +                                cr->ms->sim);
 +                    }
 +                    bLastStep=TRUE;
 +                    gs.sig[eglsCHKPT] = 1;
 +                }
 +            }
 +        }
 +
 +        /* < 0 means stop at next step, > 0 means stop at next NS step */
 +        if ( (gs.set[eglsSTOPCOND] < 0 ) ||
 +             ( (gs.set[eglsSTOPCOND] > 0 ) && ( bNS || ir->nstlist==0)) )
 +        {
 +            bLastStep = TRUE;
 +        }
 +
 +        /* Determine whether or not to update the Born radii if doing GB */
 +        bBornRadii=bFirstStep;
 +        if (ir->implicit_solvent && (step % ir->nstgbradii==0))
 +        {
 +            bBornRadii=TRUE;
 +        }
 +        
 +        do_log = do_per_step(step,ir->nstlog) || bFirstStep || bLastStep;
 +        do_verbose = bVerbose &&
 +                  (step % stepout == 0 || bFirstStep || bLastStep);
 +
 +        if (bNS && !(bFirstStep && ir->bContinuation && !bRerunMD))
 +        {
 +            if (bRerunMD)
 +            {
 +                bMasterState = TRUE;
 +            }
 +            else
 +            {
 +                bMasterState = FALSE;
 +                /* Correct the new box if it is too skewed */
 +                if (DYNAMIC_BOX(*ir))
 +                {
 +                    if (correct_box(fplog,step,state->box,graph))
 +                    {
 +                        bMasterState = TRUE;
 +                    }
 +                }
 +                if (DOMAINDECOMP(cr) && bMasterState)
 +                {
 +                    dd_collect_state(cr->dd,state,state_global);
 +                }
 +            }
 +
 +            if (DOMAINDECOMP(cr))
 +            {
 +                /* Repartition the domain decomposition */
 +                wallcycle_start(wcycle,ewcDOMDEC);
 +                dd_partition_system(fplog,step,cr,
 +                                    bMasterState,nstglobalcomm,
 +                                    state_global,top_global,ir,
 +                                    state,&f,mdatoms,top,fr,
 +                                    vsite,shellfc,constr,
 +                                    nrnb,wcycle,do_verbose);
 +                wallcycle_stop(wcycle,ewcDOMDEC);
 +                /* If using an iterative integrator, reallocate space to match the decomposition */
 +            }
 +        }
 +
 +        if (MASTER(cr) && do_log && !bFFscan)
 +        {
 +            print_ebin_header(fplog,step,t,state->lambda[efptFEP]); /* can we improve the information printed here? */
 +        }
 +
 +        if (ir->efep != efepNO)
 +        {
 +            update_mdatoms(mdatoms,state->lambda[efptMASS]);
 +        }
 +
 +        if (bRerunMD && rerun_fr.bV)
 +        {
 +            
 +            /* We need the kinetic energy at minus the half step for determining
 +             * the full step kinetic energy and possibly for T-coupling.*/
 +            /* This may not be quite working correctly yet . . . . */
 +            compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
 +                            wcycle,enerd,NULL,NULL,NULL,NULL,mu_tot,
 +                            constr,NULL,FALSE,state->box,
 +                            top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
 +                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
 +        }
 +        clear_mat(force_vir);
 +        
 +        /* Ionize the atoms if necessary */
 +        if (bIonize)
 +        {
 +            ionize(fplog,oenv,mdatoms,top_global,t,ir,state->x,state->v,
 +                   mdatoms->start,mdatoms->start+mdatoms->homenr,state->box,cr);
 +        }
 +        
 +        /* Update force field in ffscan program */
 +        if (bFFscan)
 +        {
 +            if (update_forcefield(fplog,
 +                                  nfile,fnm,fr,
 +                                  mdatoms->nr,state->x,state->box))
 +            {
 +                gmx_finalize_par();
 +
 +                exit(0);
 +            }
 +        }
 +
 +        /* We write a checkpoint at this MD step when:
 +         * either at an NS step when we signalled through gs,
 +         * or at the last step (but not when we do not want confout),
 +         * but never at the first step or with rerun.
 +         */
 +        bCPT = (((gs.set[eglsCHKPT] && (bNS || ir->nstlist == 0)) ||
 +                 (bLastStep && (Flags & MD_CONFOUT))) &&
 +                step > ir->init_step && !bRerunMD);
 +        if (bCPT)
 +        {
 +            gs.set[eglsCHKPT] = 0;
 +        }
 +
 +        /* Determine the energy and pressure:
 +         * at nstcalcenergy steps and at energy output steps (set below).
 +         */
 +
 +        if (EI_VV(ir->eI) && (!bInitStep)) {  /* for vv, the first half actually corresponds to the last step */
 +            bNstEner = do_per_step(step-1,ir->nstcalcenergy);
 +        } else {
 +            bNstEner = do_per_step(step,ir->nstcalcenergy);
 +        }
 +        bCalcEnerPres =
 +            (bNstEner ||
 +             (ir->epc > epcNO && do_per_step(step,ir->nstpcouple)));
 +
 +        /* Do we need global communication ? */
 +        bGStat = (bCalcEnerPres || bStopCM ||
 +                  do_per_step(step,nstglobalcomm) ||
 +                  (ir->nstlist == -1 && !bRerunMD && step >= nlh.step_nscheck));
 +
 +        do_ene = (do_per_step(step,ir->nstenergy) || bLastStep);
 +
 +        if (do_ene || do_log)
 +        {
 +            bCalcEnerPres = TRUE;
 +            bGStat        = TRUE;
 +        }
 +        
 +        /* these CGLO_ options remain the same throughout the iteration */
 +        cglo_flags = ((bRerunMD ? CGLO_RERUNMD : 0) |
 +                      (bGStat ? CGLO_GSTAT : 0)
 +            );
 +        
 +        force_flags = (GMX_FORCE_STATECHANGED |
 +                       ((DYNAMIC_BOX(*ir) || bRerunMD) ? GMX_FORCE_DYNAMICBOX : 0) |
 +                       GMX_FORCE_ALLFORCES |
 +                       (bNStList ? GMX_FORCE_DOLR : 0) |
 +                       GMX_FORCE_SEPLRF |
 +                       (bCalcEnerPres ? GMX_FORCE_VIRIAL : 0) |
 +                       (bDoFEP ? GMX_FORCE_DHDL : 0)
 +            );
 +        
 +        if (shellfc)
 +        {
 +            /* Now is the time to relax the shells */
 +            count=relax_shell_flexcon(fplog,cr,bVerbose,bFFscan ? step+1 : step,
 +                                      ir,bNS,force_flags,
 +                                      bStopCM,top,top_global,
 +                                      constr,enerd,fcd,
 +                                      state,f,force_vir,mdatoms,
 +                                      nrnb,wcycle,graph,groups,
 +                                      shellfc,fr,bBornRadii,t,mu_tot,
 +                                      state->natoms,&bConverged,vsite,
 +                                      outf->fp_field);
 +            tcount+=count;
 +
 +            if (bConverged)
 +            {
 +                nconverged++;
 +            }
 +        }
 +        else
 +        {
 +            /* The coordinates (x) are shifted (to get whole molecules)
 +             * in do_force.
 +             * This is parallellized as well, and does communication too. 
 +             * Check comments in sim_util.c
 +             */
 +            do_force(fplog,cr,ir,step,nrnb,wcycle,top,top_global,groups,
 +                     state->box,state->x,&state->hist,
 +                     f,force_vir,mdatoms,enerd,fcd,
 +                     state->lambda,graph,
 +                     fr,vsite,mu_tot,t,outf->fp_field,ed,bBornRadii,
 +                     (bNS ? GMX_FORCE_NS : 0) | force_flags);
 +        }
 +        
 +        if (bTCR)
 +        {
 +            mu_aver = calc_mu_aver(cr,state->x,mdatoms->chargeA,
 +                                   mu_tot,&top_global->mols,mdatoms,gnx,grpindex);
 +        }
 +        
 +        if (bTCR && bFirstStep)
 +        {
 +            tcr=init_coupling(fplog,nfile,fnm,cr,fr,mdatoms,&(top->idef));
 +            fprintf(fplog,"Done init_coupling\n"); 
 +            fflush(fplog);
 +        }
 +        
 +        if (bVV && !bStartingFromCpt && !bRerunMD)
 +        /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
 +        {
 +            if (ir->eI==eiVV && bInitStep) 
 +            {
 +                /* if using velocity verlet with full time step Ekin,
 +                 * take the first half step only to compute the 
 +                 * virial for the first step. From there,
 +                 * revert back to the initial coordinates
 +                 * so that the input is actually the initial step.
 +                 */
 +                copy_rvecn(state->v,cbuf,0,state->natoms); /* should make this better for parallelizing? */
 +            } else {
 +                /* this is for NHC in the Ekin(t+dt/2) version of vv */
 +                trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ1);            
 +            }
 +
 +            update_coords(fplog,step,ir,mdatoms,state,
 +                          f,fr->bTwinRange && bNStList,fr->f_twin,fcd,
 +                          ekind,M,wcycle,upd,bInitStep,etrtVELOCITY1,
 +                          cr,nrnb,constr,&top->idef);
 +            
 +            if (bIterations)
 +            {
 +                gmx_iterate_init(&iterate,bIterations && !bInitStep);
 +            }
 +            /* for iterations, we save these vectors, as we will be self-consistently iterating
 +               the calculations */
 +
 +            /*#### UPDATE EXTENDED VARIABLES IN TROTTER FORMULATION */
 +            
 +            /* save the state */
 +            if (bIterations && iterate.bIterate) { 
 +                copy_coupling_state(state,bufstate,ekind,ekind_save,&(ir->opts));
 +            }
 +            
 +            bFirstIterate = TRUE;
 +            while (bFirstIterate || (bIterations && iterate.bIterate))
 +            {
 +                if (bIterations && iterate.bIterate) 
 +                {
 +                    copy_coupling_state(bufstate,state,ekind_save,ekind,&(ir->opts));
 +                    if (bFirstIterate && bTrotter) 
 +                    {
 +                        /* The first time through, we need a decent first estimate
 +                           of veta(t+dt) to compute the constraints.  Do
 +                           this by computing the box volume part of the
 +                           trotter integration at this time. Nothing else
 +                           should be changed by this routine here.  If
 +                           !(first time), we start with the previous value
 +                           of veta.  */
 +                        
 +                        veta_save = state->veta;
 +                        trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ0);
 +                        vetanew = state->veta;
 +                        state->veta = veta_save;
 +                    } 
 +                } 
 +                
 +                bOK = TRUE;
 +                if ( !bRerunMD || rerun_fr.bV || bForceUpdate) {  /* Why is rerun_fr.bV here?  Unclear. */
 +                    dvdl = 0;
 +                    
 +                    update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,state,graph,f,
 +                                       &top->idef,shake_vir,NULL,
 +                                       cr,nrnb,wcycle,upd,constr,
 +                                       bInitStep,TRUE,bCalcEnerPres,vetanew);
 +                    
 +                    if (!bOK && !bFFscan)
 +                    {
 +                        gmx_fatal(FARGS,"Constraint error: Shake, Lincs or Settle could not solve the constrains");
 +                    }
 +                    
 +                } 
 +                else if (graph)
 +                { /* Need to unshift here if a do_force has been
 +                     called in the previous step */
 +                    unshift_self(graph,state->box,state->x);
 +                }
 +
 +                
 +                /* if VV, compute the pressure and constraints */
 +                /* For VV2, we strictly only need this if using pressure
 +                 * control, but we really would like to have accurate pressures
 +                 * printed out.
 +                 * Think about ways around this in the future?
 +                 * For now, keep this choice in comments.
 +                 */
 +                /* bPres = (ir->eI==eiVV || IR_NPT_TROTTER(ir)); */
 +                /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && IR_NPT_TROTTER(ir)));*/
 +                bPres = TRUE;
 +                bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK));
 +                if (bNstEner && ir->eI==eiVVAK)  /*MRS:  7/9/2010 -- this still doesn't fix it?*/
 +                {
 +                    bSumEkinhOld = TRUE;
 +                }
 +                compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
 +                                wcycle,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
 +                                constr,NULL,FALSE,state->box,
 +                                top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
 +                                cglo_flags 
 +                                | CGLO_ENERGY 
 +                                | (bStopCM ? CGLO_STOPCM : 0)
 +                                | (bTemp ? CGLO_TEMPERATURE:0) 
 +                                | (bPres ? CGLO_PRESSURE : 0) 
 +                                | (bPres ? CGLO_CONSTRAINT : 0)
 +                                | ((bIterations && iterate.bIterate) ? CGLO_ITERATE : 0)  
 +                                | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
 +                                | CGLO_SCALEEKIN 
 +                    );
 +                /* explanation of above: 
 +                   a) We compute Ekin at the full time step
 +                   if 1) we are using the AveVel Ekin, and it's not the
 +                   initial step, or 2) if we are using AveEkin, but need the full
 +                   time step kinetic energy for the pressure (always true now, since we want accurate statistics).
 +                   b) If we are using EkinAveEkin for the kinetic energy for the temperture control, we still feed in 
 +                   EkinAveVel because it's needed for the pressure */
 +                
 +                /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
 +                if (!bInitStep) 
 +                {
 +                    if (bTrotter)
 +                    {
 +                        trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ2);
 +                    } 
 +                    else 
 +                    {
 +                        update_tcouple(fplog,step,ir,state,ekind,wcycle,upd,&MassQ,mdatoms);
 +                    }
 +                }
 +                
 +                if (bIterations &&
 +                    done_iterating(cr,fplog,step,&iterate,bFirstIterate,
 +                                   state->veta,&vetanew)) 
 +                {
 +                    break;
 +                }
 +                bFirstIterate = FALSE;
 +            }
 +
 +            if (bTrotter && !bInitStep) {
 +                enerd->term[F_DVDL_BONDED] += dvdl;        /* only add after iterations */
 +                copy_mat(shake_vir,state->svir_prev);
 +                copy_mat(force_vir,state->fvir_prev);
 +                if (IR_NVT_TROTTER(ir) && ir->eI==eiVV) {
 +                    /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
 +                    enerd->term[F_TEMP] = sum_ekin(&(ir->opts),ekind,NULL,(ir->eI==eiVV),FALSE,FALSE);
 +                    enerd->term[F_EKIN] = trace(ekind->ekin);
 +                }
 +            }
 +            /* if it's the initial step, we performed this first step just to get the constraint virial */
 +            if (bInitStep && ir->eI==eiVV) {
 +                copy_rvecn(cbuf,state->v,0,state->natoms);
 +            }
 +            
 +            if (fr->bSepDVDL && fplog && do_log) 
 +            {
 +                fprintf(fplog,sepdvdlformat,"Constraint",0.0,dvdl);
 +            }
 +            enerd->term[F_DVDL_BONDED] += dvdl;
 +        }
 +
 +        /* MRS -- now done iterating -- compute the conserved quantity */
 +        if (bVV) {
 +            saved_conserved_quantity = compute_conserved_from_auxiliary(ir,state,&MassQ);
 +            if (ir->eI==eiVV) 
 +            {
 +                last_ekin = enerd->term[F_EKIN];
 +            }
 +            if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres)) 
 +            {
 +                saved_conserved_quantity -= enerd->term[F_DISPCORR];
 +            }
 +            /* sum up the foreign energy and dhdl terms for vv.  currently done every step so that dhdl is correct in the .edr */
 +            sum_dhdl(enerd,state->lambda,ir->fepvals);
 +        }
 +        
 +        /* ########  END FIRST UPDATE STEP  ############## */
 +        /* ########  If doing VV, we now have v(dt) ###### */
 +        if (bDoExpanded) {
 +            /* perform extended ensemble sampling in lambda - we don't
 +               actually move to the new state before outputting
 +               statistics, but if performing simulated tempering, we
 +               do update the velocities and the tau_t. */
 +
 +            lamnew = ExpandedEnsembleDynamics(fplog,ir,enerd,state,&MassQ,&df_history,step,mcrng,state->v,mdatoms);
 +        }
 +        /* ################## START TRAJECTORY OUTPUT ################# */
 +        
 +        /* Now we have the energies and forces corresponding to the 
 +         * coordinates at time t. We must output all of this before
 +         * the update.
 +         * for RerunMD t is read from input trajectory
 +         */
 +        mdof_flags = 0;
 +        if (do_per_step(step,ir->nstxout)) { mdof_flags |= MDOF_X; }
 +        if (do_per_step(step,ir->nstvout)) { mdof_flags |= MDOF_V; }
 +        if (do_per_step(step,ir->nstfout)) { mdof_flags |= MDOF_F; }
 +        if (do_per_step(step,ir->nstxtcout)) { mdof_flags |= MDOF_XTC; }
 +        if (bCPT) { mdof_flags |= MDOF_CPT; };
 +
 +#if defined(GMX_FAHCORE) || defined(GMX_WRITELASTSTEP)
 +        if (bLastStep)
 +        {
 +            /* Enforce writing positions and velocities at end of run */
 +            mdof_flags |= (MDOF_X | MDOF_V);
 +        }
 +#endif
 +#ifdef GMX_FAHCORE
 +        if (MASTER(cr))
 +            fcReportProgress( ir->nsteps, step );
 +
 +        /* sync bCPT and fc record-keeping */
 +        if (bCPT && MASTER(cr))
 +            fcRequestCheckPoint();
 +#endif
 +        
 +        if (mdof_flags != 0)
 +        {
 +            wallcycle_start(wcycle,ewcTRAJ);
 +            if (bCPT)
 +            {
 +                if (state->flags & (1<<estLD_RNG))
 +                {
 +                    get_stochd_state(upd,state);
 +                }
 +                if (state->flags  & (1<<estMC_RNG))
 +                {
 +                    get_mc_state(mcrng,state);
 +                }
 +                if (MASTER(cr))
 +                {
 +                    if (bSumEkinhOld)
 +                    {
 +                        state_global->ekinstate.bUpToDate = FALSE;
 +                    }
 +                    else
 +                    {
 +                        update_ekinstate(&state_global->ekinstate,ekind);
 +                        state_global->ekinstate.bUpToDate = TRUE;
 +                    }
 +                    update_energyhistory(&state_global->enerhist,mdebin);
 +                    if (ir->efep!=efepNO || ir->bSimTemp) 
 +                    {
 +                        state_global->fep_state = state->fep_state; /* MRS: seems kludgy. The code should be
 +                                                                       structured so this isn't necessary.
 +                                                                       Note this reassignment is only necessary
 +                                                                       for single threads.*/
 +                        copy_df_history(&state_global->dfhist,&df_history);
 +                    }
 +                }
 +            }
 +            write_traj(fplog,cr,outf,mdof_flags,top_global,
 +                       step,t,state,state_global,f,f_global,&n_xtc,&x_xtc);
 +            if (bCPT)
 +            {
 +                nchkpt++;
 +                bCPT = FALSE;
 +            }
 +            debug_gmx();
 +            if (bLastStep && step_rel == ir->nsteps &&
 +                (Flags & MD_CONFOUT) && MASTER(cr) &&
 +                !bRerunMD && !bFFscan)
 +            {
 +                /* x and v have been collected in write_traj,
 +                 * because a checkpoint file will always be written
 +                 * at the last step.
 +                 */
 +                fprintf(stderr,"\nWriting final coordinates.\n");
 +                if (ir->ePBC != epbcNONE && !ir->bPeriodicMols &&
 +                    DOMAINDECOMP(cr))
 +                {
 +                    /* Make molecules whole only for confout writing */
 +                    do_pbc_mtop(fplog,ir->ePBC,state->box,top_global,state_global->x);
 +                }
 +                write_sto_conf_mtop(ftp2fn(efSTO,nfile,fnm),
 +                                    *top_global->name,top_global,
 +                                    state_global->x,state_global->v,
 +                                    ir->ePBC,state->box);
 +                debug_gmx();
 +            }
 +            wallcycle_stop(wcycle,ewcTRAJ);
 +        }
 +        
 +        /* kludge -- virial is lost with restart for NPT control. Must restart */
 +        if (bStartingFromCpt && bVV) 
 +        {
 +            copy_mat(state->svir_prev,shake_vir);
 +            copy_mat(state->fvir_prev,force_vir);
 +        }
 +        /*  ################## END TRAJECTORY OUTPUT ################ */
 +        
 +        /* Determine the pressure:
 +         * always when we want exact averages in the energy file,
 +         * at ns steps when we have pressure coupling,
 +         * otherwise only at energy output steps (set below).
 +         */
 +
 +        
 +        bNstEner = (bGStatEveryStep || do_per_step(step,ir->nstcalcenergy));
 +        bCalcEnerPres = bNstEner;
 +
 +        /* Do we need global communication ? */
 +        bGStat = (bGStatEveryStep || bStopCM || bNS ||
 +                  (ir->nstlist == -1 && !bRerunMD && step >= nlh.step_nscheck));
 +
 +        do_ene = (do_per_step(step,ir->nstenergy) || bLastStep);
 +
 +        if (do_ene || do_log)
 +        {
 +            bCalcEnerPres = TRUE;
 +            bGStat        = TRUE;
 +        }
 +
 +        /* Determine the wallclock run time up till now */
 +        run_time = gmx_gettime() - (double)runtime->real;
 +        /* Check whether everything is still allright */    
 +        if (((int)gmx_get_stop_condition() > handled_stop_condition)
 +#ifdef GMX_THREAD_MPI
 +            && MASTER(cr)
 +#endif
 +            )
 +        {
 +            /* this is just make gs.sig compatible with the hack 
 +               of sending signals around by MPI_Reduce with together with
 +               other floats */
 +            if ( gmx_get_stop_condition() == gmx_stop_cond_next_ns )
 +                gs.sig[eglsSTOPCOND]=1;
 +            if ( gmx_get_stop_condition() == gmx_stop_cond_next )
 +                gs.sig[eglsSTOPCOND]=-1;
 +            /* < 0 means stop at next step, > 0 means stop at next NS step */
 +            if (fplog)
 +            {
 +                fprintf(fplog,
 +                        "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
 +                        gmx_get_signal_name(),
 +                        gs.sig[eglsSTOPCOND]==1 ? "NS " : "");
 +                fflush(fplog);
 +            }
 +            fprintf(stderr,
 +                    "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
 +                    gmx_get_signal_name(),
 +                    gs.sig[eglsSTOPCOND]==1 ? "NS " : "");
 +            fflush(stderr);
 +            handled_stop_condition=(int)gmx_get_stop_condition();
 +        }
 +        else if (MASTER(cr) && (bNS || ir->nstlist <= 0) &&
 +                 (max_hours > 0 && run_time > max_hours*60.0*60.0*0.99) &&
 +                 gs.sig[eglsSTOPCOND] == 0 && gs.set[eglsSTOPCOND] == 0)
 +        {
 +            /* Signal to terminate the run */
 +            gs.sig[eglsSTOPCOND] = 1;
 +            if (fplog)
 +            {
 +                fprintf(fplog,"\nStep %s: Run time exceeded %.3f hours, will terminate the run\n",gmx_step_str(step,sbuf),max_hours*0.99);
 +            }
 +            fprintf(stderr, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n",gmx_step_str(step,sbuf),max_hours*0.99);
 +        }
 +
 +        if (bResetCountersHalfMaxH && MASTER(cr) &&
 +            run_time > max_hours*60.0*60.0*0.495)
 +        {
 +            gs.sig[eglsRESETCOUNTERS] = 1;
 +        }
 +
 +        if (ir->nstlist == -1 && !bRerunMD)
 +        {
 +            /* When bGStatEveryStep=FALSE, global_stat is only called
 +             * when we check the atom displacements, not at NS steps.
 +             * This means that also the bonded interaction count check is not
 +             * performed immediately after NS. Therefore a few MD steps could
 +             * be performed with missing interactions.
 +             * But wrong energies are never written to file,
 +             * since energies are only written after global_stat
 +             * has been called.
 +             */
 +            if (step >= nlh.step_nscheck)
 +            {
 +                nlh.nabnsb = natoms_beyond_ns_buffer(ir,fr,&top->cgs,
 +                                                     nlh.scale_tot,state->x);
 +            }
 +            else
 +            {
 +                /* This is not necessarily true,
 +                 * but step_nscheck is determined quite conservatively.
 +                 */
 +                nlh.nabnsb = 0;
 +            }
 +        }
 +
 +        /* In parallel we only have to check for checkpointing in steps
 +         * where we do global communication,
 +         *  otherwise the other nodes don't know.
 +         */
 +        if (MASTER(cr) && ((bGStat || !PAR(cr)) &&
 +                           cpt_period >= 0 &&
 +                           (cpt_period == 0 || 
 +                            run_time >= nchkpt*cpt_period*60.0)) &&
 +            gs.set[eglsCHKPT] == 0)
 +        {
 +            gs.sig[eglsCHKPT] = 1;
 +        }
 +
 +
 +        /* at the start of step, randomize the velocities */
 +        if (ETC_ANDERSEN(ir->etc) && EI_VV(ir->eI))
 +        {
 +            gmx_bool bDoAndersenConstr;
 +            bDoAndersenConstr = update_randomize_velocities(ir,step,mdatoms,state,upd,&top->idef,constr);
 +            /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
 +            if (bDoAndersenConstr)
 +            {
 +                update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,state,graph,f,
 +                                   &top->idef,tmp_vir,NULL,
 +                                   cr,nrnb,wcycle,upd,constr,
 +                                   bInitStep,TRUE,FALSE,vetanew);
 +            }
 +        }
 +
 +        if (bIterations)
 +        {
 +            gmx_iterate_init(&iterate,bIterations);
 +        }
 +    
 +        /* for iterations, we save these vectors, as we will be redoing the calculations */
 +        if (bIterations && iterate.bIterate) 
 +        {
 +            copy_coupling_state(state,bufstate,ekind,ekind_save,&(ir->opts));
 +        }
 +        bFirstIterate = TRUE;
 +        while (bFirstIterate || (bIterations && iterate.bIterate))
 +        {
 +            /* We now restore these vectors to redo the calculation with improved extended variables */    
 +            if (bIterations) 
 +            { 
 +                copy_coupling_state(bufstate,state,ekind_save,ekind,&(ir->opts));
 +            }
 +
 +            /* We make the decision to break or not -after- the calculation of Ekin and Pressure,
 +               so scroll down for that logic */
 +            
 +            /* #########   START SECOND UPDATE STEP ################# */
 +            /* Box is changed in update() when we do pressure coupling,
 +             * but we should still use the old box for energy corrections and when
 +             * writing it to the energy file, so it matches the trajectory files for
 +             * the same timestep above. Make a copy in a separate array.
 +             */
 +            copy_mat(state->box,lastbox);
 +
 +            bOK = TRUE;
 +            if (!(bRerunMD && !rerun_fr.bV && !bForceUpdate))
 +            {
 +                wallcycle_start(wcycle,ewcUPDATE);
 +                dvdl = 0;
 +                /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
 +                if (bTrotter) 
 +                {
 +                    if (bIterations && iterate.bIterate) 
 +                    {
 +                        if (bFirstIterate) 
 +                        {
 +                            scalevir = 1;
 +                        }
 +                        else 
 +                        {
 +                            /* we use a new value of scalevir to converge the iterations faster */
 +                            scalevir = tracevir/trace(shake_vir);
 +                        }
 +                        msmul(shake_vir,scalevir,shake_vir); 
 +                        m_add(force_vir,shake_vir,total_vir);
 +                        clear_mat(shake_vir);
 +                    }
 +                    trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ3);
 +                /* We can only do Berendsen coupling after we have summed
 +                 * the kinetic energy or virial. Since the happens
 +                 * in global_state after update, we should only do it at
 +                 * step % nstlist = 1 with bGStatEveryStep=FALSE.
 +                 */
 +                }
 +                else 
 +                {
 +                    update_tcouple(fplog,step,ir,state,ekind,wcycle,upd,&MassQ,mdatoms);
 +                    update_pcouple(fplog,step,ir,state,pcoupl_mu,M,wcycle,
 +                                   upd,bInitStep);
 +                }
 +
 +                if (bVV)
 +                {
 +                    /* velocity half-step update */
 +                    update_coords(fplog,step,ir,mdatoms,state,f,
 +                                  fr->bTwinRange && bNStList,fr->f_twin,fcd,
 +                                  ekind,M,wcycle,upd,FALSE,etrtVELOCITY2,
 +                                  cr,nrnb,constr,&top->idef);
 +                }
 +
 +                /* Above, initialize just copies ekinh into ekin,
 +                 * it doesn't copy position (for VV),
 +                 * and entire integrator for MD.
 +                 */
 +                
 +                if (ir->eI==eiVVAK) 
 +                {
 +                    copy_rvecn(state->x,cbuf,0,state->natoms);
 +                }
 +                
 +                update_coords(fplog,step,ir,mdatoms,state,f,fr->bTwinRange && bNStList,fr->f_twin,fcd,
 +                              ekind,M,wcycle,upd,bInitStep,etrtPOSITION,cr,nrnb,constr,&top->idef);
 +                wallcycle_stop(wcycle,ewcUPDATE);
 +
 +                update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,state,graph,f,
 +                                   &top->idef,shake_vir,force_vir,
 +                                   cr,nrnb,wcycle,upd,constr,
 +                                   bInitStep,FALSE,bCalcEnerPres,state->veta);  
 +                
 +                if (ir->eI==eiVVAK)
 +                {
 +                    /* erase F_EKIN and F_TEMP here? */
 +                    /* just compute the kinetic energy at the half step to perform a trotter step */
 +                    compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
 +                                    wcycle,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
 +                                    constr,NULL,FALSE,lastbox,
 +                                    top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
 +                                    cglo_flags | CGLO_TEMPERATURE
 +                        );
 +                    wallcycle_start(wcycle,ewcUPDATE);
 +                    trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ4);            
 +                    /* now we know the scaling, we can compute the positions again again */
 +                    copy_rvecn(cbuf,state->x,0,state->natoms);
 +
 +                    update_coords(fplog,step,ir,mdatoms,state,f,fr->bTwinRange && bNStList,fr->f_twin,fcd,
 +                                  ekind,M,wcycle,upd,bInitStep,etrtPOSITION,cr,nrnb,constr,&top->idef);
 +                    wallcycle_stop(wcycle,ewcUPDATE);
 +
 +                    /* do we need an extra constraint here? just need to copy out of state->v to upd->xp? */
 +                    /* are the small terms in the shake_vir here due
 +                     * to numerical errors, or are they important
 +                     * physically? I'm thinking they are just errors, but not completely sure. 
 +                     * For now, will call without actually constraining, constr=NULL*/
 +                    update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,state,graph,f,
 +                                       &top->idef,tmp_vir,force_vir,
 +                                       cr,nrnb,wcycle,upd,NULL,
 +                                       bInitStep,FALSE,bCalcEnerPres,
 +                                       state->veta);  
 +                }
 +                if (!bOK && !bFFscan) 
 +                {
 +                    gmx_fatal(FARGS,"Constraint error: Shake, Lincs or Settle could not solve the constrains");
 +                }
 +                
 +                if (fr->bSepDVDL && fplog && do_log) 
 +                {
 +                    fprintf(fplog,sepdvdlformat,"Constraint dV/dl",0.0,dvdl);
 +                }
 +                enerd->term[F_DVDL_BONDED] += dvdl;
 +            } 
 +            else if (graph) 
 +            {
 +                /* Need to unshift here */
 +                unshift_self(graph,state->box,state->x);
 +            }
 +
 +            if (vsite != NULL) 
 +            {
 +                wallcycle_start(wcycle,ewcVSITECONSTR);
 +                if (graph != NULL) 
 +                {
 +                    shift_self(graph,state->box,state->x);
 +                }
 +                construct_vsites(fplog,vsite,state->x,nrnb,ir->delta_t,state->v,
 +                                 top->idef.iparams,top->idef.il,
 +                                 fr->ePBC,fr->bMolPBC,graph,cr,state->box);
 +                
 +                if (graph != NULL) 
 +                {
 +                    unshift_self(graph,state->box,state->x);
 +                }
 +                wallcycle_stop(wcycle,ewcVSITECONSTR);
 +            }
 +            
 +            /* ############## IF NOT VV, Calculate globals HERE, also iterate constraints ############ */
 +            if (ir->nstlist == -1 && bFirstIterate)
 +            {
 +                gs.sig[eglsNABNSB] = nlh.nabnsb;
 +            }
 +            bEnergyHere = (!EI_VV(ir->eI) || (EI_VV(ir->eI) && bRerunMD)); /* this is not quite working for vv and rerun! fails for running rerun on multiple threads. This is caught in runner.c. */
 +            compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
 +                            wcycle,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
 +                            constr,
 +                            bFirstIterate ? &gs : NULL, 
 +                            (step_rel % gs.nstms == 0) && 
 +                                (multisim_nsteps<0 || (step_rel<multisim_nsteps)),
 +                            lastbox,
 +                            top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
 +                            cglo_flags 
 +                            | (!EI_VV(ir->eI) ? CGLO_ENERGY : 0) 
 +                            | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
 +                            | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0) 
 +                            | (bEnergyHere || bRerunMD ? CGLO_PRESSURE : 0) 
 +                            | (bIterations && iterate.bIterate ? CGLO_ITERATE : 0) 
 +                            | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
 +                            | CGLO_CONSTRAINT
 +                );
 +            if (ir->nstlist == -1 && bFirstIterate)
 +            {
 +                nlh.nabnsb = gs.set[eglsNABNSB];
 +                gs.set[eglsNABNSB] = 0;
 +            }
 +            /* bIterate is set to keep it from eliminating the old ekin kinetic energy terms */
 +            /* #############  END CALC EKIN AND PRESSURE ################# */
 +        
 +            /* Note: this is OK, but there are some numerical precision issues with using the convergence of
 +               the virial that should probably be addressed eventually. state->veta has better properies,
 +               but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
 +               generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
 +
 +            if (bIterations && 
 +                done_iterating(cr,fplog,step,&iterate,bFirstIterate,
 +                               trace(shake_vir),&tracevir)) 
 +            {
 +                break;
 +            }
 +            bFirstIterate = FALSE;
 +        }
 +
 +        /* only add constraint dvdl after constraints */
 +        enerd->term[F_DVDL_BONDED] += dvdl;
 +        if (!bVV)
 +        {
 +            /* sum up the foreign energy and dhdl terms for md and sd. currently done every step so that dhdl is correct in the .edr */
 +            sum_dhdl(enerd,state->lambda,ir->fepvals);
 +        }
 +        update_box(fplog,step,ir,mdatoms,state,graph,f,
 +                   ir->nstlist==-1 ? &nlh.scale_tot : NULL,pcoupl_mu,nrnb,wcycle,upd,bInitStep,FALSE);
 +        
 +        /* ################# END UPDATE STEP 2 ################# */
 +        /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
 +    
 +        /* The coordinates (x) were unshifted in update */
 +        if (bFFscan && (shellfc==NULL || bConverged))
 +        {
 +            if (print_forcefield(fplog,enerd->term,mdatoms->homenr,
 +                                 f,NULL,xcopy,
 +                                 &(top_global->mols),mdatoms->massT,pres))
 +            {
 +                gmx_finalize_par();
 +
 +                fprintf(stderr,"\n");
 +                exit(0);
 +            }
 +        }
 +        if (!bGStat)
 +        {
 +            /* We will not sum ekinh_old,                                                            
 +             * so signal that we still have to do it.                                                
 +             */
 +            bSumEkinhOld = TRUE;
 +        }
 +        
 +        if (bTCR)
 +        {
 +            /* Only do GCT when the relaxation of shells (minimization) has converged,
 +             * otherwise we might be coupling to bogus energies. 
 +             * In parallel we must always do this, because the other sims might
 +             * update the FF.
 +             */
 +
 +            /* Since this is called with the new coordinates state->x, I assume
 +             * we want the new box state->box too. / EL 20040121
 +             */
 +            do_coupling(fplog,oenv,nfile,fnm,tcr,t,step,enerd->term,fr,
 +                        ir,MASTER(cr),
 +                        mdatoms,&(top->idef),mu_aver,
 +                        top_global->mols.nr,cr,
 +                        state->box,total_vir,pres,
 +                        mu_tot,state->x,f,bConverged);
 +            debug_gmx();
 +        }
 +
 +        /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
 +
 +        /* use the directly determined last velocity, not actually the averaged half steps */
 +        if (bTrotter && ir->eI==eiVV) 
 +        {
 +            enerd->term[F_EKIN] = last_ekin;
 +        }
 +        enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
 +        
 +        if (bVV)
 +        {
 +            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
 +        }
 +        else 
 +        {
 +            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + compute_conserved_from_auxiliary(ir,state,&MassQ);
 +        }
 +        /* Check for excessively large energies */
 +        if (bIonize) 
 +        {
 +#ifdef GMX_DOUBLE
 +            real etot_max = 1e200;
 +#else
 +            real etot_max = 1e30;
 +#endif
 +            if (fabs(enerd->term[F_ETOT]) > etot_max) 
 +            {
 +                fprintf(stderr,"Energy too large (%g), giving up\n",
 +                        enerd->term[F_ETOT]);
 +            }
 +        }
 +        /* #########  END PREPARING EDR OUTPUT  ###########  */
 +        
 +        /* Time for performance */
 +        if (((step % stepout) == 0) || bLastStep) 
 +        {
 +            runtime_upd_proc(runtime);
 +        }
 +        
 +        /* Output stuff */
 +        if (MASTER(cr))
 +        {
 +            gmx_bool do_dr,do_or;
 +            
 +            if (fplog && do_log && bDoExpanded)
 +            {
 +                /* only needed if doing expanded ensemble */
 +                PrintFreeEnergyInfoToFile(fplog,ir->fepvals,ir->expandedvals,ir->bSimTemp?ir->simtempvals:NULL,
 +                                          &df_history,state->fep_state,ir->nstlog,step);
 +            }
 +            if (!(bStartingFromCpt && (EI_VV(ir->eI)))) 
 +            {
 +                if (bNstEner)
 +                {
 +                    upd_mdebin(mdebin,bDoDHDL,TRUE,
 +                               t,mdatoms->tmass,enerd,state,
 +                               ir->fepvals,ir->expandedvals,lastbox,
 +                               shake_vir,force_vir,total_vir,pres,
 +                               ekind,mu_tot,constr);
 +                }
 +                else
 +                {
 +                    upd_mdebin_step(mdebin);
 +                }
 +                
 +                do_dr  = do_per_step(step,ir->nstdisreout);
 +                do_or  = do_per_step(step,ir->nstorireout);
 +                
 +                print_ebin(outf->fp_ene,do_ene,do_dr,do_or,do_log?fplog:NULL,
 +                           step,t,
 +                           eprNORMAL,bCompact,mdebin,fcd,groups,&(ir->opts));
 +            }
 +            if (ir->ePull != epullNO)
 +            {
 +                pull_print_output(ir->pull,step,t);
 +            }
 +            
 +            if (do_per_step(step,ir->nstlog))
 +            {
 +                if(fflush(fplog) != 0)
 +                {
 +                    gmx_fatal(FARGS,"Cannot flush logfile - maybe you are out of disk space?");
 +                }
 +            }
 +        }
 +        if (bDoExpanded)
 +        {
 +            /* Have to do this part after outputting the logfile and the edr file */
 +            state->fep_state = lamnew;
 +            for (i=0;i<efptNR;i++)
 +            {
 +                state->lambda[i] = ir->fepvals->all_lambda[i][lamnew];
 +            }
 +        }
 +        /* Remaining runtime */
 +        if (MULTIMASTER(cr) && (do_verbose || gmx_got_usr_signal() ))
 +        {
 +            if (shellfc) 
 +            {
 +                fprintf(stderr,"\n");
 +            }
 +            print_time(stderr,runtime,step,ir,cr);
 +        }
 +
 +        /* Replica exchange */
 +        bExchanged = FALSE;
 +        if ((repl_ex_nst > 0) && (step > 0) && !bLastStep &&
 +            do_per_step(step,repl_ex_nst)) 
 +        {
 +            bExchanged = replica_exchange(fplog,cr,repl_ex,
 +                                          state_global,enerd,
 +                                          state,step,t);
 +
 +            if (bExchanged && DOMAINDECOMP(cr)) 
 +            {
 +                dd_partition_system(fplog,step,cr,TRUE,1,
 +                                    state_global,top_global,ir,
 +                                    state,&f,mdatoms,top,fr,
 +                                    vsite,shellfc,constr,
 +                                    nrnb,wcycle,FALSE);
 +            }
 +        }
 +        
 +        bFirstStep = FALSE;
 +        bInitStep = FALSE;
 +        bStartingFromCpt = FALSE;
 +
 +        /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
 +        /* With all integrators, except VV, we need to retain the pressure
 +         * at the current step for coupling at the next step.
 +         */
 +        if ((state->flags & (1<<estPRES_PREV)) &&
 +            (bGStatEveryStep ||
 +             (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
 +        {
 +            /* Store the pressure in t_state for pressure coupling
 +             * at the next MD step.
 +             */
 +            copy_mat(pres,state->pres_prev);
 +        }
 +        
 +        /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
 +
 +        if ( (membed!=NULL) && (!bLastStep) )
 +        {
 +            rescale_membed(step_rel,membed,state_global->x);
 +        }
 +
 +        if (bRerunMD) 
 +        {
 +            if (MASTER(cr))
 +            {
 +                /* read next frame from input trajectory */
 +                bNotLastFrame = read_next_frame(oenv,status,&rerun_fr);
 +            }
 +
 +            if (PAR(cr))
 +            {
 +                rerun_parallel_comm(cr,&rerun_fr,&bNotLastFrame);
 +            }
 +        }
 +        
 +        if (!bRerunMD || !rerun_fr.bStep)
 +        {
 +            /* increase the MD step number */
 +            step++;
 +            step_rel++;
 +        }
 +        
 +        cycles = wallcycle_stop(wcycle,ewcSTEP);
 +        if (DOMAINDECOMP(cr) && wcycle)
 +        {
 +            dd_cycles_add(cr->dd,cycles,ddCyclStep);
 +        }
 +        
 +        if (step_rel == wcycle_get_reset_counters(wcycle) ||
 +            gs.set[eglsRESETCOUNTERS] != 0)
 +        {
 +            /* Reset all the counters related to performance over the run */
 +            reset_all_counters(fplog,cr,step,&step_rel,ir,wcycle,nrnb,runtime);
 +            wcycle_set_reset_counters(wcycle,-1);
 +            /* Correct max_hours for the elapsed time */
 +            max_hours -= run_time/(60.0*60.0);
 +            bResetCountersHalfMaxH = FALSE;
 +            gs.set[eglsRESETCOUNTERS] = 0;
 +        }
 +
 +    }
 +    /* End of main MD loop */
 +    debug_gmx();
 +    
 +    /* Stop the time */
 +    runtime_end(runtime);
 +    
 +    if (bRerunMD && MASTER(cr))
 +    {
 +        close_trj(status);
 +    }
 +    
 +    if (!(cr->duty & DUTY_PME))
 +    {
 +        /* Tell the PME only node to finish */
 +        gmx_pme_finish(cr);
 +    }
 +    
 +    if (MASTER(cr))
 +    {
 +        if (ir->nstcalcenergy > 0 && !bRerunMD) 
 +        {
 +            print_ebin(outf->fp_ene,FALSE,FALSE,FALSE,fplog,step,t,
 +                       eprAVER,FALSE,mdebin,fcd,groups,&(ir->opts));
 +        }
 +    }
 +
 +    done_mdoutf(outf);
 +
 +    debug_gmx();
 +
 +    if (ir->nstlist == -1 && nlh.nns > 0 && fplog)
 +    {
 +        fprintf(fplog,"Average neighborlist lifetime: %.1f steps, std.dev.: %.1f steps\n",nlh.s1/nlh.nns,sqrt(nlh.s2/nlh.nns - sqr(nlh.s1/nlh.nns)));
 +        fprintf(fplog,"Average number of atoms that crossed the half buffer length: %.1f\n\n",nlh.ab/nlh.nns);
 +    }
 +    
 +    if (shellfc && fplog)
 +    {
 +        fprintf(fplog,"Fraction of iterations that converged:           %.2f %%\n",
 +                (nconverged*100.0)/step_rel);
 +        fprintf(fplog,"Average number of force evaluations per MD step: %.2f\n\n",
 +                tcount/step_rel);
 +    }
 +    
 +    if (repl_ex_nst > 0 && MASTER(cr))
 +    {
 +        print_replica_exchange_statistics(fplog,repl_ex);
 +    }
 +    
 +    runtime->nsteps_done = step_rel;
 +    
 +    return 0;
 +}
diff --cc src/programs/mdrun/openmm_wrapper.cpp
index a9ee28f19e,0000000000..70d63a33d4
mode 100644,000000..100644
--- a/src/programs/mdrun/openmm_wrapper.cpp
+++ b/src/programs/mdrun/openmm_wrapper.cpp
@@@ -1,1510 -1,0 +1,1510 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2010, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +
 +/*
 + * Note, that parts of this source code originate from the Simtk release 
 + * of OpenMM accelerated Gromacs, for more details see: 
 + * https://simtk.org/project/xml/downloads.xml?group_id=161#package_id600
 + */
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <types/simple.h>
 +#include <cmath>
 +#include <set>
 +#include <iostream>
 +#include <sstream>
 +#include <fstream>
 +#include <map>
 +#include <vector>
 +#include <cctype>
 +#include <algorithm>
 +
 +using namespace std;
 +
 +#include "OpenMM.h"
 +
 +#include "gmx_fatal.h"
 +#include "typedefs.h"
 +#include "mdrun.h"
 +#include "physics.h"
 +#include "string2.h"
 +#include "gmx_gpu_utils.h"
 +#include "mtop_util.h"
 +
 +#include "openmm_wrapper.h"
 +
 +using namespace OpenMM;
 +
 +/*! \cond */
 +#define MEM_ERR_MSG(str) \
 +    "The %s-simulation GPU memory test detected errors. As memory errors would cause incorrect " \
 +    "simulation results, gromacs has aborted execution.\n Make sure that your GPU's memory is not " \
 +    "overclocked and that the device is properly cooled.\n", (str)
 +/*! \endcond */
 +
 +#define COMBRULE_CHK_TOL            1e-6
 +#define COMBRULE_SIGMA(sig1, sig2)  (((sig1) + (sig2))/2)
 +#define COMBRULE_EPS(eps1, eps2)    (sqrt((eps1) * (eps2)))
 +
 +/*! 
 + * \brief Convert string to integer type.
 + * \param[in]  s    String to convert from.
 + * \param[in]  f    Basefield format flag that takes any of the following I/O
 + *                  manipulators: dec, hex, oct.
 + * \param[out] t    Destination variable to convert to.
 + */
 +template <class T>
 +static gmx_bool from_string(T& t, const string& s, ios_base& (*f)(ios_base&))
 +{
 +    istringstream iss(s);
 +    return !(iss >> f >> t).fail();
 +}
 +
 +/*!
 + * \brief Split string around a given delimiter.
 + * \param[in] s      String to split.
 + * \param[in] delim  Delimiter character.
 + * \returns          Vector of strings found in \p s.
 + */
 +static vector<string> split(const string &s, char delim)
 +{
 +    vector<string> elems;
 +    stringstream ss(s);
 +    string item;
 +    while (getline(ss, item, delim))
 +    {
 +        if (item.length() != 0)
 +            elems.push_back(item);
 +    }
 +    return elems;
 +}
 +
 +/*!
 + * \brief Split a string of the form "option=value" into "option" and "value" strings.
 + * This string corresponds to one option and the associated value from the option list 
 + * in the mdrun -device argument.
 + *
 + * \param[in]  s    A string containing an "option=value" pair that needs to be split up.
 + * \param[out] opt  The name of the option.
 + * \param[out] val  Value of the option. 
 + */
 +static void splitOptionValue(const string &s, string &opt, string &val)
 +{
 +    size_t eqPos = s.find('=');
 +    if (eqPos != string::npos)
 +    {
 +        opt = s.substr(0, eqPos);
 +        if (eqPos != s.length())  val = s.substr(eqPos+1);
 +    }
 +}
 +
 +/*!
 + * \brief Compare two strings ignoring case.
 + * This function is in fact a wrapper around the gromacs function gmx_strncasecmp().
 + * \param[in] s1 String. 
 + * \param[in] s2 String.
 + * \returns      Similarly to the C function strncasecmp(), the return value is an  
 +                 integer less than, equal to, or greater than 0 if \p s1 less than, 
 +                 identical to, or greater than \p s2.
 + */
 +static gmx_bool isStringEqNCase(const string s1, const string s2)
 +{
 +    return (gmx_strncasecmp(s1.c_str(), s2.c_str(), max(s1.length(), s2.length())) == 0);
 +}
 +
 +/*!
 + * \brief Convert string to upper case.
 + *
 + * \param[in]  s    String to convert to uppercase.
 + * \returns         The given string converted to uppercase.
 + */
 +static string toUpper(const string &s)
 +{
 +    string stmp(s);
 +    std::transform(stmp.begin(), stmp.end(), stmp.begin(), static_cast < int(*)(int) > (toupper));
 +    return stmp;
 +}
 +
 +/*! 
 +  \name Sizes of constant device option arrays GmxOpenMMPlatformOptions#platforms, 
 +  GmxOpenMMPlatformOptions#memtests, GmxOpenMMPlatformOptions#deviceid, 
 +  GmxOpenMMPlatformOptions#force_dev.  */
 +/* {@ */
 +#define SIZEOF_PLATFORMS    2  // 2
 +#define SIZEOF_MEMTESTS     3 
 +#define SIZEOF_DEVICEIDS    1 
 +#define SIZEOF_FORCE_DEV    2 
 +
 +#define SIZEOF_CHECK_COMBRULE 2
 +/* @} */
 +
 +/*! Possible platform options in the mdrun -device option. */
 +static const char *devOptStrings[] = { "platform", "deviceid", "memtest", "force-device", "check-combrule" }; 
 +
 +/*! Enumerated platform options in the mdrun -device option. */
 +enum devOpt
 +{
 +    PLATFORM     = 0,
 +    DEVICEID     = 1,
 +    MEMTEST      = 2,
 +    FORCE_DEVICE = 3
 +};
 +
 +/*!
 + * \brief Class to extract and manage the platform options in the mdrun -device option.
 + * 
 + */
 +class GmxOpenMMPlatformOptions
 +{
 +public:
 +    GmxOpenMMPlatformOptions(const char *opt);
 +    ~GmxOpenMMPlatformOptions() { options.clear(); }
 +    string getOptionValue(const string &opt);
 +    void remOption(const string &opt);
 +    void print();
 +private:
 +    void setOption(const string &opt, const string &val);
 +
 +    map<string, string> options; /*!< Data structure to store the option (name, value) pairs. */
 +
 +    static const char * const platforms[SIZEOF_PLATFORMS];  /*!< Available OpenMM platforms; size #SIZEOF_PLATFORMS */
 +    static const char * const memtests[SIZEOF_MEMTESTS];    /*!< Available types of memory tests, also valid 
 +                                                                 any positive integer >=15; size #SIZEOF_MEMTESTS */
 +    static const char * const deviceid[SIZEOF_DEVICEIDS];   /*!< Possible values for deviceid option; 
 +                                                                 also valid any positive integer; size #SIZEOF_DEVICEIDS */
 +    static const char * const force_dev[SIZEOF_FORCE_DEV];  /*!< Possible values for for force-device option; 
 +                                                                 size #SIZEOF_FORCE_DEV */
 +    static const char * const check_combrule[SIZEOF_CHECK_COMBRULE]; /* XXX temporary debug feature to 
 +                                                                      turn off combination rule check */
 +};
 +
 +const char * const GmxOpenMMPlatformOptions::platforms[SIZEOF_PLATFORMS]
 +                    = {"CUDA", "Reference"};
 +                    //= { "Reference", "CUDA" /*,"OpenCL"*/ };
 +const char * const GmxOpenMMPlatformOptions::memtests[SIZEOF_MEMTESTS]
 +                    = { "15", "full", "off" };
 +const char * const GmxOpenMMPlatformOptions::deviceid[SIZEOF_DEVICEIDS]
 +                    = { "0" };
 +const char * const GmxOpenMMPlatformOptions::force_dev[SIZEOF_FORCE_DEV]
 +                    = { "no", "yes" };
 +const char * const GmxOpenMMPlatformOptions::check_combrule[SIZEOF_CHECK_COMBRULE] 
 +                    = { "yes", "no" };
 +
 +/*!
 + * \brief Contructor.
 + * Takes the option list, parses it, checks the options and their values for validity.
 + * When certain options are not provided by the user, as default value the first item  
 + * of the respective constant array is taken (GmxOpenMMPlatformOptions#platforms, 
 + * GmxOpenMMPlatformOptions#memtests, GmxOpenMMPlatformOptions#deviceid, 
 + * GmxOpenMMPlatformOptions#force_dev). 
 + * \param[in] optionString  Option list part of the mdrun -device parameter.
 + */
 +GmxOpenMMPlatformOptions::GmxOpenMMPlatformOptions(const char *optionString)
 +{
 +    // set default values
 +    setOption("platform",       platforms[0]);
 +    setOption("memtest",        memtests[0]);
 +    setOption("deviceid",       deviceid[0]);
 +    setOption("force-device",   force_dev[0]);
 +    setOption("check-combrule", check_combrule[0]);
 +
 +    string opt(optionString);
 +
 +    // remove all whitespaces
 +    opt.erase(remove_if(opt.begin(), opt.end(), ::isspace), opt.end());
 +    // tokenize around ","-s
 +    vector<string> tokens = split(opt, ',');
 +
 +    for (vector<string>::iterator it = tokens.begin(); it != tokens.end(); ++it)
 +    {
 +        string opt = "", val = "";
 +        splitOptionValue(*it, opt, val);
 +
 +        if (isStringEqNCase(opt, "platform"))
 +        {
 +            /* no check, this will fail if platform does not exist when we try to set it */
 +            setOption(opt, val);
 +            continue;
 +        }
 +
 +        if (isStringEqNCase(opt, "memtest"))
 +        {
 +            /* the value has to be an integer >15(s) or "full" OR "off" */
 +            if (!isStringEqNCase(val, "full") && !isStringEqNCase(val, "off")) 
 +            {
 +                int secs;
 +                if (!from_string<int>(secs, val, std::dec))
 +                {
 +                    gmx_fatal(FARGS, "Invalid value for option memtest option: \"%s\"!", val.c_str());
 +                }
 +                if (secs < 15)
 +                {
 +                    gmx_fatal(FARGS, "Incorrect value for memtest option (%d). "
 +                            "Memtest needs to run for at least 15s!", secs);
 +                }
 +            }
 +            setOption(opt, val);
 +            continue;
 +        }
 +
 +        if (isStringEqNCase(opt, "deviceid"))
 +        {
 +            int id;
 +            if (!from_string<int>(id, val, std::dec) )
 +            {
 +                gmx_fatal(FARGS, "Invalid device id: \"%s\"!", val.c_str());
 +            }
 +            setOption(opt, val);
 +            continue;
 +        }
 +
 +        if (isStringEqNCase(opt, "force-device"))
 +        {
 +            /* */
 +            if (!isStringEqNCase(val, "yes") && !isStringEqNCase(val, "no"))
 +            {
 +                gmx_fatal(FARGS, "Invalid OpenMM force option: \"%s\"!", val.c_str());
 +            }
 +            setOption(opt, val);
 +            continue;
 +        }
 +
 +        if (isStringEqNCase(opt, "check-combrule"))
 +        {
 +            /* */
 +            if (!isStringEqNCase(val, "yes") && !isStringEqNCase(val, "no"))
 +            {
 +                gmx_fatal(FARGS, "Invalid OpenMM force option: \"%s\"!", val.c_str());
 +            }
 +            setOption(opt, val);
 +            continue;
 +        }
 +
 +
 +        // if we got till here something went wrong
 +        gmx_fatal(FARGS, "Invalid OpenMM platform option: \"%s\"!", (*it).c_str());
 +    }
 +}
 +
 +
 +/*!
 + * \brief Getter function.
 + * \param[in] opt   Name of the option.
 + * \returns         Returns the value associated to an option. 
 + */
 +string GmxOpenMMPlatformOptions::getOptionValue(const string &opt)
 +{
 +	map<string, string> :: const_iterator it = options.find(toUpper(opt));
 +	if (it != options.end())
 +    {
 +		return it->second;
 +    }
 +    else
 +    {
 +        return NULL;
 +    }
 +}
 +
 +/*!
 + * \brief Setter function - private, only used from contructor.
 + * \param[in] opt   Name of the option.
 + * \param[in] val   Value for the option. 
 + */
 +void GmxOpenMMPlatformOptions::setOption(const string &opt, const string &val)
 +{
 +    options[toUpper(opt)] = val;
 +}
 +
 +/*!
 + * \brief Removes an option with its value from the map structure. If the option 
 + * does not exist, returns without any action.
 + * \param[in] opt   Name of the option.
 + */
 +void GmxOpenMMPlatformOptions::remOption(const string &opt) 
 +{ 
 +    options.erase(toUpper(opt)); 
 +}
 +
 +/*!
 + * \brief Print option-value pairs to a file (debugging function). 
 + */
 +void GmxOpenMMPlatformOptions::print()
 +{
 +    cout << ">> Platform options: " << endl 
 +         << ">> platform     = " << getOptionValue("platform") << endl
 +         << ">> deviceID     = " << getOptionValue("deviceid") << endl
 +         << ">> memtest      = " << getOptionValue("memtest") << endl
 +         << ">> force-device = " << getOptionValue("force-device") << endl;
 +}
 +
 +/*!
 + * \brief Container for OpenMM related data structures that represent the bridge 
 + *        between the Gromacs data-structures and the OpenMM library and is but it's 
 + *        only passed through the API functions as void to disable direct access. 
 + */
 +class OpenMMData
 +{
 +public:
 +    System* system;      //!< The system to simulate.
 +    Context* context;   //!< The OpenMM context in which the simulation is carried out.
 +    Integrator* integrator; //!< The integrator used in the simulation.
 +    gmx_bool removeCM;          //!< If true, remove center of mass motion, false otherwise.
 +    GmxOpenMMPlatformOptions *platformOpt; //!< Platform options.
 +};
 +
 +/*!
 + *  \brief Runs memtest on the GPU that has alreaby been initialized by OpenMM.
 + *  \param[in] fplog    Pointer to gromacs log file.
 + *  \param[in] devId    Device id of the GPU to run the test on. 
 +                        Note: as OpenMM previously creates the context,for now this is always -1.
 + *  \param[in] pre_post Contains either "Pre" or "Post" just to be able to differentiate in 
 + *                      stdout messages/log between memtest carried out before and after simulation.
 + *  \param[in] opt      Pointer to platform options object.
 + */
 +static void runMemtest(FILE* fplog, int devId, const char* pre_post, GmxOpenMMPlatformOptions *opt)
 +{
 +    char        strout_buf[STRLEN];
 +    int         which_test;
 +    int         res = 0;
 +    string      s = opt->getOptionValue("memtest");
 +    const char  *test_type = s.c_str();
 +
 +    if (!gmx_strcasecmp(test_type, "off"))
 +    {
 +        which_test = 0;
 +    }
 +    else
 +    {
 +        if (!gmx_strcasecmp(test_type, "full"))
 +        {
 +            which_test = 2;
 +        }
 +        else
 +        {
 +            from_string<int>(which_test, test_type, std::dec);
 +        }
 +    }
 +
 +    if (which_test < 0) 
 +    {
 +        gmx_fatal(FARGS, "Amount of seconds for memetest is negative (%d). ", which_test);
 +    }
 +
 +    switch (which_test)
 +    {
 +        case 0: /* no memtest */
 +            sprintf(strout_buf, "%s-simulation GPU memtest skipped. Note, that faulty memory can cause "
 +                "incorrect results!", pre_post);
 +            fprintf(fplog, "%s\n", strout_buf);
 +            gmx_warning(strout_buf);
 +            break; /* case 0 */
 +
 +        case 1: /* quick memtest */
 +            fprintf(fplog,  "%s-simulation %s GPU memtest in progress...\n", pre_post, test_type);
 +            fprintf(stdout, "\n%s-simulation %s GPU memtest in progress...", pre_post, test_type);
 +            fflush(fplog);
 +            fflush(stdout);
 +            res = do_quick_memtest(devId);
 +            break; /* case 1 */
 +
 +        case 2: /* full memtest */
 +            fprintf(fplog,  "%s-simulation %s memtest in progress...\n", pre_post, test_type);
 +            fprintf(stdout, "\n%s-simulation %s memtest in progress...", pre_post, test_type);
 +            fflush(fplog);
 +            fflush(stdout);
 +            res = do_full_memtest(devId);
 +            break; /* case 2 */
 +
 +        default: /* timed memtest */
 +            fprintf(fplog,  "%s-simulation ~%ds memtest in progress...\n", pre_post, which_test);
 +            fprintf(stdout, "\n%s-simulation ~%ds memtest in progress...", pre_post, which_test);
 +            fflush(fplog);
 +            fflush(stdout);
 +            res = do_timed_memtest(devId, which_test);
 +        }
 +
 +        if (which_test != 0)
 +        {
 +            if (res != 0)
 +            {
 +                gmx_fatal(FARGS, MEM_ERR_MSG(pre_post));
 +            }
 +            else
 +            {
 +                fprintf(fplog,  "Memory test completed without errors.\n");
 +                fflush(fplog);
 +                fprintf(stdout, "done, no errors detected\n");
 +                fflush(stdout);           
 +            }
 +        }
 +}
 +
 +/*!
 + * \brief Convert Lennard-Jones parameters c12 and c6 to sigma and epsilon.
 + * 
 + * \param[in] c12
 + * \param[in] c6
 + * \param[out] sigma 
 + * \param[out] epsilon
 + */
 +static void convert_c_12_6(double c12, double c6, double *sigma, double *epsilon)
 +{
 +    if (c12 == 0 && c6 == 0)
 +    {
 +        *epsilon    = 0.0;        
 +        *sigma      = 1.0;
 +    }
 +    else if (c12 > 0 && c6 > 0)
 +    {
 +        *epsilon    = (c6*c6)/(4.0*c12);
 +        *sigma      = pow(c12/c6, 1.0/6.0);
 +    }
 +    else 
 +    {
 +        gmx_fatal(FARGS,"OpenMM only supports c6 > 0 and c12 > 0 or c6 = c12 = 0.");
 +    } 
 +}
 +
 +/*!
 + * \brief Does gromacs option checking.
 + *
 + * Checks the gromacs mdp options for features unsupported in OpenMM, case in which 
 + * interrupts the execution. It also warns the user about pecularities of OpenMM 
 + * implementations.
 + * \param[in] fplog         Gromacs log file pointer.
 + * \param[in] ir            Gromacs input parameters, see ::t_inputrec
 + * \param[in] top           Gromacs node local topology, \see gmx_localtop_t
 + * \param[in] state         Gromacs state structure \see ::t_state
 + * \param[in] mdatoms       Gromacs atom parameters, \see ::t_mdatoms
 + * \param[in] fr            \see ::t_forcerec
 + * \param[in] state         Gromacs systems state, \see ::t_state
 + */
 +static void checkGmxOptions(FILE* fplog, GmxOpenMMPlatformOptions *opt,
 +                            t_inputrec *ir, gmx_localtop_t *top,
 +                            t_forcerec *fr, t_state *state)
 +{
 +    int     i, j, natoms;
 +    double  c6, c12;
 +    double  sigma_ij=0, sigma_ji=0, sigma_ii=0, sigma_jj=0, sigma_comb;
 +    double  eps_ij=0, eps_ji=0, eps_ii=0, eps_jj=0, eps_comb;
 +
 +    /* Abort if unsupported critical options are present */
 +
 +    /* Integrator */
 +    if (ir->eI ==  eiMD)
 +    {
 +        gmx_warning( "OpenMM does not support leap-frog, will use velocity-verlet integrator.");
 +    }
 +
 +    if (    (ir->eI !=  eiMD)   &&
 +            (ir->eI !=  eiVV)   &&
 +            (ir->eI !=  eiVVAK) &&
 +            (ir->eI !=  eiSD1)  &&
 +            (ir->eI !=  eiSD2)  &&
 +            (ir->eI !=  eiBD) )
 +    {
 +        gmx_fatal(FARGS, "OpenMM supports only the following integrators: md/md-vv/md-vv-avek, sd/sd1, and bd.");
 +    }
 +
 +    /* Electroctstics */
 +    if (   !(ir->coulombtype == eelPME   ||
 +             EEL_RF(ir->coulombtype)     ||
 +             ir->coulombtype == eelRF    ||
 +             ir->coulombtype == eelEWALD ||
 +             // no-cutoff
 +             (ir->coulombtype == eelCUT && ir->rcoulomb == 0 &&  ir->rvdw == 0) ||
 +             // we could have cut-off combined with GBSA (openmm will use RF)
 +             ir->implicit_solvent == eisGBSA)   )
 +    {
 +        gmx_fatal(FARGS,"OpenMM supports only the following methods for electrostatics: "
 +                "NoCutoff (i.e. rcoulomb = rvdw = 0 ),Reaction-Field, Ewald or PME.");
 +    }
 +
 +    if (EEL_RF(ir->coulombtype) && ir->epsilon_rf != 0)
 +    {
 +        // openmm has epsilon_rf=inf hard-coded
 +        gmx_warning("OpenMM will use a Reaction-Field epsilon of infinity instead of %g.",ir->epsilon_rf);
 +    }
 +
 +    if (ir->etc != etcNO &&
 +        ir->eI  != eiSD1 &&
 +        ir->eI  != eiSD2 &&
 +        ir->eI  != eiBD )
 +    {
 +        gmx_warning("OpenMM supports only Andersen thermostat with the md/md-vv/md-vv-avek integrators.");
 +    }
 +
 +    if (ir->implicit_solvent == eisGBSA &&
 +        ir->gb_algorithm != egbOBC  )
 +    {
 +        gmx_warning("OpenMM does not support the specified algorithm for Generalized Born, will use OBC instead.");
 +    }
 +
 +    if (ir->opts.ngtc > 1)
 +        gmx_fatal(FARGS,"OpenMM does not support multiple temperature coupling groups.");
 +
 +    if (ir->epc != epcNO)
 +        gmx_warning("OpenMM supports only Monte Carlo barostat for pressure coupling.");
 +
 +    if (ir->opts.annealing[0])
 +        gmx_fatal(FARGS,"OpenMM does not support simulated annealing.");
 +    
 +    if (top->idef.il[F_CONSTR].nr > 0 && ir->eConstrAlg != econtSHAKE)
 +        gmx_warning("OpenMM provides contraints as a combination "
 +                    "of SHAKE, SETTLE and CCMA. Accuracy is based on the SHAKE tolerance set "
 +                    "by the \"shake_tol\" option.");
 +
 +    if (ir->nwall != 0)
 +        gmx_fatal(FARGS,"OpenMM does not support walls.");
 +
 +    if (ir->ePull != epullNO)
 +        gmx_fatal(FARGS,"OpenMM does not support pulling.");
 +
 +    /* check for interaction types */
 +    for (i = 0; i < F_EPOT; i++)
 +    {
 +        if (!(i == F_CONSTR ||
 +            i == F_SETTLE   ||
 +            i == F_BONDS    ||            
 +            i == F_HARMONIC ||
 +            i == F_UREY_BRADLEY ||
 +            i == F_ANGLES   ||
 +            i == F_PDIHS    ||
 +            i == F_RBDIHS   ||
 +            i == F_PIDIHS   ||
 +            i == F_IDIHS    ||
 +            i == F_LJ14     ||
 +            i == F_GB12     || /* The GB parameters are hardcoded both in */
 +            i == F_GB13     || /* Gromacs and OpenMM */
 +            i == F_GB14   ) &&
 +            top->idef.il[i].nr > 0)
 +        {
 +            gmx_fatal(FARGS, "OpenMM does not support (some) of the provided interaction " 
 +                    "type(s) (%s) ", interaction_function[i].longname);
 +        }
 +    }
 +
 +    if (ir->efep != efepNO)
 +        gmx_fatal(FARGS,"OpenMM does not support free energy calculations.");
 +
 +    if (ir->opts.ngacc > 1)
 +        gmx_fatal(FARGS,"OpenMM does not support non-equilibrium MD (accelerated groups).");
 +
 +    if (IR_ELEC_FIELD(*ir))
 +        gmx_fatal(FARGS,"OpenMM does not support electric fields.");
 +
 +    if (ir->bQMMM)
 +        gmx_fatal(FARGS,"OpenMM does not support QMMM calculations.");
 +
 +    if (ir->rcoulomb != ir->rvdw)
 +        gmx_fatal(FARGS,"OpenMM uses a single cutoff for both Coulomb "
 +                  "and VdW interactions. Please set rcoulomb equal to rvdw.");
 +    
 +    if (EEL_FULL(ir->coulombtype))
 +    {
 +        if (ir->ewald_geometry == eewg3DC)
 +            gmx_fatal(FARGS,"OpenMM supports only Ewald 3D geometry.");
 +        if (ir->epsilon_surface != 0)
 +            gmx_fatal(FARGS,"OpenMM does not support dipole correction in Ewald summation.");
 +    }
 +
 +    if (TRICLINIC(state->box))        
 +    {
 +        gmx_fatal(FARGS,"OpenMM does not support triclinic unit cells.");
 +    }
 +
 +    /* XXX this is just debugging code to disable the combination rule check */
 +    if ( isStringEqNCase(opt->getOptionValue("check-combrule"), "yes") )
 +    {
 +    /* As OpenMM by default uses hardcoded combination rules 
 +       sigma_ij = (sigma_i + sigma_j)/2, eps_ij = sqrt(eps_i * eps_j)
 +       we need to check whether the force field params obey this 
 +       and if not, we can't use this force field so we exit 
 +       grace-fatal-fully. */
 +    real *nbfp = fr->nbfp;
 +    natoms = fr->ntype;
 +    if (debug) 
 +    {   
 +        fprintf(debug, ">> Atom parameters: <<\n%10s%5s %5s %5s %5s COMB\n", 
 +                "", "i-j", "j-i", "i-i", "j-j");
 +    }
 +    /* loop over all i-j atom pairs and verify if 
 +       sigma_ij = sigma_ji = sigma_comb and eps_ij = eps_ji = eps_comb */
 +    for (i = 0; i < natoms; i++)
 +    {
 +        /* i-i */
 +        c12 = C12(nbfp, natoms, i, i);
 +        c6  = C6(nbfp,  natoms, i, i);
 +        convert_c_12_6(c12, c6, &sigma_ii, &eps_ii);
 +
 +        for (j = 0; j < i; j++)
 +        {
 +            /* i-j */
 +            c12 = C12(nbfp, natoms, i, j);
 +            c6  = C6(nbfp,  natoms, i, j);
 +            convert_c_12_6(c12, c6, &sigma_ij, &eps_ij);
 +            /* j-i */
 +            c12 = C12(nbfp, natoms, j, i);
 +            c6  = C6(nbfp,  natoms, j, i);
 +            convert_c_12_6(c12, c6, &sigma_ji, &eps_ji);
 +            /* j-j */
 +            c12 = C12(nbfp, natoms, j, j);
 +            c6  = C6(nbfp,  natoms, j, j);
 +            convert_c_12_6(c12, c6, &sigma_jj, &eps_jj);
 +            /* OpenMM hardcoded combination rules */
 +            sigma_comb = COMBRULE_SIGMA(sigma_ii, sigma_jj);
 +            eps_comb = COMBRULE_EPS(eps_ii, eps_jj);
 +  
 +            if (debug)
 +            {
 +                fprintf(debug, "i=%-3d j=%-3d", i, j);
 +                fprintf(debug, "%-11s", "sigma");
 +                fprintf(debug, "%5.3f %5.3f %5.3f %5.3f %5.3f\n",  
 +                        sigma_ij, sigma_ji, sigma_ii, sigma_jj, sigma_comb);
 +                fprintf(debug, "%11s%-11s", "", "epsilon");
 +                fprintf(debug, "%5.3f %5.3f %5.3f %5.3f %5.3f\n", 
 +                        eps_ij, eps_ji, eps_ii, eps_jj, eps_comb);
 +            }
 +
 +            /* check the values against the rule used by omm */
 +            if((fabs(eps_ij) > COMBRULE_CHK_TOL && 
 +                fabs(eps_ji) > COMBRULE_CHK_TOL) &&
 +               (fabs(sigma_comb - sigma_ij) > COMBRULE_CHK_TOL ||
 +               fabs(sigma_comb - sigma_ji) > COMBRULE_CHK_TOL ||
 +               fabs(eps_comb - eps_ij) > COMBRULE_CHK_TOL ||
 +               fabs(eps_comb - eps_ji) > COMBRULE_CHK_TOL ))
 +            {
 +                gmx_fatal(FARGS,
 +                        "The combination rules of the used force-field do not "
 +                        "match the one supported by OpenMM:  "
 +                        "sigma_ij = (sigma_i + sigma_j)/2, eps_ij = sqrt(eps_i * eps_j). "
 +                        "Switch to a force-field that uses these rules in order to "
 +                        "simulate this system using OpenMM.\n");                        
 +            }
 +        }
 +    }
 +    if (debug) { fprintf(debug, ">><<\n\n"); }
 +
 +    /* if we got here, log that everything is fine */
 +    if (debug)
 +    {
 +        fprintf(debug, ">> The combination rule of the used force matches the one used by OpenMM.\n");
 +    }
 +    fprintf(fplog, "The combination rule of the used force field matches the one used by OpenMM.\n");   
 +
 +    } /* if (are we checking the combination rules) ... */
 +}
 +
 +
 +/*!
 + * \brief Initialize OpenMM, run sanity/consistency checks, and return a pointer to 
 + * the OpenMMData.
 + * 
 + * Various gromacs data structures are passed that contain the parameters, state and 
 + * other porperties of the system to simulate. These serve as input for initializing 
 + * OpenMM. Besides, a set of misc action are taken:
 + *  - OpenMM plugins are loaded;
 + *  - platform options in \p platformOptStr are parsed and checked; 
 + *  - Gromacs parameters are checked for OpenMM support and consistency;
 + *  - after the OpenMM is initialized memtest executed in the same GPU context.
 + * 
 + * \param[in] fplog             Gromacs log file handler.
 + * \param[in] platformOptStr    Platform option string. 
 + * \param[in] ir                The Gromacs input parameters, see ::t_inputrec
 + * \param[in] top_global        Gromacs system toppology, \see ::gmx_mtop_t
 + * \param[in] top               Gromacs node local topology, \see gmx_localtop_t
 + * \param[in] mdatoms           Gromacs atom parameters, \see ::t_mdatoms
 + * \param[in] fr                \see ::t_forcerec
 + * \param[in] state             Gromacs systems state, \see ::t_state
 + * \returns                     Pointer to a 
 + * 
 + */
 +void* openmm_init(FILE *fplog, const char *platformOptStr,
 +                  t_inputrec *ir,
 +                  gmx_mtop_t *top_global, gmx_localtop_t *top,
 +                  t_mdatoms *mdatoms, t_forcerec *fr, t_state *state)
 +{
 +
 +    char warn_buf[STRLEN];
 +    static gmx_bool hasLoadedPlugins = false;
 +    string usedPluginDir;
 +    int devId;
 +
 +    try
 +    {
 +        if (!hasLoadedPlugins)
 +        {
 +            vector<string> loadedPlugins;
 +            /*  Look for OpenMM plugins at various locations (listed in order of priority):
 +                - on the path in OPENMM_PLUGIN_DIR environment variable if this is specified
 +                - on the path in the OPENMM_PLUGIN_DIR macro that is set by the build script
 +                - at the default location assumed by OpenMM
 +            */
 +            /* env var */
 +            char *pluginDir = getenv("OPENMM_PLUGIN_DIR");
 +            trim(pluginDir);
 +            /* no env var or empty */
 +            if (pluginDir != NULL && *pluginDir != '\0')
 +            {
 +                loadedPlugins = Platform::loadPluginsFromDirectory(pluginDir);
 +                if (!loadedPlugins.empty())
 +                {
 +                    hasLoadedPlugins = true;
 +                    usedPluginDir = pluginDir;
 +                }
 +                else
 +                {
 +                    gmx_fatal(FARGS, "The directory provided in the OPENMM_PLUGIN_DIR environment variable "
 +                              "(%s) does not contain valid OpenMM plugins. Check your OpenMM installation!", 
 +                              pluginDir);
 +                }
 +            }
 +
 +            /* macro set at build time  */
 +#ifdef OPENMM_PLUGIN_DIR
 +            if (!hasLoadedPlugins)
 +            {
 +                loadedPlugins = Platform::loadPluginsFromDirectory(OPENMM_PLUGIN_DIR);
 +                if (!loadedPlugins.empty())
 +                {
 +                    hasLoadedPlugins = true;
 +                    usedPluginDir = OPENMM_PLUGIN_DIR;
 +                }
 +            }
 +#endif
 +            /* default loocation */
 +            if (!hasLoadedPlugins)
 +            {
 +                loadedPlugins = Platform::loadPluginsFromDirectory(Platform::getDefaultPluginsDirectory());
 +                if (!loadedPlugins.empty())
 +                {
 +                    hasLoadedPlugins = true;
 +                    usedPluginDir = Platform::getDefaultPluginsDirectory();
 +                }
 +            }
 +
 +            /* if there are still no plugins loaded there won't be any */
 +            if (!hasLoadedPlugins)
 +            {
 +                gmx_fatal(FARGS, "No OpenMM plugins were found! You can provide the"
 +                          " plugin directory in the OPENMM_PLUGIN_DIR environment variable.", pluginDir);
 +            }
 +
 +            fprintf(fplog, "\nOpenMM plugins loaded from directory %s:\t", usedPluginDir.c_str());
 +            for (int i = 0; i < (int)loadedPlugins.size(); i++)
 +            {
 +                fprintf(fplog, "%s, ", loadedPlugins[i].c_str());
 +            }
 +            fprintf(fplog, "\n");
 +        }
 +
 +        /* parse option string */
 +        GmxOpenMMPlatformOptions *opt = new GmxOpenMMPlatformOptions(platformOptStr);
 +        devId = atoi(opt->getOptionValue("deviceid").c_str());
 +
 +        if (debug)
 +        {
 +            opt->print();
 +        }
 +
 +        /* check wheter Gromacs options compatibility with OpenMM */
 +        checkGmxOptions(fplog, opt, ir, top, fr, state);
 +
 +        /* Create the system. */
 +        const t_idef& idef = top->idef;
 +        const int numAtoms = top_global->natoms;
 +        const int numConstraints = idef.il[F_CONSTR].nr/3;
 +        const int numSettle = idef.il[F_SETTLE].nr/2;
 +        const int numBonds = idef.il[F_BONDS].nr/3;
 +        const int numHarmonic = idef.il[F_HARMONIC].nr/3;
 +        const int numUB = idef.il[F_UREY_BRADLEY].nr/4;
 +        const int numAngles = idef.il[F_ANGLES].nr/4;
 +        const int numPeriodic = idef.il[F_PDIHS].nr/5;
 +        const int numPeriodicImproper = idef.il[F_PIDIHS].nr/5;
 +        const int numRB = idef.il[F_RBDIHS].nr/5;
 +        const int numImproperDih = idef.il[F_IDIHS].nr/5;
 +        const int num14 = idef.il[F_LJ14].nr/3;
 +        System* sys = new System();
 +        if (ir->nstcomm > 0)
 +            sys->addForce(new CMMotionRemover(ir->nstcomm));
 +
 +        /* Set bonded force field terms. */
 +
 +		/* 
 +		 * CUDA platform currently doesn't support more than one
 +		 * instance of a force object, so we pack all forces that
 +		 * use the same form into one.
 +		*/
 +
 +        const int* bondAtoms = (int*) idef.il[F_BONDS].iatoms;
 +        HarmonicBondForce* bondForce = new HarmonicBondForce();
 +        sys->addForce(bondForce);
 +        int offset = 0;
 +        for (int i = 0; i < numBonds; ++i)
 +        {
 +            int type = bondAtoms[offset++];
 +            int atom1 = bondAtoms[offset++];
 +            int atom2 = bondAtoms[offset++];
 +            bondForce->addBond(atom1, atom2,
 +                               idef.iparams[type].harmonic.rA, idef.iparams[type].harmonic.krA);
 +        }
 +
 +        const int* harmonicAtoms = (int*) idef.il[F_HARMONIC].iatoms;
 +        offset = 0;
 +        for (int i = 0; i < numHarmonic; ++i)
 +        {
 +            int type = harmonicAtoms[offset++];
 +            int atom1 = harmonicAtoms[offset++];
 +            int atom2 = harmonicAtoms[offset++];
 +            bondForce->addBond(atom1, atom2,
 +                               idef.iparams[type].harmonic.rA, idef.iparams[type].harmonic.krA);
 +        }
 +
 +		/* Set the angle force field terms */
 +        const int* angleAtoms = (int*) idef.il[F_ANGLES].iatoms;
 +        HarmonicAngleForce* angleForce = new HarmonicAngleForce();
 +        sys->addForce(angleForce);
 +        offset = 0;
 +        for (int i = 0; i < numAngles; ++i)
 +        {
 +            int type = angleAtoms[offset++];
 +            int atom1 = angleAtoms[offset++];
 +            int atom2 = angleAtoms[offset++];
 +            int atom3 = angleAtoms[offset++];
 +            angleForce->addAngle(atom1, atom2, atom3, 
 +                    idef.iparams[type].harmonic.rA*M_PI/180.0, idef.iparams[type].harmonic.krA);
 +        }
 +
 +        /* Urey-Bradley includes both the angle and bond potential for 1-3 interactions */
 +        const int* ubAtoms = (int*) idef.il[F_UREY_BRADLEY].iatoms;
 +		/* HarmonicBondForce* ubBondForce = new HarmonicBondForce(); */
 +		/*  HarmonicAngleForce* ubAngleForce = new HarmonicAngleForce(); */
 +        /* sys->addForce(ubBondForce); */
 +        /* sys->addForce(ubAngleForce); */
 +        offset = 0;
 +        for (int i = 0; i < numUB; ++i)
 +        {
 +            int type = ubAtoms[offset++];
 +            int atom1 = ubAtoms[offset++];
 +            int atom2 = ubAtoms[offset++];
 +            int atom3 = ubAtoms[offset++];
 +            /* ubBondForce->addBond(atom1, atom3, */
 +            bondForce->addBond(atom1, atom3,
-                                idef.iparams[type].u_b.r13, idef.iparams[type].u_b.kUB);
++                               idef.iparams[type].u_b.r13A, idef.iparams[type].u_b.kUBA);
 +            /* ubAngleForce->addAngle(atom1, atom2, atom3, */ 
 +            angleForce->addAngle(atom1, atom2, atom3, 
-                     idef.iparams[type].u_b.theta*M_PI/180.0, idef.iparams[type].u_b.ktheta);
++                    idef.iparams[type].u_b.thetaA*M_PI/180.0, idef.iparams[type].u_b.kthetaA);
 +        }
 +
 +		/* Set proper dihedral terms */
 +        const int* periodicAtoms = (int*) idef.il[F_PDIHS].iatoms;
 +        PeriodicTorsionForce* periodicForce = new PeriodicTorsionForce();
 +        sys->addForce(periodicForce);
 +        offset = 0;
 +        for (int i = 0; i < numPeriodic; ++i)
 +        {
 +            int type = periodicAtoms[offset++];
 +            int atom1 = periodicAtoms[offset++];
 +            int atom2 = periodicAtoms[offset++];
 +            int atom3 = periodicAtoms[offset++];
 +            int atom4 = periodicAtoms[offset++];
 +            periodicForce->addTorsion(atom1, atom2, atom3, atom4,
 +                                      idef.iparams[type].pdihs.mult,
 +                                      idef.iparams[type].pdihs.phiA*M_PI/180.0, 
 +                                      idef.iparams[type].pdihs.cpA);
 +        }
 +
 +		/* Set improper dihedral terms that are represented by a periodic function (as in AMBER FF) */
 +        const int* periodicImproperAtoms = (int*) idef.il[F_PIDIHS].iatoms;
 +        /* PeriodicTorsionForce* periodicImproperForce = new PeriodicTorsionForce(); */
 +        /* sys->addForce(periodicImproperForce); */
 +        offset = 0;
 +        for (int i = 0; i < numPeriodicImproper; ++i)
 +        {
 +            int type = periodicImproperAtoms[offset++];
 +            int atom1 = periodicImproperAtoms[offset++];
 +            int atom2 = periodicImproperAtoms[offset++];
 +            int atom3 = periodicImproperAtoms[offset++];
 +            int atom4 = periodicImproperAtoms[offset++];
 +            /* periodicImproperForce->addTorsion(atom1, atom2, atom3, atom4, */
 +            periodicForce->addTorsion(atom1, atom2, atom3, atom4,
 +                                      idef.iparams[type].pdihs.mult,
 +                                      idef.iparams[type].pdihs.phiA*M_PI/180.0,
 +                                      idef.iparams[type].pdihs.cpA);
 +        }
 +
 +        /* Ryckaert-Bellemans dihedrals */
 +        const int* rbAtoms = (int*) idef.il[F_RBDIHS].iatoms;
 +        RBTorsionForce* rbForce = new RBTorsionForce();
 +        sys->addForce(rbForce);
 +        offset = 0;
 +        for (int i = 0; i < numRB; ++i)
 +        {
 +            int type = rbAtoms[offset++];
 +            int atom1 = rbAtoms[offset++];
 +            int atom2 = rbAtoms[offset++];
 +            int atom3 = rbAtoms[offset++];
 +            int atom4 = rbAtoms[offset++];
 +            rbForce->addTorsion(atom1, atom2, atom3, atom4,
 +                                idef.iparams[type].rbdihs.rbcA[0], idef.iparams[type].rbdihs.rbcA[1],
 +                                idef.iparams[type].rbdihs.rbcA[2], idef.iparams[type].rbdihs.rbcA[3],
 +                                idef.iparams[type].rbdihs.rbcA[4], idef.iparams[type].rbdihs.rbcA[5]);
 +        }
 +
 +		/* Set improper dihedral terms (as in CHARMM FF) */
 +        const int* improperDihAtoms = (int*) idef.il[F_IDIHS].iatoms;
 +		CustomTorsionForce* improperDihForce = new CustomTorsionForce("2.0*k*asin(sin((theta-theta0)/2))^2");
 +        sys->addForce(improperDihForce);
 +		improperDihForce->addPerTorsionParameter("k");
 +		improperDihForce->addPerTorsionParameter("theta0");
 +		vector<double> improperDihParameters(2);
 +        offset = 0;
 +        for (int i = 0; i < numImproperDih; ++i)
 +        {
 +            int type = improperDihAtoms[offset++];
 +            int atom1 = improperDihAtoms[offset++];
 +            int atom2 = improperDihAtoms[offset++];
 +            int atom3 = improperDihAtoms[offset++];
 +            int atom4 = improperDihAtoms[offset++];
 +			improperDihParameters[0] = idef.iparams[type].harmonic.krA;
 +			improperDihParameters[1] = idef.iparams[type].harmonic.rA*M_PI/180.0;
 +            improperDihForce->addTorsion(atom1, atom2, atom3, atom4,
 +                                improperDihParameters);
 +        }
 +
 +        /* Set nonbonded parameters and masses. */
 +        int ntypes = fr->ntype;
 +        int* types = mdatoms->typeA;
 +        real* nbfp = fr->nbfp;
 +        real* charges = mdatoms->chargeA;
 +        real* masses = mdatoms->massT;
 +        NonbondedForce* nonbondedForce = new NonbondedForce();
 +        sys->addForce(nonbondedForce);
 +        
 +        switch (ir->ePBC)
 +        {
 +        case epbcNONE:
 +            if (ir->rcoulomb == 0)
 +            {
 +                nonbondedForce->setNonbondedMethod(NonbondedForce::NoCutoff);
 +            }
 +            else
 +            {
 +                nonbondedForce->setNonbondedMethod(NonbondedForce::CutoffNonPeriodic);
 +            }
 +            break;
 +        case epbcXYZ:
 +            switch (ir->coulombtype)
 +            {
 +            case eelCUT:
 +            case eelRF:
 +            case eelGRF:
 +            case eelRF_NEC:
 +            case eelRF_ZERO:
 +                nonbondedForce->setNonbondedMethod(NonbondedForce::CutoffPeriodic);
 +                break;
 +
 +            case eelEWALD:
 +                nonbondedForce->setNonbondedMethod(NonbondedForce::Ewald);
 +                break;
 +
 +            case eelPME:
 +                nonbondedForce->setNonbondedMethod(NonbondedForce::PME);
 +                break;
 +
 +            default:
 +                gmx_fatal(FARGS,"Internal error: you should not see this message, it means that the"
 +                          "electrosatics option check failed. Please report this error!");
 +            }        
 +            sys->setDefaultPeriodicBoxVectors(Vec3(state->box[0][0], 0, 0),
 +                                       Vec3(0, state->box[1][1], 0), Vec3(0, 0, state->box[2][2]));                    
 +            nonbondedForce->setCutoffDistance(ir->rcoulomb);
 +           
 +            break;
 +        default:            
 +            gmx_fatal(FARGS,"OpenMM supports only full periodic boundary conditions "
 +                              "(pbc = xyz), or none (pbc = no).");
 +        }
 +
 +
 +        /* Fix for PME and Ewald error tolerance 
 +         *
 +		 *  OpenMM uses approximate formulas to calculate the Ewald parameter:
 +		 *  alpha = (1.0/cutoff)*sqrt(-log(2.0*tolerlance));
 +		 *  and the grid spacing for PME:
 +		 *  gridX = ceil(2*alpha*box[0][0]/3*(pow(tol, 0.2)))
 +		 *  gridY = ceil(2*alpha*box[1][1]/3*(pow(tol, 0.2)));
 +		 *  gridZ = ceil(2*alpha*box[2][2]/3*(pow(tol, 0.2)));
 +		 *
 +		 *  
 +		 *  If the default ewald_rtol=1e-5 is used we silently adjust the value to the 
 +		 *  OpenMM default of 5e-4 otherwise a warning is issued about the action taken. 
 +		 *
 +		*/
 +        double corr_ewald_rtol = 50.0 * ir->ewald_rtol;
 +        if ((ir->ePBC == epbcXYZ) && 
 +            (ir->coulombtype == eelEWALD || ir->coulombtype == eelPME))
 +        {
 +            if (debug)
 +            {
 +                fprintf(debug, ">> ewald_rtol = %e (corrected = %e) \n",
 +                    ir->ewald_rtol, corr_ewald_rtol);
 +            }
 +
 +            if (fabs(ir->ewald_rtol - 1e-5) > 1e-10)
 +            {
 +                gmx_warning("OpenMM uses the ewald_rtol parameter with approximate formulas "
 +                        "to calculate the alpha and grid spacing parameters of the Ewald "
 +                        "and PME methods. This tolerance need to be corrected in order to get "
 +                        "settings close to the ones used in GROMACS. Although the internal correction "
 +                        "should work for any reasonable value of ewald_rtol, using values other than "
 +                        "the default 1e-5 might cause incorrect behavior.");
 +
 +                if (corr_ewald_rtol > 1)
 +                {
 +                    gmx_fatal(FARGS, "The ewald_rtol accuracy term is >1 after the "
 +                            "adjustment for OpenMM (%e)", corr_ewald_rtol);
 +                }
 +            }
 +            nonbondedForce->setEwaldErrorTolerance(corr_ewald_rtol);
 +        }
 +
 +        for (int i = 0; i < numAtoms; ++i)
 +        {
 +            double c12 = nbfp[types[i]*2*ntypes+types[i]*2+1];
 +            double c6 = nbfp[types[i]*2*ntypes+types[i]*2];
 +            double sigma=0.0, epsilon=0.0;
 +            convert_c_12_6(c12, c6, &sigma, &epsilon);
 +            nonbondedForce->addParticle(charges[i], sigma, epsilon);
 +            sys->addParticle(masses[i]);
 +        }
 +
 +        // Build a table of all exclusions.
 +        vector<set<int> > exclusions(numAtoms);
 +        for (int i = 0; i < numAtoms; i++)
 +        {
 +            int start = top->excls.index[i];
 +            int end = top->excls.index[i+1];
 +            for (int j = start; j < end; j++)
 +                exclusions[i].insert(top->excls.a[j]);
 +        }
 +
 +        // Record the 1-4 interactions, and remove them from the list of exclusions.
 +        const int* nb14Atoms = (int*) idef.il[F_LJ14].iatoms;
 +        offset = 0;
 +        for (int i = 0; i < num14; ++i)
 +        {
 +            int type = nb14Atoms[offset++];
 +            int atom1 = nb14Atoms[offset++];
 +            int atom2 = nb14Atoms[offset++];
 +            double sigma=0, epsilon=0;
 +            convert_c_12_6(idef.iparams[type].lj14.c12A, 
 +                    idef.iparams[type].lj14.c6A,
 +                    &sigma, &epsilon);
 +            nonbondedForce->addException(atom1, atom2,
 +                                         fr->fudgeQQ*charges[atom1]*charges[atom2], sigma, epsilon);
 +            exclusions[atom1].erase(atom2);
 +            exclusions[atom2].erase(atom1);
 +        }
 +
 +        // Record exclusions.
 +        for (int i = 0; i < numAtoms; i++)
 +        {
 +            for (set<int>::const_iterator iter = exclusions[i].begin(); iter != exclusions[i].end(); ++iter)
 +            {
 +                if (i < *iter)
 +                {
 +                    nonbondedForce->addException(i, *iter, 0.0, 1.0, 0.0);
 +                }
 +            }
 +        }
 +
 +        // Add GBSA if needed.
 +        if (ir->implicit_solvent == eisGBSA)
 +        {
 +            gmx_warning("The OBC scale factors alpha, beta and gamma are hardcoded in OpenMM with the default Gromacs values.");
 +            t_atoms atoms       = gmx_mtop_global_atoms(top_global);
 +            GBSAOBCForce* gbsa  = new GBSAOBCForce();
 +
 +            sys->addForce(gbsa);
 +            gbsa->setSoluteDielectric(ir->epsilon_r);
 +            gbsa->setSolventDielectric(ir->gb_epsilon_solvent);
 +            gbsa->setCutoffDistance(nonbondedForce->getCutoffDistance());
 +            if (nonbondedForce->getNonbondedMethod() == NonbondedForce::NoCutoff)
 +                gbsa->setNonbondedMethod(GBSAOBCForce::NoCutoff);
 +            else if (nonbondedForce->getNonbondedMethod() == NonbondedForce::CutoffNonPeriodic)
 +                gbsa->setNonbondedMethod(GBSAOBCForce::CutoffNonPeriodic);
 +            else if (nonbondedForce->getNonbondedMethod() == NonbondedForce::CutoffPeriodic)
 +                gbsa->setNonbondedMethod(GBSAOBCForce::CutoffPeriodic);
 +            else
 +                gmx_fatal(FARGS,"OpenMM supports only Reaction-Field electrostatics with OBC/GBSA.");
 +
 +            for (int i = 0; i < numAtoms; ++i)
 +            {
 +                gbsa->addParticle(charges[i],
 +                                  top_global->atomtypes.gb_radius[atoms.atom[i].type],
 +                                  top_global->atomtypes.S_hct[atoms.atom[i].type]);
 +            }
 +            free_t_atoms(&atoms, FALSE);
 +        }
 +
 +        // Set constraints.
 +        const int* constraintAtoms = (int*) idef.il[F_CONSTR].iatoms;
 +        offset = 0;
 +        for (int i = 0; i < numConstraints; ++i)
 +        {
 +            int type = constraintAtoms[offset++];
 +            int atom1 = constraintAtoms[offset++];
 +            int atom2 = constraintAtoms[offset++];
 +            sys->addConstraint(atom1, atom2, idef.iparams[type].constr.dA);
 +        }
 +        const int* settleAtoms = (int*) idef.il[F_SETTLE].iatoms;
 +        offset = 0;
 +        for (int i = 0; i < numSettle; ++i)
 +        {
 +            int type = settleAtoms[offset++];
 +            int oxygen = settleAtoms[offset++];
 +            sys->addConstraint(oxygen, oxygen+1, idef.iparams[type].settle.doh);
 +            sys->addConstraint(oxygen, oxygen+2, idef.iparams[type].settle.doh);
 +            sys->addConstraint(oxygen+1, oxygen+2, idef.iparams[type].settle.dhh);
 +        }
 +
 +        // Create an integrator for simulating the system.
 +        double friction = (ir->opts.tau_t[0] == 0.0 ? 0.0 : 1.0/ir->opts.tau_t[0]);
 +        Integrator* integ;
 +        if (ir->eI == eiBD)
 +        {
 +            integ = new BrownianIntegrator(ir->opts.ref_t[0], friction, ir->delta_t);
 +            static_cast<BrownianIntegrator*>(integ)->setRandomNumberSeed(ir->ld_seed); 
 +        }
 +        else if (EI_SD(ir->eI))
 +        {
 +            integ = new LangevinIntegrator(ir->opts.ref_t[0], friction, ir->delta_t);
 +            static_cast<LangevinIntegrator*>(integ)->setRandomNumberSeed(ir->ld_seed); 
 +        }
 +        else 
 +        {
 +            integ = new VerletIntegrator(ir->delta_t);
 +            if ( ir->etc != etcNO)
 +            {
 +                AndersenThermostat* thermostat = new AndersenThermostat(ir->opts.ref_t[0], friction); 
 +                sys->addForce(thermostat);
 +            }           
 +        }
 +
 +		// Add pressure coupling
 +        if (ir->epc != epcNO)
 +		{
 +          // convert gromacs pressure tensor to a scalar
 +          double pressure = (ir->ref_p[0][0] + ir->ref_p[1][1] + ir->ref_p[2][2]) / 3.0;
 +          int frequency = int(ir->tau_p / ir->delta_t); // update frequency in time steps
 +          if (frequency < 1) frequency = 1;
 +          double temperature = ir->opts.ref_t[0]; // in kelvin
 +          sys->addForce(new MonteCarloBarostat(pressure, temperature, frequency));
 +		}
 +
 +        integ->setConstraintTolerance(ir->shake_tol);
 +
 +        // Create a context and initialize it.
 +        Context* context = NULL;
 +
 +        /*      
 +        OpenMM could automatically select the "best" GPU, however we're not't 
 +        going to let it do that for now, as the current algorithm is very rudimentary
 +        and we anyway support only CUDA.        
 +        if (platformOptStr == NULL || platformOptStr == "")
 +        {
 +            context = new Context(*sys, *integ);
 +        }
 +        else
 +        */        
 +        {
 +            /* which platform should we use */
 +            for (int i = 0; i < (int)Platform::getNumPlatforms() && context == NULL; i++)
 +            {
 +                if (isStringEqNCase(opt->getOptionValue("platform"), Platform::getPlatform(i).getName()))
 +                {
 +                    Platform& platform = Platform::getPlatform(i);
 +                    // set standard properties
 +                    platform.setPropertyDefaultValue("CudaDevice", opt->getOptionValue("deviceid"));
 +                    // TODO add extra properties
 +                    context = new Context(*sys, *integ, platform);
 +                }
 +            }
 +            if (context == NULL)
 +            {
 +                gmx_fatal(FARGS, "The requested platform \"%s\" could not be found.", 
 +                        opt->getOptionValue("platform").c_str());
 +            }
 +        }
 +
 +        Platform& platform = context->getPlatform();
 +        fprintf(fplog, "Gromacs will use the OpenMM platform: %s\n", platform.getName().c_str());
 +
 +        const vector<string>& properties = platform.getPropertyNames();
 +        if (debug)
 +        {
 +            for (int i = 0; i < (int)properties.size(); i++)
 +            {
 +                fprintf(debug, ">> %s: %s\n", properties[i].c_str(), 
 +                        platform.getPropertyValue(*context, properties[i]).c_str());
 +            }
 +        }
 +
 +        /* only for CUDA */
 +        if (isStringEqNCase(opt->getOptionValue("platform"), "CUDA"))
 +        {
 +            int tmp;
 +            if (!from_string<int>(tmp, platform.getPropertyValue(*context, "CudaDevice"), std::dec))
 +            {
 +                gmx_fatal(FARGS, "Internal error: couldn't determine the device selected by OpenMM");
 +
 +            }
 +
 +            /* For now this is just to double-check if OpenMM selected the GPU we wanted,
 +            but when we'll let OpenMM select the GPU automatically, it will query the deviceId.
 +            */            
 +            if (tmp != devId)
 +            {
 +                gmx_fatal(FARGS, "Internal error: OpenMM is using device #%d"
 +                        "while initialized for device #%d", tmp, devId);
 +            }        
 +            
 +            /* check GPU compatibility */
 +            char gpuname[STRLEN];
 +            devId = atoi(opt->getOptionValue("deviceid").c_str());
 +            if (!is_supported_cuda_gpu(-1, gpuname))
 +            {
 +                if (!gmx_strcasecmp(opt->getOptionValue("force-device").c_str(), "yes"))
 +                {
 +                    sprintf(warn_buf, "Non-supported GPU selected (#%d, %s), forced continuing."
 +                            "Note, that the simulation can be slow or it migth even crash.", 
 +                            devId, gpuname);
 +                    fprintf(fplog, "%s\n", warn_buf);
 +                    gmx_warning(warn_buf);
 +                }
 +                else
 +                {
 +                    gmx_fatal(FARGS, "The selected GPU (#%d, %s) is not supported by Gromacs! "
 +                              "Most probably you have a low-end GPU which would not perform well, " 
 +                              "or new hardware that has not been tested with the current release. "
 +                              "If you still want to try using the device, use the force-device=yes option.", 
 +                              devId, gpuname);
 +                }
 +            }
 +            else
 +            {
 +                fprintf(fplog, "Gromacs will run on the GPU #%d (%s).\n", devId, gpuname);
 +            }
 +        }
 +        
 +        /* only for CUDA */
 +        if (isStringEqNCase(opt->getOptionValue("platform"), "CUDA"))
 +        {
 +            /* pre-simulation memtest */
 +            runMemtest(fplog, -1, "Pre", opt);
 +        }
 +
 +        vector<Vec3> pos(numAtoms);
 +        vector<Vec3> vel(numAtoms);
 +        for (int i = 0; i < numAtoms; ++i)
 +        {
 +            pos[i] = Vec3(state->x[i][0], state->x[i][1], state->x[i][2]);
 +            vel[i] = Vec3(state->v[i][0], state->v[i][1], state->v[i][2]);
 +        }
 +        context->setPositions(pos);
 +        context->setVelocities(vel);
 +
 +        // Return a structure containing the system, integrator, and context.
 +        OpenMMData* data = new OpenMMData();
 +        data->system = sys;
 +        data->integrator = integ;
 +        data->context = context;
 +        data->removeCM = (ir->nstcomm > 0);
 +        data->platformOpt = opt;
 +        return data;
 +    }
 +    catch (std::exception& e)
 +    {
 +        gmx_fatal(FARGS, "OpenMM exception caught while initializating: %s", e.what());
 +    } 
 +    return NULL; /* just to avoid warnings */
 +}
 +
 +/*!
 + * \brief Integrate one step.
 + *
 + * \param[in] data  OpenMMData object created by openmm_init().
 + */
 +void openmm_take_one_step(void* data)
 +{
 +    // static int step = 0; printf("----> taking step #%d\n", step++);
 +    try
 +    {
 +        static_cast<OpenMMData*>(data)->integrator->step(1);
 +    }
 +    catch (std::exception& e)
 +    {
 +        gmx_fatal(FARGS, "OpenMM exception caught while taking a step: %s", e.what());
 +    }
 +}
 +
 +/*!
 + * \brief Integrate n steps.
 + *
 + * \param[in] data  OpenMMData object created by openmm_init().
 + */
 +void openmm_take_steps(void* data, int nstep)
 +{
 +    try
 +    {
 +        static_cast<OpenMMData*>(data)->integrator->step(nstep);
 +    }
 +    catch (std::exception& e)
 +    {
 +        gmx_fatal(FARGS, "OpenMM exception caught while taking a step: %s", e.what());
 +    }
 +}
 +
 +/*!
 + * \brief Clean up the data structures cretead for OpenMM.
 + *
 + * \param[in] log   Log file pointer.
 + * \param[in] data  OpenMMData object created by openmm_init().
 + */
 +void openmm_cleanup(FILE* fplog, void* data)
 +{
 +    OpenMMData* d = static_cast<OpenMMData*>(data);
 +    /* only for CUDA */
 +    if (isStringEqNCase(d->platformOpt->getOptionValue("platform"), "CUDA"))
 +    {
 +        /* post-simulation memtest */
 +        runMemtest(fplog, -1, "Post", d->platformOpt);
 +    }
 +    delete d->system;
 +    delete d->integrator;
 +    delete d->context;
 +    delete d->platformOpt;
 +    delete d;
 +}
 +
 +/*!
 + * \brief Copy the current state information from OpenMM into the Gromacs data structures.
 + * 
 + * This function results in the requested proprties to be copied from the 
 + * GPU to host. As this represents a bottleneck, the frequency of pulling data
 + * should be minimized. 
 + *
 + * \param[in]   data        OpenMMData object created by openmm_init().
 + * \param[out]  time        Simulation time for which the state was created.
 + * \param[out]  state       State of the system: coordinates and velocities.
 + * \param[out]  f           Forces.
 + * \param[out]  enerd       Energies.
 + * \param[in]   includePos  True if coordinates are requested.
 + * \param[in]   includeVel  True if velocities are requested. 
 + * \param[in]   includeForce True if forces are requested. 
 + * \param[in]   includeEnergy True if energies are requested. 
 + */
 +void openmm_copy_state(void *data,
 +                       t_state *state, double *time,
 +                       rvec f[], gmx_enerdata_t *enerd,
 +                       gmx_bool includePos, gmx_bool includeVel, gmx_bool includeForce, gmx_bool includeEnergy)
 +{
 +    int types = 0;
 +    if (includePos)
 +        types += State::Positions;
 +    if (includeVel)
 +        types += State::Velocities;
 +    if (includeForce)
 +        types += State::Forces;
 +    if (includeEnergy)
 +        types += State::Energy;
 +    if (types == 0)
 +        return;
 +    try
 +    {
 +        State currentState = static_cast<OpenMMData*>(data)->context->getState(types);
 +        int numAtoms =  static_cast<OpenMMData*>(data)->system->getNumParticles();
 +        if (includePos)
 +        {
 +            for (int i = 0; i < numAtoms; i++)
 +            {
 +                Vec3 x = currentState.getPositions()[i];
 +                state->x[i][0] = x[0];
 +                state->x[i][1] = x[1];
 +                state->x[i][2] = x[2];
 +            }
 +        }
 +        if (includeVel)
 +        {
 +            for (int i = 0; i < numAtoms; i++)
 +            {
 +                Vec3 v = currentState.getVelocities()[i];
 +                state->v[i][0] = v[0];
 +                state->v[i][1] = v[1];
 +                state->v[i][2] = v[2];
 +            }
 +        }
 +        if (includeForce)
 +        {
 +            for (int i = 0; i < numAtoms; i++)
 +            {
 +                Vec3 force = currentState.getForces()[i];
 +                f[i][0] = force[0];
 +                f[i][1] = force[1];
 +                f[i][2] = force[2];
 +            }
 +        }
 +        if (includeEnergy)
 +        {
 +            int numConstraints = static_cast<OpenMMData*>(data)->system->getNumConstraints();
 +            int dof = 3*numAtoms-numConstraints;
 +            if (static_cast<OpenMMData*>(data)->removeCM)
 +                dof -= 3;
 +            enerd->term[F_EPOT] = currentState.getPotentialEnergy();
 +            enerd->term[F_EKIN] = currentState.getKineticEnergy();
 +            enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
 +            enerd->term[F_TEMP] = 2.0*enerd->term[F_EKIN]/dof/BOLTZ;
 +        }
 +        *time = currentState.getTime();
 +    }
 +    catch (std::exception& e)
 +    {
 +        gmx_fatal(FARGS, "OpenMM exception caught while retrieving state information: %s", e.what());
 +    }
 +}