Merge branch 'release-4-6'
authorTeemu Murtola <teemu.murtola@gmail.com>
Fri, 20 Sep 2013 03:23:30 +0000 (06:23 +0300)
committerTeemu Murtola <teemu.murtola@gmail.com>
Fri, 20 Sep 2013 03:23:30 +0000 (06:23 +0300)
Merged up to the nbnxn kernel split (i.e., left the current last commit
in release-4-6 yet unmerged).

Conflicts:
    CMakeLists.txt
        (master had changed indentation and added code in suffixing that
         was moved in release-4-6)
    src/gromacs/gmxlib/statutil.cpp
        (code changed in 4.6 removed from master)
    src/gromacs/mdlib/expanded.c (took both changes)
Extra changes:
    src/gromacs/mdlib/nbnxn_search.c (added back a cppcheck suppression)

Change-Id: Ib1d95e53d69714404e594de45442b41d48b5c584

26 files changed:
1  2 
CMakeLists.txt
src/gromacs/gmxlib/gmx_cpuid.c
src/gromacs/legacyheaders/gmx_cpuid.h
src/gromacs/legacyheaders/gmx_simd_macros.h
src/gromacs/legacyheaders/gmx_simd_ref.h
src/gromacs/mdlib/domdec.c
src/gromacs/mdlib/expanded.c
src/gromacs/mdlib/gmx_wallcycle.c
src/gromacs/mdlib/minimize.c
src/gromacs/mdlib/nbnxn_internal.h
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn.c
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_inner.h
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_2xnn_outer.h
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn.c
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_inner.h
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_4xn_outer.h
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ref.h
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128d.h
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128s.h
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256d.h
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256s.h
src/gromacs/mdlib/nbnxn_search.c
src/gromacs/mdlib/nbnxn_search_simd_2xnn.h
src/gromacs/mdlib/nbnxn_search_simd_4xn.h
src/programs/mdrun/runner.c

diff --cc CMakeLists.txt
index 43750c1d132addf3363aa3f64bbd286dcb055d5d,f6a45a83e440bf2cca0aec973c70f6b3215a4f2b..f758a3c412bc3f2eced825451b4ce707962c5839
@@@ -775,16 -751,18 +741,16 @@@ elseif(${GMX_CPU_ACCELERATION} STREQUA
          GMX_TEST_CFLAG(MSVC_AVX_CFLAG "/arch:AVX" ACCELERATION_C_FLAGS)
      endif (NOT GNU_AVX_CFLAG AND GMX_NATIVE_WINDOWS)
      if (NOT GNU_AVX_CFLAG AND NOT MSVC_AVX_CFLAG)
-         message(WARNING "No C AVX flag found. Consider a newer compiler, or try SSE4.1 (lower performance).")
+         message(WARNING "No C AVX flag found. Consider a newer compiler, or try SSE4.1 (lower performance) giving the -DGMX_CPU_ACCELERATION=SSE4.1 to cmake.")
      endif (NOT GNU_AVX_CFLAG AND NOT MSVC_AVX_CFLAG)
  
 -    if (CMAKE_CXX_COMPILER_LOADED)
 -        GMX_TEST_CXXFLAG(GNU_AVX_CXXFLAG "-mavx" ACCELERATION_CXX_FLAGS)
 -        if (NOT GNU_AVX_CXXFLAG AND GMX_NATIVE_WINDOWS)
 -            GMX_TEST_CXXFLAG(MSVC_AVX_CXXFLAG "/arch:AVX" ACCELERATION_CXX_FLAGS)
 -        endif (NOT GNU_AVX_CXXFLAG AND GMX_NATIVE_WINDOWS)
 -        if (NOT GNU_AVX_CXXFLAG AND NOT MSVC_AVX_CXXFLAG)
 -            message(WARNING "No C++ AVX flag found. Consider a newer compiler, or try SSE4.1 (lower performance) giving the -DGMX_CPU_ACCELERATION=SSE4.1 to cmake.")
 -        endif (NOT GNU_AVX_CXXFLAG AND NOT MSVC_AVX_CXXFLAG)
 -    endif()
 +    GMX_TEST_CXXFLAG(GNU_AVX_CXXFLAG "-mavx" ACCELERATION_CXX_FLAGS)
 +    if (NOT GNU_AVX_CXXFLAG AND GMX_NATIVE_WINDOWS)
 +        GMX_TEST_CXXFLAG(MSVC_AVX_CXXFLAG "/arch:AVX" ACCELERATION_CXX_FLAGS)
 +    endif (NOT GNU_AVX_CXXFLAG AND GMX_NATIVE_WINDOWS)
 +    if (NOT GNU_AVX_CXXFLAG AND NOT MSVC_AVX_CXXFLAG)
-        message(WARNING "No C++ AVX flag found. Consider a newer compiler, or try SSE4.1 (lower performance).")
++        message(WARNING "No C++ AVX flag found. Consider a newer compiler, or try SSE4.1 (lower performance) giving the -DGMX_CPU_ACCELERATION=SSE4.1 to cmake.")
 +    endif (NOT GNU_AVX_CXXFLAG AND NOT MSVC_AVX_CXXFLAG)
  
      # Set the FMA4 flags (MSVC doesn't require any)
      if(${GMX_CPU_ACCELERATION} STREQUAL "AVX_128_FMA" AND NOT MSVC)
@@@ -1155,8 -1134,39 +1120,42 @@@ set(INCL_INSTALL_DIR ${GMX_INSTALL_PREF
  
  set(GMXLIBDIR        ${DATA_INSTALL_DIR}/top)
  
+ ########################################################################
+ # Set up binary and library suffixing
+ ########################################################################
+ set(GMX_BINARY_SUFFIX "" CACHE STRING "Suffix for GROMACS binaries (default: _d for double, _mpi for MPI, _mpi_d for MPI and double).")
+ set(GMX_LIBS_SUFFIX ""
+   CACHE STRING "Suffix for GROMACS libs (default: _d for double, _mpi for MPI, _mpi_d for MPI and double).")
+ if (GMX_DEFAULT_SUFFIX)
+   set(GMX_BINARY_SUFFIX "")
+   set(GMX_LIBS_SUFFIX "")
+   if (GMX_LIB_MPI)
+     set(GMX_BINARY_SUFFIX "_mpi")
+     set(GMX_LIBS_SUFFIX "_mpi")
+   endif()
+   if (GMX_DOUBLE)
+     set (GMX_BINARY_SUFFIX "${GMX_BINARY_SUFFIX}_d")
+     set (GMX_LIBS_SUFFIX "${GMX_LIBS_SUFFIX}_d")
+   endif(GMX_DOUBLE)
+   mark_as_advanced(FORCE GMX_BINARY_SUFFIX GMX_LIBS_SUFFIX)
+   if (NOT SUFFIX_QUIETLY)
+     message(STATUS "Using default binary suffix: \"${GMX_BINARY_SUFFIX}\"")
+     message(STATUS "Using default library suffix: \"${GMX_LIBS_SUFFIX}\"")
+   endif (NOT SUFFIX_QUIETLY)
+ else(GMX_DEFAULT_SUFFIX)
+   mark_as_advanced(CLEAR GMX_BINARY_SUFFIX GMX_LIBS_SUFFIX)
+   if (NOT SUFFIX_QUIETLY)
+     message(STATUS "Using manually set binary suffix: \"${GMX_BINARY_SUFFIX}\"")
+     message(STATUS "Using manually set library suffix: \"${GMX_LIBS_SUFFIX}\"")
+   endif (NOT SUFFIX_QUIETLY)
+ endif(GMX_DEFAULT_SUFFIX)
++if (GMX_BUILD_MDRUN_ONLY)
++    set(GMX_LIBS_SUFFIX "_mdrun${GMX_LIBS_SUFFIX}")
++endif ()
+ set(SUFFIX_QUIETLY TRUE CACHE INTERNAL "")
  ##################################################################
 -# Shared library settings - Darwin uses INSTALL_NAME_DIR instead!
 +# Shared library settings
  ##################################################################
  if(NOT CMAKE_SYSTEM_NAME STREQUAL "Darwin")
      set(CMAKE_SKIP_BUILD_RPATH  FALSE)
index c3c5ff8cd790ee365cee7b3c67c4b0682760ebc9,0000000000000000000000000000000000000000..c97a48a95cd49afcccc31aa513e403912ce57d33
mode 100644,000000..100644
--- /dev/null
@@@ -1,1184 -1,0 +1,1213 @@@
-     "Sparc64 HPC-ACE"
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + * This file is part of GROMACS.
 + * Copyright (c) 2012-
 + *
 + * Written by the Gromacs development team under coordination of
 + * David van der Spoel, Berk Hess, and Erik Lindahl.
 + *
 + * This library is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org
 + *
 + * And Hey:
 + * Gnomes, ROck Monsters And Chili Sauce
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#ifdef HAVE_SCHED_H
 +#define _GNU_SOURCE
 +#include <sched.h>
 +#endif
 +
 +#include <stdio.h>
 +#include <stdlib.h>
 +#include <string.h>
 +#include <ctype.h>
 +#ifdef _MSC_VER
 +/* MSVC definition for __cpuid() */
 +#include <intrin.h>
 +/* sysinfo functions */
 +#include <windows.h>
 +#endif
 +#ifdef HAVE_UNISTD_H
 +/* sysconf() definition */
 +#include <unistd.h>
 +#endif
 +
 +#include "gmx_cpuid.h"
 +
 +
 +
 +/* For convenience, and to enable configure-time invocation, we keep all architectures
 + * in a single file, but to avoid repeated ifdefs we set the overall architecture here.
 + */
 +#if defined (__i386__) || defined (__x86_64__) || defined (_M_IX86) || defined (_M_X64)
 +/* OK, it is x86, but can we execute cpuid? */
 +#if defined(GMX_X86_GCC_INLINE_ASM) || ( defined(_MSC_VER) && ( (_MSC_VER > 1500) || (_MSC_VER==1500 & _MSC_FULL_VER >= 150030729)))
 +#    define GMX_CPUID_X86
 +#endif
 +#endif
 +
 +/* Global constant character strings corresponding to our enumerated types */
 +const char *
 +gmx_cpuid_vendor_string[GMX_CPUID_NVENDORS] =
 +{
 +    "CannotDetect",
 +    "Unknown",
 +    "GenuineIntel",
 +    "AuthenticAMD",
 +    "Fujitsu",
 +    "IBM"
 +};
 +
++const char *
++gmx_cpuid_vendor_string_alternative[GMX_CPUID_NVENDORS] =
++{
++    "CannotDetect",
++    "Unknown",
++    "GenuineIntel",
++    "AuthenticAMD",
++    "Fujitsu",
++    "ibm" /* Used on BlueGene/Q */
++};
++
 +const char *
 +gmx_cpuid_feature_string[GMX_CPUID_NFEATURES] =
 +{
 +    "CannotDetect",
 +    "aes",
 +    "apic",
 +    "avx",
 +    "avx2",
 +    "clfsh",
 +    "cmov",
 +    "cx8",
 +    "cx16",
 +    "f16c",
 +    "fma",
 +    "fma4",
 +    "htt",
 +    "lahf_lm",
 +    "misalignsse",
 +    "mmx",
 +    "msr",
 +    "nonstop_tsc",
 +    "pcid",
 +    "pclmuldq",
 +    "pdcm",
 +    "pdpe1gb",
 +    "popcnt",
 +    "pse",
 +    "rdrnd",
 +    "rdtscp",
 +    "sse2",
 +    "sse3",
 +    "sse4a",
 +    "sse4.1",
 +    "sse4.2",
 +    "ssse3",
 +    "tdt",
 +    "x2apic",
 +    "xop"
 +};
 +
 +const char *
 +gmx_cpuid_acceleration_string[GMX_CPUID_NACCELERATIONS] =
 +{
 +    "CannotDetect",
 +    "None",
 +    "SSE2",
 +    "SSE4.1",
 +    "AVX_128_FMA",
 +    "AVX_256",
-     char                       buffer[255],buffer2[255];
++    "Sparc64 HPC-ACE",
++    "IBM_QPX"
 +};
 +
 +/* Max length of brand string */
 +#define GMX_CPUID_BRAND_MAXLEN 256
 +
 +
 +/* Contents of the abstract datatype */
 +struct gmx_cpuid
 +{
 +    enum gmx_cpuid_vendor      vendor;
 +    char                       brand[GMX_CPUID_BRAND_MAXLEN];
 +    int                        family;
 +    int                        model;
 +    int                        stepping;
 +    /* Not using gmx_bool here, since this file must be possible to compile without simple.h */
 +    char                       feature[GMX_CPUID_NFEATURES];
 +
 +    /* Basic CPU topology information. For x86 this is a bit complicated since the topology differs between
 +     * operating systems and sometimes even settings. For most other architectures you can likely just check
 +     * the documentation and then write static information to these arrays rather than detecting on-the-fly.
 +     */
 +    int                        have_cpu_topology;
 +    int                        nproc;               /* total number of logical processors from OS */
 +    int                        npackages;
 +    int                        ncores_per_package;
 +    int                        nhwthreads_per_core;
 +    int *                      package_id;
 +    int *                      core_id;             /* Local core id in each package */
 +    int *                      hwthread_id;         /* Local hwthread id in each core */
 +    int *                      locality_order;      /* Processor indices sorted in locality order */
 +};
 +
 +
 +/* Simple routines to access the data structure. The initialization routine is
 + * further down since that needs to call other static routines in this file.
 + */
 +enum gmx_cpuid_vendor
 +gmx_cpuid_vendor            (gmx_cpuid_t                cpuid)
 +{
 +    return cpuid->vendor;
 +}
 +
 +
 +const char *
 +gmx_cpuid_brand             (gmx_cpuid_t                cpuid)
 +{
 +    return cpuid->brand;
 +}
 +
 +int
 +gmx_cpuid_family            (gmx_cpuid_t                cpuid)
 +{
 +    return cpuid->family;
 +}
 +
 +int
 +gmx_cpuid_model             (gmx_cpuid_t                cpuid)
 +{
 +    return cpuid->model;
 +}
 +
 +int
 +gmx_cpuid_stepping          (gmx_cpuid_t                cpuid)
 +{
 +    return cpuid->stepping;
 +}
 +
 +int
 +gmx_cpuid_feature           (gmx_cpuid_t                cpuid,
 +                             enum gmx_cpuid_feature     feature)
 +{
 +    return (cpuid->feature[feature] != 0);
 +}
 +
 +
 +
 +
 +/* What type of acceleration was compiled in, if any?
 + * This is set from Cmake. Note that the SSE2 and SSE4_1 macros are set for
 + * AVX too, so it is important that they appear last in the list.
 + */
 +#ifdef GMX_X86_AVX_256
 +static const
 +enum gmx_cpuid_acceleration
 +    compiled_acc = GMX_CPUID_ACCELERATION_X86_AVX_256;
 +#elif defined GMX_X86_AVX_128_FMA
 +static const
 +enum gmx_cpuid_acceleration
 +    compiled_acc = GMX_CPUID_ACCELERATION_X86_AVX_128_FMA;
 +#elif defined GMX_X86_SSE4_1
 +static const
 +enum gmx_cpuid_acceleration
 +    compiled_acc = GMX_CPUID_ACCELERATION_X86_SSE4_1;
 +#elif defined GMX_X86_SSE2
 +static const
 +enum gmx_cpuid_acceleration
 +    compiled_acc = GMX_CPUID_ACCELERATION_X86_SSE2;
 +#elif defined GMX_CPU_ACCELERATION_SPARC64_HPC_ACE
 +static const
 +enum gmx_cpuid_acceleration
 +    compiled_acc = GMX_CPUID_ACCELERATION_SPARC64_HPC_ACE;
++#elif defined GMX_CPU_ACCELERATION_IBM_QPX
++static const
++enum gmx_cpuid_acceleration
++    compiled_acc = GMX_CPUID_ACCELERATION_IBM_QPX;
 +#else
 +static const
 +enum gmx_cpuid_acceleration
 +    compiled_acc = GMX_CPUID_ACCELERATION_NONE;
 +#endif
 +
 +
 +#ifdef GMX_CPUID_X86
 +
 +/* Execute CPUID on x86 class CPUs. level sets function to exec, and the
 + * contents of register output is returned. See Intel/AMD docs for details.
 + *
 + * This version supports extended information where we can also have an input
 + * value in the ecx register. This is ignored for most levels, but some of them
 + * (e.g. level 0xB on Intel) use it.
 + */
 +static int
 +execute_x86cpuid(unsigned int   level,
 +                 unsigned int   ecxval,
 +                 unsigned int * eax,
 +                 unsigned int * ebx,
 +                 unsigned int * ecx,
 +                 unsigned int * edx)
 +{
 +    int rc = 0;
 +
 +    /* Currently CPUID is only supported (1) if we can use an instruction on MSVC, or (2)
 +     * if the compiler handles GNU-style inline assembly.
 +     */
 +
 +#if (defined _MSC_VER)
 +    int CPUInfo[4];
 +
 +#if (_MSC_VER > 1500) || (_MSC_VER == 1500 & _MSC_FULL_VER >= 150030729)
 +    /* MSVC 9.0 SP1 or later */
 +    __cpuidex(CPUInfo, level, ecxval);
 +    rc = 0;
 +#else
 +    __cpuid(CPUInfo, level);
 +    /* Set an error code if the user wanted a non-zero ecxval, since we did not have cpuidex */
 +    rc = (ecxval > 0) ? -1 : 0;
 +#endif
 +    *eax = CPUInfo[0];
 +    *ebx = CPUInfo[1];
 +    *ecx = CPUInfo[2];
 +    *edx = CPUInfo[3];
 +
 +#elif (defined GMX_X86_GCC_INLINE_ASM)
 +    /* for now this means GMX_X86_GCC_INLINE_ASM should be defined,
 +     * but there might be more options added in the future.
 +     */
 +    *eax = level;
 +    *ecx = ecxval;
 +    *ebx = 0;
 +    *edx = 0;
 +#if defined(__i386__) && defined(__PIC__)
 +    /* Avoid clobbering the global offset table in 32-bit pic code (ebx register) */
 +    __asm__ __volatile__ ("xchgl %%ebx, %1  \n\t"
 +                          "cpuid            \n\t"
 +                          "xchgl %%ebx, %1  \n\t"
 +                          : "+a" (*eax), "+r" (*ebx), "+c" (*ecx), "+d" (*edx));
 +#else
 +    /* i386 without PIC, or x86-64. Things are easy and we can clobber any reg we want :-) */
 +    __asm__ __volatile__ ("cpuid            \n\t"
 +                          : "+a" (*eax), "+b" (*ebx), "+c" (*ecx), "+d" (*edx));
 +#endif
 +    rc = 0;
 +#else
 +    /* Death and horror!
 +     * Apparently this is an x86 platform where we don't know how to call cpuid.
 +     *
 +     * This is REALLY bad, since we will lose all Gromacs acceleration.
 +     */
 +    *eax = 0;
 +    *ebx = 0;
 +    *ecx = 0;
 +    *edx = 0;
 +
 +    rc = -1;
 +#endif
 +    return rc;
 +}
 +
 +
 +/* Identify CPU features common to Intel & AMD - mainly brand string,
 + * version and some features. Vendor has already been detected outside this.
 + */
 +static int
 +cpuid_check_common_x86(gmx_cpuid_t                cpuid)
 +{
 +    int                       fn, max_stdfn, max_extfn;
 +    unsigned int              eax, ebx, ecx, edx;
 +    char                      str[GMX_CPUID_BRAND_MAXLEN];
 +    char *                    p;
 +
 +    /* Find largest standard/extended function input value */
 +    execute_x86cpuid(0x0, 0, &eax, &ebx, &ecx, &edx);
 +    max_stdfn = eax;
 +    execute_x86cpuid(0x80000000, 0, &eax, &ebx, &ecx, &edx);
 +    max_extfn = eax;
 +
 +    p = str;
 +    if (max_extfn >= 0x80000005)
 +    {
 +        /* Get CPU brand string */
 +        for (fn = 0x80000002; fn < 0x80000005; fn++)
 +        {
 +            execute_x86cpuid(fn, 0, &eax, &ebx, &ecx, &edx);
 +            memcpy(p, &eax, 4);
 +            memcpy(p+4, &ebx, 4);
 +            memcpy(p+8, &ecx, 4);
 +            memcpy(p+12, &edx, 4);
 +            p += 16;
 +        }
 +        *p = '\0';
 +
 +        /* Remove empty initial space */
 +        p = str;
 +        while (isspace(*(p)))
 +        {
 +            p++;
 +        }
 +        strncpy(cpuid->brand, p, GMX_CPUID_BRAND_MAXLEN);
 +    }
 +    else
 +    {
 +        strncpy(cpuid->brand, "Unknown CPU brand", GMX_CPUID_BRAND_MAXLEN);
 +    }
 +
 +    /* Find basic CPU properties */
 +    if (max_stdfn >= 1)
 +    {
 +        execute_x86cpuid(0x1, 0, &eax, &ebx, &ecx, &edx);
 +
 +        cpuid->family   = ((eax & 0x0FF00000) >> 20) + ((eax & 0x00000F00) >> 8);
 +        /* Note that extended model should be shifted left 4, so only shift right 12 iso 16. */
 +        cpuid->model    = ((eax & 0x000F0000) >> 12) + ((eax & 0x000000F0) >> 4);
 +        cpuid->stepping = (eax & 0x0000000F);
 +
 +        /* Feature flags common to AMD and intel */
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_SSE3]     = (ecx & (1 << 0))  != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_PCLMULDQ] = (ecx & (1 << 1))  != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_SSSE3]    = (ecx & (1 << 9))  != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_FMA]      = (ecx & (1 << 12)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_CX16]     = (ecx & (1 << 13)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_SSE4_1]   = (ecx & (1 << 19)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_SSE4_2]   = (ecx & (1 << 20)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_POPCNT]   = (ecx & (1 << 23)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_AES]      = (ecx & (1 << 25)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_AVX]      = (ecx & (1 << 28)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_F16C]     = (ecx & (1 << 29)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_RDRND]    = (ecx & (1 << 30)) != 0;
 +
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_PSE]      = (edx & (1 << 3))  != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_MSR]      = (edx & (1 << 5))  != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_CX8]      = (edx & (1 << 8))  != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_APIC]     = (edx & (1 << 9))  != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_CMOV]     = (edx & (1 << 15)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_CLFSH]    = (edx & (1 << 19)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_MMX]      = (edx & (1 << 23)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_SSE2]     = (edx & (1 << 26)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_HTT]      = (edx & (1 << 28)) != 0;
 +    }
 +    else
 +    {
 +        cpuid->family   = -1;
 +        cpuid->model    = -1;
 +        cpuid->stepping = -1;
 +    }
 +
 +    if (max_extfn >= 0x80000001)
 +    {
 +        execute_x86cpuid(0x80000001, 0, &eax, &ebx, &ecx, &edx);
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_LAHF_LM] = (ecx & (1 << 0))  != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_PDPE1GB] = (edx & (1 << 26)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_RDTSCP]  = (edx & (1 << 27)) != 0;
 +    }
 +
 +    if (max_extfn >= 0x80000007)
 +    {
 +        execute_x86cpuid(0x80000007, 0, &eax, &ebx, &ecx, &edx);
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_NONSTOP_TSC]  = (edx & (1 << 8))  != 0;
 +    }
 +    return 0;
 +}
 +
 +/* This routine returns the number of unique different elements found in the array,
 + * and renumbers these starting from 0. For example, the array {0,1,2,8,9,10,8,9,10,0,1,2}
 + * will be rewritten to {0,1,2,3,4,5,3,4,5,0,1,2}, and it returns 6 for the
 + * number of unique elements.
 + */
 +static int
 +cpuid_renumber_elements(int *data, int n)
 +{
 +    int *unique;
 +    int  i, j, nunique, found;
 +
 +    unique = malloc(sizeof(int)*n);
 +
 +    nunique = 0;
 +    for (i = 0; i < n; i++)
 +    {
 +        for (j = 0, found = 0; j < nunique && !found; j++)
 +        {
 +            found = (data[i] == unique[j]);
 +        }
 +        if (!found)
 +        {
 +            /* Insert in sorted order! */
 +            for (j = nunique++; j > 0 && unique[j-1] > data[i]; j--)
 +            {
 +                unique[j] = unique[j-1];
 +            }
 +            unique[j] = data[i];
 +        }
 +    }
 +    /* renumber */
 +    for (i = 0; i < n; i++)
 +    {
 +        for (j = 0; j < nunique; j++)
 +        {
 +            if (data[i] == unique[j])
 +            {
 +                data[i] = j;
 +            }
 +        }
 +    }
 +    return nunique;
 +}
 +
 +/* APIC IDs, or everything you wanted to know about your x86 cores but were afraid to ask...
 + *
 + * Raw APIC IDs are unfortunately somewhat dirty. For technical reasons they are assigned
 + * in power-of-2 chunks, and even then there are no guarantees about specific numbers - all
 + * we know is that the part for each thread/core/package is unique, and how many bits are
 + * reserved for that part.
 + * This routine does internal renumbering so we get continuous indices, and also
 + * decodes the actual number of packages,cores-per-package and hwthreads-per-core.
 + * Returns: 0 on success, non-zero on failure.
 + */
 +static int
 +cpuid_x86_decode_apic_id(gmx_cpuid_t cpuid, int *apic_id, int core_bits, int hwthread_bits)
 +{
 +    int i, idx;
 +    int hwthread_mask, core_mask_after_shift;
 +
 +    cpuid->hwthread_id     = malloc(sizeof(int)*cpuid->nproc);
 +    cpuid->core_id         = malloc(sizeof(int)*cpuid->nproc);
 +    cpuid->package_id      = malloc(sizeof(int)*cpuid->nproc);
 +    cpuid->locality_order  = malloc(sizeof(int)*cpuid->nproc);
 +
 +    hwthread_mask         = (1 << hwthread_bits) - 1;
 +    core_mask_after_shift = (1 << core_bits) - 1;
 +
 +    for (i = 0; i < cpuid->nproc; i++)
 +    {
 +        cpuid->hwthread_id[i] = apic_id[i] & hwthread_mask;
 +        cpuid->core_id[i]     = (apic_id[i] >> hwthread_bits) & core_mask_after_shift;
 +        cpuid->package_id[i]  = apic_id[i] >> (core_bits + hwthread_bits);
 +    }
 +
 +    cpuid->npackages            = cpuid_renumber_elements(cpuid->package_id, cpuid->nproc);
 +    cpuid->ncores_per_package   = cpuid_renumber_elements(cpuid->core_id, cpuid->nproc);
 +    cpuid->nhwthreads_per_core  = cpuid_renumber_elements(cpuid->hwthread_id, cpuid->nproc);
 +
 +    /* now check for consistency */
 +    if ( (cpuid->npackages * cpuid->ncores_per_package *
 +          cpuid->nhwthreads_per_core) != cpuid->nproc )
 +    {
 +        /* the packages/cores-per-package/hwthreads-per-core counts are
 +           inconsistent. */
 +        return -1;
 +    }
 +
 +    /* Create a locality order array, i.e. first all resources in package0, which in turn
 +     * are sorted so we first have all resources in core0, where threads are sorted in order, etc.
 +     */
 +
 +    for (i = 0; i < cpuid->nproc; i++)
 +    {
 +        idx = (cpuid->package_id[i]*cpuid->ncores_per_package + cpuid->core_id[i])*cpuid->nhwthreads_per_core + cpuid->hwthread_id[i];
 +        cpuid->locality_order[idx] = i;
 +    }
 +    return 0;
 +}
 +
 +
 +/* Detection of AMD-specific CPU features */
 +static int
 +cpuid_check_amd_x86(gmx_cpuid_t                cpuid)
 +{
 +    int                       max_stdfn, max_extfn, ret;
 +    unsigned int              eax, ebx, ecx, edx;
 +    int                       hwthread_bits, core_bits;
 +    int *                     apic_id;
 +
 +    cpuid_check_common_x86(cpuid);
 +
 +    execute_x86cpuid(0x0, 0, &eax, &ebx, &ecx, &edx);
 +    max_stdfn = eax;
 +
 +    execute_x86cpuid(0x80000000, 0, &eax, &ebx, &ecx, &edx);
 +    max_extfn = eax;
 +
 +    if (max_extfn >= 0x80000001)
 +    {
 +        execute_x86cpuid(0x80000001, 0, &eax, &ebx, &ecx, &edx);
 +
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_SSE4A]       = (ecx & (1 << 6))  != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_MISALIGNSSE] = (ecx & (1 << 7))  != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_XOP]         = (ecx & (1 << 11)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_FMA4]        = (ecx & (1 << 16)) != 0;
 +    }
 +
 +    /* Query APIC information on AMD */
 +    if (max_extfn >= 0x80000008)
 +    {
 +#if (defined HAVE_SCHED_H && defined HAVE_SCHED_SETAFFINITY && defined HAVE_SYSCONF && defined __linux__)
 +        /* Linux */
 +        unsigned int   i;
 +        cpu_set_t      cpuset, save_cpuset;
 +        cpuid->nproc = sysconf(_SC_NPROCESSORS_ONLN);
 +        apic_id      = malloc(sizeof(int)*cpuid->nproc);
 +        sched_getaffinity(0, sizeof(cpu_set_t), &save_cpuset);
 +        /* Get APIC id from each core */
 +        CPU_ZERO(&cpuset);
 +        for (i = 0; i < cpuid->nproc; i++)
 +        {
 +            CPU_SET(i, &cpuset);
 +            sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);
 +            execute_x86cpuid(0x1, 0, &eax, &ebx, &ecx, &edx);
 +            apic_id[i] = ebx >> 24;
 +            CPU_CLR(i, &cpuset);
 +        }
 +        /* Reset affinity to the value it had when calling this routine */
 +        sched_setaffinity(0, sizeof(cpu_set_t), &save_cpuset);
 +#define CPUID_HAVE_APIC
 +#elif defined GMX_NATIVE_WINDOWS
 +        /* Windows */
 +        DWORD_PTR     i;
 +        SYSTEM_INFO   sysinfo;
 +        unsigned int  save_affinity, affinity;
 +        GetSystemInfo( &sysinfo );
 +        cpuid->nproc  = sysinfo.dwNumberOfProcessors;
 +        apic_id       = malloc(sizeof(int)*cpuid->nproc);
 +        /* Get previous affinity mask */
 +        save_affinity = SetThreadAffinityMask(GetCurrentThread(), 1);
 +        for (i = 0; i < cpuid->nproc; i++)
 +        {
 +            SetThreadAffinityMask(GetCurrentThread(), (((DWORD_PTR)1)<<i));
 +            Sleep(0);
 +            execute_x86cpuid(0x1, 0, &eax, &ebx, &ecx, &edx);
 +            apic_id[i] = ebx >> 24;
 +        }
 +        SetThreadAffinityMask(GetCurrentThread(), save_affinity);
 +#define CPUID_HAVE_APIC
 +#endif
 +#ifdef CPUID_HAVE_APIC
 +        /* AMD does not support SMT yet - there are no hwthread bits in apic ID */
 +        hwthread_bits = 0;
 +        /* Get number of core bits in apic ID - try modern extended method first */
 +        execute_x86cpuid(0x80000008, 0, &eax, &ebx, &ecx, &edx);
 +        core_bits = (ecx >> 12) & 0xf;
 +        if (core_bits == 0)
 +        {
 +            /* Legacy method for old single/dual core AMD CPUs */
 +            int i = ecx & 0xF;
 +            for (core_bits = 0; (i>>core_bits) > 0; core_bits++)
 +            {
 +                ;
 +            }
 +        }
 +        ret = cpuid_x86_decode_apic_id(cpuid, apic_id, core_bits, 
 +                                       hwthread_bits);
 +        cpuid->have_cpu_topology = (ret == 0);
 +#endif
 +    }
 +    return 0;
 +}
 +
 +/* Detection of Intel-specific CPU features */
 +static int
 +cpuid_check_intel_x86(gmx_cpuid_t                cpuid)
 +{
 +    unsigned int              max_stdfn, max_extfn, ret;
 +    unsigned int              eax, ebx, ecx, edx;
 +    unsigned int              max_logical_cores, max_physical_cores;
 +    int                       hwthread_bits, core_bits;
 +    int *                     apic_id;
 +
 +    cpuid_check_common_x86(cpuid);
 +
 +    execute_x86cpuid(0x0, 0, &eax, &ebx, &ecx, &edx);
 +    max_stdfn = eax;
 +
 +    execute_x86cpuid(0x80000000, 0, &eax, &ebx, &ecx, &edx);
 +    max_extfn = eax;
 +
 +    if (max_stdfn >= 1)
 +    {
 +        execute_x86cpuid(0x1, 0, &eax, &ebx, &ecx, &edx);
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_PDCM]    = (ecx & (1 << 15)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_PCID]    = (ecx & (1 << 17)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_X2APIC]  = (ecx & (1 << 21)) != 0;
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_TDT]     = (ecx & (1 << 24)) != 0;
 +    }
 +
 +    if (max_stdfn >= 7)
 +    {
 +        execute_x86cpuid(0x7, 0, &eax, &ebx, &ecx, &edx);
 +        cpuid->feature[GMX_CPUID_FEATURE_X86_AVX2]    = (ebx & (1 << 5))  != 0;
 +    }
 +
 +    /* Check whether Hyper-Threading is enabled, not only supported */
 +    if (cpuid->feature[GMX_CPUID_FEATURE_X86_HTT] && max_stdfn >= 4)
 +    {
 +        execute_x86cpuid(0x1, 0, &eax, &ebx, &ecx, &edx);
 +        max_logical_cores  = (ebx >> 16) & 0x0FF;
 +        execute_x86cpuid(0x4, 0, &eax, &ebx, &ecx, &edx);
 +        max_physical_cores = ((eax >> 26) & 0x3F) + 1;
 +
 +        /* Clear HTT flag if we only have 1 logical core per physical */
 +        if (max_logical_cores/max_physical_cores < 2)
 +        {
 +            cpuid->feature[GMX_CPUID_FEATURE_X86_HTT] = 0;
 +        }
 +    }
 +
 +    if (max_stdfn >= 0xB)
 +    {
 +        /* Query x2 APIC information from cores */
 +#if (defined HAVE_SCHED_H && defined HAVE_SCHED_SETAFFINITY && defined HAVE_SYSCONF && defined __linux__)
 +        /* Linux */
 +        unsigned int   i;
 +        cpu_set_t      cpuset, save_cpuset;
 +        cpuid->nproc = sysconf(_SC_NPROCESSORS_ONLN);
 +        apic_id      = malloc(sizeof(int)*cpuid->nproc);
 +        sched_getaffinity(0, sizeof(cpu_set_t), &save_cpuset);
 +        /* Get x2APIC ID from each hardware thread */
 +        CPU_ZERO(&cpuset);
 +        for (i = 0; i < cpuid->nproc; i++)
 +        {
 +            CPU_SET(i, &cpuset);
 +            sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);
 +            execute_x86cpuid(0xB, 0, &eax, &ebx, &ecx, &edx);
 +            apic_id[i] = edx;
 +            CPU_CLR(i, &cpuset);
 +        }
 +        /* Reset affinity to the value it had when calling this routine */
 +        sched_setaffinity(0, sizeof(cpu_set_t), &save_cpuset);
 +#define CPUID_HAVE_APIC
 +#elif defined GMX_NATIVE_WINDOWS
 +        /* Windows */
 +        DWORD_PTR     i;
 +        SYSTEM_INFO   sysinfo;
 +        unsigned int  save_affinity, affinity;
 +        GetSystemInfo( &sysinfo );
 +        cpuid->nproc  = sysinfo.dwNumberOfProcessors;
 +        apic_id       = malloc(sizeof(int)*cpuid->nproc);
 +        /* Get previous affinity mask */
 +        save_affinity = SetThreadAffinityMask(GetCurrentThread(), 1);
 +        for (i = 0; i < cpuid->nproc; i++)
 +        {
 +            SetThreadAffinityMask(GetCurrentThread(), (((DWORD_PTR)1)<<i));
 +            Sleep(0);
 +            execute_x86cpuid(0xB, 0, &eax, &ebx, &ecx, &edx);
 +            apic_id[i] = edx;
 +        }
 +        SetThreadAffinityMask(GetCurrentThread(), save_affinity);
 +#define CPUID_HAVE_APIC
 +#endif
 +#ifdef CPUID_HAVE_APIC
 +        execute_x86cpuid(0xB, 0, &eax, &ebx, &ecx, &edx);
 +        hwthread_bits    = eax & 0x1F;
 +        execute_x86cpuid(0xB, 1, &eax, &ebx, &ecx, &edx);
 +        core_bits        = (eax & 0x1F) - hwthread_bits;
 +        ret = cpuid_x86_decode_apic_id(cpuid, apic_id, core_bits, 
 +                                       hwthread_bits);
 +        cpuid->have_cpu_topology = (ret == 0);
 +#endif
 +    }
 +    return 0;
 +}
 +#endif /* GMX_CPUID_X86 */
 +
 +
 +
 +
 +static void
 +chomp_substring_before_colon(const char *in, char *s, int maxlength)
 +{
 +    char *p;
 +    strncpy(s,in,maxlength);
 +    p = strchr(s,':');
 +    if(p!=NULL)
 +    {
 +        *p='\0';
 +        while(isspace(*(--p)) && (p>=s))
 +        {
 +            *p='\0';
 +        }
 +    }
 +    else
 +    {
 +        *s='\0';
 +    }
 +}
 +
 +static void
 +chomp_substring_after_colon(const char *in, char *s, int maxlength)
 +{
 +    char *p;
 +    if( (p = strchr(in,':'))!=NULL)
 +    {
 +        p++;
 +        while(isspace(*p)) p++;
 +        strncpy(s,p,maxlength);
 +        p = s+strlen(s);
 +        while(isspace(*(--p)) && (p>=s))
 +        {
 +            *p='\0';
 +        }
 +    }
 +    else
 +    {
 +        *s='\0';
 +    }
 +}
 +
 +/* Try to find the vendor of the current CPU, so we know what specific
 + * detection routine to call.
 + */
 +static enum gmx_cpuid_vendor
 +cpuid_check_vendor(void)
 +{
 +    enum gmx_cpuid_vendor      i, vendor;
 +    /* Register data used on x86 */
 +    unsigned int               eax, ebx, ecx, edx;
 +    char                       vendorstring[13];
 +    FILE *                     fp;
-             chomp_substring_before_colon(buffer,buffer2,sizeof(buffer2));
-             /* Intel/AMD use "vendor_id", IBM "vendor". Fujitsu "manufacture". Add others if you have them! */
-             if( !strcmp(buffer2,"vendor_id") || !strcmp(buffer2,"vendor") || !strcmp(buffer2,"manufacture") )
++    char                       buffer[255],before_colon[255], after_colon[255];
 +
 +    /* Set default first */
 +    vendor = GMX_CPUID_VENDOR_UNKNOWN;
 +
 +#ifdef GMX_CPUID_X86
 +    execute_x86cpuid(0x0, 0, &eax, &ebx, &ecx, &edx);
 +
 +    memcpy(vendorstring, &ebx, 4);
 +    memcpy(vendorstring+4, &edx, 4);
 +    memcpy(vendorstring+8, &ecx, 4);
 +
 +    vendorstring[12] = '\0';
 +
 +    for (i = GMX_CPUID_VENDOR_UNKNOWN; i < GMX_CPUID_NVENDORS; i++)
 +    {
 +        if (!strncmp(vendorstring, gmx_cpuid_vendor_string[i], 12))
 +        {
 +            vendor = i;
 +        }
 +    }
 +#elif defined(__linux__) || defined(__linux)
 +    /* General Linux. Try to get CPU vendor from /proc/cpuinfo */
 +    if( (fp = fopen("/proc/cpuinfo","r")) != NULL)
 +    {
 +        while( (vendor == GMX_CPUID_VENDOR_UNKNOWN) && (fgets(buffer,sizeof(buffer),fp) != NULL))
 +        {
-                 chomp_substring_after_colon(buffer,buffer2,sizeof(buffer2));
++            chomp_substring_before_colon(buffer,before_colon,sizeof(before_colon));
++            /* Intel/AMD use "vendor_id", IBM "vendor"(?) or "model". Fujitsu "manufacture". Add others if you have them! */
++            if( !strcmp(before_colon,"vendor_id")
++                || !strcmp(before_colon,"vendor")
++                || !strcmp(before_colon,"manufacture")
++                || !strcmp(before_colon,"model"))
 +            {
-                     /* Be liberal and accept if we find the vendor anywhere in string */
-                     if(strstr(buffer2,gmx_cpuid_vendor_string[i]))
++                chomp_substring_after_colon(buffer,after_colon,sizeof(after_colon));
 +                for(i=GMX_CPUID_VENDOR_UNKNOWN; i<GMX_CPUID_NVENDORS; i++)
 +                {
++                    /* Be liberal and accept if we find the vendor
++                     * string (or alternative string) anywhere. Using
++                     * strcasestr() would be non-portable. */
++                    if(strstr(after_colon,gmx_cpuid_vendor_string[i])
++                       || strstr(after_colon,gmx_cpuid_vendor_string_alternative[i]))
 +                    {
 +                        vendor = i;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +    fclose(fp);
 +#endif
 +
 +    return vendor;
 +}
 +
 +
 +
 +int
 +gmx_cpuid_topology(gmx_cpuid_t        cpuid,
 +                   int *              nprocessors,
 +                   int *              npackages,
 +                   int *              ncores_per_package,
 +                   int *              nhwthreads_per_core,
 +                   const int **       package_id,
 +                   const int **       core_id,
 +                   const int **       hwthread_id,
 +                   const int **       locality_order)
 +{
 +    int rc;
 +
 +    if (cpuid->have_cpu_topology)
 +    {
 +        *nprocessors          = cpuid->nproc;
 +        *npackages            = cpuid->npackages;
 +        *ncores_per_package   = cpuid->ncores_per_package;
 +        *nhwthreads_per_core  = cpuid->nhwthreads_per_core;
 +        *package_id           = cpuid->package_id;
 +        *core_id              = cpuid->core_id;
 +        *hwthread_id          = cpuid->hwthread_id;
 +        *locality_order       = cpuid->locality_order;
 +        rc                    = 0;
 +    }
 +    else
 +    {
 +        rc = -1;
 +    }
 +    return rc;
 +}
 +
 +
 +enum gmx_cpuid_x86_smt
 +gmx_cpuid_x86_smt(gmx_cpuid_t cpuid)
 +{
 +    enum gmx_cpuid_x86_smt rc;
 +
 +    if (cpuid->have_cpu_topology)
 +    {
 +        rc = (cpuid->nhwthreads_per_core > 1) ? GMX_CPUID_X86_SMT_ENABLED : GMX_CPUID_X86_SMT_DISABLED;
 +    }
 +    else if (cpuid->vendor == GMX_CPUID_VENDOR_AMD || gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_HTT) == 0)
 +    {
 +        rc = GMX_CPUID_X86_SMT_DISABLED;
 +    }
 +    else
 +    {
 +        rc = GMX_CPUID_X86_SMT_CANNOTDETECT;
 +    }
 +    return rc;
 +}
 +
 +
 +int
 +gmx_cpuid_init               (gmx_cpuid_t *              pcpuid)
 +{
 +    gmx_cpuid_t cpuid;
 +    int         i;
 +    FILE *      fp;
 +    char        buffer[255],buffer2[255];
 +    int         found_brand;
 +
 +    cpuid = malloc(sizeof(*cpuid));
 +
 +    *pcpuid = cpuid;
 +
 +    for (i = 0; i < GMX_CPUID_NFEATURES; i++)
 +    {
 +        cpuid->feature[i] = 0;
 +    }
 +
 +    cpuid->have_cpu_topology   = 0;
 +    cpuid->nproc               = 0;
 +    cpuid->npackages           = 0;
 +    cpuid->ncores_per_package  = 0;
 +    cpuid->nhwthreads_per_core = 0;
 +    cpuid->package_id          = NULL;
 +    cpuid->core_id             = NULL;
 +    cpuid->hwthread_id         = NULL;
 +    cpuid->locality_order      = NULL;
 +
 +    cpuid->vendor = cpuid_check_vendor();
 +
 +    switch (cpuid->vendor)
 +    {
 +#ifdef GMX_CPUID_X86
 +        case GMX_CPUID_VENDOR_INTEL:
 +            cpuid_check_intel_x86(cpuid);
 +            break;
 +        case GMX_CPUID_VENDOR_AMD:
 +            cpuid_check_amd_x86(cpuid);
 +            break;
 +#endif
 +        default:
 +            /* Default value */
 +            strncpy(cpuid->brand,"Unknown CPU brand",GMX_CPUID_BRAND_MAXLEN);
 +#if defined(__linux__) || defined(__linux)
 +            /* General Linux. Try to get CPU type from /proc/cpuinfo */
 +            if( (fp = fopen("/proc/cpuinfo","r")) != NULL)
 +            {
 +                found_brand = 0;
 +                while( (found_brand==0) && (fgets(buffer,sizeof(buffer),fp) !=NULL))
 +                {
 +                    chomp_substring_before_colon(buffer,buffer2,sizeof(buffer2));
 +                    /* Intel uses "model name", Fujitsu and IBM "cpu". */
 +                    if( !strcmp(buffer2,"model name") || !strcmp(buffer2,"cpu"))
 +                    {
 +                        chomp_substring_after_colon(buffer,cpuid->brand,GMX_CPUID_BRAND_MAXLEN);
 +                        found_brand = 1;
 +                    }
 +                }
 +            }
 +            fclose(fp);
 +#endif
 +            cpuid->family         = 0;
 +            cpuid->model          = 0;
 +            cpuid->stepping       = 0;
 +            
 +            for(i=0; i<GMX_CPUID_NFEATURES; i++)
 +            {
 +                cpuid->feature[i]=0;
 +            }
 +            cpuid->feature[GMX_CPUID_FEATURE_CANNOTDETECT] = 1;
 +            break;
 +    }
 +    return 0;
 +}
 +
 +
 +
 +void
 +gmx_cpuid_done               (gmx_cpuid_t              cpuid)
 +{
 +    free(cpuid);
 +}
 +
 +
 +int
 +gmx_cpuid_formatstring       (gmx_cpuid_t              cpuid,
 +                              char *                   str,
 +                              int                      n)
 +{
 +    int                     c;
 +    int                     i;
 +    enum gmx_cpuid_feature  feature;
 +
 +#ifdef _MSC_VER
 +    _snprintf(str, n,
 +              "Vendor: %s\n"
 +              "Brand:  %s\n"
 +              "Family: %2d  Model: %2d  Stepping: %2d\n"
 +              "Features:",
 +              gmx_cpuid_vendor_string[gmx_cpuid_vendor(cpuid)],
 +              gmx_cpuid_brand(cpuid),
 +              gmx_cpuid_family(cpuid), gmx_cpuid_model(cpuid), gmx_cpuid_stepping(cpuid));
 +#else
 +    snprintf(str, n,
 +             "Vendor: %s\n"
 +             "Brand:  %s\n"
 +             "Family: %2d  Model: %2d  Stepping: %2d\n"
 +             "Features:",
 +             gmx_cpuid_vendor_string[gmx_cpuid_vendor(cpuid)],
 +             gmx_cpuid_brand(cpuid),
 +             gmx_cpuid_family(cpuid), gmx_cpuid_model(cpuid), gmx_cpuid_stepping(cpuid));
 +#endif
 +
 +    str[n-1] = '\0';
 +    c        = strlen(str);
 +    n       -= c;
 +    str     += c;
 +
 +    for (feature = GMX_CPUID_FEATURE_CANNOTDETECT; feature < GMX_CPUID_NFEATURES; feature++)
 +    {
 +        if (gmx_cpuid_feature(cpuid, feature) == 1)
 +        {
 +#ifdef _MSC_VER
 +            _snprintf(str, n, " %s", gmx_cpuid_feature_string[feature]);
 +#else
 +            snprintf(str, n, " %s", gmx_cpuid_feature_string[feature]);
 +#endif
 +            str[n-1] = '\0';
 +            c        = strlen(str);
 +            n       -= c;
 +            str     += c;
 +        }
 +    }
 +#ifdef _MSC_VER
 +    _snprintf(str, n, "\n");
 +#else
 +    snprintf(str, n, "\n");
 +#endif
 +    str[n-1] = '\0';
 +
 +    return 0;
 +}
 +
 +
 +
 +enum gmx_cpuid_acceleration
 +gmx_cpuid_acceleration_suggest  (gmx_cpuid_t                 cpuid)
 +{
 +    enum gmx_cpuid_acceleration  tmpacc;
 +
 +    tmpacc = GMX_CPUID_ACCELERATION_NONE;
 +
 +    if (gmx_cpuid_vendor(cpuid) == GMX_CPUID_VENDOR_INTEL)
 +    {
 +        if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_AVX))
 +        {
 +            tmpacc = GMX_CPUID_ACCELERATION_X86_AVX_256;
 +        }
 +        else if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_SSE4_1))
 +        {
 +            tmpacc = GMX_CPUID_ACCELERATION_X86_SSE4_1;
 +        }
 +        else if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_SSE2))
 +        {
 +            tmpacc = GMX_CPUID_ACCELERATION_X86_SSE2;
 +        }
 +    }
 +    else if (gmx_cpuid_vendor(cpuid) == GMX_CPUID_VENDOR_AMD)
 +    {
 +        if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_AVX))
 +        {
 +            tmpacc = GMX_CPUID_ACCELERATION_X86_AVX_128_FMA;
 +        }
 +        else if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_SSE4_1))
 +        {
 +            tmpacc = GMX_CPUID_ACCELERATION_X86_SSE4_1;
 +        }
 +        else if (gmx_cpuid_feature(cpuid, GMX_CPUID_FEATURE_X86_SSE2))
 +        {
 +            tmpacc = GMX_CPUID_ACCELERATION_X86_SSE2;
 +        }
 +    }
 +    else if(gmx_cpuid_vendor(cpuid)==GMX_CPUID_VENDOR_FUJITSU)
 +    {
 +        if(strstr(gmx_cpuid_brand(cpuid),"SPARC64"))
 +        {
 +            tmpacc = GMX_CPUID_ACCELERATION_SPARC64_HPC_ACE;
 +        }
 +    }
++    else if(gmx_cpuid_vendor(cpuid)==GMX_CPUID_VENDOR_IBM)
++    {
++        if(strstr(gmx_cpuid_brand(cpuid),"A2"))
++        {
++            tmpacc = GMX_CPUID_ACCELERATION_IBM_QPX;
++        }
++    }
 +    return tmpacc;
 +}
 +
 +
 +
 +int
 +gmx_cpuid_acceleration_check(gmx_cpuid_t   cpuid,
 +                             FILE *        log)
 +{
 +    int                           rc;
 +    char                          str[1024];
 +    enum gmx_cpuid_acceleration   acc;
 +
 +    acc = gmx_cpuid_acceleration_suggest(cpuid);
 +
 +    rc = (acc != compiled_acc);
 +
 +    gmx_cpuid_formatstring(cpuid, str, 1023);
 +    str[1023] = '\0';
 +
 +    if (log != NULL)
 +    {
 +        fprintf(log,
 +                "\nDetecting CPU-specific acceleration.\nPresent hardware specification:\n"
 +                "%s"
 +                "Acceleration most likely to fit this hardware: %s\n"
 +                "Acceleration selected at GROMACS compile time: %s\n\n",
 +                str,
 +                gmx_cpuid_acceleration_string[acc],
 +                gmx_cpuid_acceleration_string[compiled_acc]);
 +    }
 +
 +    if (rc != 0)
 +    {
 +        if (log != NULL)
 +        {
 +            fprintf(log, "\nBinary not matching hardware - you might be losing performance.\n"
 +                    "Acceleration most likely to fit this hardware: %s\n"
 +                    "Acceleration selected at GROMACS compile time: %s\n\n",
 +                    gmx_cpuid_acceleration_string[acc],
 +                    gmx_cpuid_acceleration_string[compiled_acc]);
 +        }
 +        printf("Compiled acceleration: %s (Gromacs could use %s on this machine, which is better)\n",
 +               gmx_cpuid_acceleration_string[compiled_acc],
 +               gmx_cpuid_acceleration_string[acc]);
 +    }
 +    return rc;
 +}
 +
 +
 +#ifdef GMX_CPUID_STANDALONE
 +/* Stand-alone program to enable queries of CPU features from Cmake.
 + * Note that you need to check inline ASM capabilities before compiling and set
 + * -DGMX_X86_GCC_INLINE_ASM for the cpuid instruction to work...
 + */
 +int
 +main(int argc, char **argv)
 +{
 +    gmx_cpuid_t                   cpuid;
 +    enum gmx_cpuid_acceleration   acc;
 +    int                           i, cnt;
 +
 +    if (argc < 2)
 +    {
 +        fprintf(stdout,
 +                "Usage:\n\n%s [flags]\n\n"
 +                "Available flags:\n"
 +                "-vendor        Print CPU vendor.\n"
 +                "-brand         Print CPU brand string.\n"
 +                "-family        Print CPU family version.\n"
 +                "-model         Print CPU model version.\n"
 +                "-stepping      Print CPU stepping version.\n"
 +                "-features      Print CPU feature flags.\n"
 +                "-acceleration  Print suggested GROMACS acceleration.\n",
 +                argv[0]);
 +        exit(0);
 +    }
 +
 +    gmx_cpuid_init(&cpuid);
 +
 +    if (!strncmp(argv[1], "-vendor", 3))
 +    {
 +        printf("%s\n", gmx_cpuid_vendor_string[cpuid->vendor]);
 +    }
 +    else if (!strncmp(argv[1], "-brand", 3))
 +    {
 +        printf("%s\n", cpuid->brand);
 +    }
 +    else if (!strncmp(argv[1], "-family", 3))
 +    {
 +        printf("%d\n", cpuid->family);
 +    }
 +    else if (!strncmp(argv[1], "-model", 3))
 +    {
 +        printf("%d\n", cpuid->model);
 +    }
 +    else if (!strncmp(argv[1], "-stepping", 3))
 +    {
 +        printf("%d\n", cpuid->stepping);
 +    }
 +    else if (!strncmp(argv[1], "-features", 3))
 +    {
 +        cnt = 0;
 +        for (i = 0; i < GMX_CPUID_NFEATURES; i++)
 +        {
 +            if (cpuid->feature[i] == 1)
 +            {
 +                if (cnt++ > 0)
 +                {
 +                    printf(" ");
 +                }
 +                printf("%s", gmx_cpuid_feature_string[i]);
 +            }
 +        }
 +        printf("\n");
 +    }
 +    else if (!strncmp(argv[1], "-acceleration", 3))
 +    {
 +        acc = gmx_cpuid_acceleration_suggest(cpuid);
 +        fprintf(stdout, "%s\n", gmx_cpuid_acceleration_string[acc]);
 +    }
 +
 +    gmx_cpuid_done(cpuid);
 +
 +
 +    return 0;
 +}
 +
 +#endif
index fdbf854dc4a697cab9a7be52d59bbfdd73abbfc2,0000000000000000000000000000000000000000..4aefc72a57f102877447e2ebb06cd2338f09c33a
mode 100644,000000..100644
--- /dev/null
@@@ -1,298 -1,0 +1,299 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + * This file is part of GROMACS.
 + * Copyright (c) 2012-
 + *
 + * Written by the Gromacs development team under coordination of
 + * David van der Spoel, Berk Hess, and Erik Lindahl.
 + *
 + * This library is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org
 + *
 + * And Hey:
 + * Gnomes, ROck Monsters And Chili Sauce
 + */
 +#ifndef GMX_CPUID_H_
 +#define GMX_CPUID_H_
 +
 +#include <stdio.h>
 +
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +#if 0
 +} /* fixes auto-indentation problems */
 +#endif
 +
 +
 +/* Currently identifiable CPU Vendors */
 +enum gmx_cpuid_vendor
 +{
 +    GMX_CPUID_VENDOR_CANNOTDETECT,   /* Should only be used if something fails */
 +    GMX_CPUID_VENDOR_UNKNOWN,
 +    GMX_CPUID_VENDOR_INTEL,
 +    GMX_CPUID_VENDOR_AMD,
 +    GMX_CPUID_VENDOR_FUJITSU,
 +    GMX_CPUID_VENDOR_IBM,
 +    GMX_CPUID_NVENDORS
 +};
 +
 +
 +/* CPU feature/property list, to be used as indices into the feature array of the
 + * gmxcpuid_t data structure.
 + *
 + * To facilitate looking things up, we keep this list alphabetical.
 + * The list is NOT exhaustive - we have basically added stuff that might be
 + * useful in an application like Gromacs.
 + *
 + * AMD and Intel tend to share most architectural elements, and even if the
 + * flags might have to be detected in different ways (different cpuid registers),
 + * once the flag is present the functions should be identical. Unfortunately the
 + * trend right now (2012) seems to be that they are diverging. This means that
 + * we need to use specific flags to the compiler to maximize performance, and
 + * then the binaries might not be portable between Intel and AMD as they were
 + * before when we only needed to check for SSE and/or SSE2 support in Gromacs.
 + */
 +enum gmx_cpuid_feature
 +{
 +    GMX_CPUID_FEATURE_CANNOTDETECT,      /* Flag set if we could not detect on this CPU  */
 +    GMX_CPUID_FEATURE_X86_AES,           /* x86 advanced encryption standard accel.      */
 +    GMX_CPUID_FEATURE_X86_APIC,          /* APIC support                                 */
 +    GMX_CPUID_FEATURE_X86_AVX,           /* Advanced vector extensions                   */
 +    GMX_CPUID_FEATURE_X86_AVX2,          /* AVX2 including gather support (not used yet) */
 +    GMX_CPUID_FEATURE_X86_CLFSH,         /* Supports CLFLUSH instruction                 */
 +    GMX_CPUID_FEATURE_X86_CMOV,          /* Conditional move insn support                */
 +    GMX_CPUID_FEATURE_X86_CX8,           /* Supports CMPXCHG8B (8-byte compare-exchange) */
 +    GMX_CPUID_FEATURE_X86_CX16,          /* Supports CMPXCHG16B (16-byte compare-exchg)  */
 +    GMX_CPUID_FEATURE_X86_F16C,          /* Supports 16-bit FP conversion instructions   */
 +    GMX_CPUID_FEATURE_X86_FMA,           /* Fused-multiply add support (mainly for AVX)  */
 +    GMX_CPUID_FEATURE_X86_FMA4,          /* 4-operand FMA, only on AMD for now           */
 +    GMX_CPUID_FEATURE_X86_HTT,           /* Hyper-Threading supported                    */
 +    GMX_CPUID_FEATURE_X86_LAHF_LM,       /* LAHF/SAHF support in 64 bits                 */
 +    GMX_CPUID_FEATURE_X86_MISALIGNSSE,   /* Support for misaligned SSE data instructions */
 +    GMX_CPUID_FEATURE_X86_MMX,           /* MMX registers and instructions               */
 +    GMX_CPUID_FEATURE_X86_MSR,           /* Supports Intel model-specific-registers      */
 +    GMX_CPUID_FEATURE_X86_NONSTOP_TSC,   /* Invariant TSC (constant rate in ACPI states) */
 +    GMX_CPUID_FEATURE_X86_PCID,          /* Process context identifier support           */
 +    GMX_CPUID_FEATURE_X86_PCLMULDQ,      /* Carry-less 64-bit multiplication supported   */
 +    GMX_CPUID_FEATURE_X86_PDCM,          /* Perfmon and Debug Capability                 */
 +    GMX_CPUID_FEATURE_X86_PDPE1GB,       /* Support for 1GB pages                        */
 +    GMX_CPUID_FEATURE_X86_POPCNT,        /* Supports the POPCNT (population count) insn  */
 +    GMX_CPUID_FEATURE_X86_PSE,           /* Supports 4MB-pages (page size extension)     */
 +    GMX_CPUID_FEATURE_X86_RDRND,         /* RDRAND high-quality hardware random numbers  */
 +    GMX_CPUID_FEATURE_X86_RDTSCP,        /* Serializing rdtscp instruction available     */
 +    GMX_CPUID_FEATURE_X86_SSE2,          /* SSE 2                                        */
 +    GMX_CPUID_FEATURE_X86_SSE3,          /* SSE 3                                        */
 +    GMX_CPUID_FEATURE_X86_SSE4A,         /* SSE 4A                                       */
 +    GMX_CPUID_FEATURE_X86_SSE4_1,        /* SSE 4.1                                      */
 +    GMX_CPUID_FEATURE_X86_SSE4_2,        /* SSE 4.2                                      */
 +    GMX_CPUID_FEATURE_X86_SSSE3,         /* Supplemental SSE3                            */
 +    GMX_CPUID_FEATURE_X86_TDT,           /* TSC deadline timer                           */
 +    GMX_CPUID_FEATURE_X86_X2APIC,        /* Extended xAPIC Support                       */
 +    GMX_CPUID_FEATURE_X86_XOP,           /* AMD extended instructions, only AMD for now  */
 +    GMX_CPUID_NFEATURES
 +};
 +
 +
 +/* Currently supported acceleration instruction sets, intrinsics or other similar combinations
 + * in Gromacs. There is not always a 1-to-1 correspondence with feature flags; on some AMD
 + * hardware we prefer to use 128bit AVX instructions (although 256-bit ones could be executed),
 + * and we still haven't written the AVX2 kernels.
 + */
 +enum gmx_cpuid_acceleration
 +{
 +    GMX_CPUID_ACCELERATION_CANNOTDETECT,    /* Should only be used if something fails */
 +    GMX_CPUID_ACCELERATION_NONE,
 +    GMX_CPUID_ACCELERATION_X86_SSE2,
 +    GMX_CPUID_ACCELERATION_X86_SSE4_1,
 +    GMX_CPUID_ACCELERATION_X86_AVX_128_FMA,
 +    GMX_CPUID_ACCELERATION_X86_AVX_256,
 +    GMX_CPUID_ACCELERATION_SPARC64_HPC_ACE,
++    GMX_CPUID_ACCELERATION_IBM_QPX,
 +    GMX_CPUID_NACCELERATIONS
 +};
 +
 +/* Text strings corresponding to CPU vendors */
 +extern const char *
 +gmx_cpuid_vendor_string[GMX_CPUID_NVENDORS];
 +
 +/* Text strings for CPU feature indices */
 +extern const char *
 +gmx_cpuid_feature_string[GMX_CPUID_NFEATURES];
 +
 +/* Text strings for Gromacs acceleration/instruction sets */
 +extern const char *
 +gmx_cpuid_acceleration_string[GMX_CPUID_NACCELERATIONS];
 +
 +
 +/* Abstract data type with CPU detection information. Set by gmx_cpuid_init(). */
 +typedef struct gmx_cpuid *
 +    gmx_cpuid_t;
 +
 +
 +/* Fill the data structure by using CPU detection instructions.
 + * Return 0 on success, 1 if something bad happened.
 + */
 +int
 +gmx_cpuid_init              (gmx_cpuid_t *              cpuid);
 +
 +
 +/* Return the vendor id as enumerated type. Use gmx_cpuid_vendor_string[]
 + * to get the corresponding text string.
 + */
 +enum gmx_cpuid_vendor
 +gmx_cpuid_vendor            (gmx_cpuid_t                cpuid);
 +
 +
 +/* Return a constant pointer to the processor brand string. */
 +const char *
 +gmx_cpuid_brand             (gmx_cpuid_t                cpuid);
 +
 +
 +/* Return processor family version. For a chip of version 1.2.3, this is 1 */
 +int
 +gmx_cpuid_family            (gmx_cpuid_t                cpuid);
 +
 +/* Return processor model version, For a chip of version 1.2.3, this is 2. */
 +int
 +gmx_cpuid_model             (gmx_cpuid_t                cpuid);
 +
 +/* Return processor stepping version, For a chip of version 1.2.3, this is 3. */
 +int
 +gmx_cpuid_stepping          (gmx_cpuid_t                cpuid);
 +
 +
 +/* Check whether a particular CPUID feature is set.
 + * Returns 0 if flag "feature" is not set, 1 if the flag is set. We cannot use
 + * gmx_bool here since this file must be possible to compile without simple.h.
 + */
 +int
 +gmx_cpuid_feature           (gmx_cpuid_t                cpuid,
 +                             enum gmx_cpuid_feature     feature);
 +
 +
 +/* Return pointers to cpu topology information.
 + *
 + * Important: CPU topology requires more OS support than most other
 + * functions in this file, including support for thread pinning to hardware.
 + * This means it will not work on some platforms, including e.g. Mac OS X.
 + * Thus, it is IMPERATIVE that you check the return value from this routine
 + * before doing anything with the information. It is only if the return
 + * value is zero that the data is valid.
 + *
 + * For the returned values we have:
 + * - nprocessors         Total number of logical processors reported by OS
 + * - npackages           Usually number of CPU sockets
 + * - ncores_per_package  Number of cores in each package
 + * - nhwthreads_per_core Number of hardware threads per core; 2 for hyperthreading.
 + * - package_id          Array with the package index for each logical cpu
 + * - core_id             Array with local core index for each logical cpu
 + * - hwthread_id         Array with local hwthread index for each logical cpu
 + * - locality_order      Array with logical cpu numbers, sorted in order
 + *                       of physical and logical locality in the system.
 + *
 + * All arrays are of length nprocessors.
 + */
 +int
 +gmx_cpuid_topology(gmx_cpuid_t        cpuid,
 +                   int *              nprocessors,
 +                   int *              npackages,
 +                   int *              ncores_per_package,
 +                   int *              nhwthreads_per_core,
 +                   const int **       package_id,
 +                   const int **       core_id,
 +                   const int **       hwthread_id,
 +                   const int **       locality_order);
 +
 +/* Enumerated values for x86 SMT enabled-status. Note that this does not refer
 + * to Hyper-Threading support (that is the flag GMX_CPUID_FEATURE_X86_HTT), but
 + * whether Hyper-Threading is _enabled_ and _used_ in bios right now.
 + */
 +enum gmx_cpuid_x86_smt
 +{
 +    GMX_CPUID_X86_SMT_CANNOTDETECT,
 +    GMX_CPUID_X86_SMT_DISABLED,
 +    GMX_CPUID_X86_SMT_ENABLED
 +};
 +
 +/* Returns the status of x86 SMT support. IMPORTANT: There are non-zero
 + * return values for this routine that still do not indicate supported and
 + * enabled smt/Hyper-Threading. You need to carefully check the return value
 + * against the enumerated type values to see what you are getting.
 + *
 + * Long-term, this functionality will move to a new hardware topology detection
 + * layer, but that will require a lot of new code and a working interface to the
 + * hwloc library. Surprisingly, there is no simple way to find out that
 + * Hyper-Threading is actually turned on without fully enumerating and checking
 + * all the cores, which we presently can only do on Linux. This means a couple
 + * of things:
 + *
 + * 1) If you want to know whether your CPU _supports_ Hyper-Threading in the
 + *    first place, check the GMX_CPUID_FEATURE_X86_HTT flag instead!
 + * 2) There are several scenarios where this routine will say that it cannot
 + *    detect whether SMT is enabled and used right now.
 + * 3) If you need support on non-Linux x86, you have to write it :-)
 + * 4) Don't invest too much efforts, since this will be replaced with
 + *    full hardware topology detection in the future.
 + * 5) Don't worry if the detection does not work. It is not a catastrophe, but
 + *    but we get slightly better performance on x86 if we use Hyper-Threading
 + *    cores in direct space, but not reciprocal space.
 + *
 + * Since this routine presently only supports Hyper-Threading we say X86_SMT
 + * in order not to give the impression we can detect any SMT. We haven't
 + * even tested the performance on other SMT implementations, so it is not
 + * obvious we shouldn't use SMT there.
 + *
 + * Note that you can get more complete topology information from
 + * gmx_cpuid_topology(), although that requires slightly more OS support.
 + */
 +enum gmx_cpuid_x86_smt
 +gmx_cpuid_x86_smt(gmx_cpuid_t cpuid);
 +
 +
 +/* Formats a text string (up to n characters) from the data structure.
 + * The output will have max 80 chars between newline characters.
 + */
 +int
 +gmx_cpuid_formatstring      (gmx_cpuid_t                cpuid,
 +                             char *                     s,
 +                             int                        n);
 +
 +
 +/* Suggests a suitable gromacs acceleration based on the support in the
 + * hardware.
 + */
 +enum gmx_cpuid_acceleration
 +gmx_cpuid_acceleration_suggest  (gmx_cpuid_t                    cpuid);
 +
 +
 +/* Check if this binary was compiled with the same acceleration as we
 + * would suggest for the current hardware. Always print stats to the log file
 + * if it is non-NULL, and print a warning in stdout if we don't have a match.
 + */
 +int
 +gmx_cpuid_acceleration_check    (gmx_cpuid_t                cpuid,
 +                                 FILE *                     log);
 +
 +
 +/* Release resources used by data structure. Note that the pointer to the
 + * CPU brand string will no longer be valid once this routine has been called.
 + */
 +void
 +gmx_cpuid_done              (gmx_cpuid_t                cpuid);
 +
 +
 +
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +
 +#endif /* GMX_CPUID_H_ */
index 9cd3767d5ed51f0ee6db9abf0496ed18a9417863,0000000000000000000000000000000000000000..37e94880ef1bcaa1fd1850ca8ded0870dc5fd3be
mode 100644,000000..100644
--- /dev/null
@@@ -1,587 -1,0 +1,536 @@@
- /* For topology exclusion pair checking we need: ((a & b) ? True : False)
-  * when we do a bit-wise and between a and b.
-  * When integer SIMD operations are present, we use gmx_checkbitmask_epi32(a, b)
-  * Otherwise we do all operations, except for the set1, in reals.
-  */
- #define GMX_SIMD_HAVE_CHECKBITMASK_EPI32
- #ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
- #define gmx_set1_epi32          gmx_simd_ref_set1_epi32
- #define gmx_load_si             gmx_simd_ref_load_si
- #define gmx_checkbitmask_epi32  gmx_simd_ref_checkbitmask_epi32
- #endif
- /* #define GMX_SIMD_HAVE_CHECKBITMASK_PR */
- #ifdef GMX_SIMD_HAVE_CHECKBITMASK_PR
- #define gmx_castsi_pr           gmx_simd_ref_castsi_pr
- /* As gmx_checkbitmask_epi32, but operates on reals. In double precision two
-  * identical 32-bit masks are set in one double and one or both can be used.
-  */
- #define gmx_checkbitmask_pr     gmx_simd_ref_checkbitmask_pr
- #endif
 +/*
 + * This file is part of the GROMACS molecular simulation package.
 + *
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + * Copyright (c) 2012, by the GROMACS development team, led by
 + * David van der Spoel, Berk Hess, Erik Lindahl, and including many
 + * others, as listed in the AUTHORS file in the top-level source
 + * directory and at http://www.gromacs.org.
 + *
 + * GROMACS is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2.1
 + * of the License, or (at your option) any later version.
 + *
 + * GROMACS is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with GROMACS; if not, see
 + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 + *
 + * If you want to redistribute modifications to GROMACS, please
 + * consider that scientific software is very special. Version
 + * control is crucial - bugs must be traceable. We will be happy to
 + * consider code for inclusion in the official distribution, but
 + * derived work must not be called official GROMACS. Details are found
 + * in the README & COPYING files - if they are missing, get the
 + * official version at http://www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org.
 + */
 +
 +/* The macros in this file are intended to be used for writing
 + * architecture-independent SIMD intrinsics code.
 + * To support a new architecture, adding macros here should be (nearly)
 + * all that is needed.
 + */
 +
 +#ifdef _gmx_simd_macros_h_
 +#error "gmx_simd_macros.h included twice"
 +#else
 +#define _gmx_simd_macros_h_
 +
 +/* NOTE: SSE2 acceleration does not include floor or blendv */
 +
 +
 +/* Uncomment the next line, without other SIMD active, for testing plain-C */
 +/* #define GMX_SIMD_REFERENCE_PLAIN_C */
 +#ifdef GMX_SIMD_REFERENCE_PLAIN_C
 +/* Plain C SIMD reference implementation, also serves as documentation */
 +#define GMX_HAVE_SIMD_MACROS
 +
 +/* In general the reference SIMD supports any SIMD width, including 1.
 + * For the nbnxn 4xn kernels all widths (2, 4 and 8) are supported.
 + * The nbnxn 2xnn kernels are currently not supported.
 + */
 +#define GMX_SIMD_REF_WIDTH  4
 +
 +/* Include plain-C reference implementation, also serves as documentation */
 +#include "gmx_simd_ref.h"
 +
 +#define GMX_SIMD_WIDTH_HERE  GMX_SIMD_REF_WIDTH
 +
 +/* float/double SIMD register type */
 +#define gmx_mm_pr  gmx_simd_ref_pr
 +
 +/* boolean SIMD register type */
 +#define gmx_mm_pb  gmx_simd_ref_pb
 +
 +/* integer SIMD register type, only for table indexing and exclusion masks */
 +#define gmx_epi32  gmx_simd_ref_epi32
 +#define GMX_SIMD_EPI32_WIDTH  GMX_SIMD_REF_EPI32_WIDTH
 +
 +/* Load GMX_SIMD_WIDTH_HERE reals for memory starting at r */
 +#define gmx_load_pr       gmx_simd_ref_load_pr
 +/* Set all SIMD register elements to *r */
 +#define gmx_load1_pr      gmx_simd_ref_load1_pr
 +#define gmx_set1_pr       gmx_simd_ref_set1_pr
 +#define gmx_setzero_pr    gmx_simd_ref_setzero_pr
 +#define gmx_store_pr      gmx_simd_ref_store_pr
 +
 +#define gmx_add_pr        gmx_simd_ref_add_pr
 +#define gmx_sub_pr        gmx_simd_ref_sub_pr
 +#define gmx_mul_pr        gmx_simd_ref_mul_pr
 +/* For the FMA macros below, aim for c=d in code, so FMA3 uses 1 instruction */
 +#define gmx_madd_pr       gmx_simd_ref_madd_pr
 +#define gmx_nmsub_pr      gmx_simd_ref_nmsub_pr
 +
 +#define gmx_max_pr        gmx_simd_ref_max_pr
 +#define gmx_blendzero_pr  gmx_simd_ref_blendzero_pr
 +
 +#define gmx_round_pr      gmx_simd_ref_round_pr
 +
 +/* Not required, only used to speed up the nbnxn tabulated PME kernels */
 +#define GMX_SIMD_HAVE_FLOOR
 +#ifdef GMX_SIMD_HAVE_FLOOR
 +#define gmx_floor_pr      gmx_simd_ref_floor_pr
 +#endif
 +
 +/* Not required, only used when blendv is faster than comparison */
 +#define GMX_SIMD_HAVE_BLENDV
 +#ifdef GMX_SIMD_HAVE_BLENDV
 +#define gmx_blendv_pr     gmx_simd_ref_blendv_pr
 +#endif
 +
 +/* Copy the sign of a to b, assumes b >= 0 for efficiency */
 +#define gmx_cpsgn_nonneg_pr  gmx_simd_ref_cpsgn_nonneg_pr
 +
 +/* Very specific operation required in the non-bonded kernels */
 +#define gmx_masknot_add_pr   gmx_simd_ref_masknot_add_pr
 +
 +/* Comparison */
 +#define gmx_cmplt_pr      gmx_simd_ref_cmplt_pr
 +
 +/* Logical operations on SIMD booleans */
 +#define gmx_and_pb        gmx_simd_ref_and_pb
 +#define gmx_or_pb         gmx_simd_ref_or_pb
 +
 +/* Not required, gmx_anytrue_pb(x) returns if any of the boolean is x is True.
 + * If this is not present, define GMX_SIMD_IS_TRUE(real x),
 + * which should return x==True, where True is True as defined in SIMD.
 + */
 +#define GMX_SIMD_HAVE_ANYTRUE
 +#ifdef GMX_SIMD_HAVE_ANYTRUE
 +#define gmx_anytrue_pb    gmx_simd_ref_anytrue_pb
 +#else
 +/* If we don't have gmx_anytrue_pb, we need to store gmx_mm_pb */
 +#define gmx_store_pb      gmx_simd_ref_store_pb
 +#endif
 +
- #define GMX_SIMD_HAVE_CHECKBITMASK_EPI32
- #define gmx_set1_epi32    _mm_set1_epi32
- #define gmx_load_si(i)    _mm_load_si128((__m128i *) (i))
- #define gmx_checkbitmask_epi32(m0, m1) gmx_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128()))
 +/* Conversions only used for PME table lookup */
 +#define gmx_cvttpr_epi32  gmx_simd_ref_cvttpr_epi32
 +#define gmx_cvtepi32_pr   gmx_simd_ref_cvtepi32_pr
 +
 +/* These two function only need to be approximate, Newton-Raphson iteration
 + * is used for full accuracy in gmx_invsqrt_pr and gmx_inv_pr.
 + */
 +#define gmx_rsqrt_pr      gmx_simd_ref_rsqrt_pr
 +#define gmx_rcp_pr        gmx_simd_ref_rcp_pr
 +
 +/* sqrt+inv+sin+cos+acos+atan2 are used for bonded potentials, exp for PME */
 +#define GMX_SIMD_HAVE_EXP
 +#ifdef GMX_SIMD_HAVE_EXP
 +#define gmx_exp_pr        gmx_simd_ref_exp_pr
 +#endif
 +#define GMX_SIMD_HAVE_TRIGONOMETRIC
 +#ifdef GMX_SIMD_HAVE_TRIGONOMETRIC
 +#define gmx_sqrt_pr       gmx_simd_ref_sqrt_pr
 +#define gmx_sincos_pr     gmx_simd_ref_sincos_pr
 +#define gmx_acos_pr       gmx_simd_ref_acos_pr
 +#define gmx_atan2_pr      gmx_simd_ref_atan2_pr
 +#endif
 +
 +#endif /* GMX_SIMD_REFERENCE_PLAIN_C */
 +
 +
 +/* The same SIMD macros can be translated to SIMD intrinsics (and compiled
 + * to instructions for) different SIMD width and float precision.
 + *
 + * On x86: The gmx_ prefix is replaced by _mm_ or _mm256_ (SSE or AVX).
 + * The _pr suffix is replaced by _ps or _pd (for single or double precision).
 + * Compiler settings will decide if 128-bit intrinsics will
 + * be translated into SSE or AVX instructions.
 + */
 +
 +
 +#ifdef GMX_USE_HALF_WIDTH_SIMD_HERE
 +#if defined GMX_X86_AVX_256
 +/* We have half SIMD width support, continue */
 +#else
 +#error "half SIMD width intrinsics are not supported"
 +#endif
 +#endif
 +
 +
 +#ifdef GMX_X86_SSE2
 +/* This is for general x86 SIMD instruction sets that also support SSE2 */
 +#define GMX_HAVE_SIMD_MACROS
 +
 +/* Include the highest supported x86 SIMD intrisics + math functions */
 +#ifdef GMX_X86_AVX_256
 +#include "gmx_x86_avx_256.h"
 +#ifdef GMX_DOUBLE
 +#include "gmx_math_x86_avx_256_double.h"
 +#else
 +#include "gmx_math_x86_avx_256_single.h"
 +#endif
 +#else
 +#ifdef GMX_X86_AVX_128_FMA
 +#include "gmx_x86_avx_128_fma.h"
 +#ifdef GMX_DOUBLE
 +#include "gmx_math_x86_avx_128_fma_double.h"
 +#else
 +#include "gmx_math_x86_avx_128_fma_single.h"
 +#endif
 +#else
 +#ifdef GMX_X86_SSE4_1
 +#include "gmx_x86_sse4_1.h"
 +#ifdef GMX_DOUBLE
 +#include "gmx_math_x86_sse4_1_double.h"
 +#else
 +#include "gmx_math_x86_sse4_1_single.h"
 +#endif
 +#else
 +#ifdef GMX_X86_SSE2
 +#include "gmx_x86_sse2.h"
 +#ifdef GMX_DOUBLE
 +#include "gmx_math_x86_sse2_double.h"
 +#else
 +#include "gmx_math_x86_sse2_single.h"
 +#endif
 +#else
 +#error No x86 acceleration defined
 +#endif
 +#endif
 +#endif
 +#endif
 +/* exp and trigonometric functions are included above */
 +#define GMX_SIMD_HAVE_EXP
 +#define GMX_SIMD_HAVE_TRIGONOMETRIC
 +
 +#if !defined GMX_X86_AVX_256 || defined GMX_USE_HALF_WIDTH_SIMD_HERE
 +
 +#ifndef GMX_DOUBLE
 +
 +#define GMX_SIMD_WIDTH_HERE  4
 +
 +#define gmx_mm_pr  __m128
 +
 +#define gmx_mm_pb  __m128
 +
 +#define gmx_epi32  __m128i
 +#define GMX_SIMD_EPI32_WIDTH  4
 +
 +#define gmx_load_pr       _mm_load_ps
 +#define gmx_load1_pr      _mm_load1_ps
 +#define gmx_set1_pr       _mm_set1_ps
 +#define gmx_setzero_pr    _mm_setzero_ps
 +#define gmx_store_pr      _mm_store_ps
 +
 +#define gmx_add_pr        _mm_add_ps
 +#define gmx_sub_pr        _mm_sub_ps
 +#define gmx_mul_pr        _mm_mul_ps
 +#ifdef GMX_X86_AVX_128_FMA
 +#define gmx_madd_pr(a, b, c)   _mm_macc_ps(a, b, c)
 +#define gmx_nmsub_pr(a, b, c)  _mm_nmacc_ps(a, b, c)
 +#else
 +#define gmx_madd_pr(a, b, c)   _mm_add_ps(c, _mm_mul_ps(a, b))
 +#define gmx_nmsub_pr(a, b, c)  _mm_sub_ps(c, _mm_mul_ps(a, b))
 +#endif
 +#define gmx_max_pr        _mm_max_ps
 +#define gmx_blendzero_pr  _mm_and_ps
 +
 +#define gmx_cmplt_pr      _mm_cmplt_ps
 +#define gmx_and_pb        _mm_and_ps
 +#define gmx_or_pb         _mm_or_ps
 +
 +#ifdef GMX_X86_SSE4_1
 +#define gmx_round_pr(x)   _mm_round_ps(x, 0x0)
 +#define GMX_SIMD_HAVE_FLOOR
 +#define gmx_floor_pr      _mm_floor_ps
 +#else
 +#define gmx_round_pr(x)   _mm_cvtepi32_ps(_mm_cvtps_epi32(x))
 +#endif
 +
 +#ifdef GMX_X86_SSE4_1
 +#define GMX_SIMD_HAVE_BLENDV
 +#define gmx_blendv_pr     _mm_blendv_ps
 +#endif
 +
 +static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
 +{
 +    /* The value -0.0 has only the sign-bit set */
 +    gmx_mm_pr sign_mask = _mm_set1_ps(-0.0);
 +    return _mm_or_ps(_mm_and_ps(a, sign_mask), b);
 +};
 +
 +static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm_add_ps(b, _mm_andnot_ps(a, c)); };
 +
 +#define GMX_SIMD_HAVE_ANYTRUE
 +#define gmx_anytrue_pb    _mm_movemask_ps
 +
- #define GMX_SIMD_HAVE_CHECKBITMASK_EPI32
- #define gmx_set1_epi32    _mm_set1_epi32
- #define gmx_load_si(i)    _mm_load_si128((__m128i *) (i))
- #define gmx_checkbitmask_epi32(m0, m1) gmx_mm_castsi128_pd(_mm_cmpeq_epi32(_mm_andnot_si128(m0, m1), _mm_setzero_si128()))
 +#define gmx_cvttpr_epi32  _mm_cvttps_epi32
 +#define gmx_cvtepi32_pr   _mm_cvtepi32_ps
 +
 +#define gmx_rsqrt_pr      _mm_rsqrt_ps
 +#define gmx_rcp_pr        _mm_rcp_ps
 +
 +#define gmx_exp_pr        gmx_mm_exp_ps
 +#define gmx_sqrt_pr       gmx_mm_sqrt_ps
 +#define gmx_sincos_pr     gmx_mm_sincos_ps
 +#define gmx_acos_pr       gmx_mm_acos_ps
 +#define gmx_atan2_pr      gmx_mm_atan2_ps
 +
 +#else /* ifndef GMX_DOUBLE */
 +
 +#define GMX_SIMD_WIDTH_HERE  2
 +
 +#define gmx_mm_pr  __m128d
 +
 +#define gmx_mm_pb  __m128d
 +
 +#define gmx_epi32  __m128i
 +#define GMX_SIMD_EPI32_WIDTH  4
 +
 +#define gmx_load_pr       _mm_load_pd
 +#define gmx_load1_pr      _mm_load1_pd
 +#define gmx_set1_pr       _mm_set1_pd
 +#define gmx_setzero_pr    _mm_setzero_pd
 +#define gmx_store_pr      _mm_store_pd
 +
 +#define gmx_add_pr        _mm_add_pd
 +#define gmx_sub_pr        _mm_sub_pd
 +#define gmx_mul_pr        _mm_mul_pd
 +#ifdef GMX_X86_AVX_128_FMA
 +#define gmx_madd_pr(a, b, c)   _mm_macc_pd(a, b, c)
 +#define gmx_nmsub_pr(a, b, c)  _mm_nmacc_pd(a, b, c)
 +#else
 +#define gmx_madd_pr(a, b, c)   _mm_add_pd(c, _mm_mul_pd(a, b))
 +#define gmx_nmsub_pr(a, b, c)  _mm_sub_pd(c, _mm_mul_pd(a, b))
 +#endif
 +#define gmx_max_pr        _mm_max_pd
 +#define gmx_blendzero_pr  _mm_and_pd
 +
 +#ifdef GMX_X86_SSE4_1
 +#define gmx_round_pr(x)   _mm_round_pd(x, 0x0)
 +#define GMX_SIMD_HAVE_FLOOR
 +#define gmx_floor_pr      _mm_floor_pd
 +#else
 +#define gmx_round_pr(x)   _mm_cvtepi32_pd(_mm_cvtpd_epi32(x))
 +/* gmx_floor_pr is not used in code for pre-SSE4_1 hardware */
 +#endif
 +
 +#ifdef GMX_X86_SSE4_1
 +#define GMX_SIMD_HAVE_BLENDV
 +#define gmx_blendv_pr     _mm_blendv_pd
 +#endif
 +
 +static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
 +{
 +    gmx_mm_pr sign_mask = _mm_set1_pd(-0.0);
 +    return _mm_or_pd(_mm_and_pd(a, sign_mask), b);
 +};
 +
 +static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm_add_pd(b, _mm_andnot_pd(a, c)); };
 +
 +#define gmx_cmplt_pr      _mm_cmplt_pd
 +
 +#define gmx_and_pb        _mm_and_pd
 +#define gmx_or_pb         _mm_or_pd
 +
 +#define GMX_SIMD_HAVE_ANYTRUE
 +#define gmx_anytrue_pb    _mm_movemask_pd
 +
- #define GMX_SIMD_HAVE_CHECKBITMASK_PR
- #define gmx_set1_epi32    _mm256_set1_epi32
- #define gmx_castsi_pr     _mm256_castsi256_ps
- /* With <= 16 bits used the cast and conversion should not be required,
-  * since only mantissa bits are set and that would give a non-zero float,
-  * but with the Intel compiler this does not work correctly.
-  */
- #define gmx_checkbitmask_pr(m0, m1) _mm256_cmp_ps(_mm256_cvtepi32_ps(_mm256_castps_si256(_mm256_and_ps(m0, m1))), _mm256_setzero_ps(), 0x0c)
 +#define gmx_cvttpr_epi32  _mm_cvttpd_epi32
 +#define gmx_cvtepi32_pr   _mm_cvtepi32_pd
 +
 +#define gmx_rsqrt_pr(r)   _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(r)))
 +#define gmx_rcp_pr(r)     _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(r)))
 +
 +#define gmx_exp_pr        gmx_mm_exp_pd
 +#define gmx_sqrt_pr       gmx_mm_sqrt_pd
 +#define gmx_sincos_pr     gmx_mm_sincos_pd
 +#define gmx_acos_pr       gmx_mm_acos_pd
 +#define gmx_atan2_pr      gmx_mm_atan2_pd
 +
 +#endif /* ifndef GMX_DOUBLE */
 +
 +#else
 +/* We have GMX_X86_AVX_256 and not GMX_USE_HALF_WIDTH_SIMD_HERE,
 + * so we use 256-bit SIMD.
 + */
 +
 +#ifndef GMX_DOUBLE
 +
 +#define GMX_SIMD_WIDTH_HERE  8
 +
 +#define gmx_mm_pr  __m256
 +
 +#define gmx_mm_pb  __m256
 +
 +#define gmx_epi32  __m256i
 +#define GMX_SIMD_EPI32_WIDTH  8
 +
 +#define gmx_load_pr       _mm256_load_ps
 +#define gmx_load1_pr(x)   _mm256_set1_ps((x)[0])
 +#define gmx_set1_pr       _mm256_set1_ps
 +#define gmx_setzero_pr    _mm256_setzero_ps
 +#define gmx_store_pr      _mm256_store_ps
 +
 +#define gmx_add_pr        _mm256_add_ps
 +#define gmx_sub_pr        _mm256_sub_ps
 +#define gmx_mul_pr        _mm256_mul_ps
 +#define gmx_madd_pr(a, b, c)   _mm256_add_ps(c, _mm256_mul_ps(a, b))
 +#define gmx_nmsub_pr(a, b, c)  _mm256_sub_ps(c, _mm256_mul_ps(a, b))
 +#define gmx_max_pr        _mm256_max_ps
 +#define gmx_blendzero_pr  _mm256_and_ps
 +
 +#define gmx_round_pr(x)   _mm256_round_ps(x, 0x0)
 +#define GMX_SIMD_HAVE_FLOOR
 +#define gmx_floor_pr      _mm256_floor_ps
 +
 +#define GMX_SIMD_HAVE_BLENDV
 +#define gmx_blendv_pr     _mm256_blendv_ps
 +
 +static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
 +{
 +    gmx_mm_pr sign_mask = _mm256_set1_ps(-0.0);
 +    return _mm256_or_ps(_mm256_and_ps(a, sign_mask), b);
 +};
 +
 +static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm256_add_ps(b, _mm256_andnot_ps(a, c)); };
 +
 +/* Less-than (we use ordered, non-signaling, but that's not required) */
 +#define gmx_cmplt_pr(x, y) _mm256_cmp_ps(x, y, 0x11)
 +#define gmx_and_pb        _mm256_and_ps
 +#define gmx_or_pb         _mm256_or_ps
 +
 +#define GMX_SIMD_HAVE_ANYTRUE
 +#define gmx_anytrue_pb    _mm256_movemask_ps
 +
- #else
 +#define gmx_cvttpr_epi32  _mm256_cvttps_epi32
 +
 +#define gmx_rsqrt_pr      _mm256_rsqrt_ps
 +#define gmx_rcp_pr        _mm256_rcp_ps
 +
 +#define gmx_exp_pr        gmx_mm256_exp_ps
 +#define gmx_sqrt_pr       gmx_mm256_sqrt_ps
 +#define gmx_sincos_pr     gmx_mm256_sincos_ps
 +#define gmx_acos_pr       gmx_mm256_acos_ps
 +#define gmx_atan2_pr      gmx_mm256_atan2_ps
 +
- #define GMX_SIMD_HAVE_CHECKBITMASK_PR
- #define gmx_set1_epi32    _mm256_set1_epi32
- #define gmx_castsi_pr     _mm256_castsi256_pd
- /* With <= 16 bits used the cast and conversion should not be required,
-  * since only mantissa bits are set and that would give a non-zero float,
-  * but with the Intel compiler this does not work correctly.
-  * Because AVX does not have int->double conversion, we convert via float.
-  */
- #define gmx_checkbitmask_pr(m0, m1) _mm256_cmp_pd(_mm256_castps_pd(_mm256_cvtepi32_ps(_mm256_castpd_si256(_mm256_and_pd(m0, m1)))), _mm256_setzero_pd(), 0x0c)
++#else /* ifndef GMX_DOUBLE */
 +
 +#define GMX_SIMD_WIDTH_HERE  4
 +
 +#define gmx_mm_pr  __m256d
 +
 +#define gmx_mm_pb  __m256d
 +
 +/* We use 128-bit integer registers because of missing 256-bit operations */
 +#define gmx_epi32  __m128i
 +#define GMX_SIMD_EPI32_WIDTH  4
 +
 +#define gmx_load_pr       _mm256_load_pd
 +#define gmx_load1_pr(x)   _mm256_set1_pd((x)[0])
 +#define gmx_set1_pr       _mm256_set1_pd
 +#define gmx_setzero_pr    _mm256_setzero_pd
 +#define gmx_store_pr      _mm256_store_pd
 +
 +#define gmx_add_pr        _mm256_add_pd
 +#define gmx_sub_pr        _mm256_sub_pd
 +#define gmx_mul_pr        _mm256_mul_pd
 +#define gmx_madd_pr(a, b, c)   _mm256_add_pd(c, _mm256_mul_pd(a, b))
 +#define gmx_nmsub_pr(a, b, c)  _mm256_sub_pd(c, _mm256_mul_pd(a, b))
 +#define gmx_max_pr        _mm256_max_pd
 +#define gmx_blendzero_pr  _mm256_and_pd
 +
 +#define gmx_round_pr(x)   _mm256_round_pd(x, 0x0)
 +#define GMX_SIMD_HAVE_FLOOR
 +#define gmx_floor_pr      _mm256_floor_pd
 +
 +#define GMX_SIMD_HAVE_BLENDV
 +#define gmx_blendv_pr     _mm256_blendv_pd
 +
 +static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
 +{
 +    gmx_mm_pr sign_mask = _mm256_set1_pd(-0.0);
 +    return _mm256_or_pd(_mm256_and_pd(a, sign_mask), b);
 +};
 +
 +static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm256_add_pd(b, _mm256_andnot_pd(a, c)); };
 +
 +/* Less-than (we use ordered, non-signaling, but that's not required) */
 +#define gmx_cmplt_pr(x, y) _mm256_cmp_pd(x, y, 0x11)
 +
 +#define gmx_and_pb        _mm256_and_pd
 +#define gmx_or_pb         _mm256_or_pd
 +
 +#define GMX_SIMD_HAVE_ANYTRUE
 +#define gmx_anytrue_pb    _mm256_movemask_pd
 +
- #endif /* GMX_DOUBLE */
 +#define gmx_cvttpr_epi32  _mm256_cvttpd_epi32
 +
 +#define gmx_rsqrt_pr(r)   _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(r)))
 +#define gmx_rcp_pr(r)     _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(r)))
 +
 +#define gmx_exp_pr        gmx_mm256_exp_pd
 +#define gmx_sqrt_pr       gmx_mm256_sqrt_pd
 +#define gmx_sincos_pr     gmx_mm256_sincos_pd
 +#define gmx_acos_pr       gmx_mm256_acos_pd
 +#define gmx_atan2_pr      gmx_mm256_atan2_pd
 +
++#endif /* ifndef GMX_DOUBLE */
 +
 +#endif /* 128- or 256-bit x86 SIMD */
 +
 +#endif /* GMX_X86_SSE2 */
 +
 +
 +#ifdef GMX_HAVE_SIMD_MACROS
 +/* Generic functions to extract a SIMD aligned pointer from a pointer x.
 + * x should have at least GMX_SIMD_WIDTH_HERE elements extra compared
 + * to how many you want to use, to avoid indexing outside the aligned region.
 + */
 +
 +static gmx_inline real *
 +gmx_simd_align_real(const real *x)
 +{
 +    return (real *)(((size_t)((x)+GMX_SIMD_WIDTH_HERE)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(real)-1))));
 +}
 +
 +static gmx_inline int *
 +gmx_simd_align_int(const int *x)
 +{
 +    return (int  *)(((size_t)((x)+GMX_SIMD_WIDTH_HERE)) & (~((size_t)(GMX_SIMD_WIDTH_HERE*sizeof(int )-1))));
 +}
 +
 +
 +/* Include the math functions which only need the above macros,
 + * generally these are the ones that don't need masking operations.
 + */
 +#ifdef GMX_DOUBLE
 +#include "gmx_simd_math_double.h"
 +#else
 +#include "gmx_simd_math_single.h"
 +#endif
 +
 +#endif /* GMX_HAVE_SIMD_MACROS */
 +
 +#endif /* _gmx_simd_macros_h_ */
index 1e9324a55ab465684963692fa6f33c7c272aefc8,0000000000000000000000000000000000000000..e2329ddc150d8927886329e2ce2064bd38670dd9
mode 100644,000000..100644
--- /dev/null
@@@ -1,9724 -1,0 +1,9724 @@@
-                                 10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol);
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + * This file is part of Gromacs        Copyright (c) 1991-2008
 + * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org
 + *
 + * And Hey:
 + * Gnomes, ROck Monsters And Chili Sauce
 + */
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <stdio.h>
 +#include <time.h>
 +#include <math.h>
 +#include <string.h>
 +#include <stdlib.h>
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "gmx_fatal.h"
 +#include "gmx_fatal_collective.h"
 +#include "vec.h"
 +#include "domdec.h"
 +#include "domdec_network.h"
 +#include "nrnb.h"
 +#include "pbc.h"
 +#include "chargegroup.h"
 +#include "constr.h"
 +#include "mdatoms.h"
 +#include "names.h"
 +#include "pdbio.h"
 +#include "futil.h"
 +#include "force.h"
 +#include "pme.h"
 +#include "pull.h"
 +#include "pull_rotation.h"
 +#include "gmx_wallcycle.h"
 +#include "mdrun.h"
 +#include "nsgrid.h"
 +#include "shellfc.h"
 +#include "mtop_util.h"
 +#include "gmxfio.h"
 +#include "gmx_ga2la.h"
 +#include "gmx_sort.h"
 +#include "macros.h"
 +#include "nbnxn_search.h"
 +#include "bondf.h"
 +#include "gmx_omp_nthreads.h"
 +
 +#include "gromacs/utility/gmxmpi.h"
 +
 +#define DDRANK(dd, rank)    (rank)
 +#define DDMASTERRANK(dd)   (dd->masterrank)
 +
 +typedef struct gmx_domdec_master
 +{
 +    /* The cell boundaries */
 +    real **cell_x;
 +    /* The global charge group division */
 +    int   *ncg;    /* Number of home charge groups for each node */
 +    int   *index;  /* Index of nnodes+1 into cg */
 +    int   *cg;     /* Global charge group index */
 +    int   *nat;    /* Number of home atoms for each node. */
 +    int   *ibuf;   /* Buffer for communication */
 +    rvec  *vbuf;   /* Buffer for state scattering and gathering */
 +} gmx_domdec_master_t;
 +
 +typedef struct
 +{
 +    /* The numbers of charge groups to send and receive for each cell
 +     * that requires communication, the last entry contains the total
 +     * number of atoms that needs to be communicated.
 +     */
 +    int  nsend[DD_MAXIZONE+2];
 +    int  nrecv[DD_MAXIZONE+2];
 +    /* The charge groups to send */
 +    int *index;
 +    int  nalloc;
 +    /* The atom range for non-in-place communication */
 +    int  cell2at0[DD_MAXIZONE];
 +    int  cell2at1[DD_MAXIZONE];
 +} gmx_domdec_ind_t;
 +
 +typedef struct
 +{
 +    int               np;       /* Number of grid pulses in this dimension */
 +    int               np_dlb;   /* For dlb, for use with edlbAUTO          */
 +    gmx_domdec_ind_t *ind;      /* The indices to communicate, size np     */
 +    int               np_nalloc;
 +    gmx_bool          bInPlace; /* Can we communicate in place?            */
 +} gmx_domdec_comm_dim_t;
 +
 +typedef struct
 +{
 +    gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
 +    real     *cell_f;      /* State var.: cell boundaries, box relative      */
 +    real     *old_cell_f;  /* Temp. var.: old cell size                      */
 +    real     *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
 +    real     *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
 +    real     *bound_min;   /* Temp. var.: lower limit for cell boundary      */
 +    real     *bound_max;   /* Temp. var.: upper limit for cell boundary      */
 +    gmx_bool  bLimited;    /* State var.: is DLB limited in this dim and row */
 +    real     *buf_ncd;     /* Temp. var.                                     */
 +} gmx_domdec_root_t;
 +
 +#define DD_NLOAD_MAX 9
 +
 +/* Here floats are accurate enough, since these variables
 + * only influence the load balancing, not the actual MD results.
 + */
 +typedef struct
 +{
 +    int    nload;
 +    float *load;
 +    float  sum;
 +    float  max;
 +    float  sum_m;
 +    float  cvol_min;
 +    float  mdf;
 +    float  pme;
 +    int    flags;
 +} gmx_domdec_load_t;
 +
 +typedef struct
 +{
 +    int  nsc;
 +    int  ind_gl;
 +    int  ind;
 +} gmx_cgsort_t;
 +
 +typedef struct
 +{
 +    gmx_cgsort_t *sort;
 +    gmx_cgsort_t *sort2;
 +    int           sort_nalloc;
 +    gmx_cgsort_t *sort_new;
 +    int           sort_new_nalloc;
 +    int          *ibuf;
 +    int           ibuf_nalloc;
 +} gmx_domdec_sort_t;
 +
 +typedef struct
 +{
 +    rvec *v;
 +    int   nalloc;
 +} vec_rvec_t;
 +
 +/* This enum determines the order of the coordinates.
 + * ddnatHOME and ddnatZONE should be first and second,
 + * the others can be ordered as wanted.
 + */
 +enum {
 +    ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR
 +};
 +
 +enum {
 +    edlbAUTO, edlbNO, edlbYES, edlbNR
 +};
 +const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
 +
 +typedef struct
 +{
 +    int      dim;       /* The dimension                                          */
 +    gmx_bool dim_match; /* Tells if DD and PME dims match                         */
 +    int      nslab;     /* The number of PME slabs in this dimension              */
 +    real    *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
 +    int     *pp_min;    /* The minimum pp node location, size nslab               */
 +    int     *pp_max;    /* The maximum pp node location,size nslab                */
 +    int      maxshift;  /* The maximum shift for coordinate redistribution in PME */
 +} gmx_ddpme_t;
 +
 +typedef struct
 +{
 +    real min0;    /* The minimum bottom of this zone                        */
 +    real max1;    /* The maximum top of this zone                           */
 +    real min1;    /* The minimum top of this zone                           */
 +    real mch0;    /* The maximum bottom communicaton height for this zone   */
 +    real mch1;    /* The maximum top communicaton height for this zone      */
 +    real p1_0;    /* The bottom value of the first cell in this zone        */
 +    real p1_1;    /* The top value of the first cell in this zone           */
 +} gmx_ddzone_t;
 +
 +typedef struct
 +{
 +    gmx_domdec_ind_t ind;
 +    int             *ibuf;
 +    int              ibuf_nalloc;
 +    vec_rvec_t       vbuf;
 +    int              nsend;
 +    int              nat;
 +    int              nsend_zone;
 +} dd_comm_setup_work_t;
 +
 +typedef struct gmx_domdec_comm
 +{
 +    /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
 +     * unless stated otherwise.
 +     */
 +
 +    /* The number of decomposition dimensions for PME, 0: no PME */
 +    int         npmedecompdim;
 +    /* The number of nodes doing PME (PP/PME or only PME) */
 +    int         npmenodes;
 +    int         npmenodes_x;
 +    int         npmenodes_y;
 +    /* The communication setup including the PME only nodes */
 +    gmx_bool    bCartesianPP_PME;
 +    ivec        ntot;
 +    int         cartpmedim;
 +    int        *pmenodes;          /* size npmenodes                         */
 +    int        *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
 +                                    * but with bCartesianPP_PME              */
 +    gmx_ddpme_t ddpme[2];
 +
 +    /* The DD particle-particle nodes only */
 +    gmx_bool bCartesianPP;
 +    int     *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
 +
 +    /* The global charge groups */
 +    t_block cgs_gl;
 +
 +    /* Should we sort the cgs */
 +    int                nstSortCG;
 +    gmx_domdec_sort_t *sort;
 +
 +    /* Are there charge groups? */
 +    gmx_bool bCGs;
 +
 +    /* Are there bonded and multi-body interactions between charge groups? */
 +    gmx_bool bInterCGBondeds;
 +    gmx_bool bInterCGMultiBody;
 +
 +    /* Data for the optional bonded interaction atom communication range */
 +    gmx_bool  bBondComm;
 +    t_blocka *cglink;
 +    char     *bLocalCG;
 +
 +    /* The DLB option */
 +    int      eDLB;
 +    /* Are we actually using DLB? */
 +    gmx_bool bDynLoadBal;
 +
 +    /* Cell sizes for static load balancing, first index cartesian */
 +    real **slb_frac;
 +
 +    /* The width of the communicated boundaries */
 +    real     cutoff_mbody;
 +    real     cutoff;
 +    /* The minimum cell size (including triclinic correction) */
 +    rvec     cellsize_min;
 +    /* For dlb, for use with edlbAUTO */
 +    rvec     cellsize_min_dlb;
 +    /* The lower limit for the DD cell size with DLB */
 +    real     cellsize_limit;
 +    /* Effectively no NB cut-off limit with DLB for systems without PBC? */
 +    gmx_bool bVacDLBNoLimit;
 +
 +    /* With PME load balancing we set limits on DLB */
 +    gmx_bool bPMELoadBalDLBLimits;
 +    /* DLB needs to take into account that we want to allow this maximum
 +     * cut-off (for PME load balancing), this could limit cell boundaries.
 +     */
 +    real PMELoadBal_max_cutoff;
 +
 +    /* tric_dir is only stored here because dd_get_ns_ranges needs it */
 +    ivec tric_dir;
 +    /* box0 and box_size are required with dim's without pbc and -gcom */
 +    rvec box0;
 +    rvec box_size;
 +
 +    /* The cell boundaries */
 +    rvec cell_x0;
 +    rvec cell_x1;
 +
 +    /* The old location of the cell boundaries, to check cg displacements */
 +    rvec old_cell_x0;
 +    rvec old_cell_x1;
 +
 +    /* The communication setup and charge group boundaries for the zones */
 +    gmx_domdec_zones_t zones;
 +
 +    /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
 +     * cell boundaries of neighboring cells for dynamic load balancing.
 +     */
 +    gmx_ddzone_t zone_d1[2];
 +    gmx_ddzone_t zone_d2[2][2];
 +
 +    /* The coordinate/force communication setup and indices */
 +    gmx_domdec_comm_dim_t cd[DIM];
 +    /* The maximum number of cells to communicate with in one dimension */
 +    int                   maxpulse;
 +
 +    /* Which cg distribution is stored on the master node */
 +    int master_cg_ddp_count;
 +
 +    /* The number of cg's received from the direct neighbors */
 +    int  zone_ncg1[DD_MAXZONE];
 +
 +    /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
 +    int  nat[ddnatNR];
 +
 +    /* Array for signalling if atoms have moved to another domain */
 +    int  *moved;
 +    int   moved_nalloc;
 +
 +    /* Communication buffer for general use */
 +    int  *buf_int;
 +    int   nalloc_int;
 +
 +    /* Communication buffer for general use */
 +    vec_rvec_t vbuf;
 +
 +    /* Temporary storage for thread parallel communication setup */
 +    int                   nth;
 +    dd_comm_setup_work_t *dth;
 +
 +    /* Communication buffers only used with multiple grid pulses */
 +    int       *buf_int2;
 +    int        nalloc_int2;
 +    vec_rvec_t vbuf2;
 +
 +    /* Communication buffers for local redistribution */
 +    int  **cggl_flag;
 +    int    cggl_flag_nalloc[DIM*2];
 +    rvec **cgcm_state;
 +    int    cgcm_state_nalloc[DIM*2];
 +
 +    /* Cell sizes for dynamic load balancing */
 +    gmx_domdec_root_t **root;
 +    real               *cell_f_row;
 +    real                cell_f0[DIM];
 +    real                cell_f1[DIM];
 +    real                cell_f_max0[DIM];
 +    real                cell_f_min1[DIM];
 +
 +    /* Stuff for load communication */
 +    gmx_bool           bRecordLoad;
 +    gmx_domdec_load_t *load;
 +#ifdef GMX_MPI
 +    MPI_Comm          *mpi_comm_load;
 +#endif
 +
 +    /* Maximum DLB scaling per load balancing step in percent */
 +    int dlb_scale_lim;
 +
 +    /* Cycle counters */
 +    float  cycl[ddCyclNr];
 +    int    cycl_n[ddCyclNr];
 +    float  cycl_max[ddCyclNr];
 +    /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
 +    int    eFlop;
 +    double flop;
 +    int    flop_n;
 +    /* Have often have did we have load measurements */
 +    int    n_load_have;
 +    /* Have often have we collected the load measurements */
 +    int    n_load_collect;
 +
 +    /* Statistics */
 +    double sum_nat[ddnatNR-ddnatZONE];
 +    int    ndecomp;
 +    int    nload;
 +    double load_step;
 +    double load_sum;
 +    double load_max;
 +    ivec   load_lim;
 +    double load_mdf;
 +    double load_pme;
 +
 +    /* The last partition step */
 +    gmx_large_int_t partition_step;
 +
 +    /* Debugging */
 +    int  nstDDDump;
 +    int  nstDDDumpGrid;
 +    int  DD_debug;
 +} gmx_domdec_comm_t;
 +
 +/* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 +#define DD_CGIBS 2
 +
 +/* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 +#define DD_FLAG_NRCG  65535
 +#define DD_FLAG_FW(d) (1<<(16+(d)*2))
 +#define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 +
 +/* Zone permutation required to obtain consecutive charge groups
 + * for neighbor searching.
 + */
 +static const int zone_perm[3][4] = { {0, 0, 0, 0}, {1, 0, 0, 0}, {3, 0, 1, 2} };
 +
 +/* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
 + * components see only j zones with that component 0.
 + */
 +
 +/* The DD zone order */
 +static const ivec dd_zo[DD_MAXZONE] =
 +{{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 +
 +/* The 3D setup */
 +#define dd_z3n  8
 +#define dd_zp3n 4
 +static const ivec dd_zp3[dd_zp3n] = {{0, 0, 8}, {1, 3, 6}, {2, 5, 6}, {3, 5, 7}};
 +
 +/* The 2D setup */
 +#define dd_z2n  4
 +#define dd_zp2n 2
 +static const ivec dd_zp2[dd_zp2n] = {{0, 0, 4}, {1, 3, 4}};
 +
 +/* The 1D setup */
 +#define dd_z1n  2
 +#define dd_zp1n 1
 +static const ivec dd_zp1[dd_zp1n] = {{0, 0, 2}};
 +
 +/* Factors used to avoid problems due to rounding issues */
 +#define DD_CELL_MARGIN       1.0001
 +#define DD_CELL_MARGIN2      1.00005
 +/* Factor to account for pressure scaling during nstlist steps */
 +#define DD_PRES_SCALE_MARGIN 1.02
 +
 +/* Allowed performance loss before we DLB or warn */
 +#define DD_PERF_LOSS 0.05
 +
 +#define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 +
 +/* Use separate MPI send and receive commands
 + * when nnodes <= GMX_DD_NNODES_SENDRECV.
 + * This saves memory (and some copying for small nnodes).
 + * For high parallelization scatter and gather calls are used.
 + */
 +#define GMX_DD_NNODES_SENDRECV 4
 +
 +
 +/*
 +   #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 +
 +   static void index2xyz(ivec nc,int ind,ivec xyz)
 +   {
 +   xyz[XX] = ind % nc[XX];
 +   xyz[YY] = (ind / nc[XX]) % nc[YY];
 +   xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 +   }
 + */
 +
 +/* This order is required to minimize the coordinate communication in PME
 + * which uses decomposition in the x direction.
 + */
 +#define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 +
 +static void ddindex2xyz(ivec nc, int ind, ivec xyz)
 +{
 +    xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 +    xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 +    xyz[ZZ] = ind % nc[ZZ];
 +}
 +
 +static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 +{
 +    int ddindex;
 +    int ddnodeid = -1;
 +
 +    ddindex = dd_index(dd->nc, c);
 +    if (dd->comm->bCartesianPP_PME)
 +    {
 +        ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 +    }
 +    else if (dd->comm->bCartesianPP)
 +    {
 +#ifdef GMX_MPI
 +        MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 +#endif
 +    }
 +    else
 +    {
 +        ddnodeid = ddindex;
 +    }
 +
 +    return ddnodeid;
 +}
 +
 +static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox, t_inputrec *ir)
 +{
 +    return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 +}
 +
 +int ddglatnr(gmx_domdec_t *dd, int i)
 +{
 +    int atnr;
 +
 +    if (dd == NULL)
 +    {
 +        atnr = i + 1;
 +    }
 +    else
 +    {
 +        if (i >= dd->comm->nat[ddnatNR-1])
 +        {
 +            gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->nat[ddnatNR-1]);
 +        }
 +        atnr = dd->gatindex[i] + 1;
 +    }
 +
 +    return atnr;
 +}
 +
 +t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 +{
 +    return &dd->comm->cgs_gl;
 +}
 +
 +static void vec_rvec_init(vec_rvec_t *v)
 +{
 +    v->nalloc = 0;
 +    v->v      = NULL;
 +}
 +
 +static void vec_rvec_check_alloc(vec_rvec_t *v, int n)
 +{
 +    if (n > v->nalloc)
 +    {
 +        v->nalloc = over_alloc_dd(n);
 +        srenew(v->v, v->nalloc);
 +    }
 +}
 +
 +void dd_store_state(gmx_domdec_t *dd, t_state *state)
 +{
 +    int i;
 +
 +    if (state->ddp_count != dd->ddp_count)
 +    {
 +        gmx_incons("The state does not the domain decomposition state");
 +    }
 +
 +    state->ncg_gl = dd->ncg_home;
 +    if (state->ncg_gl > state->cg_gl_nalloc)
 +    {
 +        state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 +        srenew(state->cg_gl, state->cg_gl_nalloc);
 +    }
 +    for (i = 0; i < state->ncg_gl; i++)
 +    {
 +        state->cg_gl[i] = dd->index_gl[i];
 +    }
 +
 +    state->ddp_count_cg_gl = dd->ddp_count;
 +}
 +
 +gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 +{
 +    return &dd->comm->zones;
 +}
 +
 +void dd_get_ns_ranges(gmx_domdec_t *dd, int icg,
 +                      int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 +{
 +    gmx_domdec_zones_t *zones;
 +    int                 izone, d, dim;
 +
 +    zones = &dd->comm->zones;
 +
 +    izone = 0;
 +    while (icg >= zones->izone[izone].cg1)
 +    {
 +        izone++;
 +    }
 +
 +    if (izone == 0)
 +    {
 +        *jcg0 = icg;
 +    }
 +    else if (izone < zones->nizone)
 +    {
 +        *jcg0 = zones->izone[izone].jcg0;
 +    }
 +    else
 +    {
 +        gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 +                  icg, izone, zones->nizone);
 +    }
 +
 +    *jcg1 = zones->izone[izone].jcg1;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim         = dd->dim[d];
 +        shift0[dim] = zones->izone[izone].shift0[dim];
 +        shift1[dim] = zones->izone[izone].shift1[dim];
 +        if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
 +        {
 +            /* A conservative approach, this can be optimized */
 +            shift0[dim] -= 1;
 +            shift1[dim] += 1;
 +        }
 +    }
 +}
 +
 +int dd_natoms_vsite(gmx_domdec_t *dd)
 +{
 +    return dd->comm->nat[ddnatVSITE];
 +}
 +
 +void dd_get_constraint_range(gmx_domdec_t *dd, int *at_start, int *at_end)
 +{
 +    *at_start = dd->comm->nat[ddnatCON-1];
 +    *at_end   = dd->comm->nat[ddnatCON];
 +}
 +
 +void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[])
 +{
 +    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 +    int                   *index, *cgindex;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    rvec                   shift = {0, 0, 0}, *buf, *rbuf;
 +    gmx_bool               bPBC, bScrew;
 +
 +    comm = dd->comm;
 +
 +    cgindex = dd->cgindex;
 +
 +    buf = comm->vbuf.v;
 +
 +    nzone   = 1;
 +    nat_tot = dd->nat_home;
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        bPBC   = (dd->ci[dd->dim[d]] == 0);
 +        bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 +        if (bPBC)
 +        {
 +            copy_rvec(box[dd->dim[d]], shift);
 +        }
 +        cd = &comm->cd[d];
 +        for (p = 0; p < cd->np; p++)
 +        {
 +            ind   = &cd->ind[p];
 +            index = ind->index;
 +            n     = 0;
 +            if (!bPBC)
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        copy_rvec(x[j], buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else if (!bScrew)
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        /* We need to shift the coordinates */
 +                        rvec_add(x[j], shift, buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        /* Shift x */
 +                        buf[n][XX] = x[j][XX] + shift[XX];
 +                        /* Rotate y and z.
 +                         * This operation requires a special shift force
 +                         * treatment, which is performed in calc_vir.
 +                         */
 +                        buf[n][YY] = box[YY][YY] - x[j][YY];
 +                        buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 +                        n++;
 +                    }
 +                }
 +            }
 +
 +            if (cd->bInPlace)
 +            {
 +                rbuf = x + nat_tot;
 +            }
 +            else
 +            {
 +                rbuf = comm->vbuf2.v;
 +            }
 +            /* Send and receive the coordinates */
 +            dd_sendrecv_rvec(dd, d, dddirBackward,
 +                             buf,  ind->nsend[nzone+1],
 +                             rbuf, ind->nrecv[nzone+1]);
 +            if (!cd->bInPlace)
 +            {
 +                j = 0;
 +                for (zone = 0; zone < nzone; zone++)
 +                {
 +                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 +                    {
 +                        copy_rvec(rbuf[j], x[i]);
 +                        j++;
 +                    }
 +                }
 +            }
 +            nat_tot += ind->nrecv[nzone+1];
 +        }
 +        nzone += nzone;
 +    }
 +}
 +
 +void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift)
 +{
 +    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 +    int                   *index, *cgindex;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    rvec                  *buf, *sbuf;
 +    ivec                   vis;
 +    int                    is;
 +    gmx_bool               bPBC, bScrew;
 +
 +    comm = dd->comm;
 +
 +    cgindex = dd->cgindex;
 +
 +    buf = comm->vbuf.v;
 +
 +    n       = 0;
 +    nzone   = comm->zones.n/2;
 +    nat_tot = dd->nat_tot;
 +    for (d = dd->ndim-1; d >= 0; d--)
 +    {
 +        bPBC   = (dd->ci[dd->dim[d]] == 0);
 +        bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 +        if (fshift == NULL && !bScrew)
 +        {
 +            bPBC = FALSE;
 +        }
 +        /* Determine which shift vector we need */
 +        clear_ivec(vis);
 +        vis[dd->dim[d]] = 1;
 +        is              = IVEC2IS(vis);
 +
 +        cd = &comm->cd[d];
 +        for (p = cd->np-1; p >= 0; p--)
 +        {
 +            ind      = &cd->ind[p];
 +            nat_tot -= ind->nrecv[nzone+1];
 +            if (cd->bInPlace)
 +            {
 +                sbuf = f + nat_tot;
 +            }
 +            else
 +            {
 +                sbuf = comm->vbuf2.v;
 +                j    = 0;
 +                for (zone = 0; zone < nzone; zone++)
 +                {
 +                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 +                    {
 +                        copy_rvec(f[i], sbuf[j]);
 +                        j++;
 +                    }
 +                }
 +            }
 +            /* Communicate the forces */
 +            dd_sendrecv_rvec(dd, d, dddirForward,
 +                             sbuf, ind->nrecv[nzone+1],
 +                             buf,  ind->nsend[nzone+1]);
 +            index = ind->index;
 +            /* Add the received forces */
 +            n = 0;
 +            if (!bPBC)
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        rvec_inc(f[j], buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else if (!bScrew)
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        rvec_inc(f[j], buf[n]);
 +                        /* Add this force to the shift force */
 +                        rvec_inc(fshift[is], buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        /* Rotate the force */
 +                        f[j][XX] += buf[n][XX];
 +                        f[j][YY] -= buf[n][YY];
 +                        f[j][ZZ] -= buf[n][ZZ];
 +                        if (fshift)
 +                        {
 +                            /* Add this force to the shift force */
 +                            rvec_inc(fshift[is], buf[n]);
 +                        }
 +                        n++;
 +                    }
 +                }
 +            }
 +        }
 +        nzone /= 2;
 +    }
 +}
 +
 +void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 +{
 +    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 +    int                   *index, *cgindex;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    real                  *buf, *rbuf;
 +
 +    comm = dd->comm;
 +
 +    cgindex = dd->cgindex;
 +
 +    buf = &comm->vbuf.v[0][0];
 +
 +    nzone   = 1;
 +    nat_tot = dd->nat_home;
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        cd = &comm->cd[d];
 +        for (p = 0; p < cd->np; p++)
 +        {
 +            ind   = &cd->ind[p];
 +            index = ind->index;
 +            n     = 0;
 +            for (i = 0; i < ind->nsend[nzone]; i++)
 +            {
 +                at0 = cgindex[index[i]];
 +                at1 = cgindex[index[i]+1];
 +                for (j = at0; j < at1; j++)
 +                {
 +                    buf[n] = v[j];
 +                    n++;
 +                }
 +            }
 +
 +            if (cd->bInPlace)
 +            {
 +                rbuf = v + nat_tot;
 +            }
 +            else
 +            {
 +                rbuf = &comm->vbuf2.v[0][0];
 +            }
 +            /* Send and receive the coordinates */
 +            dd_sendrecv_real(dd, d, dddirBackward,
 +                             buf,  ind->nsend[nzone+1],
 +                             rbuf, ind->nrecv[nzone+1]);
 +            if (!cd->bInPlace)
 +            {
 +                j = 0;
 +                for (zone = 0; zone < nzone; zone++)
 +                {
 +                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 +                    {
 +                        v[i] = rbuf[j];
 +                        j++;
 +                    }
 +                }
 +            }
 +            nat_tot += ind->nrecv[nzone+1];
 +        }
 +        nzone += nzone;
 +    }
 +}
 +
 +void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 +{
 +    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 +    int                   *index, *cgindex;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    real                  *buf, *sbuf;
 +
 +    comm = dd->comm;
 +
 +    cgindex = dd->cgindex;
 +
 +    buf = &comm->vbuf.v[0][0];
 +
 +    n       = 0;
 +    nzone   = comm->zones.n/2;
 +    nat_tot = dd->nat_tot;
 +    for (d = dd->ndim-1; d >= 0; d--)
 +    {
 +        cd = &comm->cd[d];
 +        for (p = cd->np-1; p >= 0; p--)
 +        {
 +            ind      = &cd->ind[p];
 +            nat_tot -= ind->nrecv[nzone+1];
 +            if (cd->bInPlace)
 +            {
 +                sbuf = v + nat_tot;
 +            }
 +            else
 +            {
 +                sbuf = &comm->vbuf2.v[0][0];
 +                j    = 0;
 +                for (zone = 0; zone < nzone; zone++)
 +                {
 +                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 +                    {
 +                        sbuf[j] = v[i];
 +                        j++;
 +                    }
 +                }
 +            }
 +            /* Communicate the forces */
 +            dd_sendrecv_real(dd, d, dddirForward,
 +                             sbuf, ind->nrecv[nzone+1],
 +                             buf,  ind->nsend[nzone+1]);
 +            index = ind->index;
 +            /* Add the received forces */
 +            n = 0;
 +            for (i = 0; i < ind->nsend[nzone]; i++)
 +            {
 +                at0 = cgindex[index[i]];
 +                at1 = cgindex[index[i]+1];
 +                for (j = at0; j < at1; j++)
 +                {
 +                    v[j] += buf[n];
 +                    n++;
 +                }
 +            }
 +        }
 +        nzone /= 2;
 +    }
 +}
 +
 +static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
 +{
 +    fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 +            d, i, j,
 +            zone->min0, zone->max1,
 +            zone->mch0, zone->mch0,
 +            zone->p1_0, zone->p1_1);
 +}
 +
 +
 +#define DDZONECOMM_MAXZONE  5
 +#define DDZONECOMM_BUFSIZE  3
 +
 +static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
 +                               int ddimind, int direction,
 +                               gmx_ddzone_t *buf_s, int n_s,
 +                               gmx_ddzone_t *buf_r, int n_r)
 +{
 +#define ZBS  DDZONECOMM_BUFSIZE
 +    rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
 +    rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
 +    int  i;
 +
 +    for (i = 0; i < n_s; i++)
 +    {
 +        vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
 +        vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
 +        vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
 +        vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
 +        vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
 +        vbuf_s[i*ZBS+1][2] = 0;
 +        vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
 +        vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
 +        vbuf_s[i*ZBS+2][2] = 0;
 +    }
 +
 +    dd_sendrecv_rvec(dd, ddimind, direction,
 +                     vbuf_s, n_s*ZBS,
 +                     vbuf_r, n_r*ZBS);
 +
 +    for (i = 0; i < n_r; i++)
 +    {
 +        buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
 +        buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
 +        buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
 +        buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
 +        buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
 +        buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
 +        buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
 +    }
 +
 +#undef ZBS
 +}
 +
 +static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
 +                          rvec cell_ns_x0, rvec cell_ns_x1)
 +{
 +    int                d, d1, dim, dim1, pos, buf_size, i, j, k, p, npulse, npulse_min;
 +    gmx_ddzone_t      *zp;
 +    gmx_ddzone_t       buf_s[DDZONECOMM_MAXZONE];
 +    gmx_ddzone_t       buf_r[DDZONECOMM_MAXZONE];
 +    gmx_ddzone_t       buf_e[DDZONECOMM_MAXZONE];
 +    rvec               extr_s[2], extr_r[2];
 +    rvec               dh;
 +    real               dist_d, c = 0, det;
 +    gmx_domdec_comm_t *comm;
 +    gmx_bool           bPBC, bUse;
 +
 +    comm = dd->comm;
 +
 +    for (d = 1; d < dd->ndim; d++)
 +    {
 +        dim      = dd->dim[d];
 +        zp       = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
 +        zp->min0 = cell_ns_x0[dim];
 +        zp->max1 = cell_ns_x1[dim];
 +        zp->min1 = cell_ns_x1[dim];
 +        zp->mch0 = cell_ns_x0[dim];
 +        zp->mch1 = cell_ns_x1[dim];
 +        zp->p1_0 = cell_ns_x0[dim];
 +        zp->p1_1 = cell_ns_x1[dim];
 +    }
 +
 +    for (d = dd->ndim-2; d >= 0; d--)
 +    {
 +        dim  = dd->dim[d];
 +        bPBC = (dim < ddbox->npbcdim);
 +
 +        /* Use an rvec to store two reals */
 +        extr_s[d][0] = comm->cell_f0[d+1];
 +        extr_s[d][1] = comm->cell_f1[d+1];
 +        extr_s[d][2] = comm->cell_f1[d+1];
 +
 +        pos = 0;
 +        /* Store the extremes in the backward sending buffer,
 +         * so the get updated separately from the forward communication.
 +         */
 +        for (d1 = d; d1 < dd->ndim-1; d1++)
 +        {
 +            /* We invert the order to be able to use the same loop for buf_e */
 +            buf_s[pos].min0 = extr_s[d1][1];
 +            buf_s[pos].max1 = extr_s[d1][0];
 +            buf_s[pos].min1 = extr_s[d1][2];
 +            buf_s[pos].mch0 = 0;
 +            buf_s[pos].mch1 = 0;
 +            /* Store the cell corner of the dimension we communicate along */
 +            buf_s[pos].p1_0 = comm->cell_x0[dim];
 +            buf_s[pos].p1_1 = 0;
 +            pos++;
 +        }
 +
 +        buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 +        pos++;
 +
 +        if (dd->ndim == 3 && d == 0)
 +        {
 +            buf_s[pos] = comm->zone_d2[0][1];
 +            pos++;
 +            buf_s[pos] = comm->zone_d1[0];
 +            pos++;
 +        }
 +
 +        /* We only need to communicate the extremes
 +         * in the forward direction
 +         */
 +        npulse = comm->cd[d].np;
 +        if (bPBC)
 +        {
 +            /* Take the minimum to avoid double communication */
 +            npulse_min = min(npulse, dd->nc[dim]-1-npulse);
 +        }
 +        else
 +        {
 +            /* Without PBC we should really not communicate over
 +             * the boundaries, but implementing that complicates
 +             * the communication setup and therefore we simply
 +             * do all communication, but ignore some data.
 +             */
 +            npulse_min = npulse;
 +        }
 +        for (p = 0; p < npulse_min; p++)
 +        {
 +            /* Communicate the extremes forward */
 +            bUse = (bPBC || dd->ci[dim] > 0);
 +
 +            dd_sendrecv_rvec(dd, d, dddirForward,
 +                             extr_s+d, dd->ndim-d-1,
 +                             extr_r+d, dd->ndim-d-1);
 +
 +            if (bUse)
 +            {
 +                for (d1 = d; d1 < dd->ndim-1; d1++)
 +                {
 +                    extr_s[d1][0] = max(extr_s[d1][0], extr_r[d1][0]);
 +                    extr_s[d1][1] = min(extr_s[d1][1], extr_r[d1][1]);
 +                    extr_s[d1][2] = min(extr_s[d1][2], extr_r[d1][2]);
 +                }
 +            }
 +        }
 +
 +        buf_size = pos;
 +        for (p = 0; p < npulse; p++)
 +        {
 +            /* Communicate all the zone information backward */
 +            bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
 +
 +            dd_sendrecv_ddzone(dd, d, dddirBackward,
 +                               buf_s, buf_size,
 +                               buf_r, buf_size);
 +
 +            clear_rvec(dh);
 +            if (p > 0)
 +            {
 +                for (d1 = d+1; d1 < dd->ndim; d1++)
 +                {
 +                    /* Determine the decrease of maximum required
 +                     * communication height along d1 due to the distance along d,
 +                     * this avoids a lot of useless atom communication.
 +                     */
 +                    dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
 +
 +                    if (ddbox->tric_dir[dim])
 +                    {
 +                        /* c is the off-diagonal coupling between the cell planes
 +                         * along directions d and d1.
 +                         */
 +                        c = ddbox->v[dim][dd->dim[d1]][dim];
 +                    }
 +                    else
 +                    {
 +                        c = 0;
 +                    }
 +                    det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
 +                    if (det > 0)
 +                    {
 +                        dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
 +                    }
 +                    else
 +                    {
 +                        /* A negative value signals out of range */
 +                        dh[d1] = -1;
 +                    }
 +                }
 +            }
 +
 +            /* Accumulate the extremes over all pulses */
 +            for (i = 0; i < buf_size; i++)
 +            {
 +                if (p == 0)
 +                {
 +                    buf_e[i] = buf_r[i];
 +                }
 +                else
 +                {
 +                    if (bUse)
 +                    {
 +                        buf_e[i].min0 = min(buf_e[i].min0, buf_r[i].min0);
 +                        buf_e[i].max1 = max(buf_e[i].max1, buf_r[i].max1);
 +                        buf_e[i].min1 = min(buf_e[i].min1, buf_r[i].min1);
 +                    }
 +
 +                    if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
 +                    {
 +                        d1 = 1;
 +                    }
 +                    else
 +                    {
 +                        d1 = d + 1;
 +                    }
 +                    if (bUse && dh[d1] >= 0)
 +                    {
 +                        buf_e[i].mch0 = max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
 +                        buf_e[i].mch1 = max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
 +                    }
 +                }
 +                /* Copy the received buffer to the send buffer,
 +                 * to pass the data through with the next pulse.
 +                 */
 +                buf_s[i] = buf_r[i];
 +            }
 +            if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
 +                (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
 +            {
 +                /* Store the extremes */
 +                pos = 0;
 +
 +                for (d1 = d; d1 < dd->ndim-1; d1++)
 +                {
 +                    extr_s[d1][1] = min(extr_s[d1][1], buf_e[pos].min0);
 +                    extr_s[d1][0] = max(extr_s[d1][0], buf_e[pos].max1);
 +                    extr_s[d1][2] = min(extr_s[d1][2], buf_e[pos].min1);
 +                    pos++;
 +                }
 +
 +                if (d == 1 || (d == 0 && dd->ndim == 3))
 +                {
 +                    for (i = d; i < 2; i++)
 +                    {
 +                        comm->zone_d2[1-d][i] = buf_e[pos];
 +                        pos++;
 +                    }
 +                }
 +                if (d == 0)
 +                {
 +                    comm->zone_d1[1] = buf_e[pos];
 +                    pos++;
 +                }
 +            }
 +        }
 +    }
 +
 +    if (dd->ndim >= 2)
 +    {
 +        dim = dd->dim[1];
 +        for (i = 0; i < 2; i++)
 +        {
 +            if (debug)
 +            {
 +                print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
 +            }
 +            cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d1[i].min0);
 +            cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d1[i].max1);
 +        }
 +    }
 +    if (dd->ndim >= 3)
 +    {
 +        dim = dd->dim[2];
 +        for (i = 0; i < 2; i++)
 +        {
 +            for (j = 0; j < 2; j++)
 +            {
 +                if (debug)
 +                {
 +                    print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
 +                }
 +                cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
 +                cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
 +            }
 +        }
 +    }
 +    for (d = 1; d < dd->ndim; d++)
 +    {
 +        comm->cell_f_max0[d] = extr_s[d-1][0];
 +        comm->cell_f_min1[d] = extr_s[d-1][1];
 +        if (debug)
 +        {
 +            fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
 +                    d, comm->cell_f_max0[d], comm->cell_f_min1[d]);
 +        }
 +    }
 +}
 +
 +static void dd_collect_cg(gmx_domdec_t *dd,
 +                          t_state      *state_local)
 +{
 +    gmx_domdec_master_t *ma = NULL;
 +    int                  buf2[2], *ibuf, i, ncg_home = 0, *cg = NULL, nat_home = 0;
 +    t_block             *cgs_gl;
 +
 +    if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
 +    {
 +        /* The master has the correct distribution */
 +        return;
 +    }
 +
 +    if (state_local->ddp_count == dd->ddp_count)
 +    {
 +        ncg_home = dd->ncg_home;
 +        cg       = dd->index_gl;
 +        nat_home = dd->nat_home;
 +    }
 +    else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
 +    {
 +        cgs_gl = &dd->comm->cgs_gl;
 +
 +        ncg_home = state_local->ncg_gl;
 +        cg       = state_local->cg_gl;
 +        nat_home = 0;
 +        for (i = 0; i < ncg_home; i++)
 +        {
 +            nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
 +        }
 +    }
 +    else
 +    {
 +        gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
 +    }
 +
 +    buf2[0] = dd->ncg_home;
 +    buf2[1] = dd->nat_home;
 +    if (DDMASTER(dd))
 +    {
 +        ma   = dd->ma;
 +        ibuf = ma->ibuf;
 +    }
 +    else
 +    {
 +        ibuf = NULL;
 +    }
 +    /* Collect the charge group and atom counts on the master */
 +    dd_gather(dd, 2*sizeof(int), buf2, ibuf);
 +
 +    if (DDMASTER(dd))
 +    {
 +        ma->index[0] = 0;
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            ma->ncg[i]     = ma->ibuf[2*i];
 +            ma->nat[i]     = ma->ibuf[2*i+1];
 +            ma->index[i+1] = ma->index[i] + ma->ncg[i];
 +
 +        }
 +        /* Make byte counts and indices */
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
 +            ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug, "Initial charge group distribution: ");
 +            for (i = 0; i < dd->nnodes; i++)
 +            {
 +                fprintf(debug, " %d", ma->ncg[i]);
 +            }
 +            fprintf(debug, "\n");
 +        }
 +    }
 +
 +    /* Collect the charge group indices on the master */
 +    dd_gatherv(dd,
 +               dd->ncg_home*sizeof(int), dd->index_gl,
 +               DDMASTER(dd) ? ma->ibuf : NULL,
 +               DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
 +               DDMASTER(dd) ? ma->cg : NULL);
 +
 +    dd->comm->master_cg_ddp_count = state_local->ddp_count;
 +}
 +
 +static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
 +                                    rvec *lv, rvec *v)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  n, i, c, a, nalloc = 0;
 +    rvec                *buf = NULL;
 +    t_block             *cgs_gl;
 +
 +    ma = dd->ma;
 +
 +    if (!DDMASTER(dd))
 +    {
 +#ifdef GMX_MPI
 +        MPI_Send(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
 +                 dd->rank, dd->mpi_comm_all);
 +#endif
 +    }
 +    else
 +    {
 +        /* Copy the master coordinates to the global array */
 +        cgs_gl = &dd->comm->cgs_gl;
 +
 +        n = DDMASTERRANK(dd);
 +        a = 0;
 +        for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +        {
 +            for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
 +            {
 +                copy_rvec(lv[a++], v[c]);
 +            }
 +        }
 +
 +        for (n = 0; n < dd->nnodes; n++)
 +        {
 +            if (n != dd->rank)
 +            {
 +                if (ma->nat[n] > nalloc)
 +                {
 +                    nalloc = over_alloc_dd(ma->nat[n]);
 +                    srenew(buf, nalloc);
 +                }
 +#ifdef GMX_MPI
 +                MPI_Recv(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, DDRANK(dd, n),
 +                         n, dd->mpi_comm_all, MPI_STATUS_IGNORE);
 +#endif
 +                a = 0;
 +                for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +                {
 +                    for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
 +                    {
 +                        copy_rvec(buf[a++], v[c]);
 +                    }
 +                }
 +            }
 +        }
 +        sfree(buf);
 +    }
 +}
 +
 +static void get_commbuffer_counts(gmx_domdec_t *dd,
 +                                  int **counts, int **disps)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  n;
 +
 +    ma = dd->ma;
 +
 +    /* Make the rvec count and displacment arrays */
 +    *counts  = ma->ibuf;
 +    *disps   = ma->ibuf + dd->nnodes;
 +    for (n = 0; n < dd->nnodes; n++)
 +    {
 +        (*counts)[n] = ma->nat[n]*sizeof(rvec);
 +        (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
 +    }
 +}
 +
 +static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
 +                                   rvec *lv, rvec *v)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                 *rcounts = NULL, *disps = NULL;
 +    int                  n, i, c, a;
 +    rvec                *buf = NULL;
 +    t_block             *cgs_gl;
 +
 +    ma = dd->ma;
 +
 +    if (DDMASTER(dd))
 +    {
 +        get_commbuffer_counts(dd, &rcounts, &disps);
 +
 +        buf = ma->vbuf;
 +    }
 +
 +    dd_gatherv(dd, dd->nat_home*sizeof(rvec), lv, rcounts, disps, buf);
 +
 +    if (DDMASTER(dd))
 +    {
 +        cgs_gl = &dd->comm->cgs_gl;
 +
 +        a = 0;
 +        for (n = 0; n < dd->nnodes; n++)
 +        {
 +            for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +            {
 +                for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
 +                {
 +                    copy_rvec(buf[a++], v[c]);
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +void dd_collect_vec(gmx_domdec_t *dd,
 +                    t_state *state_local, rvec *lv, rvec *v)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  n, i, c, a, nalloc = 0;
 +    rvec                *buf = NULL;
 +
 +    dd_collect_cg(dd, state_local);
 +
 +    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
 +    {
 +        dd_collect_vec_sendrecv(dd, lv, v);
 +    }
 +    else
 +    {
 +        dd_collect_vec_gatherv(dd, lv, v);
 +    }
 +}
 +
 +
 +void dd_collect_state(gmx_domdec_t *dd,
 +                      t_state *state_local, t_state *state)
 +{
 +    int est, i, j, nh;
 +
 +    nh = state->nhchainlength;
 +
 +    if (DDMASTER(dd))
 +    {
 +        for (i = 0; i < efptNR; i++)
 +        {
 +            state->lambda[i] = state_local->lambda[i];
 +        }
 +        state->fep_state = state_local->fep_state;
 +        state->veta      = state_local->veta;
 +        state->vol0      = state_local->vol0;
 +        copy_mat(state_local->box, state->box);
 +        copy_mat(state_local->boxv, state->boxv);
 +        copy_mat(state_local->svir_prev, state->svir_prev);
 +        copy_mat(state_local->fvir_prev, state->fvir_prev);
 +        copy_mat(state_local->pres_prev, state->pres_prev);
 +
 +
 +        for (i = 0; i < state_local->ngtc; i++)
 +        {
 +            for (j = 0; j < nh; j++)
 +            {
 +                state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
 +                state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
 +            }
 +            state->therm_integral[i] = state_local->therm_integral[i];
 +        }
 +        for (i = 0; i < state_local->nnhpres; i++)
 +        {
 +            for (j = 0; j < nh; j++)
 +            {
 +                state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
 +                state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
 +            }
 +        }
 +    }
 +    for (est = 0; est < estNR; est++)
 +    {
 +        if (EST_DISTR(est) && (state_local->flags & (1<<est)))
 +        {
 +            switch (est)
 +            {
 +                case estX:
 +                    dd_collect_vec(dd, state_local, state_local->x, state->x);
 +                    break;
 +                case estV:
 +                    dd_collect_vec(dd, state_local, state_local->v, state->v);
 +                    break;
 +                case estSDX:
 +                    dd_collect_vec(dd, state_local, state_local->sd_X, state->sd_X);
 +                    break;
 +                case estCGP:
 +                    dd_collect_vec(dd, state_local, state_local->cg_p, state->cg_p);
 +                    break;
 +                case estLD_RNG:
 +                    if (state->nrngi == 1)
 +                    {
 +                        if (DDMASTER(dd))
 +                        {
 +                            for (i = 0; i < state_local->nrng; i++)
 +                            {
 +                                state->ld_rng[i] = state_local->ld_rng[i];
 +                            }
 +                        }
 +                    }
 +                    else
 +                    {
 +                        dd_gather(dd, state_local->nrng*sizeof(state->ld_rng[0]),
 +                                  state_local->ld_rng, state->ld_rng);
 +                    }
 +                    break;
 +                case estLD_RNGI:
 +                    if (state->nrngi == 1)
 +                    {
 +                        if (DDMASTER(dd))
 +                        {
 +                            state->ld_rngi[0] = state_local->ld_rngi[0];
 +                        }
 +                    }
 +                    else
 +                    {
 +                        dd_gather(dd, sizeof(state->ld_rngi[0]),
 +                                  state_local->ld_rngi, state->ld_rngi);
 +                    }
 +                    break;
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_collect_state");
 +            }
 +        }
 +    }
 +}
 +
 +static void dd_realloc_state(t_state *state, rvec **f, int nalloc)
 +{
 +    int est;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Reallocating state: currently %d, required %d, allocating %d\n", state->nalloc, nalloc, over_alloc_dd(nalloc));
 +    }
 +
 +    state->nalloc = over_alloc_dd(nalloc);
 +
 +    for (est = 0; est < estNR; est++)
 +    {
 +        if (EST_DISTR(est) && (state->flags & (1<<est)))
 +        {
 +            switch (est)
 +            {
 +                case estX:
 +                    srenew(state->x, state->nalloc);
 +                    break;
 +                case estV:
 +                    srenew(state->v, state->nalloc);
 +                    break;
 +                case estSDX:
 +                    srenew(state->sd_X, state->nalloc);
 +                    break;
 +                case estCGP:
 +                    srenew(state->cg_p, state->nalloc);
 +                    break;
 +                case estLD_RNG:
 +                case estLD_RNGI:
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* No reallocation required */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_realloc_state");
 +            }
 +        }
 +    }
 +
 +    if (f != NULL)
 +    {
 +        srenew(*f, state->nalloc);
 +    }
 +}
 +
 +static void dd_check_alloc_ncg(t_forcerec *fr, t_state *state, rvec **f,
 +                               int nalloc)
 +{
 +    if (nalloc > fr->cg_nalloc)
 +    {
 +        if (debug)
 +        {
 +            fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, nalloc, over_alloc_dd(nalloc));
 +        }
 +        fr->cg_nalloc = over_alloc_dd(nalloc);
 +        srenew(fr->cginfo, fr->cg_nalloc);
 +        if (fr->cutoff_scheme == ecutsGROUP)
 +        {
 +            srenew(fr->cg_cm, fr->cg_nalloc);
 +        }
 +    }
 +    if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
 +    {
 +        /* We don't use charge groups, we use x in state to set up
 +         * the atom communication.
 +         */
 +        dd_realloc_state(state, f, nalloc);
 +    }
 +}
 +
 +static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd, t_block *cgs,
 +                                       rvec *v, rvec *lv)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  n, i, c, a, nalloc = 0;
 +    rvec                *buf = NULL;
 +
 +    if (DDMASTER(dd))
 +    {
 +        ma  = dd->ma;
 +
 +        for (n = 0; n < dd->nnodes; n++)
 +        {
 +            if (n != dd->rank)
 +            {
 +                if (ma->nat[n] > nalloc)
 +                {
 +                    nalloc = over_alloc_dd(ma->nat[n]);
 +                    srenew(buf, nalloc);
 +                }
 +                /* Use lv as a temporary buffer */
 +                a = 0;
 +                for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +                {
 +                    for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
 +                    {
 +                        copy_rvec(v[c], buf[a++]);
 +                    }
 +                }
 +                if (a != ma->nat[n])
 +                {
 +                    gmx_fatal(FARGS, "Internal error a (%d) != nat (%d)",
 +                              a, ma->nat[n]);
 +                }
 +
 +#ifdef GMX_MPI
 +                MPI_Send(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE,
 +                         DDRANK(dd, n), n, dd->mpi_comm_all);
 +#endif
 +            }
 +        }
 +        sfree(buf);
 +        n = DDMASTERRANK(dd);
 +        a = 0;
 +        for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +        {
 +            for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
 +            {
 +                copy_rvec(v[c], lv[a++]);
 +            }
 +        }
 +    }
 +    else
 +    {
 +#ifdef GMX_MPI
 +        MPI_Recv(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
 +                 MPI_ANY_TAG, dd->mpi_comm_all, MPI_STATUS_IGNORE);
 +#endif
 +    }
 +}
 +
 +static void dd_distribute_vec_scatterv(gmx_domdec_t *dd, t_block *cgs,
 +                                       rvec *v, rvec *lv)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                 *scounts = NULL, *disps = NULL;
 +    int                  n, i, c, a, nalloc = 0;
 +    rvec                *buf = NULL;
 +
 +    if (DDMASTER(dd))
 +    {
 +        ma  = dd->ma;
 +
 +        get_commbuffer_counts(dd, &scounts, &disps);
 +
 +        buf = ma->vbuf;
 +        a   = 0;
 +        for (n = 0; n < dd->nnodes; n++)
 +        {
 +            for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +            {
 +                for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
 +                {
 +                    copy_rvec(v[c], buf[a++]);
 +                }
 +            }
 +        }
 +    }
 +
 +    dd_scatterv(dd, scounts, disps, buf, dd->nat_home*sizeof(rvec), lv);
 +}
 +
 +static void dd_distribute_vec(gmx_domdec_t *dd, t_block *cgs, rvec *v, rvec *lv)
 +{
 +    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
 +    {
 +        dd_distribute_vec_sendrecv(dd, cgs, v, lv);
 +    }
 +    else
 +    {
 +        dd_distribute_vec_scatterv(dd, cgs, v, lv);
 +    }
 +}
 +
 +static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
 +                                t_state *state, t_state *state_local,
 +                                rvec **f)
 +{
 +    int  i, j, nh;
 +
 +    nh = state->nhchainlength;
 +
 +    if (DDMASTER(dd))
 +    {
 +        for (i = 0; i < efptNR; i++)
 +        {
 +            state_local->lambda[i] = state->lambda[i];
 +        }
 +        state_local->fep_state = state->fep_state;
 +        state_local->veta      = state->veta;
 +        state_local->vol0      = state->vol0;
 +        copy_mat(state->box, state_local->box);
 +        copy_mat(state->box_rel, state_local->box_rel);
 +        copy_mat(state->boxv, state_local->boxv);
 +        copy_mat(state->svir_prev, state_local->svir_prev);
 +        copy_mat(state->fvir_prev, state_local->fvir_prev);
 +        for (i = 0; i < state_local->ngtc; i++)
 +        {
 +            for (j = 0; j < nh; j++)
 +            {
 +                state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
 +                state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
 +            }
 +            state_local->therm_integral[i] = state->therm_integral[i];
 +        }
 +        for (i = 0; i < state_local->nnhpres; i++)
 +        {
 +            for (j = 0; j < nh; j++)
 +            {
 +                state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
 +                state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
 +            }
 +        }
 +    }
 +    dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda);
 +    dd_bcast(dd, sizeof(int), &state_local->fep_state);
 +    dd_bcast(dd, sizeof(real), &state_local->veta);
 +    dd_bcast(dd, sizeof(real), &state_local->vol0);
 +    dd_bcast(dd, sizeof(state_local->box), state_local->box);
 +    dd_bcast(dd, sizeof(state_local->box_rel), state_local->box_rel);
 +    dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv);
 +    dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev);
 +    dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev);
 +    dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi);
 +    dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi);
 +    dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral);
 +    dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi);
 +    dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi);
 +
 +    if (dd->nat_home > state_local->nalloc)
 +    {
 +        dd_realloc_state(state_local, f, dd->nat_home);
 +    }
 +    for (i = 0; i < estNR; i++)
 +    {
 +        if (EST_DISTR(i) && (state_local->flags & (1<<i)))
 +        {
 +            switch (i)
 +            {
 +                case estX:
 +                    dd_distribute_vec(dd, cgs, state->x, state_local->x);
 +                    break;
 +                case estV:
 +                    dd_distribute_vec(dd, cgs, state->v, state_local->v);
 +                    break;
 +                case estSDX:
 +                    dd_distribute_vec(dd, cgs, state->sd_X, state_local->sd_X);
 +                    break;
 +                case estCGP:
 +                    dd_distribute_vec(dd, cgs, state->cg_p, state_local->cg_p);
 +                    break;
 +                case estLD_RNG:
 +                    if (state->nrngi == 1)
 +                    {
 +                        dd_bcastc(dd,
 +                                  state_local->nrng*sizeof(state_local->ld_rng[0]),
 +                                  state->ld_rng, state_local->ld_rng);
 +                    }
 +                    else
 +                    {
 +                        dd_scatter(dd,
 +                                   state_local->nrng*sizeof(state_local->ld_rng[0]),
 +                                   state->ld_rng, state_local->ld_rng);
 +                    }
 +                    break;
 +                case estLD_RNGI:
 +                    if (state->nrngi == 1)
 +                    {
 +                        dd_bcastc(dd, sizeof(state_local->ld_rngi[0]),
 +                                  state->ld_rngi, state_local->ld_rngi);
 +                    }
 +                    else
 +                    {
 +                        dd_scatter(dd, sizeof(state_local->ld_rngi[0]),
 +                                   state->ld_rngi, state_local->ld_rngi);
 +                    }
 +                    break;
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* Not implemented yet */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_distribute_state");
 +            }
 +        }
 +    }
 +}
 +
 +static char dim2char(int dim)
 +{
 +    char c = '?';
 +
 +    switch (dim)
 +    {
 +        case XX: c = 'X'; break;
 +        case YY: c = 'Y'; break;
 +        case ZZ: c = 'Z'; break;
 +        default: gmx_fatal(FARGS, "Unknown dim %d", dim);
 +    }
 +
 +    return c;
 +}
 +
 +static void write_dd_grid_pdb(const char *fn, gmx_large_int_t step,
 +                              gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox)
 +{
 +    rvec   grid_s[2], *grid_r = NULL, cx, r;
 +    char   fname[STRLEN], format[STRLEN], buf[22];
 +    FILE  *out;
 +    int    a, i, d, z, y, x;
 +    matrix tric;
 +    real   vol;
 +
 +    copy_rvec(dd->comm->cell_x0, grid_s[0]);
 +    copy_rvec(dd->comm->cell_x1, grid_s[1]);
 +
 +    if (DDMASTER(dd))
 +    {
 +        snew(grid_r, 2*dd->nnodes);
 +    }
 +
 +    dd_gather(dd, 2*sizeof(rvec), grid_s[0], DDMASTER(dd) ? grid_r[0] : NULL);
 +
 +    if (DDMASTER(dd))
 +    {
 +        for (d = 0; d < DIM; d++)
 +        {
 +            for (i = 0; i < DIM; i++)
 +            {
 +                if (d == i)
 +                {
 +                    tric[d][i] = 1;
 +                }
 +                else
 +                {
 +                    if (d < ddbox->npbcdim && dd->nc[d] > 1)
 +                    {
 +                        tric[d][i] = box[i][d]/box[i][i];
 +                    }
 +                    else
 +                    {
 +                        tric[d][i] = 0;
 +                    }
 +                }
 +            }
 +        }
 +        sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf));
 +        sprintf(format, "%s%s\n", get_pdbformat(), "%6.2f%6.2f");
 +        out = gmx_fio_fopen(fname, "w");
 +        gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
 +        a = 1;
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
 +            for (d = 0; d < DIM; d++)
 +            {
 +                vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
 +            }
 +            for (z = 0; z < 2; z++)
 +            {
 +                for (y = 0; y < 2; y++)
 +                {
 +                    for (x = 0; x < 2; x++)
 +                    {
 +                        cx[XX] = grid_r[i*2+x][XX];
 +                        cx[YY] = grid_r[i*2+y][YY];
 +                        cx[ZZ] = grid_r[i*2+z][ZZ];
 +                        mvmul(tric, cx, r);
 +                        fprintf(out, format, "ATOM", a++, "CA", "GLY", ' ', 1+i,
++                                ' ', 10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol);
 +                    }
 +                }
 +            }
 +            for (d = 0; d < DIM; d++)
 +            {
 +                for (x = 0; x < 4; x++)
 +                {
 +                    switch (d)
 +                    {
 +                        case 0: y = 1 + i*8 + 2*x; break;
 +                        case 1: y = 1 + i*8 + 2*x - (x % 2); break;
 +                        case 2: y = 1 + i*8 + x; break;
 +                    }
 +                    fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<<d));
 +                }
 +            }
 +        }
 +        gmx_fio_fclose(out);
 +        sfree(grid_r);
 +    }
 +}
 +
 +void write_dd_pdb(const char *fn, gmx_large_int_t step, const char *title,
 +                  gmx_mtop_t *mtop, t_commrec *cr,
 +                  int natoms, rvec x[], matrix box)
 +{
 +    char          fname[STRLEN], format[STRLEN], format4[STRLEN], buf[22];
 +    FILE         *out;
 +    int           i, ii, resnr, c;
 +    char         *atomname, *resname;
 +    real          b;
 +    gmx_domdec_t *dd;
 +
 +    dd = cr->dd;
 +    if (natoms == -1)
 +    {
 +        natoms = dd->comm->nat[ddnatVSITE];
 +    }
 +
 +    sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid);
 +
 +    sprintf(format, "%s%s\n", get_pdbformat(), "%6.2f%6.2f");
 +    sprintf(format4, "%s%s\n", get_pdbformat4(), "%6.2f%6.2f");
 +
 +    out = gmx_fio_fopen(fname, "w");
 +
 +    fprintf(out, "TITLE     %s\n", title);
 +    gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
 +    for (i = 0; i < natoms; i++)
 +    {
 +        ii = dd->gatindex[i];
 +        gmx_mtop_atominfo_global(mtop, ii, &atomname, &resnr, &resname);
 +        if (i < dd->comm->nat[ddnatZONE])
 +        {
 +            c = 0;
 +            while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
 +            {
 +                c++;
 +            }
 +            b = c;
 +        }
 +        else if (i < dd->comm->nat[ddnatVSITE])
 +        {
 +            b = dd->comm->zones.n;
 +        }
 +        else
 +        {
 +            b = dd->comm->zones.n + 1;
 +        }
 +        fprintf(out, strlen(atomname) < 4 ? format : format4,
 +                "ATOM", (ii+1)%100000,
 +                atomname, resname, ' ', resnr%10000, ' ',
 +                10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b);
 +    }
 +    fprintf(out, "TER\n");
 +
 +    gmx_fio_fclose(out);
 +}
 +
 +real dd_cutoff_mbody(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                di;
 +    real               r;
 +
 +    comm = dd->comm;
 +
 +    r = -1;
 +    if (comm->bInterCGBondeds)
 +    {
 +        if (comm->cutoff_mbody > 0)
 +        {
 +            r = comm->cutoff_mbody;
 +        }
 +        else
 +        {
 +            /* cutoff_mbody=0 means we do not have DLB */
 +            r = comm->cellsize_min[dd->dim[0]];
 +            for (di = 1; di < dd->ndim; di++)
 +            {
 +                r = min(r, comm->cellsize_min[dd->dim[di]]);
 +            }
 +            if (comm->bBondComm)
 +            {
 +                r = max(r, comm->cutoff_mbody);
 +            }
 +            else
 +            {
 +                r = min(r, comm->cutoff);
 +            }
 +        }
 +    }
 +
 +    return r;
 +}
 +
 +real dd_cutoff_twobody(gmx_domdec_t *dd)
 +{
 +    real r_mb;
 +
 +    r_mb = dd_cutoff_mbody(dd);
 +
 +    return max(dd->comm->cutoff, r_mb);
 +}
 +
 +
 +static void dd_cart_coord2pmecoord(gmx_domdec_t *dd, ivec coord, ivec coord_pme)
 +{
 +    int nc, ntot;
 +
 +    nc   = dd->nc[dd->comm->cartpmedim];
 +    ntot = dd->comm->ntot[dd->comm->cartpmedim];
 +    copy_ivec(coord, coord_pme);
 +    coord_pme[dd->comm->cartpmedim] =
 +        nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
 +}
 +
 +static int low_ddindex2pmeindex(int ndd, int npme, int ddindex)
 +{
 +    /* Here we assign a PME node to communicate with this DD node
 +     * by assuming that the major index of both is x.
 +     * We add cr->npmenodes/2 to obtain an even distribution.
 +     */
 +    return (ddindex*npme + npme/2)/ndd;
 +}
 +
 +static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
 +{
 +    return low_ddindex2pmeindex(dd->nnodes, dd->comm->npmenodes, ddindex);
 +}
 +
 +static int cr_ddindex2pmeindex(const t_commrec *cr, int ddindex)
 +{
 +    return low_ddindex2pmeindex(cr->dd->nnodes, cr->npmenodes, ddindex);
 +}
 +
 +static int *dd_pmenodes(t_commrec *cr)
 +{
 +    int *pmenodes;
 +    int  n, i, p0, p1;
 +
 +    snew(pmenodes, cr->npmenodes);
 +    n = 0;
 +    for (i = 0; i < cr->dd->nnodes; i++)
 +    {
 +        p0 = cr_ddindex2pmeindex(cr, i);
 +        p1 = cr_ddindex2pmeindex(cr, i+1);
 +        if (i+1 == cr->dd->nnodes || p1 > p0)
 +        {
 +            if (debug)
 +            {
 +                fprintf(debug, "pmenode[%d] = %d\n", n, i+1+n);
 +            }
 +            pmenodes[n] = i + 1 + n;
 +            n++;
 +        }
 +    }
 +
 +    return pmenodes;
 +}
 +
 +static int gmx_ddcoord2pmeindex(t_commrec *cr, int x, int y, int z)
 +{
 +    gmx_domdec_t *dd;
 +    ivec          coords, coords_pme, nc;
 +    int           slab;
 +
 +    dd = cr->dd;
 +    /*
 +       if (dd->comm->bCartesian) {
 +       gmx_ddindex2xyz(dd->nc,ddindex,coords);
 +       dd_coords2pmecoords(dd,coords,coords_pme);
 +       copy_ivec(dd->ntot,nc);
 +       nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
 +       coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
 +
 +       slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
 +       } else {
 +       slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
 +       }
 +     */
 +    coords[XX] = x;
 +    coords[YY] = y;
 +    coords[ZZ] = z;
 +    slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
 +
 +    return slab;
 +}
 +
 +static int ddcoord2simnodeid(t_commrec *cr, int x, int y, int z)
 +{
 +    gmx_domdec_comm_t *comm;
 +    ivec               coords;
 +    int                ddindex, nodeid = -1;
 +
 +    comm = cr->dd->comm;
 +
 +    coords[XX] = x;
 +    coords[YY] = y;
 +    coords[ZZ] = z;
 +    if (comm->bCartesianPP_PME)
 +    {
 +#ifdef GMX_MPI
 +        MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
 +#endif
 +    }
 +    else
 +    {
 +        ddindex = dd_index(cr->dd->nc, coords);
 +        if (comm->bCartesianPP)
 +        {
 +            nodeid = comm->ddindex2simnodeid[ddindex];
 +        }
 +        else
 +        {
 +            if (comm->pmenodes)
 +            {
 +                nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
 +            }
 +            else
 +            {
 +                nodeid = ddindex;
 +            }
 +        }
 +    }
 +
 +    return nodeid;
 +}
 +
 +static int dd_simnode2pmenode(t_commrec *cr, int sim_nodeid)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    ivec               coord, coord_pme;
 +    int                i;
 +    int                pmenode = -1;
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    /* This assumes a uniform x domain decomposition grid cell size */
 +    if (comm->bCartesianPP_PME)
 +    {
 +#ifdef GMX_MPI
 +        MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
 +        if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 +        {
 +            /* This is a PP node */
 +            dd_cart_coord2pmecoord(dd, coord, coord_pme);
 +            MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
 +        }
 +#endif
 +    }
 +    else if (comm->bCartesianPP)
 +    {
 +        if (sim_nodeid < dd->nnodes)
 +        {
 +            pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
 +        }
 +    }
 +    else
 +    {
 +        /* This assumes DD cells with identical x coordinates
 +         * are numbered sequentially.
 +         */
 +        if (dd->comm->pmenodes == NULL)
 +        {
 +            if (sim_nodeid < dd->nnodes)
 +            {
 +                /* The DD index equals the nodeid */
 +                pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
 +            }
 +        }
 +        else
 +        {
 +            i = 0;
 +            while (sim_nodeid > dd->comm->pmenodes[i])
 +            {
 +                i++;
 +            }
 +            if (sim_nodeid < dd->comm->pmenodes[i])
 +            {
 +                pmenode = dd->comm->pmenodes[i];
 +            }
 +        }
 +    }
 +
 +    return pmenode;
 +}
 +
 +void get_pme_nnodes(const gmx_domdec_t *dd,
 +                    int *npmenodes_x, int *npmenodes_y)
 +{
 +    if (dd != NULL)
 +    {
 +        *npmenodes_x = dd->comm->npmenodes_x;
 +        *npmenodes_y = dd->comm->npmenodes_y;
 +    }
 +    else
 +    {
 +        *npmenodes_x = 1;
 +        *npmenodes_y = 1;
 +    }
 +}
 +
 +gmx_bool gmx_pmeonlynode(t_commrec *cr, int sim_nodeid)
 +{
 +    gmx_bool bPMEOnlyNode;
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        bPMEOnlyNode = (dd_simnode2pmenode(cr, sim_nodeid) == -1);
 +    }
 +    else
 +    {
 +        bPMEOnlyNode = FALSE;
 +    }
 +
 +    return bPMEOnlyNode;
 +}
 +
 +void get_pme_ddnodes(t_commrec *cr, int pmenodeid,
 +                     int *nmy_ddnodes, int **my_ddnodes, int *node_peer)
 +{
 +    gmx_domdec_t *dd;
 +    int           x, y, z;
 +    ivec          coord, coord_pme;
 +
 +    dd = cr->dd;
 +
 +    snew(*my_ddnodes, (dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
 +
 +    *nmy_ddnodes = 0;
 +    for (x = 0; x < dd->nc[XX]; x++)
 +    {
 +        for (y = 0; y < dd->nc[YY]; y++)
 +        {
 +            for (z = 0; z < dd->nc[ZZ]; z++)
 +            {
 +                if (dd->comm->bCartesianPP_PME)
 +                {
 +                    coord[XX] = x;
 +                    coord[YY] = y;
 +                    coord[ZZ] = z;
 +                    dd_cart_coord2pmecoord(dd, coord, coord_pme);
 +                    if (dd->ci[XX] == coord_pme[XX] &&
 +                        dd->ci[YY] == coord_pme[YY] &&
 +                        dd->ci[ZZ] == coord_pme[ZZ])
 +                    {
 +                        (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
 +                    }
 +                }
 +                else
 +                {
 +                    /* The slab corresponds to the nodeid in the PME group */
 +                    if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
 +                    {
 +                        (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    /* The last PP-only node is the peer node */
 +    *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Receive coordinates from PP nodes:");
 +        for (x = 0; x < *nmy_ddnodes; x++)
 +        {
 +            fprintf(debug, " %d", (*my_ddnodes)[x]);
 +        }
 +        fprintf(debug, "\n");
 +    }
 +}
 +
 +static gmx_bool receive_vir_ener(t_commrec *cr)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                pmenode, coords[DIM], rank;
 +    gmx_bool           bReceive;
 +
 +    bReceive = TRUE;
 +    if (cr->npmenodes < cr->dd->nnodes)
 +    {
 +        comm = cr->dd->comm;
 +        if (comm->bCartesianPP_PME)
 +        {
 +            pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
 +#ifdef GMX_MPI
 +            MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
 +            coords[comm->cartpmedim]++;
 +            if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
 +            {
 +                MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
 +                if (dd_simnode2pmenode(cr, rank) == pmenode)
 +                {
 +                    /* This is not the last PP node for pmenode */
 +                    bReceive = FALSE;
 +                }
 +            }
 +#endif
 +        }
 +        else
 +        {
 +            pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
 +            if (cr->sim_nodeid+1 < cr->nnodes &&
 +                dd_simnode2pmenode(cr, cr->sim_nodeid+1) == pmenode)
 +            {
 +                /* This is not the last PP node for pmenode */
 +                bReceive = FALSE;
 +            }
 +        }
 +    }
 +
 +    return bReceive;
 +}
 +
 +static void set_zones_ncg_home(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_zones_t *zones;
 +    int                 i;
 +
 +    zones = &dd->comm->zones;
 +
 +    zones->cg_range[0] = 0;
 +    for (i = 1; i < zones->n+1; i++)
 +    {
 +        zones->cg_range[i] = dd->ncg_home;
 +    }
 +    /* zone_ncg1[0] should always be equal to ncg_home */
 +    dd->comm->zone_ncg1[0] = dd->ncg_home;
 +}
 +
 +static void rebuild_cgindex(gmx_domdec_t *dd,
 +                            const int *gcgs_index, t_state *state)
 +{
 +    int nat, i, *ind, *dd_cg_gl, *cgindex, cg_gl;
 +
 +    ind        = state->cg_gl;
 +    dd_cg_gl   = dd->index_gl;
 +    cgindex    = dd->cgindex;
 +    nat        = 0;
 +    cgindex[0] = nat;
 +    for (i = 0; i < state->ncg_gl; i++)
 +    {
 +        cgindex[i]  = nat;
 +        cg_gl       = ind[i];
 +        dd_cg_gl[i] = cg_gl;
 +        nat        += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
 +    }
 +    cgindex[i] = nat;
 +
 +    dd->ncg_home = state->ncg_gl;
 +    dd->nat_home = nat;
 +
 +    set_zones_ncg_home(dd);
 +}
 +
 +static int ddcginfo(const cginfo_mb_t *cginfo_mb, int cg)
 +{
 +    while (cg >= cginfo_mb->cg_end)
 +    {
 +        cginfo_mb++;
 +    }
 +
 +    return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
 +}
 +
 +static void dd_set_cginfo(int *index_gl, int cg0, int cg1,
 +                          t_forcerec *fr, char *bLocalCG)
 +{
 +    cginfo_mb_t *cginfo_mb;
 +    int         *cginfo;
 +    int          cg;
 +
 +    if (fr != NULL)
 +    {
 +        cginfo_mb = fr->cginfo_mb;
 +        cginfo    = fr->cginfo;
 +
 +        for (cg = cg0; cg < cg1; cg++)
 +        {
 +            cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
 +        }
 +    }
 +
 +    if (bLocalCG != NULL)
 +    {
 +        for (cg = cg0; cg < cg1; cg++)
 +        {
 +            bLocalCG[index_gl[cg]] = TRUE;
 +        }
 +    }
 +}
 +
 +static void make_dd_indices(gmx_domdec_t *dd,
 +                            const int *gcgs_index, int cg_start)
 +{
 +    int          nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
 +    int         *zone2cg, *zone_ncg1, *index_gl, *gatindex;
 +    gmx_ga2la_t *ga2la;
 +    char        *bLocalCG;
 +    gmx_bool     bCGs;
 +
 +    bLocalCG = dd->comm->bLocalCG;
 +
 +    if (dd->nat_tot > dd->gatindex_nalloc)
 +    {
 +        dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
 +        srenew(dd->gatindex, dd->gatindex_nalloc);
 +    }
 +
 +    nzone      = dd->comm->zones.n;
 +    zone2cg    = dd->comm->zones.cg_range;
 +    zone_ncg1  = dd->comm->zone_ncg1;
 +    index_gl   = dd->index_gl;
 +    gatindex   = dd->gatindex;
 +    bCGs       = dd->comm->bCGs;
 +
 +    if (zone2cg[1] != dd->ncg_home)
 +    {
 +        gmx_incons("dd->ncg_zone is not up to date");
 +    }
 +
 +    /* Make the local to global and global to local atom index */
 +    a = dd->cgindex[cg_start];
 +    for (zone = 0; zone < nzone; zone++)
 +    {
 +        if (zone == 0)
 +        {
 +            cg0 = cg_start;
 +        }
 +        else
 +        {
 +            cg0 = zone2cg[zone];
 +        }
 +        cg1    = zone2cg[zone+1];
 +        cg1_p1 = cg0 + zone_ncg1[zone];
 +
 +        for (cg = cg0; cg < cg1; cg++)
 +        {
 +            zone1 = zone;
 +            if (cg >= cg1_p1)
 +            {
 +                /* Signal that this cg is from more than one pulse away */
 +                zone1 += nzone;
 +            }
 +            cg_gl = index_gl[cg];
 +            if (bCGs)
 +            {
 +                for (a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
 +                {
 +                    gatindex[a] = a_gl;
 +                    ga2la_set(dd->ga2la, a_gl, a, zone1);
 +                    a++;
 +                }
 +            }
 +            else
 +            {
 +                gatindex[a] = cg_gl;
 +                ga2la_set(dd->ga2la, cg_gl, a, zone1);
 +                a++;
 +            }
 +        }
 +    }
 +}
 +
 +static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
 +                          const char *where)
 +{
 +    int ncg, i, ngl, nerr;
 +
 +    nerr = 0;
 +    if (bLocalCG == NULL)
 +    {
 +        return nerr;
 +    }
 +    for (i = 0; i < dd->ncg_tot; i++)
 +    {
 +        if (!bLocalCG[dd->index_gl[i]])
 +        {
 +            fprintf(stderr,
 +                    "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i+1, dd->index_gl[i]+1, dd->ncg_home);
 +            nerr++;
 +        }
 +    }
 +    ngl = 0;
 +    for (i = 0; i < ncg_sys; i++)
 +    {
 +        if (bLocalCG[i])
 +        {
 +            ngl++;
 +        }
 +    }
 +    if (ngl != dd->ncg_tot)
 +    {
 +        fprintf(stderr, "DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd->rank, where, ngl, dd->ncg_tot);
 +        nerr++;
 +    }
 +
 +    return nerr;
 +}
 +
 +static void check_index_consistency(gmx_domdec_t *dd,
 +                                    int natoms_sys, int ncg_sys,
 +                                    const char *where)
 +{
 +    int   nerr, ngl, i, a, cell;
 +    int  *have;
 +
 +    nerr = 0;
 +
 +    if (dd->comm->DD_debug > 1)
 +    {
 +        snew(have, natoms_sys);
 +        for (a = 0; a < dd->nat_tot; a++)
 +        {
 +            if (have[dd->gatindex[a]] > 0)
 +            {
 +                fprintf(stderr, "DD node %d: global atom %d occurs twice: index %d and %d\n", dd->rank, dd->gatindex[a]+1, have[dd->gatindex[a]], a+1);
 +            }
 +            else
 +            {
 +                have[dd->gatindex[a]] = a + 1;
 +            }
 +        }
 +        sfree(have);
 +    }
 +
 +    snew(have, dd->nat_tot);
 +
 +    ngl  = 0;
 +    for (i = 0; i < natoms_sys; i++)
 +    {
 +        if (ga2la_get(dd->ga2la, i, &a, &cell))
 +        {
 +            if (a >= dd->nat_tot)
 +            {
 +                fprintf(stderr, "DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, dd->nat_tot);
 +                nerr++;
 +            }
 +            else
 +            {
 +                have[a] = 1;
 +                if (dd->gatindex[a] != i)
 +                {
 +                    fprintf(stderr, "DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->gatindex[a]+1);
 +                    nerr++;
 +                }
 +            }
 +            ngl++;
 +        }
 +    }
 +    if (ngl != dd->nat_tot)
 +    {
 +        fprintf(stderr,
 +                "DD node %d, %s: %d global atom indices, %d local atoms\n",
 +                dd->rank, where, ngl, dd->nat_tot);
 +    }
 +    for (a = 0; a < dd->nat_tot; a++)
 +    {
 +        if (have[a] == 0)
 +        {
 +            fprintf(stderr,
 +                    "DD node %d, %s: local atom %d, global %d has no global index\n",
 +                    dd->rank, where, a+1, dd->gatindex[a]+1);
 +        }
 +    }
 +    sfree(have);
 +
 +    nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
 +
 +    if (nerr > 0)
 +    {
 +        gmx_fatal(FARGS, "DD node %d, %s: %d atom/cg index inconsistencies",
 +                  dd->rank, where, nerr);
 +    }
 +}
 +
 +static void clear_dd_indices(gmx_domdec_t *dd, int cg_start, int a_start)
 +{
 +    int   i;
 +    char *bLocalCG;
 +
 +    if (a_start == 0)
 +    {
 +        /* Clear the whole list without searching */
 +        ga2la_clear(dd->ga2la);
 +    }
 +    else
 +    {
 +        for (i = a_start; i < dd->nat_tot; i++)
 +        {
 +            ga2la_del(dd->ga2la, dd->gatindex[i]);
 +        }
 +    }
 +
 +    bLocalCG = dd->comm->bLocalCG;
 +    if (bLocalCG)
 +    {
 +        for (i = cg_start; i < dd->ncg_tot; i++)
 +        {
 +            bLocalCG[dd->index_gl[i]] = FALSE;
 +        }
 +    }
 +
 +    dd_clear_local_vsite_indices(dd);
 +
 +    if (dd->constraints)
 +    {
 +        dd_clear_local_constraint_indices(dd);
 +    }
 +}
 +
 +/* This function should be used for moving the domain boudaries during DLB,
 + * for obtaining the minimum cell size. It checks the initially set limit
 + * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
 + * and, possibly, a longer cut-off limit set for PME load balancing.
 + */
 +static real cellsize_min_dlb(gmx_domdec_comm_t *comm, int dim_ind, int dim)
 +{
 +    real cellsize_min;
 +
 +    cellsize_min = comm->cellsize_min[dim];
 +
 +    if (!comm->bVacDLBNoLimit)
 +    {
 +        /* The cut-off might have changed, e.g. by PME load balacning,
 +         * from the value used to set comm->cellsize_min, so check it.
 +         */
 +        cellsize_min = max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
 +
 +        if (comm->bPMELoadBalDLBLimits)
 +        {
 +            /* Check for the cut-off limit set by the PME load balancing */
 +            cellsize_min = max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
 +        }
 +    }
 +
 +    return cellsize_min;
 +}
 +
 +static real grid_jump_limit(gmx_domdec_comm_t *comm, real cutoff,
 +                            int dim_ind)
 +{
 +    real grid_jump_limit;
 +
 +    /* The distance between the boundaries of cells at distance
 +     * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
 +     * and by the fact that cells should not be shifted by more than
 +     * half their size, such that cg's only shift by one cell
 +     * at redecomposition.
 +     */
 +    grid_jump_limit = comm->cellsize_limit;
 +    if (!comm->bVacDLBNoLimit)
 +    {
 +        if (comm->bPMELoadBalDLBLimits)
 +        {
 +            cutoff = max(cutoff, comm->PMELoadBal_max_cutoff);
 +        }
 +        grid_jump_limit = max(grid_jump_limit,
 +                              cutoff/comm->cd[dim_ind].np);
 +    }
 +
 +    return grid_jump_limit;
 +}
 +
 +static gmx_bool check_grid_jump(gmx_large_int_t step,
 +                                gmx_domdec_t   *dd,
 +                                real            cutoff,
 +                                gmx_ddbox_t    *ddbox,
 +                                gmx_bool        bFatal)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d, dim;
 +    real               limit, bfac;
 +    gmx_bool           bInvalid;
 +
 +    bInvalid = FALSE;
 +
 +    comm = dd->comm;
 +
 +    for (d = 1; d < dd->ndim; d++)
 +    {
 +        dim   = dd->dim[d];
 +        limit = grid_jump_limit(comm, cutoff, d);
 +        bfac  = ddbox->box_size[dim];
 +        if (ddbox->tric_dir[dim])
 +        {
 +            bfac *= ddbox->skew_fac[dim];
 +        }
 +        if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
 +                                                              (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
 +        {
 +            bInvalid = TRUE;
 +
 +            if (bFatal)
 +            {
 +                char buf[22];
 +
 +                /* This error should never be triggered under normal
 +                 * circumstances, but you never know ...
 +                 */
 +                gmx_fatal(FARGS, "Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with less nodes might avoid this issue.",
 +                          gmx_step_str(step, buf),
 +                          dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +            }
 +        }
 +    }
 +
 +    return bInvalid;
 +}
 +
 +static int dd_load_count(gmx_domdec_comm_t *comm)
 +{
 +    return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
 +}
 +
 +static float dd_force_load(gmx_domdec_comm_t *comm)
 +{
 +    float load;
 +
 +    if (comm->eFlop)
 +    {
 +        load = comm->flop;
 +        if (comm->eFlop > 1)
 +        {
 +            load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
 +        }
 +    }
 +    else
 +    {
 +        load = comm->cycl[ddCyclF];
 +        if (comm->cycl_n[ddCyclF] > 1)
 +        {
 +            /* Subtract the maximum of the last n cycle counts
 +             * to get rid of possible high counts due to other soures,
 +             * for instance system activity, that would otherwise
 +             * affect the dynamic load balancing.
 +             */
 +            load -= comm->cycl_max[ddCyclF];
 +        }
 +    }
 +
 +    return load;
 +}
 +
 +static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                i;
 +
 +    comm = dd->comm;
 +
 +    snew(*dim_f, dd->nc[dim]+1);
 +    (*dim_f)[0] = 0;
 +    for (i = 1; i < dd->nc[dim]; i++)
 +    {
 +        if (comm->slb_frac[dim])
 +        {
 +            (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
 +        }
 +        else
 +        {
 +            (*dim_f)[i] = (real)i/(real)dd->nc[dim];
 +        }
 +    }
 +    (*dim_f)[dd->nc[dim]] = 1;
 +}
 +
 +static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
 +{
 +    int  pmeindex, slab, nso, i;
 +    ivec xyz;
 +
 +    if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
 +    {
 +        ddpme->dim = YY;
 +    }
 +    else
 +    {
 +        ddpme->dim = dimind;
 +    }
 +    ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
 +
 +    ddpme->nslab = (ddpme->dim == 0 ?
 +                    dd->comm->npmenodes_x :
 +                    dd->comm->npmenodes_y);
 +
 +    if (ddpme->nslab <= 1)
 +    {
 +        return;
 +    }
 +
 +    nso = dd->comm->npmenodes/ddpme->nslab;
 +    /* Determine for each PME slab the PP location range for dimension dim */
 +    snew(ddpme->pp_min, ddpme->nslab);
 +    snew(ddpme->pp_max, ddpme->nslab);
 +    for (slab = 0; slab < ddpme->nslab; slab++)
 +    {
 +        ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
 +        ddpme->pp_max[slab] = 0;
 +    }
 +    for (i = 0; i < dd->nnodes; i++)
 +    {
 +        ddindex2xyz(dd->nc, i, xyz);
 +        /* For y only use our y/z slab.
 +         * This assumes that the PME x grid size matches the DD grid size.
 +         */
 +        if (dimind == 0 || xyz[XX] == dd->ci[XX])
 +        {
 +            pmeindex = ddindex2pmeindex(dd, i);
 +            if (dimind == 0)
 +            {
 +                slab = pmeindex/nso;
 +            }
 +            else
 +            {
 +                slab = pmeindex % ddpme->nslab;
 +            }
 +            ddpme->pp_min[slab] = min(ddpme->pp_min[slab], xyz[dimind]);
 +            ddpme->pp_max[slab] = max(ddpme->pp_max[slab], xyz[dimind]);
 +        }
 +    }
 +
 +    set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
 +}
 +
 +int dd_pme_maxshift_x(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->ddpme[0].dim == XX)
 +    {
 +        return dd->comm->ddpme[0].maxshift;
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +int dd_pme_maxshift_y(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->ddpme[0].dim == YY)
 +    {
 +        return dd->comm->ddpme[0].maxshift;
 +    }
 +    else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
 +    {
 +        return dd->comm->ddpme[1].maxshift;
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +static void set_pme_maxshift(gmx_domdec_t *dd, gmx_ddpme_t *ddpme,
 +                             gmx_bool bUniform, gmx_ddbox_t *ddbox, real *cell_f)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                nc, ns, s;
 +    int               *xmin, *xmax;
 +    real               range, pme_boundary;
 +    int                sh;
 +
 +    comm = dd->comm;
 +    nc   = dd->nc[ddpme->dim];
 +    ns   = ddpme->nslab;
 +
 +    if (!ddpme->dim_match)
 +    {
 +        /* PP decomposition is not along dim: the worst situation */
 +        sh = ns/2;
 +    }
 +    else if (ns <= 3 || (bUniform && ns == nc))
 +    {
 +        /* The optimal situation */
 +        sh = 1;
 +    }
 +    else
 +    {
 +        /* We need to check for all pme nodes which nodes they
 +         * could possibly need to communicate with.
 +         */
 +        xmin = ddpme->pp_min;
 +        xmax = ddpme->pp_max;
 +        /* Allow for atoms to be maximally 2/3 times the cut-off
 +         * out of their DD cell. This is a reasonable balance between
 +         * between performance and support for most charge-group/cut-off
 +         * combinations.
 +         */
 +        range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
 +        /* Avoid extra communication when we are exactly at a boundary */
 +        range *= 0.999;
 +
 +        sh = 1;
 +        for (s = 0; s < ns; s++)
 +        {
 +            /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
 +            pme_boundary = (real)s/ns;
 +            while (sh+1 < ns &&
 +                   ((s-(sh+1) >= 0 &&
 +                     cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
 +                    (s-(sh+1) <  0 &&
 +                     cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
 +            {
 +                sh++;
 +            }
 +            pme_boundary = (real)(s+1)/ns;
 +            while (sh+1 < ns &&
 +                   ((s+(sh+1) <  ns &&
 +                     cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
 +                    (s+(sh+1) >= ns &&
 +                     cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
 +            {
 +                sh++;
 +            }
 +        }
 +    }
 +
 +    ddpme->maxshift = sh;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "PME slab communication range for dim %d is %d\n",
 +                ddpme->dim, ddpme->maxshift);
 +    }
 +}
 +
 +static void check_box_size(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
 +{
 +    int d, dim;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        if (dim < ddbox->nboundeddim &&
 +            ddbox->box_size[dim]*ddbox->skew_fac[dim] <
 +            dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
 +        {
 +            gmx_fatal(FARGS, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
 +                      dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
 +                      dd->nc[dim], dd->comm->cellsize_limit);
 +        }
 +    }
 +}
 +
 +static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
 +                                  gmx_bool bMaster, ivec npulse)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d, j;
 +    rvec               cellsize_min;
 +    real              *cell_x, cell_dx, cellsize;
 +
 +    comm = dd->comm;
 +
 +    for (d = 0; d < DIM; d++)
 +    {
 +        cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
 +        npulse[d]       = 1;
 +        if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
 +        {
 +            /* Uniform grid */
 +            cell_dx = ddbox->box_size[d]/dd->nc[d];
 +            if (bMaster)
 +            {
 +                for (j = 0; j < dd->nc[d]+1; j++)
 +                {
 +                    dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
 +                }
 +            }
 +            else
 +            {
 +                comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
 +                comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
 +            }
 +            cellsize = cell_dx*ddbox->skew_fac[d];
 +            while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
 +            {
 +                npulse[d]++;
 +            }
 +            cellsize_min[d] = cellsize;
 +        }
 +        else
 +        {
 +            /* Statically load balanced grid */
 +            /* Also when we are not doing a master distribution we determine
 +             * all cell borders in a loop to obtain identical values
 +             * to the master distribution case and to determine npulse.
 +             */
 +            if (bMaster)
 +            {
 +                cell_x = dd->ma->cell_x[d];
 +            }
 +            else
 +            {
 +                snew(cell_x, dd->nc[d]+1);
 +            }
 +            cell_x[0] = ddbox->box0[d];
 +            for (j = 0; j < dd->nc[d]; j++)
 +            {
 +                cell_dx     = ddbox->box_size[d]*comm->slb_frac[d][j];
 +                cell_x[j+1] = cell_x[j] + cell_dx;
 +                cellsize    = cell_dx*ddbox->skew_fac[d];
 +                while (cellsize*npulse[d] < comm->cutoff &&
 +                       npulse[d] < dd->nc[d]-1)
 +                {
 +                    npulse[d]++;
 +                }
 +                cellsize_min[d] = min(cellsize_min[d], cellsize);
 +            }
 +            if (!bMaster)
 +            {
 +                comm->cell_x0[d] = cell_x[dd->ci[d]];
 +                comm->cell_x1[d] = cell_x[dd->ci[d]+1];
 +                sfree(cell_x);
 +            }
 +        }
 +        /* The following limitation is to avoid that a cell would receive
 +         * some of its own home charge groups back over the periodic boundary.
 +         * Double charge groups cause trouble with the global indices.
 +         */
 +        if (d < ddbox->npbcdim &&
 +            dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
 +        {
 +            gmx_fatal_collective(FARGS, NULL, dd,
 +                                 "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
 +                                 dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d],
 +                                 comm->cutoff,
 +                                 dd->nc[d], dd->nc[d],
 +                                 dd->nnodes > dd->nc[d] ? "cells" : "processors");
 +        }
 +    }
 +
 +    if (!comm->bDynLoadBal)
 +    {
 +        copy_rvec(cellsize_min, comm->cellsize_min);
 +    }
 +
 +    for (d = 0; d < comm->npmedecompdim; d++)
 +    {
 +        set_pme_maxshift(dd, &comm->ddpme[d],
 +                         comm->slb_frac[dd->dim[d]] == NULL, ddbox,
 +                         comm->ddpme[d].slb_dim_f);
 +    }
 +}
 +
 +
 +static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
 +                                                  int d, int dim, gmx_domdec_root_t *root,
 +                                                  gmx_ddbox_t *ddbox,
 +                                                  gmx_bool bUniform, gmx_large_int_t step, real cellsize_limit_f, int range[])
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ncd, i, j, nmin, nmin_old;
 +    gmx_bool           bLimLo, bLimHi;
 +    real              *cell_size;
 +    real               fac, halfway, cellsize_limit_f_i, region_size;
 +    gmx_bool           bPBC, bLastHi = FALSE;
 +    int                nrange[] = {range[0], range[1]};
 +
 +    region_size = root->cell_f[range[1]]-root->cell_f[range[0]];
 +
 +    comm = dd->comm;
 +
 +    ncd = dd->nc[dim];
 +
 +    bPBC = (dim < ddbox->npbcdim);
 +
 +    cell_size = root->buf_ncd;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "enforce_limits: %d %d\n", range[0], range[1]);
 +    }
 +
 +    /* First we need to check if the scaling does not make cells
 +     * smaller than the smallest allowed size.
 +     * We need to do this iteratively, since if a cell is too small,
 +     * it needs to be enlarged, which makes all the other cells smaller,
 +     * which could in turn make another cell smaller than allowed.
 +     */
 +    for (i = range[0]; i < range[1]; i++)
 +    {
 +        root->bCellMin[i] = FALSE;
 +    }
 +    nmin = 0;
 +    do
 +    {
 +        nmin_old = nmin;
 +        /* We need the total for normalization */
 +        fac = 0;
 +        for (i = range[0]; i < range[1]; i++)
 +        {
 +            if (root->bCellMin[i] == FALSE)
 +            {
 +                fac += cell_size[i];
 +            }
 +        }
 +        fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
 +        /* Determine the cell boundaries */
 +        for (i = range[0]; i < range[1]; i++)
 +        {
 +            if (root->bCellMin[i] == FALSE)
 +            {
 +                cell_size[i] *= fac;
 +                if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
 +                {
 +                    cellsize_limit_f_i = 0;
 +                }
 +                else
 +                {
 +                    cellsize_limit_f_i = cellsize_limit_f;
 +                }
 +                if (cell_size[i] < cellsize_limit_f_i)
 +                {
 +                    root->bCellMin[i] = TRUE;
 +                    cell_size[i]      = cellsize_limit_f_i;
 +                    nmin++;
 +                }
 +            }
 +            root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
 +        }
 +    }
 +    while (nmin > nmin_old);
 +
 +    i            = range[1]-1;
 +    cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
 +    /* For this check we should not use DD_CELL_MARGIN,
 +     * but a slightly smaller factor,
 +     * since rounding could get use below the limit.
 +     */
 +    if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
 +    {
 +        char buf[22];
 +        gmx_fatal(FARGS, "Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
 +                  gmx_step_str(step, buf),
 +                  dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
 +                  ncd, comm->cellsize_min[dim]);
 +    }
 +
 +    root->bLimited = (nmin > 0) || (range[0] > 0) || (range[1] < ncd);
 +
 +    if (!bUniform)
 +    {
 +        /* Check if the boundary did not displace more than halfway
 +         * each of the cells it bounds, as this could cause problems,
 +         * especially when the differences between cell sizes are large.
 +         * If changes are applied, they will not make cells smaller
 +         * than the cut-off, as we check all the boundaries which
 +         * might be affected by a change and if the old state was ok,
 +         * the cells will at most be shrunk back to their old size.
 +         */
 +        for (i = range[0]+1; i < range[1]; i++)
 +        {
 +            halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
 +            if (root->cell_f[i] < halfway)
 +            {
 +                root->cell_f[i] = halfway;
 +                /* Check if the change also causes shifts of the next boundaries */
 +                for (j = i+1; j < range[1]; j++)
 +                {
 +                    if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
 +                    {
 +                        root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
 +                    }
 +                }
 +            }
 +            halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
 +            if (root->cell_f[i] > halfway)
 +            {
 +                root->cell_f[i] = halfway;
 +                /* Check if the change also causes shifts of the next boundaries */
 +                for (j = i-1; j >= range[0]+1; j--)
 +                {
 +                    if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
 +                    {
 +                        root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    /* nrange is defined as [lower, upper) range for new call to enforce_limits */
 +    /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
 +     * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
 +     * for a and b nrange is used */
 +    if (d > 0)
 +    {
 +        /* Take care of the staggering of the cell boundaries */
 +        if (bUniform)
 +        {
 +            for (i = range[0]; i < range[1]; i++)
 +            {
 +                root->cell_f_max0[i] = root->cell_f[i];
 +                root->cell_f_min1[i] = root->cell_f[i+1];
 +            }
 +        }
 +        else
 +        {
 +            for (i = range[0]+1; i < range[1]; i++)
 +            {
 +                bLimLo = (root->cell_f[i] < root->bound_min[i]);
 +                bLimHi = (root->cell_f[i] > root->bound_max[i]);
 +                if (bLimLo && bLimHi)
 +                {
 +                    /* Both limits violated, try the best we can */
 +                    /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
 +                    root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
 +                    nrange[0]       = range[0];
 +                    nrange[1]       = i;
 +                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +
 +                    nrange[0] = i;
 +                    nrange[1] = range[1];
 +                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +
 +                    return;
 +                }
 +                else if (bLimLo)
 +                {
 +                    /* root->cell_f[i] = root->bound_min[i]; */
 +                    nrange[1] = i;  /* only store violation location. There could be a LimLo violation following with an higher index */
 +                    bLastHi   = FALSE;
 +                }
 +                else if (bLimHi && !bLastHi)
 +                {
 +                    bLastHi = TRUE;
 +                    if (nrange[1] < range[1])   /* found a LimLo before */
 +                    {
 +                        root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
 +                        dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +                        nrange[0] = nrange[1];
 +                    }
 +                    root->cell_f[i] = root->bound_max[i];
 +                    nrange[1]       = i;
 +                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +                    nrange[0] = i;
 +                    nrange[1] = range[1];
 +                }
 +            }
 +            if (nrange[1] < range[1])   /* found last a LimLo */
 +            {
 +                root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
 +                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +                nrange[0] = nrange[1];
 +                nrange[1] = range[1];
 +                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +            }
 +            else if (nrange[0] > range[0]) /* found at least one LimHi */
 +            {
 +                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
 +                                       int d, int dim, gmx_domdec_root_t *root,
 +                                       gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
 +                                       gmx_bool bUniform, gmx_large_int_t step)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ncd, d1, i, j, pos;
 +    real              *cell_size;
 +    real               load_aver, load_i, imbalance, change, change_max, sc;
 +    real               cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
 +    real               change_limit;
 +    real               relax = 0.5;
 +    gmx_bool           bPBC;
 +    int                range[] = { 0, 0 };
 +
 +    comm = dd->comm;
 +
 +    /* Convert the maximum change from the input percentage to a fraction */
 +    change_limit = comm->dlb_scale_lim*0.01;
 +
 +    ncd = dd->nc[dim];
 +
 +    bPBC = (dim < ddbox->npbcdim);
 +
 +    cell_size = root->buf_ncd;
 +
 +    /* Store the original boundaries */
 +    for (i = 0; i < ncd+1; i++)
 +    {
 +        root->old_cell_f[i] = root->cell_f[i];
 +    }
 +    if (bUniform)
 +    {
 +        for (i = 0; i < ncd; i++)
 +        {
 +            cell_size[i] = 1.0/ncd;
 +        }
 +    }
 +    else if (dd_load_count(comm))
 +    {
 +        load_aver  = comm->load[d].sum_m/ncd;
 +        change_max = 0;
 +        for (i = 0; i < ncd; i++)
 +        {
 +            /* Determine the relative imbalance of cell i */
 +            load_i    = comm->load[d].load[i*comm->load[d].nload+2];
 +            imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
 +            /* Determine the change of the cell size using underrelaxation */
 +            change     = -relax*imbalance;
 +            change_max = max(change_max, max(change, -change));
 +        }
 +        /* Limit the amount of scaling.
 +         * We need to use the same rescaling for all cells in one row,
 +         * otherwise the load balancing might not converge.
 +         */
 +        sc = relax;
 +        if (change_max > change_limit)
 +        {
 +            sc *= change_limit/change_max;
 +        }
 +        for (i = 0; i < ncd; i++)
 +        {
 +            /* Determine the relative imbalance of cell i */
 +            load_i    = comm->load[d].load[i*comm->load[d].nload+2];
 +            imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
 +            /* Determine the change of the cell size using underrelaxation */
 +            change       = -sc*imbalance;
 +            cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
 +        }
 +    }
 +
 +    cellsize_limit_f  = cellsize_min_dlb(comm, d, dim)/ddbox->box_size[dim];
 +    cellsize_limit_f *= DD_CELL_MARGIN;
 +    dist_min_f_hard   = grid_jump_limit(comm, comm->cutoff, d)/ddbox->box_size[dim];
 +    dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
 +    if (ddbox->tric_dir[dim])
 +    {
 +        cellsize_limit_f /= ddbox->skew_fac[dim];
 +        dist_min_f       /= ddbox->skew_fac[dim];
 +    }
 +    if (bDynamicBox && d > 0)
 +    {
 +        dist_min_f *= DD_PRES_SCALE_MARGIN;
 +    }
 +    if (d > 0 && !bUniform)
 +    {
 +        /* Make sure that the grid is not shifted too much */
 +        for (i = 1; i < ncd; i++)
 +        {
 +            if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
 +            {
 +                gmx_incons("Inconsistent DD boundary staggering limits!");
 +            }
 +            root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
 +            space              = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
 +            if (space > 0)
 +            {
 +                root->bound_min[i] += 0.5*space;
 +            }
 +            root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
 +            space              = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
 +            if (space < 0)
 +            {
 +                root->bound_max[i] += 0.5*space;
 +            }
 +            if (debug)
 +            {
 +                fprintf(debug,
 +                        "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
 +                        d, i,
 +                        root->cell_f_max0[i-1] + dist_min_f,
 +                        root->bound_min[i], root->cell_f[i], root->bound_max[i],
 +                        root->cell_f_min1[i] - dist_min_f);
 +            }
 +        }
 +    }
 +    range[1]          = ncd;
 +    root->cell_f[0]   = 0;
 +    root->cell_f[ncd] = 1;
 +    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
 +
 +
 +    /* After the checks above, the cells should obey the cut-off
 +     * restrictions, but it does not hurt to check.
 +     */
 +    for (i = 0; i < ncd; i++)
 +    {
 +        if (debug)
 +        {
 +            fprintf(debug, "Relative bounds dim %d  cell %d: %f %f\n",
 +                    dim, i, root->cell_f[i], root->cell_f[i+1]);
 +        }
 +
 +        if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
 +            root->cell_f[i+1] - root->cell_f[i] <
 +            cellsize_limit_f/DD_CELL_MARGIN)
 +        {
 +            char buf[22];
 +            fprintf(stderr,
 +                    "\nWARNING step %s: direction %c, cell %d too small: %f\n",
 +                    gmx_step_str(step, buf), dim2char(dim), i,
 +                    (root->cell_f[i+1] - root->cell_f[i])
 +                    *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
 +        }
 +    }
 +
 +    pos = ncd + 1;
 +    /* Store the cell boundaries of the lower dimensions at the end */
 +    for (d1 = 0; d1 < d; d1++)
 +    {
 +        root->cell_f[pos++] = comm->cell_f0[d1];
 +        root->cell_f[pos++] = comm->cell_f1[d1];
 +    }
 +
 +    if (d < comm->npmedecompdim)
 +    {
 +        /* The master determines the maximum shift for
 +         * the coordinate communication between separate PME nodes.
 +         */
 +        set_pme_maxshift(dd, &comm->ddpme[d], bUniform, ddbox, root->cell_f);
 +    }
 +    root->cell_f[pos++] = comm->ddpme[0].maxshift;
 +    if (d >= 1)
 +    {
 +        root->cell_f[pos++] = comm->ddpme[1].maxshift;
 +    }
 +}
 +
 +static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
 +                                             gmx_ddbox_t *ddbox, int dimind)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                dim;
 +
 +    comm = dd->comm;
 +
 +    /* Set the cell dimensions */
 +    dim                = dd->dim[dimind];
 +    comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
 +    comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
 +    if (dim >= ddbox->nboundeddim)
 +    {
 +        comm->cell_x0[dim] += ddbox->box0[dim];
 +        comm->cell_x1[dim] += ddbox->box0[dim];
 +    }
 +}
 +
 +static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
 +                                         int d, int dim, real *cell_f_row,
 +                                         gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d1, dim1, pos;
 +
 +    comm = dd->comm;
 +
 +#ifdef GMX_MPI
 +    /* Each node would only need to know two fractions,
 +     * but it is probably cheaper to broadcast the whole array.
 +     */
 +    MPI_Bcast(cell_f_row, DD_CELL_F_SIZE(dd, d)*sizeof(real), MPI_BYTE,
 +              0, comm->mpi_comm_load[d]);
 +#endif
 +    /* Copy the fractions for this dimension from the buffer */
 +    comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
 +    comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
 +    /* The whole array was communicated, so set the buffer position */
 +    pos = dd->nc[dim] + 1;
 +    for (d1 = 0; d1 <= d; d1++)
 +    {
 +        if (d1 < d)
 +        {
 +            /* Copy the cell fractions of the lower dimensions */
 +            comm->cell_f0[d1] = cell_f_row[pos++];
 +            comm->cell_f1[d1] = cell_f_row[pos++];
 +        }
 +        relative_to_absolute_cell_bounds(dd, ddbox, d1);
 +    }
 +    /* Convert the communicated shift from float to int */
 +    comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
 +    if (d >= 1)
 +    {
 +        comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
 +    }
 +}
 +
 +static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
 +                                         gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
 +                                         gmx_bool bUniform, gmx_large_int_t step)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d, dim, d1;
 +    gmx_bool           bRowMember, bRowRoot;
 +    real              *cell_f_row;
 +
 +    comm = dd->comm;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim        = dd->dim[d];
 +        bRowMember = TRUE;
 +        bRowRoot   = TRUE;
 +        for (d1 = d; d1 < dd->ndim; d1++)
 +        {
 +            if (dd->ci[dd->dim[d1]] > 0)
 +            {
 +                if (d1 > d)
 +                {
 +                    bRowMember = FALSE;
 +                }
 +                bRowRoot = FALSE;
 +            }
 +        }
 +        if (bRowMember)
 +        {
 +            if (bRowRoot)
 +            {
 +                set_dd_cell_sizes_dlb_root(dd, d, dim, comm->root[d],
 +                                           ddbox, bDynamicBox, bUniform, step);
 +                cell_f_row = comm->root[d]->cell_f;
 +            }
 +            else
 +            {
 +                cell_f_row = comm->cell_f_row;
 +            }
 +            distribute_dd_cell_sizes_dlb(dd, d, dim, cell_f_row, ddbox);
 +        }
 +    }
 +}
 +
 +static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
 +{
 +    int d;
 +
 +    /* This function assumes the box is static and should therefore
 +     * not be called when the box has changed since the last
 +     * call to dd_partition_system.
 +     */
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        relative_to_absolute_cell_bounds(dd, ddbox, d);
 +    }
 +}
 +
 +
 +
 +static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
 +                                  gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
 +                                  gmx_bool bUniform, gmx_bool bDoDLB, gmx_large_int_t step,
 +                                  gmx_wallcycle_t wcycle)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                dim;
 +
 +    comm = dd->comm;
 +
 +    if (bDoDLB)
 +    {
 +        wallcycle_start(wcycle, ewcDDCOMMBOUND);
 +        set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
 +        wallcycle_stop(wcycle, ewcDDCOMMBOUND);
 +    }
 +    else if (bDynamicBox)
 +    {
 +        set_dd_cell_sizes_dlb_nochange(dd, ddbox);
 +    }
 +
 +    /* Set the dimensions for which no DD is used */
 +    for (dim = 0; dim < DIM; dim++)
 +    {
 +        if (dd->nc[dim] == 1)
 +        {
 +            comm->cell_x0[dim] = 0;
 +            comm->cell_x1[dim] = ddbox->box_size[dim];
 +            if (dim >= ddbox->nboundeddim)
 +            {
 +                comm->cell_x0[dim] += ddbox->box0[dim];
 +                comm->cell_x1[dim] += ddbox->box0[dim];
 +            }
 +        }
 +    }
 +}
 +
 +static void realloc_comm_ind(gmx_domdec_t *dd, ivec npulse)
 +{
 +    int                    d, np, i;
 +    gmx_domdec_comm_dim_t *cd;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        cd = &dd->comm->cd[d];
 +        np = npulse[dd->dim[d]];
 +        if (np > cd->np_nalloc)
 +        {
 +            if (debug)
 +            {
 +                fprintf(debug, "(Re)allocing cd for %c to %d pulses\n",
 +                        dim2char(dd->dim[d]), np);
 +            }
 +            if (DDMASTER(dd) && cd->np_nalloc > 0)
 +            {
 +                fprintf(stderr, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd->dim[d]), np);
 +            }
 +            srenew(cd->ind, np);
 +            for (i = cd->np_nalloc; i < np; i++)
 +            {
 +                cd->ind[i].index  = NULL;
 +                cd->ind[i].nalloc = 0;
 +            }
 +            cd->np_nalloc = np;
 +        }
 +        cd->np = np;
 +    }
 +}
 +
 +
 +static void set_dd_cell_sizes(gmx_domdec_t *dd,
 +                              gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
 +                              gmx_bool bUniform, gmx_bool bDoDLB, gmx_large_int_t step,
 +                              gmx_wallcycle_t wcycle)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d;
 +    ivec               npulse;
 +
 +    comm = dd->comm;
 +
 +    /* Copy the old cell boundaries for the cg displacement check */
 +    copy_rvec(comm->cell_x0, comm->old_cell_x0);
 +    copy_rvec(comm->cell_x1, comm->old_cell_x1);
 +
 +    if (comm->bDynLoadBal)
 +    {
 +        if (DDMASTER(dd))
 +        {
 +            check_box_size(dd, ddbox);
 +        }
 +        set_dd_cell_sizes_dlb(dd, ddbox, bDynamicBox, bUniform, bDoDLB, step, wcycle);
 +    }
 +    else
 +    {
 +        set_dd_cell_sizes_slb(dd, ddbox, FALSE, npulse);
 +        realloc_comm_ind(dd, npulse);
 +    }
 +
 +    if (debug)
 +    {
 +        for (d = 0; d < DIM; d++)
 +        {
 +            fprintf(debug, "cell_x[%d] %f - %f skew_fac %f\n",
 +                    d, comm->cell_x0[d], comm->cell_x1[d], ddbox->skew_fac[d]);
 +        }
 +    }
 +}
 +
 +static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
 +                                  gmx_ddbox_t *ddbox,
 +                                  rvec cell_ns_x0, rvec cell_ns_x1,
 +                                  gmx_large_int_t step)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                dim_ind, dim;
 +
 +    comm = dd->comm;
 +
 +    for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
 +    {
 +        dim = dd->dim[dim_ind];
 +
 +        /* Without PBC we don't have restrictions on the outer cells */
 +        if (!(dim >= ddbox->npbcdim &&
 +              (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
 +            comm->bDynLoadBal &&
 +            (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
 +            comm->cellsize_min[dim])
 +        {
 +            char buf[22];
 +            gmx_fatal(FARGS, "Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
 +                      gmx_step_str(step, buf), dim2char(dim),
 +                      comm->cell_x1[dim] - comm->cell_x0[dim],
 +                      ddbox->skew_fac[dim],
 +                      dd->comm->cellsize_min[dim],
 +                      dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +        }
 +    }
 +
 +    if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
 +    {
 +        /* Communicate the boundaries and update cell_ns_x0/1 */
 +        dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
 +        if (dd->bGridJump && dd->ndim > 1)
 +        {
 +            check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
 +        }
 +    }
 +}
 +
 +static void make_tric_corr_matrix(int npbcdim, matrix box, matrix tcm)
 +{
 +    if (YY < npbcdim)
 +    {
 +        tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
 +    }
 +    else
 +    {
 +        tcm[YY][XX] = 0;
 +    }
 +    if (ZZ < npbcdim)
 +    {
 +        tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
 +        tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
 +    }
 +    else
 +    {
 +        tcm[ZZ][XX] = 0;
 +        tcm[ZZ][YY] = 0;
 +    }
 +}
 +
 +static void check_screw_box(matrix box)
 +{
 +    /* Mathematical limitation */
 +    if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
 +    {
 +        gmx_fatal(FARGS, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
 +    }
 +
 +    /* Limitation due to the asymmetry of the eighth shell method */
 +    if (box[ZZ][YY] != 0)
 +    {
 +        gmx_fatal(FARGS, "pbc=screw with non-zero box_zy is not supported");
 +    }
 +}
 +
 +static void distribute_cg(FILE *fplog, gmx_large_int_t step,
 +                          matrix box, ivec tric_dir, t_block *cgs, rvec pos[],
 +                          gmx_domdec_t *dd)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                **tmp_ind = NULL, *tmp_nalloc = NULL;
 +    int                  i, icg, j, k, k0, k1, d, npbcdim;
 +    matrix               tcm;
 +    rvec                 box_size, cg_cm;
 +    ivec                 ind;
 +    real                 nrcg, inv_ncg, pos_d;
 +    atom_id             *cgindex;
 +    gmx_bool             bUnbounded, bScrew;
 +
 +    ma = dd->ma;
 +
 +    if (tmp_ind == NULL)
 +    {
 +        snew(tmp_nalloc, dd->nnodes);
 +        snew(tmp_ind, dd->nnodes);
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
 +            snew(tmp_ind[i], tmp_nalloc[i]);
 +        }
 +    }
 +
 +    /* Clear the count */
 +    for (i = 0; i < dd->nnodes; i++)
 +    {
 +        ma->ncg[i] = 0;
 +        ma->nat[i] = 0;
 +    }
 +
 +    make_tric_corr_matrix(dd->npbcdim, box, tcm);
 +
 +    cgindex = cgs->index;
 +
 +    /* Compute the center of geometry for all charge groups */
 +    for (icg = 0; icg < cgs->nr; icg++)
 +    {
 +        k0      = cgindex[icg];
 +        k1      = cgindex[icg+1];
 +        nrcg    = k1 - k0;
 +        if (nrcg == 1)
 +        {
 +            copy_rvec(pos[k0], cg_cm);
 +        }
 +        else
 +        {
 +            inv_ncg = 1.0/nrcg;
 +
 +            clear_rvec(cg_cm);
 +            for (k = k0; (k < k1); k++)
 +            {
 +                rvec_inc(cg_cm, pos[k]);
 +            }
 +            for (d = 0; (d < DIM); d++)
 +            {
 +                cg_cm[d] *= inv_ncg;
 +            }
 +        }
 +        /* Put the charge group in the box and determine the cell index */
 +        for (d = DIM-1; d >= 0; d--)
 +        {
 +            pos_d = cg_cm[d];
 +            if (d < dd->npbcdim)
 +            {
 +                bScrew = (dd->bScrewPBC && d == XX);
 +                if (tric_dir[d] && dd->nc[d] > 1)
 +                {
 +                    /* Use triclinic coordintates for this dimension */
 +                    for (j = d+1; j < DIM; j++)
 +                    {
 +                        pos_d += cg_cm[j]*tcm[j][d];
 +                    }
 +                }
 +                while (pos_d >= box[d][d])
 +                {
 +                    pos_d -= box[d][d];
 +                    rvec_dec(cg_cm, box[d]);
 +                    if (bScrew)
 +                    {
 +                        cg_cm[YY] = box[YY][YY] - cg_cm[YY];
 +                        cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
 +                    }
 +                    for (k = k0; (k < k1); k++)
 +                    {
 +                        rvec_dec(pos[k], box[d]);
 +                        if (bScrew)
 +                        {
 +                            pos[k][YY] = box[YY][YY] - pos[k][YY];
 +                            pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
 +                        }
 +                    }
 +                }
 +                while (pos_d < 0)
 +                {
 +                    pos_d += box[d][d];
 +                    rvec_inc(cg_cm, box[d]);
 +                    if (bScrew)
 +                    {
 +                        cg_cm[YY] = box[YY][YY] - cg_cm[YY];
 +                        cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
 +                    }
 +                    for (k = k0; (k < k1); k++)
 +                    {
 +                        rvec_inc(pos[k], box[d]);
 +                        if (bScrew)
 +                        {
 +                            pos[k][YY] = box[YY][YY] - pos[k][YY];
 +                            pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
 +                        }
 +                    }
 +                }
 +            }
 +            /* This could be done more efficiently */
 +            ind[d] = 0;
 +            while (ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
 +            {
 +                ind[d]++;
 +            }
 +        }
 +        i = dd_index(dd->nc, ind);
 +        if (ma->ncg[i] == tmp_nalloc[i])
 +        {
 +            tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
 +            srenew(tmp_ind[i], tmp_nalloc[i]);
 +        }
 +        tmp_ind[i][ma->ncg[i]] = icg;
 +        ma->ncg[i]++;
 +        ma->nat[i] += cgindex[icg+1] - cgindex[icg];
 +    }
 +
 +    k1 = 0;
 +    for (i = 0; i < dd->nnodes; i++)
 +    {
 +        ma->index[i] = k1;
 +        for (k = 0; k < ma->ncg[i]; k++)
 +        {
 +            ma->cg[k1++] = tmp_ind[i][k];
 +        }
 +    }
 +    ma->index[dd->nnodes] = k1;
 +
 +    for (i = 0; i < dd->nnodes; i++)
 +    {
 +        sfree(tmp_ind[i]);
 +    }
 +    sfree(tmp_ind);
 +    sfree(tmp_nalloc);
 +
 +    if (fplog)
 +    {
 +        char buf[22];
 +        fprintf(fplog, "Charge group distribution at step %s:",
 +                gmx_step_str(step, buf));
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            fprintf(fplog, " %d", ma->ncg[i]);
 +        }
 +        fprintf(fplog, "\n");
 +    }
 +}
 +
 +static void get_cg_distribution(FILE *fplog, gmx_large_int_t step, gmx_domdec_t *dd,
 +                                t_block *cgs, matrix box, gmx_ddbox_t *ddbox,
 +                                rvec pos[])
 +{
 +    gmx_domdec_master_t *ma = NULL;
 +    ivec                 npulse;
 +    int                  i, cg_gl;
 +    int                 *ibuf, buf2[2] = { 0, 0 };
 +    gmx_bool             bMaster = DDMASTER(dd);
 +    if (bMaster)
 +    {
 +        ma = dd->ma;
 +
 +        if (dd->bScrewPBC)
 +        {
 +            check_screw_box(box);
 +        }
 +
 +        set_dd_cell_sizes_slb(dd, ddbox, TRUE, npulse);
 +
 +        distribute_cg(fplog, step, box, ddbox->tric_dir, cgs, pos, dd);
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            ma->ibuf[2*i]   = ma->ncg[i];
 +            ma->ibuf[2*i+1] = ma->nat[i];
 +        }
 +        ibuf = ma->ibuf;
 +    }
 +    else
 +    {
 +        ibuf = NULL;
 +    }
 +    dd_scatter(dd, 2*sizeof(int), ibuf, buf2);
 +
 +    dd->ncg_home = buf2[0];
 +    dd->nat_home = buf2[1];
 +    dd->ncg_tot  = dd->ncg_home;
 +    dd->nat_tot  = dd->nat_home;
 +    if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
 +    {
 +        dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
 +        srenew(dd->index_gl, dd->cg_nalloc);
 +        srenew(dd->cgindex, dd->cg_nalloc+1);
 +    }
 +    if (bMaster)
 +    {
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
 +            ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
 +        }
 +    }
 +
 +    dd_scatterv(dd,
 +                DDMASTER(dd) ? ma->ibuf : NULL,
 +                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
 +                DDMASTER(dd) ? ma->cg : NULL,
 +                dd->ncg_home*sizeof(int), dd->index_gl);
 +
 +    /* Determine the home charge group sizes */
 +    dd->cgindex[0] = 0;
 +    for (i = 0; i < dd->ncg_home; i++)
 +    {
 +        cg_gl            = dd->index_gl[i];
 +        dd->cgindex[i+1] =
 +            dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Home charge groups:\n");
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            fprintf(debug, " %d", dd->index_gl[i]);
 +            if (i % 10 == 9)
 +            {
 +                fprintf(debug, "\n");
 +            }
 +        }
 +        fprintf(debug, "\n");
 +    }
 +}
 +
 +static int compact_and_copy_vec_at(int ncg, int *move,
 +                                   int *cgindex,
 +                                   int nvec, int vec,
 +                                   rvec *src, gmx_domdec_comm_t *comm,
 +                                   gmx_bool bCompact)
 +{
 +    int m, icg, i, i0, i1, nrcg;
 +    int home_pos;
 +    int pos_vec[DIM*2];
 +
 +    home_pos = 0;
 +
 +    for (m = 0; m < DIM*2; m++)
 +    {
 +        pos_vec[m] = 0;
 +    }
 +
 +    i0 = 0;
 +    for (icg = 0; icg < ncg; icg++)
 +    {
 +        i1 = cgindex[icg+1];
 +        m  = move[icg];
 +        if (m == -1)
 +        {
 +            if (bCompact)
 +            {
 +                /* Compact the home array in place */
 +                for (i = i0; i < i1; i++)
 +                {
 +                    copy_rvec(src[i], src[home_pos++]);
 +                }
 +            }
 +        }
 +        else
 +        {
 +            /* Copy to the communication buffer */
 +            nrcg        = i1 - i0;
 +            pos_vec[m] += 1 + vec*nrcg;
 +            for (i = i0; i < i1; i++)
 +            {
 +                copy_rvec(src[i], comm->cgcm_state[m][pos_vec[m]++]);
 +            }
 +            pos_vec[m] += (nvec - vec - 1)*nrcg;
 +        }
 +        if (!bCompact)
 +        {
 +            home_pos += i1 - i0;
 +        }
 +        i0 = i1;
 +    }
 +
 +    return home_pos;
 +}
 +
 +static int compact_and_copy_vec_cg(int ncg, int *move,
 +                                   int *cgindex,
 +                                   int nvec, rvec *src, gmx_domdec_comm_t *comm,
 +                                   gmx_bool bCompact)
 +{
 +    int m, icg, i0, i1, nrcg;
 +    int home_pos;
 +    int pos_vec[DIM*2];
 +
 +    home_pos = 0;
 +
 +    for (m = 0; m < DIM*2; m++)
 +    {
 +        pos_vec[m] = 0;
 +    }
 +
 +    i0 = 0;
 +    for (icg = 0; icg < ncg; icg++)
 +    {
 +        i1 = cgindex[icg+1];
 +        m  = move[icg];
 +        if (m == -1)
 +        {
 +            if (bCompact)
 +            {
 +                /* Compact the home array in place */
 +                copy_rvec(src[icg], src[home_pos++]);
 +            }
 +        }
 +        else
 +        {
 +            nrcg = i1 - i0;
 +            /* Copy to the communication buffer */
 +            copy_rvec(src[icg], comm->cgcm_state[m][pos_vec[m]]);
 +            pos_vec[m] += 1 + nrcg*nvec;
 +        }
 +        i0 = i1;
 +    }
 +    if (!bCompact)
 +    {
 +        home_pos = ncg;
 +    }
 +
 +    return home_pos;
 +}
 +
 +static int compact_ind(int ncg, int *move,
 +                       int *index_gl, int *cgindex,
 +                       int *gatindex,
 +                       gmx_ga2la_t ga2la, char *bLocalCG,
 +                       int *cginfo)
 +{
 +    int cg, nat, a0, a1, a, a_gl;
 +    int home_pos;
 +
 +    home_pos = 0;
 +    nat      = 0;
 +    for (cg = 0; cg < ncg; cg++)
 +    {
 +        a0 = cgindex[cg];
 +        a1 = cgindex[cg+1];
 +        if (move[cg] == -1)
 +        {
 +            /* Compact the home arrays in place.
 +             * Anything that can be done here avoids access to global arrays.
 +             */
 +            cgindex[home_pos] = nat;
 +            for (a = a0; a < a1; a++)
 +            {
 +                a_gl          = gatindex[a];
 +                gatindex[nat] = a_gl;
 +                /* The cell number stays 0, so we don't need to set it */
 +                ga2la_change_la(ga2la, a_gl, nat);
 +                nat++;
 +            }
 +            index_gl[home_pos] = index_gl[cg];
 +            cginfo[home_pos]   = cginfo[cg];
 +            /* The charge group remains local, so bLocalCG does not change */
 +            home_pos++;
 +        }
 +        else
 +        {
 +            /* Clear the global indices */
 +            for (a = a0; a < a1; a++)
 +            {
 +                ga2la_del(ga2la, gatindex[a]);
 +            }
 +            if (bLocalCG)
 +            {
 +                bLocalCG[index_gl[cg]] = FALSE;
 +            }
 +        }
 +    }
 +    cgindex[home_pos] = nat;
 +
 +    return home_pos;
 +}
 +
 +static void clear_and_mark_ind(int ncg, int *move,
 +                               int *index_gl, int *cgindex, int *gatindex,
 +                               gmx_ga2la_t ga2la, char *bLocalCG,
 +                               int *cell_index)
 +{
 +    int cg, a0, a1, a;
 +
 +    for (cg = 0; cg < ncg; cg++)
 +    {
 +        if (move[cg] >= 0)
 +        {
 +            a0 = cgindex[cg];
 +            a1 = cgindex[cg+1];
 +            /* Clear the global indices */
 +            for (a = a0; a < a1; a++)
 +            {
 +                ga2la_del(ga2la, gatindex[a]);
 +            }
 +            if (bLocalCG)
 +            {
 +                bLocalCG[index_gl[cg]] = FALSE;
 +            }
 +            /* Signal that this cg has moved using the ns cell index.
 +             * Here we set it to -1. fill_grid will change it
 +             * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
 +             */
 +            cell_index[cg] = -1;
 +        }
 +    }
 +}
 +
 +static void print_cg_move(FILE *fplog,
 +                          gmx_domdec_t *dd,
 +                          gmx_large_int_t step, int cg, int dim, int dir,
 +                          gmx_bool bHaveLimitdAndCMOld, real limitd,
 +                          rvec cm_old, rvec cm_new, real pos_d)
 +{
 +    gmx_domdec_comm_t *comm;
 +    char               buf[22];
 +
 +    comm = dd->comm;
 +
 +    fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
 +    if (bHaveLimitdAndCMOld)
 +    {
 +        fprintf(fplog, "The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
 +                ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
 +    }
 +    else
 +    {
 +        fprintf(fplog, "The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
 +                ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
 +    }
 +    fprintf(fplog, "distance out of cell %f\n",
 +            dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
 +    if (bHaveLimitdAndCMOld)
 +    {
 +        fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
 +                cm_old[XX], cm_old[YY], cm_old[ZZ]);
 +    }
 +    fprintf(fplog, "New coordinates: %8.3f %8.3f %8.3f\n",
 +            cm_new[XX], cm_new[YY], cm_new[ZZ]);
 +    fprintf(fplog, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
 +            dim2char(dim),
 +            comm->old_cell_x0[dim], comm->old_cell_x1[dim]);
 +    fprintf(fplog, "New cell boundaries in direction %c: %8.3f %8.3f\n",
 +            dim2char(dim),
 +            comm->cell_x0[dim], comm->cell_x1[dim]);
 +}
 +
 +static void cg_move_error(FILE *fplog,
 +                          gmx_domdec_t *dd,
 +                          gmx_large_int_t step, int cg, int dim, int dir,
 +                          gmx_bool bHaveLimitdAndCMOld, real limitd,
 +                          rvec cm_old, rvec cm_new, real pos_d)
 +{
 +    if (fplog)
 +    {
 +        print_cg_move(fplog, dd, step, cg, dim, dir,
 +                      bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
 +    }
 +    print_cg_move(stderr, dd, step, cg, dim, dir,
 +                  bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
 +    gmx_fatal(FARGS,
 +              "A charge group moved too far between two domain decomposition steps\n"
 +              "This usually means that your system is not well equilibrated");
 +}
 +
 +static void rotate_state_atom(t_state *state, int a)
 +{
 +    int est;
 +
 +    for (est = 0; est < estNR; est++)
 +    {
 +        if (EST_DISTR(est) && (state->flags & (1<<est)))
 +        {
 +            switch (est)
 +            {
 +                case estX:
 +                    /* Rotate the complete state; for a rectangular box only */
 +                    state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
 +                    state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
 +                    break;
 +                case estV:
 +                    state->v[a][YY] = -state->v[a][YY];
 +                    state->v[a][ZZ] = -state->v[a][ZZ];
 +                    break;
 +                case estSDX:
 +                    state->sd_X[a][YY] = -state->sd_X[a][YY];
 +                    state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
 +                    break;
 +                case estCGP:
 +                    state->cg_p[a][YY] = -state->cg_p[a][YY];
 +                    state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
 +                    break;
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* These are distances, so not affected by rotation */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in rotate_state_atom");
 +            }
 +        }
 +    }
 +}
 +
 +static int *get_moved(gmx_domdec_comm_t *comm, int natoms)
 +{
 +    if (natoms > comm->moved_nalloc)
 +    {
 +        /* Contents should be preserved here */
 +        comm->moved_nalloc = over_alloc_dd(natoms);
 +        srenew(comm->moved, comm->moved_nalloc);
 +    }
 +
 +    return comm->moved;
 +}
 +
 +static void calc_cg_move(FILE *fplog, gmx_large_int_t step,
 +                         gmx_domdec_t *dd,
 +                         t_state *state,
 +                         ivec tric_dir, matrix tcm,
 +                         rvec cell_x0, rvec cell_x1,
 +                         rvec limitd, rvec limit0, rvec limit1,
 +                         const int *cgindex,
 +                         int cg_start, int cg_end,
 +                         rvec *cg_cm,
 +                         int *move)
 +{
 +    int      npbcdim;
 +    int      c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
 +    int      mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
 +    int      flag;
 +    gmx_bool bScrew;
 +    ivec     dev;
 +    real     inv_ncg, pos_d;
 +    rvec     cm_new;
 +
 +    npbcdim = dd->npbcdim;
 +
 +    for (cg = cg_start; cg < cg_end; cg++)
 +    {
 +        k0   = cgindex[cg];
 +        k1   = cgindex[cg+1];
 +        nrcg = k1 - k0;
 +        if (nrcg == 1)
 +        {
 +            copy_rvec(state->x[k0], cm_new);
 +        }
 +        else
 +        {
 +            inv_ncg = 1.0/nrcg;
 +
 +            clear_rvec(cm_new);
 +            for (k = k0; (k < k1); k++)
 +            {
 +                rvec_inc(cm_new, state->x[k]);
 +            }
 +            for (d = 0; (d < DIM); d++)
 +            {
 +                cm_new[d] = inv_ncg*cm_new[d];
 +            }
 +        }
 +
 +        clear_ivec(dev);
 +        /* Do pbc and check DD cell boundary crossings */
 +        for (d = DIM-1; d >= 0; d--)
 +        {
 +            if (dd->nc[d] > 1)
 +            {
 +                bScrew = (dd->bScrewPBC && d == XX);
 +                /* Determine the location of this cg in lattice coordinates */
 +                pos_d = cm_new[d];
 +                if (tric_dir[d])
 +                {
 +                    for (d2 = d+1; d2 < DIM; d2++)
 +                    {
 +                        pos_d += cm_new[d2]*tcm[d2][d];
 +                    }
 +                }
 +                /* Put the charge group in the triclinic unit-cell */
 +                if (pos_d >= cell_x1[d])
 +                {
 +                    if (pos_d >= limit1[d])
 +                    {
 +                        cg_move_error(fplog, dd, step, cg, d, 1, TRUE, limitd[d],
 +                                      cg_cm[cg], cm_new, pos_d);
 +                    }
 +                    dev[d] = 1;
 +                    if (dd->ci[d] == dd->nc[d] - 1)
 +                    {
 +                        rvec_dec(cm_new, state->box[d]);
 +                        if (bScrew)
 +                        {
 +                            cm_new[YY] = state->box[YY][YY] - cm_new[YY];
 +                            cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
 +                        }
 +                        for (k = k0; (k < k1); k++)
 +                        {
 +                            rvec_dec(state->x[k], state->box[d]);
 +                            if (bScrew)
 +                            {
 +                                rotate_state_atom(state, k);
 +                            }
 +                        }
 +                    }
 +                }
 +                else if (pos_d < cell_x0[d])
 +                {
 +                    if (pos_d < limit0[d])
 +                    {
 +                        cg_move_error(fplog, dd, step, cg, d, -1, TRUE, limitd[d],
 +                                      cg_cm[cg], cm_new, pos_d);
 +                    }
 +                    dev[d] = -1;
 +                    if (dd->ci[d] == 0)
 +                    {
 +                        rvec_inc(cm_new, state->box[d]);
 +                        if (bScrew)
 +                        {
 +                            cm_new[YY] = state->box[YY][YY] - cm_new[YY];
 +                            cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
 +                        }
 +                        for (k = k0; (k < k1); k++)
 +                        {
 +                            rvec_inc(state->x[k], state->box[d]);
 +                            if (bScrew)
 +                            {
 +                                rotate_state_atom(state, k);
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +            else if (d < npbcdim)
 +            {
 +                /* Put the charge group in the rectangular unit-cell */
 +                while (cm_new[d] >= state->box[d][d])
 +                {
 +                    rvec_dec(cm_new, state->box[d]);
 +                    for (k = k0; (k < k1); k++)
 +                    {
 +                        rvec_dec(state->x[k], state->box[d]);
 +                    }
 +                }
 +                while (cm_new[d] < 0)
 +                {
 +                    rvec_inc(cm_new, state->box[d]);
 +                    for (k = k0; (k < k1); k++)
 +                    {
 +                        rvec_inc(state->x[k], state->box[d]);
 +                    }
 +                }
 +            }
 +        }
 +
 +        copy_rvec(cm_new, cg_cm[cg]);
 +
 +        /* Determine where this cg should go */
 +        flag = 0;
 +        mc   = -1;
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            dim = dd->dim[d];
 +            if (dev[dim] == 1)
 +            {
 +                flag |= DD_FLAG_FW(d);
 +                if (mc == -1)
 +                {
 +                    mc = d*2;
 +                }
 +            }
 +            else if (dev[dim] == -1)
 +            {
 +                flag |= DD_FLAG_BW(d);
 +                if (mc == -1)
 +                {
 +                    if (dd->nc[dim] > 2)
 +                    {
 +                        mc = d*2 + 1;
 +                    }
 +                    else
 +                    {
 +                        mc = d*2;
 +                    }
 +                }
 +            }
 +        }
 +        /* Temporarily store the flag in move */
 +        move[cg] = mc + flag;
 +    }
 +}
 +
 +static void dd_redistribute_cg(FILE *fplog, gmx_large_int_t step,
 +                               gmx_domdec_t *dd, ivec tric_dir,
 +                               t_state *state, rvec **f,
 +                               t_forcerec *fr,
 +                               gmx_bool bCompact,
 +                               t_nrnb *nrnb,
 +                               int *ncg_stay_home,
 +                               int *ncg_moved)
 +{
 +    int               *move;
 +    int                npbcdim;
 +    int                ncg[DIM*2], nat[DIM*2];
 +    int                c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
 +    int                mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
 +    int                sbuf[2], rbuf[2];
 +    int                home_pos_cg, home_pos_at, buf_pos;
 +    int                flag;
 +    gmx_bool           bV = FALSE, bSDX = FALSE, bCGP = FALSE;
 +    gmx_bool           bScrew;
 +    ivec               dev;
 +    real               inv_ncg, pos_d;
 +    matrix             tcm;
 +    rvec              *cg_cm = NULL, cell_x0, cell_x1, limitd, limit0, limit1, cm_new;
 +    atom_id           *cgindex;
 +    cginfo_mb_t       *cginfo_mb;
 +    gmx_domdec_comm_t *comm;
 +    int               *moved;
 +    int                nthread, thread;
 +
 +    if (dd->bScrewPBC)
 +    {
 +        check_screw_box(state->box);
 +    }
 +
 +    comm  = dd->comm;
 +    if (fr->cutoff_scheme == ecutsGROUP)
 +    {
 +        cg_cm = fr->cg_cm;
 +    }
 +
 +    for (i = 0; i < estNR; i++)
 +    {
 +        if (EST_DISTR(i))
 +        {
 +            switch (i)
 +            {
 +                case estX: /* Always present */ break;
 +                case estV:   bV   = (state->flags & (1<<i)); break;
 +                case estSDX: bSDX = (state->flags & (1<<i)); break;
 +                case estCGP: bCGP = (state->flags & (1<<i)); break;
 +                case estLD_RNG:
 +                case estLD_RNGI:
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* No processing required */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
 +            }
 +        }
 +    }
 +
 +    if (dd->ncg_tot > comm->nalloc_int)
 +    {
 +        comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
 +        srenew(comm->buf_int, comm->nalloc_int);
 +    }
 +    move = comm->buf_int;
 +
 +    /* Clear the count */
 +    for (c = 0; c < dd->ndim*2; c++)
 +    {
 +        ncg[c] = 0;
 +        nat[c] = 0;
 +    }
 +
 +    npbcdim = dd->npbcdim;
 +
 +    for (d = 0; (d < DIM); d++)
 +    {
 +        limitd[d] = dd->comm->cellsize_min[d];
 +        if (d >= npbcdim && dd->ci[d] == 0)
 +        {
 +            cell_x0[d] = -GMX_FLOAT_MAX;
 +        }
 +        else
 +        {
 +            cell_x0[d] = comm->cell_x0[d];
 +        }
 +        if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
 +        {
 +            cell_x1[d] = GMX_FLOAT_MAX;
 +        }
 +        else
 +        {
 +            cell_x1[d] = comm->cell_x1[d];
 +        }
 +        if (d < npbcdim)
 +        {
 +            limit0[d] = comm->old_cell_x0[d] - limitd[d];
 +            limit1[d] = comm->old_cell_x1[d] + limitd[d];
 +        }
 +        else
 +        {
 +            /* We check after communication if a charge group moved
 +             * more than one cell. Set the pre-comm check limit to float_max.
 +             */
 +            limit0[d] = -GMX_FLOAT_MAX;
 +            limit1[d] =  GMX_FLOAT_MAX;
 +        }
 +    }
 +
 +    make_tric_corr_matrix(npbcdim, state->box, tcm);
 +
 +    cgindex = dd->cgindex;
 +
 +    nthread = gmx_omp_nthreads_get(emntDomdec);
 +
 +    /* Compute the center of geometry for all home charge groups
 +     * and put them in the box and determine where they should go.
 +     */
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +    for (thread = 0; thread < nthread; thread++)
 +    {
 +        calc_cg_move(fplog, step, dd, state, tric_dir, tcm,
 +                     cell_x0, cell_x1, limitd, limit0, limit1,
 +                     cgindex,
 +                     ( thread   *dd->ncg_home)/nthread,
 +                     ((thread+1)*dd->ncg_home)/nthread,
 +                     fr->cutoff_scheme == ecutsGROUP ? cg_cm : state->x,
 +                     move);
 +    }
 +
 +    for (cg = 0; cg < dd->ncg_home; cg++)
 +    {
 +        if (move[cg] >= 0)
 +        {
 +            mc       = move[cg];
 +            flag     = mc & ~DD_FLAG_NRCG;
 +            mc       = mc & DD_FLAG_NRCG;
 +            move[cg] = mc;
 +
 +            if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
 +            {
 +                comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
 +                srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
 +            }
 +            comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
 +            /* We store the cg size in the lower 16 bits
 +             * and the place where the charge group should go
 +             * in the next 6 bits. This saves some communication volume.
 +             */
 +            nrcg = cgindex[cg+1] - cgindex[cg];
 +            comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
 +            ncg[mc] += 1;
 +            nat[mc] += nrcg;
 +        }
 +    }
 +
 +    inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
 +    inc_nrnb(nrnb, eNR_RESETX, dd->ncg_home);
 +
 +    *ncg_moved = 0;
 +    for (i = 0; i < dd->ndim*2; i++)
 +    {
 +        *ncg_moved += ncg[i];
 +    }
 +
 +    nvec = 1;
 +    if (bV)
 +    {
 +        nvec++;
 +    }
 +    if (bSDX)
 +    {
 +        nvec++;
 +    }
 +    if (bCGP)
 +    {
 +        nvec++;
 +    }
 +
 +    /* Make sure the communication buffers are large enough */
 +    for (mc = 0; mc < dd->ndim*2; mc++)
 +    {
 +        nvr = ncg[mc] + nat[mc]*nvec;
 +        if (nvr > comm->cgcm_state_nalloc[mc])
 +        {
 +            comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
 +            srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
 +        }
 +    }
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +        case ecutsGROUP:
 +            /* Recalculating cg_cm might be cheaper than communicating,
 +             * but that could give rise to rounding issues.
 +             */
 +            home_pos_cg =
 +                compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
 +                                        nvec, cg_cm, comm, bCompact);
 +            break;
 +        case ecutsVERLET:
 +            /* Without charge groups we send the moved atom coordinates
 +             * over twice. This is so the code below can be used without
 +             * many conditionals for both for with and without charge groups.
 +             */
 +            home_pos_cg =
 +                compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
 +                                        nvec, state->x, comm, FALSE);
 +            if (bCompact)
 +            {
 +                home_pos_cg -= *ncg_moved;
 +            }
 +            break;
 +        default:
 +            gmx_incons("unimplemented");
 +            home_pos_cg = 0;
 +    }
 +
 +    vec         = 0;
 +    home_pos_at =
 +        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
 +                                nvec, vec++, state->x, comm, bCompact);
 +    if (bV)
 +    {
 +        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
 +                                nvec, vec++, state->v, comm, bCompact);
 +    }
 +    if (bSDX)
 +    {
 +        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
 +                                nvec, vec++, state->sd_X, comm, bCompact);
 +    }
 +    if (bCGP)
 +    {
 +        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
 +                                nvec, vec++, state->cg_p, comm, bCompact);
 +    }
 +
 +    if (bCompact)
 +    {
 +        compact_ind(dd->ncg_home, move,
 +                    dd->index_gl, dd->cgindex, dd->gatindex,
 +                    dd->ga2la, comm->bLocalCG,
 +                    fr->cginfo);
 +    }
 +    else
 +    {
 +        if (fr->cutoff_scheme == ecutsVERLET)
 +        {
 +            moved = get_moved(comm, dd->ncg_home);
 +
 +            for (k = 0; k < dd->ncg_home; k++)
 +            {
 +                moved[k] = 0;
 +            }
 +        }
 +        else
 +        {
 +            moved = fr->ns.grid->cell_index;
 +        }
 +
 +        clear_and_mark_ind(dd->ncg_home, move,
 +                           dd->index_gl, dd->cgindex, dd->gatindex,
 +                           dd->ga2la, comm->bLocalCG,
 +                           moved);
 +    }
 +
 +    cginfo_mb = fr->cginfo_mb;
 +
 +    *ncg_stay_home = home_pos_cg;
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim      = dd->dim[d];
 +        ncg_recv = 0;
 +        nat_recv = 0;
 +        nvr      = 0;
 +        for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
 +        {
 +            cdd = d*2 + dir;
 +            /* Communicate the cg and atom counts */
 +            sbuf[0] = ncg[cdd];
 +            sbuf[1] = nat[cdd];
 +            if (debug)
 +            {
 +                fprintf(debug, "Sending ddim %d dir %d: ncg %d nat %d\n",
 +                        d, dir, sbuf[0], sbuf[1]);
 +            }
 +            dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
 +
 +            if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
 +            {
 +                comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
 +                srenew(comm->buf_int, comm->nalloc_int);
 +            }
 +
 +            /* Communicate the charge group indices, sizes and flags */
 +            dd_sendrecv_int(dd, d, dir,
 +                            comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
 +                            comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
 +
 +            nvs = ncg[cdd] + nat[cdd]*nvec;
 +            i   = rbuf[0]  + rbuf[1] *nvec;
 +            vec_rvec_check_alloc(&comm->vbuf, nvr+i);
 +
 +            /* Communicate cgcm and state */
 +            dd_sendrecv_rvec(dd, d, dir,
 +                             comm->cgcm_state[cdd], nvs,
 +                             comm->vbuf.v+nvr, i);
 +            ncg_recv += rbuf[0];
 +            nat_recv += rbuf[1];
 +            nvr      += i;
 +        }
 +
 +        /* Process the received charge groups */
 +        buf_pos = 0;
 +        for (cg = 0; cg < ncg_recv; cg++)
 +        {
 +            flag = comm->buf_int[cg*DD_CGIBS+1];
 +
 +            if (dim >= npbcdim && dd->nc[dim] > 2)
 +            {
 +                /* No pbc in this dim and more than one domain boundary.
 +                 * We do a separate check if a charge group didn't move too far.
 +                 */
 +                if (((flag & DD_FLAG_FW(d)) &&
 +                     comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
 +                    ((flag & DD_FLAG_BW(d)) &&
 +                     comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
 +                {
 +                    cg_move_error(fplog, dd, step, cg, dim,
 +                                  (flag & DD_FLAG_FW(d)) ? 1 : 0,
 +                                  FALSE, 0,
 +                                  comm->vbuf.v[buf_pos],
 +                                  comm->vbuf.v[buf_pos],
 +                                  comm->vbuf.v[buf_pos][dim]);
 +                }
 +            }
 +
 +            mc = -1;
 +            if (d < dd->ndim-1)
 +            {
 +                /* Check which direction this cg should go */
 +                for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
 +                {
 +                    if (dd->bGridJump)
 +                    {
 +                        /* The cell boundaries for dimension d2 are not equal
 +                         * for each cell row of the lower dimension(s),
 +                         * therefore we might need to redetermine where
 +                         * this cg should go.
 +                         */
 +                        dim2 = dd->dim[d2];
 +                        /* If this cg crosses the box boundary in dimension d2
 +                         * we can use the communicated flag, so we do not
 +                         * have to worry about pbc.
 +                         */
 +                        if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
 +                               (flag & DD_FLAG_FW(d2))) ||
 +                              (dd->ci[dim2] == 0 &&
 +                               (flag & DD_FLAG_BW(d2)))))
 +                        {
 +                            /* Clear the two flags for this dimension */
 +                            flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
 +                            /* Determine the location of this cg
 +                             * in lattice coordinates
 +                             */
 +                            pos_d = comm->vbuf.v[buf_pos][dim2];
 +                            if (tric_dir[dim2])
 +                            {
 +                                for (d3 = dim2+1; d3 < DIM; d3++)
 +                                {
 +                                    pos_d +=
 +                                        comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
 +                                }
 +                            }
 +                            /* Check of we are not at the box edge.
 +                             * pbc is only handled in the first step above,
 +                             * but this check could move over pbc while
 +                             * the first step did not due to different rounding.
 +                             */
 +                            if (pos_d >= cell_x1[dim2] &&
 +                                dd->ci[dim2] != dd->nc[dim2]-1)
 +                            {
 +                                flag |= DD_FLAG_FW(d2);
 +                            }
 +                            else if (pos_d < cell_x0[dim2] &&
 +                                     dd->ci[dim2] != 0)
 +                            {
 +                                flag |= DD_FLAG_BW(d2);
 +                            }
 +                            comm->buf_int[cg*DD_CGIBS+1] = flag;
 +                        }
 +                    }
 +                    /* Set to which neighboring cell this cg should go */
 +                    if (flag & DD_FLAG_FW(d2))
 +                    {
 +                        mc = d2*2;
 +                    }
 +                    else if (flag & DD_FLAG_BW(d2))
 +                    {
 +                        if (dd->nc[dd->dim[d2]] > 2)
 +                        {
 +                            mc = d2*2+1;
 +                        }
 +                        else
 +                        {
 +                            mc = d2*2;
 +                        }
 +                    }
 +                }
 +            }
 +
 +            nrcg = flag & DD_FLAG_NRCG;
 +            if (mc == -1)
 +            {
 +                if (home_pos_cg+1 > dd->cg_nalloc)
 +                {
 +                    dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
 +                    srenew(dd->index_gl, dd->cg_nalloc);
 +                    srenew(dd->cgindex, dd->cg_nalloc+1);
 +                }
 +                /* Set the global charge group index and size */
 +                dd->index_gl[home_pos_cg]  = comm->buf_int[cg*DD_CGIBS];
 +                dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
 +                /* Copy the state from the buffer */
 +                dd_check_alloc_ncg(fr, state, f, home_pos_cg+1);
 +                if (fr->cutoff_scheme == ecutsGROUP)
 +                {
 +                    cg_cm = fr->cg_cm;
 +                    copy_rvec(comm->vbuf.v[buf_pos], cg_cm[home_pos_cg]);
 +                }
 +                buf_pos++;
 +
 +                /* Set the cginfo */
 +                fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
 +                                                   dd->index_gl[home_pos_cg]);
 +                if (comm->bLocalCG)
 +                {
 +                    comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
 +                }
 +
 +                if (home_pos_at+nrcg > state->nalloc)
 +                {
 +                    dd_realloc_state(state, f, home_pos_at+nrcg);
 +                }
 +                for (i = 0; i < nrcg; i++)
 +                {
 +                    copy_rvec(comm->vbuf.v[buf_pos++],
 +                              state->x[home_pos_at+i]);
 +                }
 +                if (bV)
 +                {
 +                    for (i = 0; i < nrcg; i++)
 +                    {
 +                        copy_rvec(comm->vbuf.v[buf_pos++],
 +                                  state->v[home_pos_at+i]);
 +                    }
 +                }
 +                if (bSDX)
 +                {
 +                    for (i = 0; i < nrcg; i++)
 +                    {
 +                        copy_rvec(comm->vbuf.v[buf_pos++],
 +                                  state->sd_X[home_pos_at+i]);
 +                    }
 +                }
 +                if (bCGP)
 +                {
 +                    for (i = 0; i < nrcg; i++)
 +                    {
 +                        copy_rvec(comm->vbuf.v[buf_pos++],
 +                                  state->cg_p[home_pos_at+i]);
 +                    }
 +                }
 +                home_pos_cg += 1;
 +                home_pos_at += nrcg;
 +            }
 +            else
 +            {
 +                /* Reallocate the buffers if necessary  */
 +                if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
 +                {
 +                    comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
 +                    srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
 +                }
 +                nvr = ncg[mc] + nat[mc]*nvec;
 +                if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
 +                {
 +                    comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
 +                    srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
 +                }
 +                /* Copy from the receive to the send buffers */
 +                memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
 +                       comm->buf_int + cg*DD_CGIBS,
 +                       DD_CGIBS*sizeof(int));
 +                memcpy(comm->cgcm_state[mc][nvr],
 +                       comm->vbuf.v[buf_pos],
 +                       (1+nrcg*nvec)*sizeof(rvec));
 +                buf_pos += 1 + nrcg*nvec;
 +                ncg[mc] += 1;
 +                nat[mc] += nrcg;
 +            }
 +        }
 +    }
 +
 +    /* With sorting (!bCompact) the indices are now only partially up to date
 +     * and ncg_home and nat_home are not the real count, since there are
 +     * "holes" in the arrays for the charge groups that moved to neighbors.
 +     */
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        moved = get_moved(comm, home_pos_cg);
 +
 +        for (i = dd->ncg_home; i < home_pos_cg; i++)
 +        {
 +            moved[i] = 0;
 +        }
 +    }
 +    dd->ncg_home = home_pos_cg;
 +    dd->nat_home = home_pos_at;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,
 +                "Finished repartitioning: cgs moved out %d, new home %d\n",
 +                *ncg_moved, dd->ncg_home-*ncg_moved);
 +
 +    }
 +}
 +
 +void dd_cycles_add(gmx_domdec_t *dd, float cycles, int ddCycl)
 +{
 +    dd->comm->cycl[ddCycl] += cycles;
 +    dd->comm->cycl_n[ddCycl]++;
 +    if (cycles > dd->comm->cycl_max[ddCycl])
 +    {
 +        dd->comm->cycl_max[ddCycl] = cycles;
 +    }
 +}
 +
 +static double force_flop_count(t_nrnb *nrnb)
 +{
 +    int         i;
 +    double      sum;
 +    const char *name;
 +
 +    sum = 0;
 +    for (i = 0; i < eNR_NBKERNEL_FREE_ENERGY; i++)
 +    {
 +        /* To get closer to the real timings, we half the count
 +         * for the normal loops and again half it for water loops.
 +         */
 +        name = nrnb_str(i);
 +        if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
 +        {
 +            sum += nrnb->n[i]*0.25*cost_nrnb(i);
 +        }
 +        else
 +        {
 +            sum += nrnb->n[i]*0.50*cost_nrnb(i);
 +        }
 +    }
 +    for (i = eNR_NBKERNEL_FREE_ENERGY; i <= eNR_NB14; i++)
 +    {
 +        name = nrnb_str(i);
 +        if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
 +        {
 +            sum += nrnb->n[i]*cost_nrnb(i);
 +        }
 +    }
 +    for (i = eNR_BONDS; i <= eNR_WALLS; i++)
 +    {
 +        sum += nrnb->n[i]*cost_nrnb(i);
 +    }
 +
 +    return sum;
 +}
 +
 +void dd_force_flop_start(gmx_domdec_t *dd, t_nrnb *nrnb)
 +{
 +    if (dd->comm->eFlop)
 +    {
 +        dd->comm->flop -= force_flop_count(nrnb);
 +    }
 +}
 +void dd_force_flop_stop(gmx_domdec_t *dd, t_nrnb *nrnb)
 +{
 +    if (dd->comm->eFlop)
 +    {
 +        dd->comm->flop += force_flop_count(nrnb);
 +        dd->comm->flop_n++;
 +    }
 +}
 +
 +static void clear_dd_cycle_counts(gmx_domdec_t *dd)
 +{
 +    int i;
 +
 +    for (i = 0; i < ddCyclNr; i++)
 +    {
 +        dd->comm->cycl[i]     = 0;
 +        dd->comm->cycl_n[i]   = 0;
 +        dd->comm->cycl_max[i] = 0;
 +    }
 +    dd->comm->flop   = 0;
 +    dd->comm->flop_n = 0;
 +}
 +
 +static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_domdec_load_t *load;
 +    gmx_domdec_root_t *root = NULL;
 +    int                d, dim, cid, i, pos;
 +    float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
 +    gmx_bool           bSepPME;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "get_load_distribution start\n");
 +    }
 +
 +    wallcycle_start(wcycle, ewcDDCOMMLOAD);
 +
 +    comm = dd->comm;
 +
 +    bSepPME = (dd->pme_nodeid >= 0);
 +
 +    for (d = dd->ndim-1; d >= 0; d--)
 +    {
 +        dim = dd->dim[d];
 +        /* Check if we participate in the communication in this dimension */
 +        if (d == dd->ndim-1 ||
 +            (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
 +        {
 +            load = &comm->load[d];
 +            if (dd->bGridJump)
 +            {
 +                cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
 +            }
 +            pos = 0;
 +            if (d == dd->ndim-1)
 +            {
 +                sbuf[pos++] = dd_force_load(comm);
 +                sbuf[pos++] = sbuf[0];
 +                if (dd->bGridJump)
 +                {
 +                    sbuf[pos++] = sbuf[0];
 +                    sbuf[pos++] = cell_frac;
 +                    if (d > 0)
 +                    {
 +                        sbuf[pos++] = comm->cell_f_max0[d];
 +                        sbuf[pos++] = comm->cell_f_min1[d];
 +                    }
 +                }
 +                if (bSepPME)
 +                {
 +                    sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
 +                    sbuf[pos++] = comm->cycl[ddCyclPME];
 +                }
 +            }
 +            else
 +            {
 +                sbuf[pos++] = comm->load[d+1].sum;
 +                sbuf[pos++] = comm->load[d+1].max;
 +                if (dd->bGridJump)
 +                {
 +                    sbuf[pos++] = comm->load[d+1].sum_m;
 +                    sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
 +                    sbuf[pos++] = comm->load[d+1].flags;
 +                    if (d > 0)
 +                    {
 +                        sbuf[pos++] = comm->cell_f_max0[d];
 +                        sbuf[pos++] = comm->cell_f_min1[d];
 +                    }
 +                }
 +                if (bSepPME)
 +                {
 +                    sbuf[pos++] = comm->load[d+1].mdf;
 +                    sbuf[pos++] = comm->load[d+1].pme;
 +                }
 +            }
 +            load->nload = pos;
 +            /* Communicate a row in DD direction d.
 +             * The communicators are setup such that the root always has rank 0.
 +             */
 +#ifdef GMX_MPI
 +            MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
 +                       load->load, load->nload*sizeof(float), MPI_BYTE,
 +                       0, comm->mpi_comm_load[d]);
 +#endif
 +            if (dd->ci[dim] == dd->master_ci[dim])
 +            {
 +                /* We are the root, process this row */
 +                if (comm->bDynLoadBal)
 +                {
 +                    root = comm->root[d];
 +                }
 +                load->sum      = 0;
 +                load->max      = 0;
 +                load->sum_m    = 0;
 +                load->cvol_min = 1;
 +                load->flags    = 0;
 +                load->mdf      = 0;
 +                load->pme      = 0;
 +                pos            = 0;
 +                for (i = 0; i < dd->nc[dim]; i++)
 +                {
 +                    load->sum += load->load[pos++];
 +                    load->max  = max(load->max, load->load[pos]);
 +                    pos++;
 +                    if (dd->bGridJump)
 +                    {
 +                        if (root->bLimited)
 +                        {
 +                            /* This direction could not be load balanced properly,
 +                             * therefore we need to use the maximum iso the average load.
 +                             */
 +                            load->sum_m = max(load->sum_m, load->load[pos]);
 +                        }
 +                        else
 +                        {
 +                            load->sum_m += load->load[pos];
 +                        }
 +                        pos++;
 +                        load->cvol_min = min(load->cvol_min, load->load[pos]);
 +                        pos++;
 +                        if (d < dd->ndim-1)
 +                        {
 +                            load->flags = (int)(load->load[pos++] + 0.5);
 +                        }
 +                        if (d > 0)
 +                        {
 +                            root->cell_f_max0[i] = load->load[pos++];
 +                            root->cell_f_min1[i] = load->load[pos++];
 +                        }
 +                    }
 +                    if (bSepPME)
 +                    {
 +                        load->mdf = max(load->mdf, load->load[pos]);
 +                        pos++;
 +                        load->pme = max(load->pme, load->load[pos]);
 +                        pos++;
 +                    }
 +                }
 +                if (comm->bDynLoadBal && root->bLimited)
 +                {
 +                    load->sum_m *= dd->nc[dim];
 +                    load->flags |= (1<<d);
 +                }
 +            }
 +        }
 +    }
 +
 +    if (DDMASTER(dd))
 +    {
 +        comm->nload      += dd_load_count(comm);
 +        comm->load_step  += comm->cycl[ddCyclStep];
 +        comm->load_sum   += comm->load[0].sum;
 +        comm->load_max   += comm->load[0].max;
 +        if (comm->bDynLoadBal)
 +        {
 +            for (d = 0; d < dd->ndim; d++)
 +            {
 +                if (comm->load[0].flags & (1<<d))
 +                {
 +                    comm->load_lim[d]++;
 +                }
 +            }
 +        }
 +        if (bSepPME)
 +        {
 +            comm->load_mdf += comm->load[0].mdf;
 +            comm->load_pme += comm->load[0].pme;
 +        }
 +    }
 +
 +    wallcycle_stop(wcycle, ewcDDCOMMLOAD);
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "get_load_distribution finished\n");
 +    }
 +}
 +
 +static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
 +{
 +    /* Return the relative performance loss on the total run time
 +     * due to the force calculation load imbalance.
 +     */
 +    if (dd->comm->nload > 0)
 +    {
 +        return
 +            (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
 +            (dd->comm->load_step*dd->nnodes);
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
 +{
 +    char               buf[STRLEN];
 +    int                npp, npme, nnodes, d, limp;
 +    float              imbal, pme_f_ratio, lossf, lossp = 0;
 +    gmx_bool           bLim;
 +    gmx_domdec_comm_t *comm;
 +
 +    comm = dd->comm;
 +    if (DDMASTER(dd) && comm->nload > 0)
 +    {
 +        npp    = dd->nnodes;
 +        npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
 +        nnodes = npp + npme;
 +        imbal  = comm->load_max*npp/comm->load_sum - 1;
 +        lossf  = dd_force_imb_perf_loss(dd);
 +        sprintf(buf, " Average load imbalance: %.1f %%\n", imbal*100);
 +        fprintf(fplog, "%s", buf);
 +        fprintf(stderr, "\n");
 +        fprintf(stderr, "%s", buf);
 +        sprintf(buf, " Part of the total run time spent waiting due to load imbalance: %.1f %%\n", lossf*100);
 +        fprintf(fplog, "%s", buf);
 +        fprintf(stderr, "%s", buf);
 +        bLim = FALSE;
 +        if (comm->bDynLoadBal)
 +        {
 +            sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
 +            for (d = 0; d < dd->ndim; d++)
 +            {
 +                limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
 +                sprintf(buf+strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limp);
 +                if (limp >= 50)
 +                {
 +                    bLim = TRUE;
 +                }
 +            }
 +            sprintf(buf+strlen(buf), "\n");
 +            fprintf(fplog, "%s", buf);
 +            fprintf(stderr, "%s", buf);
 +        }
 +        if (npme > 0)
 +        {
 +            pme_f_ratio = comm->load_pme/comm->load_mdf;
 +            lossp       = (comm->load_pme -comm->load_mdf)/comm->load_step;
 +            if (lossp <= 0)
 +            {
 +                lossp *= (float)npme/(float)nnodes;
 +            }
 +            else
 +            {
 +                lossp *= (float)npp/(float)nnodes;
 +            }
 +            sprintf(buf, " Average PME mesh/force load: %5.3f\n", pme_f_ratio);
 +            fprintf(fplog, "%s", buf);
 +            fprintf(stderr, "%s", buf);
 +            sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossp)*100);
 +            fprintf(fplog, "%s", buf);
 +            fprintf(stderr, "%s", buf);
 +        }
 +        fprintf(fplog, "\n");
 +        fprintf(stderr, "\n");
 +
 +        if (lossf >= DD_PERF_LOSS)
 +        {
 +            sprintf(buf,
 +                    "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
 +                    "      in the domain decomposition.\n", lossf*100);
 +            if (!comm->bDynLoadBal)
 +            {
 +                sprintf(buf+strlen(buf), "      You might want to use dynamic load balancing (option -dlb.)\n");
 +            }
 +            else if (bLim)
 +            {
 +                sprintf(buf+strlen(buf), "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
 +            }
 +            fprintf(fplog, "%s\n", buf);
 +            fprintf(stderr, "%s\n", buf);
 +        }
 +        if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
 +        {
 +            sprintf(buf,
 +                    "NOTE: %.1f %% performance was lost because the PME nodes\n"
 +                    "      had %s work to do than the PP nodes.\n"
 +                    "      You might want to %s the number of PME nodes\n"
 +                    "      or %s the cut-off and the grid spacing.\n",
 +                    fabs(lossp*100),
 +                    (lossp < 0) ? "less"     : "more",
 +                    (lossp < 0) ? "decrease" : "increase",
 +                    (lossp < 0) ? "decrease" : "increase");
 +            fprintf(fplog, "%s\n", buf);
 +            fprintf(stderr, "%s\n", buf);
 +        }
 +    }
 +}
 +
 +static float dd_vol_min(gmx_domdec_t *dd)
 +{
 +    return dd->comm->load[0].cvol_min*dd->nnodes;
 +}
 +
 +static gmx_bool dd_load_flags(gmx_domdec_t *dd)
 +{
 +    return dd->comm->load[0].flags;
 +}
 +
 +static float dd_f_imbal(gmx_domdec_t *dd)
 +{
 +    return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
 +}
 +
 +float dd_pme_f_ratio(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->cycl_n[ddCyclPME] > 0)
 +    {
 +        return dd->comm->load[0].pme/dd->comm->load[0].mdf;
 +    }
 +    else
 +    {
 +        return -1.0;
 +    }
 +}
 +
 +static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_large_int_t step)
 +{
 +    int  flags, d;
 +    char buf[22];
 +
 +    flags = dd_load_flags(dd);
 +    if (flags)
 +    {
 +        fprintf(fplog,
 +                "DD  load balancing is limited by minimum cell size in dimension");
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            if (flags & (1<<d))
 +            {
 +                fprintf(fplog, " %c", dim2char(dd->dim[d]));
 +            }
 +        }
 +        fprintf(fplog, "\n");
 +    }
 +    fprintf(fplog, "DD  step %s", gmx_step_str(step, buf));
 +    if (dd->comm->bDynLoadBal)
 +    {
 +        fprintf(fplog, "  vol min/aver %5.3f%c",
 +                dd_vol_min(dd), flags ? '!' : ' ');
 +    }
 +    fprintf(fplog, " load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
 +    if (dd->comm->cycl_n[ddCyclPME])
 +    {
 +        fprintf(fplog, "  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
 +    }
 +    fprintf(fplog, "\n\n");
 +}
 +
 +static void dd_print_load_verbose(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->bDynLoadBal)
 +    {
 +        fprintf(stderr, "vol %4.2f%c ",
 +                dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
 +    }
 +    fprintf(stderr, "imb F %2d%% ", (int)(dd_f_imbal(dd)*100+0.5));
 +    if (dd->comm->cycl_n[ddCyclPME])
 +    {
 +        fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
 +    }
 +}
 +
 +#ifdef GMX_MPI
 +static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
 +{
 +    MPI_Comm           c_row;
 +    int                dim, i, rank;
 +    ivec               loc_c;
 +    gmx_domdec_root_t *root;
 +    gmx_bool           bPartOfGroup = FALSE;
 +
 +    dim = dd->dim[dim_ind];
 +    copy_ivec(loc, loc_c);
 +    for (i = 0; i < dd->nc[dim]; i++)
 +    {
 +        loc_c[dim] = i;
 +        rank       = dd_index(dd->nc, loc_c);
 +        if (rank == dd->rank)
 +        {
 +            /* This process is part of the group */
 +            bPartOfGroup = TRUE;
 +        }
 +    }
 +    MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
 +                   &c_row);
 +    if (bPartOfGroup)
 +    {
 +        dd->comm->mpi_comm_load[dim_ind] = c_row;
 +        if (dd->comm->eDLB != edlbNO)
 +        {
 +            if (dd->ci[dim] == dd->master_ci[dim])
 +            {
 +                /* This is the root process of this row */
 +                snew(dd->comm->root[dim_ind], 1);
 +                root = dd->comm->root[dim_ind];
 +                snew(root->cell_f, DD_CELL_F_SIZE(dd, dim_ind));
 +                snew(root->old_cell_f, dd->nc[dim]+1);
 +                snew(root->bCellMin, dd->nc[dim]);
 +                if (dim_ind > 0)
 +                {
 +                    snew(root->cell_f_max0, dd->nc[dim]);
 +                    snew(root->cell_f_min1, dd->nc[dim]);
 +                    snew(root->bound_min, dd->nc[dim]);
 +                    snew(root->bound_max, dd->nc[dim]);
 +                }
 +                snew(root->buf_ncd, dd->nc[dim]);
 +            }
 +            else
 +            {
 +                /* This is not a root process, we only need to receive cell_f */
 +                snew(dd->comm->cell_f_row, DD_CELL_F_SIZE(dd, dim_ind));
 +            }
 +        }
 +        if (dd->ci[dim] == dd->master_ci[dim])
 +        {
 +            snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
 +        }
 +    }
 +}
 +#endif
 +
 +static void make_load_communicators(gmx_domdec_t *dd)
 +{
 +#ifdef GMX_MPI
 +    int  dim0, dim1, i, j;
 +    ivec loc;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Making load communicators\n");
 +    }
 +
 +    snew(dd->comm->load, dd->ndim);
 +    snew(dd->comm->mpi_comm_load, dd->ndim);
 +
 +    clear_ivec(loc);
 +    make_load_communicator(dd, 0, loc);
 +    if (dd->ndim > 1)
 +    {
 +        dim0 = dd->dim[0];
 +        for (i = 0; i < dd->nc[dim0]; i++)
 +        {
 +            loc[dim0] = i;
 +            make_load_communicator(dd, 1, loc);
 +        }
 +    }
 +    if (dd->ndim > 2)
 +    {
 +        dim0 = dd->dim[0];
 +        for (i = 0; i < dd->nc[dim0]; i++)
 +        {
 +            loc[dim0] = i;
 +            dim1      = dd->dim[1];
 +            for (j = 0; j < dd->nc[dim1]; j++)
 +            {
 +                loc[dim1] = j;
 +                make_load_communicator(dd, 2, loc);
 +            }
 +        }
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Finished making load communicators\n");
 +    }
 +#endif
 +}
 +
 +void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd)
 +{
 +    gmx_bool                bZYX;
 +    int                     d, dim, i, j, m;
 +    ivec                    tmp, s;
 +    int                     nzone, nzonep;
 +    ivec                    dd_zp[DD_MAXIZONE];
 +    gmx_domdec_zones_t     *zones;
 +    gmx_domdec_ns_ranges_t *izone;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        copy_ivec(dd->ci, tmp);
 +        tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
 +        dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
 +        copy_ivec(dd->ci, tmp);
 +        tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
 +        dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
 +        if (debug)
 +        {
 +            fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
 +                    dd->rank, dim,
 +                    dd->neighbor[d][0],
 +                    dd->neighbor[d][1]);
 +        }
 +    }
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog, "\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
 +                dd->ndim,
 +                dd->nc[XX], dd->nc[YY], dd->nc[ZZ],
 +                dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +    }
 +    switch (dd->ndim)
 +    {
 +        case 3:
 +            nzone  = dd_z3n;
 +            nzonep = dd_zp3n;
 +            for (i = 0; i < nzonep; i++)
 +            {
 +                copy_ivec(dd_zp3[i], dd_zp[i]);
 +            }
 +            break;
 +        case 2:
 +            nzone  = dd_z2n;
 +            nzonep = dd_zp2n;
 +            for (i = 0; i < nzonep; i++)
 +            {
 +                copy_ivec(dd_zp2[i], dd_zp[i]);
 +            }
 +            break;
 +        case 1:
 +            nzone  = dd_z1n;
 +            nzonep = dd_zp1n;
 +            for (i = 0; i < nzonep; i++)
 +            {
 +                copy_ivec(dd_zp1[i], dd_zp[i]);
 +            }
 +            break;
 +        default:
 +            gmx_fatal(FARGS, "Can only do 1, 2 or 3D domain decomposition");
 +            nzone  = 0;
 +            nzonep = 0;
 +    }
 +
 +    zones = &dd->comm->zones;
 +
 +    for (i = 0; i < nzone; i++)
 +    {
 +        m = 0;
 +        clear_ivec(zones->shift[i]);
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
 +        }
 +    }
 +
 +    zones->n = nzone;
 +    for (i = 0; i < nzone; i++)
 +    {
 +        for (d = 0; d < DIM; d++)
 +        {
 +            s[d] = dd->ci[d] - zones->shift[i][d];
 +            if (s[d] < 0)
 +            {
 +                s[d] += dd->nc[d];
 +            }
 +            else if (s[d] >= dd->nc[d])
 +            {
 +                s[d] -= dd->nc[d];
 +            }
 +        }
 +    }
 +    zones->nizone = nzonep;
 +    for (i = 0; i < zones->nizone; i++)
 +    {
 +        if (dd_zp[i][0] != i)
 +        {
 +            gmx_fatal(FARGS, "Internal inconsistency in the dd grid setup");
 +        }
 +        izone     = &zones->izone[i];
 +        izone->j0 = dd_zp[i][1];
 +        izone->j1 = dd_zp[i][2];
 +        for (dim = 0; dim < DIM; dim++)
 +        {
 +            if (dd->nc[dim] == 1)
 +            {
 +                /* All shifts should be allowed */
 +                izone->shift0[dim] = -1;
 +                izone->shift1[dim] = 1;
 +            }
 +            else
 +            {
 +                /*
 +                   izone->shift0[d] = 0;
 +                   izone->shift1[d] = 0;
 +                   for(j=izone->j0; j<izone->j1; j++) {
 +                   if (dd->shift[j][d] > dd->shift[i][d])
 +                   izone->shift0[d] = -1;
 +                   if (dd->shift[j][d] < dd->shift[i][d])
 +                   izone->shift1[d] = 1;
 +                   }
 +                 */
 +
 +                int shift_diff;
 +
 +                /* Assume the shift are not more than 1 cell */
 +                izone->shift0[dim] = 1;
 +                izone->shift1[dim] = -1;
 +                for (j = izone->j0; j < izone->j1; j++)
 +                {
 +                    shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
 +                    if (shift_diff < izone->shift0[dim])
 +                    {
 +                        izone->shift0[dim] = shift_diff;
 +                    }
 +                    if (shift_diff > izone->shift1[dim])
 +                    {
 +                        izone->shift1[dim] = shift_diff;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    if (dd->comm->eDLB != edlbNO)
 +    {
 +        snew(dd->comm->root, dd->ndim);
 +    }
 +
 +    if (dd->comm->bRecordLoad)
 +    {
 +        make_load_communicators(dd);
 +    }
 +}
 +
 +static void make_pp_communicator(FILE *fplog, t_commrec *cr, int reorder)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    int                i, rank, *buf;
 +    ivec               periods;
 +#ifdef GMX_MPI
 +    MPI_Comm           comm_cart;
 +#endif
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +#ifdef GMX_MPI
 +    if (comm->bCartesianPP)
 +    {
 +        /* Set up cartesian communication for the particle-particle part */
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Will use a Cartesian communicator: %d x %d x %d\n",
 +                    dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
 +        }
 +
 +        for (i = 0; i < DIM; i++)
 +        {
 +            periods[i] = TRUE;
 +        }
 +        MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, reorder,
 +                        &comm_cart);
 +        /* We overwrite the old communicator with the new cartesian one */
 +        cr->mpi_comm_mygroup = comm_cart;
 +    }
 +
 +    dd->mpi_comm_all = cr->mpi_comm_mygroup;
 +    MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
 +
 +    if (comm->bCartesianPP_PME)
 +    {
 +        /* Since we want to use the original cartesian setup for sim,
 +         * and not the one after split, we need to make an index.
 +         */
 +        snew(comm->ddindex2ddnodeid, dd->nnodes);
 +        comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
 +        gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
 +        /* Get the rank of the DD master,
 +         * above we made sure that the master node is a PP node.
 +         */
 +        if (MASTER(cr))
 +        {
 +            rank = dd->rank;
 +        }
 +        else
 +        {
 +            rank = 0;
 +        }
 +        MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
 +    }
 +    else if (comm->bCartesianPP)
 +    {
 +        if (cr->npmenodes == 0)
 +        {
 +            /* The PP communicator is also
 +             * the communicator for this simulation
 +             */
 +            cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
 +        }
 +        cr->nodeid = dd->rank;
 +
 +        MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
 +
 +        /* We need to make an index to go from the coordinates
 +         * to the nodeid of this simulation.
 +         */
 +        snew(comm->ddindex2simnodeid, dd->nnodes);
 +        snew(buf, dd->nnodes);
 +        if (cr->duty & DUTY_PP)
 +        {
 +            buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
 +        }
 +        /* Communicate the ddindex to simulation nodeid index */
 +        MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
 +                      cr->mpi_comm_mysim);
 +        sfree(buf);
 +
 +        /* Determine the master coordinates and rank.
 +         * The DD master should be the same node as the master of this sim.
 +         */
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            if (comm->ddindex2simnodeid[i] == 0)
 +            {
 +                ddindex2xyz(dd->nc, i, dd->master_ci);
 +                MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug, "The master rank is %d\n", dd->masterrank);
 +        }
 +    }
 +    else
 +    {
 +        /* No Cartesian communicators */
 +        /* We use the rank in dd->comm->all as DD index */
 +        ddindex2xyz(dd->nc, dd->rank, dd->ci);
 +        /* The simulation master nodeid is 0, so the DD master rank is also 0 */
 +        dd->masterrank = 0;
 +        clear_ivec(dd->master_ci);
 +    }
 +#endif
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog,
 +                "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
 +                dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug,
 +                "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
 +                dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +    }
 +}
 +
 +static void receive_ddindex2simnodeid(t_commrec *cr)
 +{
 +    gmx_domdec_t      *dd;
 +
 +    gmx_domdec_comm_t *comm;
 +    int               *buf;
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +#ifdef GMX_MPI
 +    if (!comm->bCartesianPP_PME && comm->bCartesianPP)
 +    {
 +        snew(comm->ddindex2simnodeid, dd->nnodes);
 +        snew(buf, dd->nnodes);
 +        if (cr->duty & DUTY_PP)
 +        {
 +            buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
 +        }
 +#ifdef GMX_MPI
 +        /* Communicate the ddindex to simulation nodeid index */
 +        MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
 +                      cr->mpi_comm_mysim);
 +#endif
 +        sfree(buf);
 +    }
 +#endif
 +}
 +
 +static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
 +                                                     int ncg, int natoms)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  i;
 +
 +    snew(ma, 1);
 +
 +    snew(ma->ncg, dd->nnodes);
 +    snew(ma->index, dd->nnodes+1);
 +    snew(ma->cg, ncg);
 +    snew(ma->nat, dd->nnodes);
 +    snew(ma->ibuf, dd->nnodes*2);
 +    snew(ma->cell_x, DIM);
 +    for (i = 0; i < DIM; i++)
 +    {
 +        snew(ma->cell_x[i], dd->nc[i]+1);
 +    }
 +
 +    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
 +    {
 +        ma->vbuf = NULL;
 +    }
 +    else
 +    {
 +        snew(ma->vbuf, natoms);
 +    }
 +
 +    return ma;
 +}
 +
 +static void split_communicator(FILE *fplog, t_commrec *cr, int dd_node_order,
 +                               int reorder)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    int                i, rank;
 +    gmx_bool           bDiv[DIM];
 +    ivec               periods;
 +#ifdef GMX_MPI
 +    MPI_Comm           comm_cart;
 +#endif
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    if (comm->bCartesianPP)
 +    {
 +        for (i = 1; i < DIM; i++)
 +        {
 +            bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
 +        }
 +        if (bDiv[YY] || bDiv[ZZ])
 +        {
 +            comm->bCartesianPP_PME = TRUE;
 +            /* If we have 2D PME decomposition, which is always in x+y,
 +             * we stack the PME only nodes in z.
 +             * Otherwise we choose the direction that provides the thinnest slab
 +             * of PME only nodes as this will have the least effect
 +             * on the PP communication.
 +             * But for the PME communication the opposite might be better.
 +             */
 +            if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
 +                             !bDiv[YY] ||
 +                             dd->nc[YY] > dd->nc[ZZ]))
 +            {
 +                comm->cartpmedim = ZZ;
 +            }
 +            else
 +            {
 +                comm->cartpmedim = YY;
 +            }
 +            comm->ntot[comm->cartpmedim]
 +                += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
 +        }
 +        else if (fplog)
 +        {
 +            fprintf(fplog, "#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
 +            fprintf(fplog,
 +                    "Will not use a Cartesian communicator for PP <-> PME\n\n");
 +        }
 +    }
 +
 +#ifdef GMX_MPI
 +    if (comm->bCartesianPP_PME)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
 +        }
 +
 +        for (i = 0; i < DIM; i++)
 +        {
 +            periods[i] = TRUE;
 +        }
 +        MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
 +                        &comm_cart);
 +
 +        MPI_Comm_rank(comm_cart, &rank);
 +        if (MASTERNODE(cr) && rank != 0)
 +        {
 +            gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
 +        }
 +
 +        /* With this assigment we loose the link to the original communicator
 +         * which will usually be MPI_COMM_WORLD, unless have multisim.
 +         */
 +        cr->mpi_comm_mysim = comm_cart;
 +        cr->sim_nodeid     = rank;
 +
 +        MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
 +
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Cartesian nodeid %d, coordinates %d %d %d\n\n",
 +                    cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +        }
 +
 +        if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 +        {
 +            cr->duty = DUTY_PP;
 +        }
 +        if (cr->npmenodes == 0 ||
 +            dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
 +        {
 +            cr->duty = DUTY_PME;
 +        }
 +
 +        /* Split the sim communicator into PP and PME only nodes */
 +        MPI_Comm_split(cr->mpi_comm_mysim,
 +                       cr->duty,
 +                       dd_index(comm->ntot, dd->ci),
 +                       &cr->mpi_comm_mygroup);
 +    }
 +    else
 +    {
 +        switch (dd_node_order)
 +        {
 +            case ddnoPP_PME:
 +                if (fplog)
 +                {
 +                    fprintf(fplog, "Order of the nodes: PP first, PME last\n");
 +                }
 +                break;
 +            case ddnoINTERLEAVE:
 +                /* Interleave the PP-only and PME-only nodes,
 +                 * as on clusters with dual-core machines this will double
 +                 * the communication bandwidth of the PME processes
 +                 * and thus speed up the PP <-> PME and inter PME communication.
 +                 */
 +                if (fplog)
 +                {
 +                    fprintf(fplog, "Interleaving PP and PME nodes\n");
 +                }
 +                comm->pmenodes = dd_pmenodes(cr);
 +                break;
 +            case ddnoCARTESIAN:
 +                break;
 +            default:
 +                gmx_fatal(FARGS, "Unknown dd_node_order=%d", dd_node_order);
 +        }
 +
 +        if (dd_simnode2pmenode(cr, cr->sim_nodeid) == -1)
 +        {
 +            cr->duty = DUTY_PME;
 +        }
 +        else
 +        {
 +            cr->duty = DUTY_PP;
 +        }
 +
 +        /* Split the sim communicator into PP and PME only nodes */
 +        MPI_Comm_split(cr->mpi_comm_mysim,
 +                       cr->duty,
 +                       cr->nodeid,
 +                       &cr->mpi_comm_mygroup);
 +        MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
 +    }
 +#endif
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog, "This is a %s only node\n\n",
 +                (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
 +    }
 +}
 +
 +void make_dd_communicators(FILE *fplog, t_commrec *cr, int dd_node_order)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    int                CartReorder;
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    copy_ivec(dd->nc, comm->ntot);
 +
 +    comm->bCartesianPP     = (dd_node_order == ddnoCARTESIAN);
 +    comm->bCartesianPP_PME = FALSE;
 +
 +    /* Reorder the nodes by default. This might change the MPI ranks.
 +     * Real reordering is only supported on very few architectures,
 +     * Blue Gene is one of them.
 +     */
 +    CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
 +
 +    if (cr->npmenodes > 0)
 +    {
 +        /* Split the communicator into a PP and PME part */
 +        split_communicator(fplog, cr, dd_node_order, CartReorder);
 +        if (comm->bCartesianPP_PME)
 +        {
 +            /* We (possibly) reordered the nodes in split_communicator,
 +             * so it is no longer required in make_pp_communicator.
 +             */
 +            CartReorder = FALSE;
 +        }
 +    }
 +    else
 +    {
 +        /* All nodes do PP and PME */
 +#ifdef GMX_MPI
 +        /* We do not require separate communicators */
 +        cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
 +#endif
 +    }
 +
 +    if (cr->duty & DUTY_PP)
 +    {
 +        /* Copy or make a new PP communicator */
 +        make_pp_communicator(fplog, cr, CartReorder);
 +    }
 +    else
 +    {
 +        receive_ddindex2simnodeid(cr);
 +    }
 +
 +    if (!(cr->duty & DUTY_PME))
 +    {
 +        /* Set up the commnuication to our PME node */
 +        dd->pme_nodeid           = dd_simnode2pmenode(cr, cr->sim_nodeid);
 +        dd->pme_receive_vir_ener = receive_vir_ener(cr);
 +        if (debug)
 +        {
 +            fprintf(debug, "My pme_nodeid %d receive ener %d\n",
 +                    dd->pme_nodeid, dd->pme_receive_vir_ener);
 +        }
 +    }
 +    else
 +    {
 +        dd->pme_nodeid = -1;
 +    }
 +
 +    if (DDMASTER(dd))
 +    {
 +        dd->ma = init_gmx_domdec_master_t(dd,
 +                                          comm->cgs_gl.nr,
 +                                          comm->cgs_gl.index[comm->cgs_gl.nr]);
 +    }
 +}
 +
 +static real *get_slb_frac(FILE *fplog, const char *dir, int nc, const char *size_string)
 +{
 +    real  *slb_frac, tot;
 +    int    i, n;
 +    double dbl;
 +
 +    slb_frac = NULL;
 +    if (nc > 1 && size_string != NULL)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Using static load balancing for the %s direction\n",
 +                    dir);
 +        }
 +        snew(slb_frac, nc);
 +        tot = 0;
 +        for (i = 0; i < nc; i++)
 +        {
 +            dbl = 0;
 +            sscanf(size_string, "%lf%n", &dbl, &n);
 +            if (dbl == 0)
 +            {
 +                gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
 +            }
 +            slb_frac[i]  = dbl;
 +            size_string += n;
 +            tot         += slb_frac[i];
 +        }
 +        /* Normalize */
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Relative cell sizes:");
 +        }
 +        for (i = 0; i < nc; i++)
 +        {
 +            slb_frac[i] /= tot;
 +            if (fplog)
 +            {
 +                fprintf(fplog, " %5.3f", slb_frac[i]);
 +            }
 +        }
 +        if (fplog)
 +        {
 +            fprintf(fplog, "\n");
 +        }
 +    }
 +
 +    return slb_frac;
 +}
 +
 +static int multi_body_bondeds_count(gmx_mtop_t *mtop)
 +{
 +    int                  n, nmol, ftype;
 +    gmx_mtop_ilistloop_t iloop;
 +    t_ilist             *il;
 +
 +    n     = 0;
 +    iloop = gmx_mtop_ilistloop_init(mtop);
 +    while (gmx_mtop_ilistloop_next(iloop, &il, &nmol))
 +    {
 +        for (ftype = 0; ftype < F_NRE; ftype++)
 +        {
 +            if ((interaction_function[ftype].flags & IF_BOND) &&
 +                NRAL(ftype) >  2)
 +            {
 +                n += nmol*il[ftype].nr/(1 + NRAL(ftype));
 +            }
 +        }
 +    }
 +
 +    return n;
 +}
 +
 +static int dd_nst_env(FILE *fplog, const char *env_var, int def)
 +{
 +    char *val;
 +    int   nst;
 +
 +    nst = def;
 +    val = getenv(env_var);
 +    if (val)
 +    {
 +        if (sscanf(val, "%d", &nst) <= 0)
 +        {
 +            nst = 1;
 +        }
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Found env.var. %s = %s, using value %d\n",
 +                    env_var, val, nst);
 +        }
 +    }
 +
 +    return nst;
 +}
 +
 +static void dd_warning(t_commrec *cr, FILE *fplog, const char *warn_string)
 +{
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr, "\n%s\n", warn_string);
 +    }
 +    if (fplog)
 +    {
 +        fprintf(fplog, "\n%s\n", warn_string);
 +    }
 +}
 +
 +static void check_dd_restrictions(t_commrec *cr, gmx_domdec_t *dd,
 +                                  t_inputrec *ir, FILE *fplog)
 +{
 +    if (ir->ePBC == epbcSCREW &&
 +        (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
 +    {
 +        gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
 +    }
 +
 +    if (ir->ns_type == ensSIMPLE)
 +    {
 +        gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
 +    }
 +
 +    if (ir->nstlist == 0)
 +    {
 +        gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
 +    }
 +
 +    if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
 +    {
 +        dd_warning(cr, fplog, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
 +    }
 +}
 +
 +static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
 +{
 +    int  di, d;
 +    real r;
 +
 +    r = ddbox->box_size[XX];
 +    for (di = 0; di < dd->ndim; di++)
 +    {
 +        d = dd->dim[di];
 +        /* Check using the initial average cell size */
 +        r = min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
 +    }
 +
 +    return r;
 +}
 +
 +static int check_dlb_support(FILE *fplog, t_commrec *cr,
 +                             const char *dlb_opt, gmx_bool bRecordLoad,
 +                             unsigned long Flags, t_inputrec *ir)
 +{
 +    gmx_domdec_t *dd;
 +    int           eDLB = -1;
 +    char          buf[STRLEN];
 +
 +    switch (dlb_opt[0])
 +    {
 +        case 'a': eDLB = edlbAUTO; break;
 +        case 'n': eDLB = edlbNO;   break;
 +        case 'y': eDLB = edlbYES;  break;
 +        default: gmx_incons("Unknown dlb_opt");
 +    }
 +
 +    if (Flags & MD_RERUN)
 +    {
 +        return edlbNO;
 +    }
 +
 +    if (!EI_DYNAMICS(ir->eI))
 +    {
 +        if (eDLB == edlbYES)
 +        {
 +            sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI));
 +            dd_warning(cr, fplog, buf);
 +        }
 +
 +        return edlbNO;
 +    }
 +
 +    if (!bRecordLoad)
 +    {
 +        dd_warning(cr, fplog, "NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
 +
 +        return edlbNO;
 +    }
 +
 +    if (Flags & MD_REPRODUCIBLE)
 +    {
 +        switch (eDLB)
 +        {
 +            case edlbNO:
 +                break;
 +            case edlbAUTO:
 +                dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
 +                eDLB = edlbNO;
 +                break;
 +            case edlbYES:
 +                dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
 +                break;
 +            default:
 +                gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", eDLB);
 +                break;
 +        }
 +    }
 +
 +    return eDLB;
 +}
 +
 +static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
 +{
 +    int dim;
 +
 +    dd->ndim = 0;
 +    if (getenv("GMX_DD_ORDER_ZYX") != NULL)
 +    {
 +        /* Decomposition order z,y,x */
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Using domain decomposition order z, y, x\n");
 +        }
 +        for (dim = DIM-1; dim >= 0; dim--)
 +        {
 +            if (dd->nc[dim] > 1)
 +            {
 +                dd->dim[dd->ndim++] = dim;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        /* Decomposition order x,y,z */
 +        for (dim = 0; dim < DIM; dim++)
 +        {
 +            if (dd->nc[dim] > 1)
 +            {
 +                dd->dim[dd->ndim++] = dim;
 +            }
 +        }
 +    }
 +}
 +
 +static gmx_domdec_comm_t *init_dd_comm()
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                i;
 +
 +    snew(comm, 1);
 +    snew(comm->cggl_flag, DIM*2);
 +    snew(comm->cgcm_state, DIM*2);
 +    for (i = 0; i < DIM*2; i++)
 +    {
 +        comm->cggl_flag_nalloc[i]  = 0;
 +        comm->cgcm_state_nalloc[i] = 0;
 +    }
 +
 +    comm->nalloc_int = 0;
 +    comm->buf_int    = NULL;
 +
 +    vec_rvec_init(&comm->vbuf);
 +
 +    comm->n_load_have    = 0;
 +    comm->n_load_collect = 0;
 +
 +    for (i = 0; i < ddnatNR-ddnatZONE; i++)
 +    {
 +        comm->sum_nat[i] = 0;
 +    }
 +    comm->ndecomp   = 0;
 +    comm->nload     = 0;
 +    comm->load_step = 0;
 +    comm->load_sum  = 0;
 +    comm->load_max  = 0;
 +    clear_ivec(comm->load_lim);
 +    comm->load_mdf  = 0;
 +    comm->load_pme  = 0;
 +
 +    return comm;
 +}
 +
 +gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
 +                                        unsigned long Flags,
 +                                        ivec nc,
 +                                        real comm_distance_min, real rconstr,
 +                                        const char *dlb_opt, real dlb_scale,
 +                                        const char *sizex, const char *sizey, const char *sizez,
 +                                        gmx_mtop_t *mtop, t_inputrec *ir,
 +                                        matrix box, rvec *x,
 +                                        gmx_ddbox_t *ddbox,
 +                                        int *npme_x, int *npme_y)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    int                recload;
 +    int                d, i, j;
 +    real               r_2b, r_mb, r_bonded = -1, r_bonded_limit = -1, limit, acs;
 +    gmx_bool           bC;
 +    char               buf[STRLEN];
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog,
 +                "\nInitializing Domain Decomposition on %d nodes\n", cr->nnodes);
 +    }
 +
 +    snew(dd, 1);
 +
 +    dd->comm = init_dd_comm();
 +    comm     = dd->comm;
 +    snew(comm->cggl_flag, DIM*2);
 +    snew(comm->cgcm_state, DIM*2);
 +
 +    dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
 +    dd->bScrewPBC = (ir->ePBC == epbcSCREW);
 +
 +    dd->bSendRecv2      = dd_nst_env(fplog, "GMX_DD_SENDRECV2", 0);
 +    comm->dlb_scale_lim = dd_nst_env(fplog, "GMX_DLB_MAX", 10);
 +    comm->eFlop         = dd_nst_env(fplog, "GMX_DLB_FLOP", 0);
 +    recload             = dd_nst_env(fplog, "GMX_DD_LOAD", 1);
 +    comm->nstSortCG     = dd_nst_env(fplog, "GMX_DD_SORT", 1);
 +    comm->nstDDDump     = dd_nst_env(fplog, "GMX_DD_DUMP", 0);
 +    comm->nstDDDumpGrid = dd_nst_env(fplog, "GMX_DD_DUMP_GRID", 0);
 +    comm->DD_debug      = dd_nst_env(fplog, "GMX_DD_DEBUG", 0);
 +
 +    dd->pme_recv_f_alloc = 0;
 +    dd->pme_recv_f_buf   = NULL;
 +
 +    if (dd->bSendRecv2 && fplog)
 +    {
 +        fprintf(fplog, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
 +    }
 +    if (comm->eFlop)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Will load balance based on FLOP count\n");
 +        }
 +        if (comm->eFlop > 1)
 +        {
 +            srand(1+cr->nodeid);
 +        }
 +        comm->bRecordLoad = TRUE;
 +    }
 +    else
 +    {
 +        comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
 +
 +    }
 +
 +    comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
 +
 +    comm->bDynLoadBal = (comm->eDLB == edlbYES);
 +    if (fplog)
 +    {
 +        fprintf(fplog, "Dynamic load balancing: %s\n", edlb_names[comm->eDLB]);
 +    }
 +    dd->bGridJump              = comm->bDynLoadBal;
 +    comm->bPMELoadBalDLBLimits = FALSE;
 +
 +    if (comm->nstSortCG)
 +    {
 +        if (fplog)
 +        {
 +            if (comm->nstSortCG == 1)
 +            {
 +                fprintf(fplog, "Will sort the charge groups at every domain (re)decomposition\n");
 +            }
 +            else
 +            {
 +                fprintf(fplog, "Will sort the charge groups every %d steps\n",
 +                        comm->nstSortCG);
 +            }
 +        }
 +        snew(comm->sort, 1);
 +    }
 +    else
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Will not sort the charge groups\n");
 +        }
 +    }
 +
 +    comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
 +
 +    comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
 +    if (comm->bInterCGBondeds)
 +    {
 +        comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
 +    }
 +    else
 +    {
 +        comm->bInterCGMultiBody = FALSE;
 +    }
 +
 +    dd->bInterCGcons    = inter_charge_group_constraints(mtop);
 +    dd->bInterCGsettles = inter_charge_group_settles(mtop);
 +
 +    if (ir->rlistlong == 0)
 +    {
 +        /* Set the cut-off to some very large value,
 +         * so we don't need if statements everywhere in the code.
 +         * We use sqrt, since the cut-off is squared in some places.
 +         */
 +        comm->cutoff   = GMX_CUTOFF_INF;
 +    }
 +    else
 +    {
 +        comm->cutoff   = ir->rlistlong;
 +    }
 +    comm->cutoff_mbody = 0;
 +
 +    comm->cellsize_limit = 0;
 +    comm->bBondComm      = FALSE;
 +
 +    if (comm->bInterCGBondeds)
 +    {
 +        if (comm_distance_min > 0)
 +        {
 +            comm->cutoff_mbody = comm_distance_min;
 +            if (Flags & MD_DDBONDCOMM)
 +            {
 +                comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
 +            }
 +            else
 +            {
 +                comm->cutoff = max(comm->cutoff, comm->cutoff_mbody);
 +            }
 +            r_bonded_limit = comm->cutoff_mbody;
 +        }
 +        else if (ir->bPeriodicMols)
 +        {
 +            /* Can not easily determine the required cut-off */
 +            dd_warning(cr, fplog, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
 +            comm->cutoff_mbody = comm->cutoff/2;
 +            r_bonded_limit     = comm->cutoff_mbody;
 +        }
 +        else
 +        {
 +            if (MASTER(cr))
 +            {
 +                dd_bonded_cg_distance(fplog, mtop, ir, x, box,
 +                                      Flags & MD_DDBONDCHECK, &r_2b, &r_mb);
 +            }
 +            gmx_bcast(sizeof(r_2b), &r_2b, cr);
 +            gmx_bcast(sizeof(r_mb), &r_mb, cr);
 +
 +            /* We use an initial margin of 10% for the minimum cell size,
 +             * except when we are just below the non-bonded cut-off.
 +             */
 +            if (Flags & MD_DDBONDCOMM)
 +            {
 +                if (max(r_2b, r_mb) > comm->cutoff)
 +                {
 +                    r_bonded        = max(r_2b, r_mb);
 +                    r_bonded_limit  = 1.1*r_bonded;
 +                    comm->bBondComm = TRUE;
 +                }
 +                else
 +                {
 +                    r_bonded       = r_mb;
 +                    r_bonded_limit = min(1.1*r_bonded, comm->cutoff);
 +                }
 +                /* We determine cutoff_mbody later */
 +            }
 +            else
 +            {
 +                /* No special bonded communication,
 +                 * simply increase the DD cut-off.
 +                 */
 +                r_bonded_limit     = 1.1*max(r_2b, r_mb);
 +                comm->cutoff_mbody = r_bonded_limit;
 +                comm->cutoff       = max(comm->cutoff, comm->cutoff_mbody);
 +            }
 +        }
 +        comm->cellsize_limit = max(comm->cellsize_limit, r_bonded_limit);
 +        if (fplog)
 +        {
 +            fprintf(fplog,
 +                    "Minimum cell size due to bonded interactions: %.3f nm\n",
 +                    comm->cellsize_limit);
 +        }
 +    }
 +
 +    if (dd->bInterCGcons && rconstr <= 0)
 +    {
 +        /* There is a cell size limit due to the constraints (P-LINCS) */
 +        rconstr = constr_r_max(fplog, mtop, ir);
 +        if (fplog)
 +        {
 +            fprintf(fplog,
 +                    "Estimated maximum distance required for P-LINCS: %.3f nm\n",
 +                    rconstr);
 +            if (rconstr > comm->cellsize_limit)
 +            {
 +                fprintf(fplog, "This distance will limit the DD cell size, you can override this with -rcon\n");
 +            }
 +        }
 +    }
 +    else if (rconstr > 0 && fplog)
 +    {
 +        /* Here we do not check for dd->bInterCGcons,
 +         * because one can also set a cell size limit for virtual sites only
 +         * and at this point we don't know yet if there are intercg v-sites.
 +         */
 +        fprintf(fplog,
 +                "User supplied maximum distance required for P-LINCS: %.3f nm\n",
 +                rconstr);
 +    }
 +    comm->cellsize_limit = max(comm->cellsize_limit, rconstr);
 +
 +    comm->cgs_gl = gmx_mtop_global_cgs(mtop);
 +
 +    if (nc[XX] > 0)
 +    {
 +        copy_ivec(nc, dd->nc);
 +        set_dd_dim(fplog, dd);
 +        set_ddbox_cr(cr, &dd->nc, ir, box, &comm->cgs_gl, x, ddbox);
 +
 +        if (cr->npmenodes == -1)
 +        {
 +            cr->npmenodes = 0;
 +        }
 +        acs = average_cellsize_min(dd, ddbox);
 +        if (acs < comm->cellsize_limit)
 +        {
 +            if (fplog)
 +            {
 +                fprintf(fplog, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs, comm->cellsize_limit);
 +            }
 +            gmx_fatal_collective(FARGS, cr, NULL,
 +                                 "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
 +                                 acs, comm->cellsize_limit);
 +        }
 +    }
 +    else
 +    {
 +        set_ddbox_cr(cr, NULL, ir, box, &comm->cgs_gl, x, ddbox);
 +
 +        /* We need to choose the optimal DD grid and possibly PME nodes */
 +        limit = dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
 +                               comm->eDLB != edlbNO, dlb_scale,
 +                               comm->cellsize_limit, comm->cutoff,
 +                               comm->bInterCGBondeds);
 +
 +        if (dd->nc[XX] == 0)
 +        {
 +            bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
 +            sprintf(buf, "Change the number of nodes or mdrun option %s%s%s",
 +                    !bC ? "-rdd" : "-rcon",
 +                    comm->eDLB != edlbNO ? " or -dds" : "",
 +                    bC ? " or your LINCS settings" : "");
 +
 +            gmx_fatal_collective(FARGS, cr, NULL,
 +                                 "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
 +                                 "%s\n"
 +                                 "Look in the log file for details on the domain decomposition",
 +                                 cr->nnodes-cr->npmenodes, limit, buf);
 +        }
 +        set_dd_dim(fplog, dd);
 +    }
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog,
 +                "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
 +                dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
 +    }
 +
 +    dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
 +    if (cr->nnodes - dd->nnodes != cr->npmenodes)
 +    {
 +        gmx_fatal_collective(FARGS, cr, NULL,
 +                             "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
 +                             dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
 +    }
 +    if (cr->npmenodes > dd->nnodes)
 +    {
 +        gmx_fatal_collective(FARGS, cr, NULL,
 +                             "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.", cr->npmenodes, dd->nnodes);
 +    }
 +    if (cr->npmenodes > 0)
 +    {
 +        comm->npmenodes = cr->npmenodes;
 +    }
 +    else
 +    {
 +        comm->npmenodes = dd->nnodes;
 +    }
 +
 +    if (EEL_PME(ir->coulombtype))
 +    {
 +        /* The following choices should match those
 +         * in comm_cost_est in domdec_setup.c.
 +         * Note that here the checks have to take into account
 +         * that the decomposition might occur in a different order than xyz
 +         * (for instance through the env.var. GMX_DD_ORDER_ZYX),
 +         * in which case they will not match those in comm_cost_est,
 +         * but since that is mainly for testing purposes that's fine.
 +         */
 +        if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
 +            comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
 +            getenv("GMX_PMEONEDD") == NULL)
 +        {
 +            comm->npmedecompdim = 2;
 +            comm->npmenodes_x   = dd->nc[XX];
 +            comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
 +        }
 +        else
 +        {
 +            /* In case nc is 1 in both x and y we could still choose to
 +             * decompose pme in y instead of x, but we use x for simplicity.
 +             */
 +            comm->npmedecompdim = 1;
 +            if (dd->dim[0] == YY)
 +            {
 +                comm->npmenodes_x = 1;
 +                comm->npmenodes_y = comm->npmenodes;
 +            }
 +            else
 +            {
 +                comm->npmenodes_x = comm->npmenodes;
 +                comm->npmenodes_y = 1;
 +            }
 +        }
 +        if (fplog)
 +        {
 +            fprintf(fplog, "PME domain decomposition: %d x %d x %d\n",
 +                    comm->npmenodes_x, comm->npmenodes_y, 1);
 +        }
 +    }
 +    else
 +    {
 +        comm->npmedecompdim = 0;
 +        comm->npmenodes_x   = 0;
 +        comm->npmenodes_y   = 0;
 +    }
 +
 +    /* Technically we don't need both of these,
 +     * but it simplifies code not having to recalculate it.
 +     */
 +    *npme_x = comm->npmenodes_x;
 +    *npme_y = comm->npmenodes_y;
 +
 +    snew(comm->slb_frac, DIM);
 +    if (comm->eDLB == edlbNO)
 +    {
 +        comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], sizex);
 +        comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], sizey);
 +        comm->slb_frac[ZZ] = get_slb_frac(fplog, "z", dd->nc[ZZ], sizez);
 +    }
 +
 +    if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
 +    {
 +        if (comm->bBondComm || comm->eDLB != edlbNO)
 +        {
 +            /* Set the bonded communication distance to halfway
 +             * the minimum and the maximum,
 +             * since the extra communication cost is nearly zero.
 +             */
 +            acs                = average_cellsize_min(dd, ddbox);
 +            comm->cutoff_mbody = 0.5*(r_bonded + acs);
 +            if (comm->eDLB != edlbNO)
 +            {
 +                /* Check if this does not limit the scaling */
 +                comm->cutoff_mbody = min(comm->cutoff_mbody, dlb_scale*acs);
 +            }
 +            if (!comm->bBondComm)
 +            {
 +                /* Without bBondComm do not go beyond the n.b. cut-off */
 +                comm->cutoff_mbody = min(comm->cutoff_mbody, comm->cutoff);
 +                if (comm->cellsize_limit >= comm->cutoff)
 +                {
 +                    /* We don't loose a lot of efficieny
 +                     * when increasing it to the n.b. cut-off.
 +                     * It can even be slightly faster, because we need
 +                     * less checks for the communication setup.
 +                     */
 +                    comm->cutoff_mbody = comm->cutoff;
 +                }
 +            }
 +            /* Check if we did not end up below our original limit */
 +            comm->cutoff_mbody = max(comm->cutoff_mbody, r_bonded_limit);
 +
 +            if (comm->cutoff_mbody > comm->cellsize_limit)
 +            {
 +                comm->cellsize_limit = comm->cutoff_mbody;
 +            }
 +        }
 +        /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Bonded atom communication beyond the cut-off: %d\n"
 +                "cellsize limit %f\n",
 +                comm->bBondComm, comm->cellsize_limit);
 +    }
 +
 +    if (MASTER(cr))
 +    {
 +        check_dd_restrictions(cr, dd, ir, fplog);
 +    }
 +
 +    comm->partition_step = INT_MIN;
 +    dd->ddp_count        = 0;
 +
 +    clear_dd_cycle_counts(dd);
 +
 +    return dd;
 +}
 +
 +static void set_dlb_limits(gmx_domdec_t *dd)
 +
 +{
 +    int d;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dd->comm->cd[d].np                 = dd->comm->cd[d].np_dlb;
 +        dd->comm->cellsize_min[dd->dim[d]] =
 +            dd->comm->cellsize_min_dlb[dd->dim[d]];
 +    }
 +}
 +
 +
 +static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_large_int_t step)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    real               cellsize_min;
 +    int                d, nc, i;
 +    char               buf[STRLEN];
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog, "At step %s the performance loss due to force load imbalance is %.1f %%\n", gmx_step_str(step, buf), dd_force_imb_perf_loss(dd)*100);
 +    }
 +
 +    cellsize_min = comm->cellsize_min[dd->dim[0]];
 +    for (d = 1; d < dd->ndim; d++)
 +    {
 +        cellsize_min = min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
 +    }
 +
 +    if (cellsize_min < comm->cellsize_limit*1.05)
 +    {
 +        dd_warning(cr, fplog, "NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
 +
 +        /* Change DLB from "auto" to "no". */
 +        comm->eDLB = edlbNO;
 +
 +        return;
 +    }
 +
 +    dd_warning(cr, fplog, "NOTE: Turning on dynamic load balancing\n");
 +    comm->bDynLoadBal = TRUE;
 +    dd->bGridJump     = TRUE;
 +
 +    set_dlb_limits(dd);
 +
 +    /* We can set the required cell size info here,
 +     * so we do not need to communicate this.
 +     * The grid is completely uniform.
 +     */
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        if (comm->root[d])
 +        {
 +            comm->load[d].sum_m = comm->load[d].sum;
 +
 +            nc = dd->nc[dd->dim[d]];
 +            for (i = 0; i < nc; i++)
 +            {
 +                comm->root[d]->cell_f[i]    = i/(real)nc;
 +                if (d > 0)
 +                {
 +                    comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
 +                    comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
 +                }
 +            }
 +            comm->root[d]->cell_f[nc] = 1.0;
 +        }
 +    }
 +}
 +
 +static char *init_bLocalCG(gmx_mtop_t *mtop)
 +{
 +    int   ncg, cg;
 +    char *bLocalCG;
 +
 +    ncg = ncg_mtop(mtop);
 +    snew(bLocalCG, ncg);
 +    for (cg = 0; cg < ncg; cg++)
 +    {
 +        bLocalCG[cg] = FALSE;
 +    }
 +
 +    return bLocalCG;
 +}
 +
 +void dd_init_bondeds(FILE *fplog,
 +                     gmx_domdec_t *dd, gmx_mtop_t *mtop,
 +                     gmx_vsite_t *vsite,
 +                     t_inputrec *ir, gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_bool           bBondComm;
 +    int                d;
 +
 +    dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
 +
 +    comm = dd->comm;
 +
 +    if (comm->bBondComm)
 +    {
 +        /* Communicate atoms beyond the cut-off for bonded interactions */
 +        comm = dd->comm;
 +
 +        comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
 +
 +        comm->bLocalCG = init_bLocalCG(mtop);
 +    }
 +    else
 +    {
 +        /* Only communicate atoms based on cut-off */
 +        comm->cglink   = NULL;
 +        comm->bLocalCG = NULL;
 +    }
 +}
 +
 +static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
 +                              t_inputrec *ir,
 +                              gmx_bool bDynLoadBal, real dlb_scale,
 +                              gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d;
 +    ivec               np;
 +    real               limit, shrink;
 +    char               buf[64];
 +
 +    if (fplog == NULL)
 +    {
 +        return;
 +    }
 +
 +    comm = dd->comm;
 +
 +    if (bDynLoadBal)
 +    {
 +        fprintf(fplog, "The maximum number of communication pulses is:");
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            fprintf(fplog, " %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
 +        }
 +        fprintf(fplog, "\n");
 +        fprintf(fplog, "The minimum size for domain decomposition cells is %.3f nm\n", comm->cellsize_limit);
 +        fprintf(fplog, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale);
 +        fprintf(fplog, "The allowed shrink of domain decomposition cells is:");
 +        for (d = 0; d < DIM; d++)
 +        {
 +            if (dd->nc[d] > 1)
 +            {
 +                if (d >= ddbox->npbcdim && dd->nc[d] == 2)
 +                {
 +                    shrink = 0;
 +                }
 +                else
 +                {
 +                    shrink =
 +                        comm->cellsize_min_dlb[d]/
 +                        (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
 +                }
 +                fprintf(fplog, " %c %.2f", dim2char(d), shrink);
 +            }
 +        }
 +        fprintf(fplog, "\n");
 +    }
 +    else
 +    {
 +        set_dd_cell_sizes_slb(dd, ddbox, FALSE, np);
 +        fprintf(fplog, "The initial number of communication pulses is:");
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            fprintf(fplog, " %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
 +        }
 +        fprintf(fplog, "\n");
 +        fprintf(fplog, "The initial domain decomposition cell size is:");
 +        for (d = 0; d < DIM; d++)
 +        {
 +            if (dd->nc[d] > 1)
 +            {
 +                fprintf(fplog, " %c %.2f nm",
 +                        dim2char(d), dd->comm->cellsize_min[d]);
 +            }
 +        }
 +        fprintf(fplog, "\n\n");
 +    }
 +
 +    if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
 +    {
 +        fprintf(fplog, "The maximum allowed distance for charge groups involved in interactions is:\n");
 +        fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                "non-bonded interactions", "", comm->cutoff);
 +
 +        if (bDynLoadBal)
 +        {
 +            limit = dd->comm->cellsize_limit;
 +        }
 +        else
 +        {
 +            if (dynamic_dd_box(ddbox, ir))
 +            {
 +                fprintf(fplog, "(the following are initial values, they could change due to box deformation)\n");
 +            }
 +            limit = dd->comm->cellsize_min[XX];
 +            for (d = 1; d < DIM; d++)
 +            {
 +                limit = min(limit, dd->comm->cellsize_min[d]);
 +            }
 +        }
 +
 +        if (comm->bInterCGBondeds)
 +        {
 +            fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                    "two-body bonded interactions", "(-rdd)",
 +                    max(comm->cutoff, comm->cutoff_mbody));
 +            fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                    "multi-body bonded interactions", "(-rdd)",
 +                    (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff, limit));
 +        }
 +        if (dd->vsite_comm)
 +        {
 +            fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                    "virtual site constructions", "(-rcon)", limit);
 +        }
 +        if (dd->constraint_comm)
 +        {
 +            sprintf(buf, "atoms separated by up to %d constraints",
 +                    1+ir->nProjOrder);
 +            fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                    buf, "(-rcon)", limit);
 +        }
 +        fprintf(fplog, "\n");
 +    }
 +
 +    fflush(fplog);
 +}
 +
 +static void set_cell_limits_dlb(gmx_domdec_t      *dd,
 +                                real               dlb_scale,
 +                                const t_inputrec  *ir,
 +                                const gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d, dim, npulse, npulse_d_max, npulse_d;
 +    gmx_bool           bNoCutOff;
 +
 +    comm = dd->comm;
 +
 +    bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
 +
 +    /* Determine the maximum number of comm. pulses in one dimension */
 +
 +    comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
 +
 +    /* Determine the maximum required number of grid pulses */
 +    if (comm->cellsize_limit >= comm->cutoff)
 +    {
 +        /* Only a single pulse is required */
 +        npulse = 1;
 +    }
 +    else if (!bNoCutOff && comm->cellsize_limit > 0)
 +    {
 +        /* We round down slightly here to avoid overhead due to the latency
 +         * of extra communication calls when the cut-off
 +         * would be only slightly longer than the cell size.
 +         * Later cellsize_limit is redetermined,
 +         * so we can not miss interactions due to this rounding.
 +         */
 +        npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
 +    }
 +    else
 +    {
 +        /* There is no cell size limit */
 +        npulse = max(dd->nc[XX]-1, max(dd->nc[YY]-1, dd->nc[ZZ]-1));
 +    }
 +
 +    if (!bNoCutOff && npulse > 1)
 +    {
 +        /* See if we can do with less pulses, based on dlb_scale */
 +        npulse_d_max = 0;
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            dim      = dd->dim[d];
 +            npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
 +                             /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
 +            npulse_d_max = max(npulse_d_max, npulse_d);
 +        }
 +        npulse = min(npulse, npulse_d_max);
 +    }
 +
 +    /* This env var can override npulse */
 +    d = dd_nst_env(debug, "GMX_DD_NPULSE", 0);
 +    if (d > 0)
 +    {
 +        npulse = d;
 +    }
 +
 +    comm->maxpulse       = 1;
 +    comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        comm->cd[d].np_dlb    = min(npulse, dd->nc[dd->dim[d]]-1);
 +        comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
 +        snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
 +        comm->maxpulse = max(comm->maxpulse, comm->cd[d].np_dlb);
 +        if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
 +        {
 +            comm->bVacDLBNoLimit = FALSE;
 +        }
 +    }
 +
 +    /* cellsize_limit is set for LINCS in init_domain_decomposition */
 +    if (!comm->bVacDLBNoLimit)
 +    {
 +        comm->cellsize_limit = max(comm->cellsize_limit,
 +                                   comm->cutoff/comm->maxpulse);
 +    }
 +    comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
 +    /* Set the minimum cell size for each DD dimension */
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        if (comm->bVacDLBNoLimit ||
 +            comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
 +        {
 +            comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
 +        }
 +        else
 +        {
 +            comm->cellsize_min_dlb[dd->dim[d]] =
 +                comm->cutoff/comm->cd[d].np_dlb;
 +        }
 +    }
 +    if (comm->cutoff_mbody <= 0)
 +    {
 +        comm->cutoff_mbody = min(comm->cutoff, comm->cellsize_limit);
 +    }
 +    if (comm->bDynLoadBal)
 +    {
 +        set_dlb_limits(dd);
 +    }
 +}
 +
 +gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd, int ePBC)
 +{
 +    /* If each molecule is a single charge group
 +     * or we use domain decomposition for each periodic dimension,
 +     * we do not need to take pbc into account for the bonded interactions.
 +     */
 +    return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
 +            !(dd->nc[XX] > 1 &&
 +              dd->nc[YY] > 1 &&
 +              (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
 +}
 +
 +void set_dd_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
 +                       t_inputrec *ir, gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                natoms_tot;
 +    real               vol_frac;
 +
 +    comm = dd->comm;
 +
 +    /* Initialize the thread data.
 +     * This can not be done in init_domain_decomposition,
 +     * as the numbers of threads is determined later.
 +     */
 +    comm->nth = gmx_omp_nthreads_get(emntDomdec);
 +    if (comm->nth > 1)
 +    {
 +        snew(comm->dth, comm->nth);
 +    }
 +
 +    if (EEL_PME(ir->coulombtype))
 +    {
 +        init_ddpme(dd, &comm->ddpme[0], 0);
 +        if (comm->npmedecompdim >= 2)
 +        {
 +            init_ddpme(dd, &comm->ddpme[1], 1);
 +        }
 +    }
 +    else
 +    {
 +        comm->npmenodes = 0;
 +        if (dd->pme_nodeid >= 0)
 +        {
 +            gmx_fatal_collective(FARGS, NULL, dd,
 +                                 "Can not have separate PME nodes without PME electrostatics");
 +        }
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
 +    }
 +    if (comm->eDLB != edlbNO)
 +    {
 +        set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
 +    }
 +
 +    print_dd_settings(fplog, dd, ir, comm->bDynLoadBal, dlb_scale, ddbox);
 +    if (comm->eDLB == edlbAUTO)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "When dynamic load balancing gets turned on, these settings will change to:\n");
 +        }
 +        print_dd_settings(fplog, dd, ir, TRUE, dlb_scale, ddbox);
 +    }
 +
 +    if (ir->ePBC == epbcNONE)
 +    {
 +        vol_frac = 1 - 1/(double)dd->nnodes;
 +    }
 +    else
 +    {
 +        vol_frac =
 +            (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/(double)dd->nnodes;
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
 +    }
 +    natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
 +
 +    dd->ga2la = ga2la_init(natoms_tot, vol_frac*natoms_tot);
 +}
 +
 +static gmx_bool test_dd_cutoff(t_commrec *cr,
 +                               t_state *state, t_inputrec *ir,
 +                               real cutoff_req)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_ddbox_t   ddbox;
 +    int           d, dim, np;
 +    real          inv_cell_size;
 +    int           LocallyLimited;
 +
 +    dd = cr->dd;
 +
 +    set_ddbox(dd, FALSE, cr, ir, state->box,
 +              TRUE, &dd->comm->cgs_gl, state->x, &ddbox);
 +
 +    LocallyLimited = 0;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +
 +        inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
 +        if (dynamic_dd_box(&ddbox, ir))
 +        {
 +            inv_cell_size *= DD_PRES_SCALE_MARGIN;
 +        }
 +
 +        np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
 +
 +        if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
 +            dd->comm->cd[d].np_dlb > 0)
 +        {
 +            if (np > dd->comm->cd[d].np_dlb)
 +            {
 +                return FALSE;
 +            }
 +
 +            /* If a current local cell size is smaller than the requested
 +             * cut-off, we could still fix it, but this gets very complicated.
 +             * Without fixing here, we might actually need more checks.
 +             */
 +            if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
 +            {
 +                LocallyLimited = 1;
 +            }
 +        }
 +    }
 +
 +    if (dd->comm->eDLB != edlbNO)
 +    {
 +        /* If DLB is not active yet, we don't need to check the grid jumps.
 +         * Actually we shouldn't, because then the grid jump data is not set.
 +         */
 +        if (dd->comm->bDynLoadBal &&
 +            check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
 +        {
 +            LocallyLimited = 1;
 +        }
 +
 +        gmx_sumi(1, &LocallyLimited, cr);
 +
 +        if (LocallyLimited > 0)
 +        {
 +            return FALSE;
 +        }
 +    }
 +
 +    return TRUE;
 +}
 +
 +gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, t_inputrec *ir,
 +                          real cutoff_req)
 +{
 +    gmx_bool bCutoffAllowed;
 +
 +    bCutoffAllowed = test_dd_cutoff(cr, state, ir, cutoff_req);
 +
 +    if (bCutoffAllowed)
 +    {
 +        cr->dd->comm->cutoff = cutoff_req;
 +    }
 +
 +    return bCutoffAllowed;
 +}
 +
 +void change_dd_dlb_cutoff_limit(t_commrec *cr)
 +{
 +    gmx_domdec_comm_t *comm;
 +
 +    comm = cr->dd->comm;
 +
 +    /* Turn on the DLB limiting (might have been on already) */
 +    comm->bPMELoadBalDLBLimits = TRUE;
 +
 +    /* Change the cut-off limit */
 +    comm->PMELoadBal_max_cutoff = comm->cutoff;
 +}
 +
 +static void merge_cg_buffers(int ncell,
 +                             gmx_domdec_comm_dim_t *cd, int pulse,
 +                             int  *ncg_cell,
 +                             int  *index_gl, int  *recv_i,
 +                             rvec *cg_cm,    rvec *recv_vr,
 +                             int *cgindex,
 +                             cginfo_mb_t *cginfo_mb, int *cginfo)
 +{
 +    gmx_domdec_ind_t *ind, *ind_p;
 +    int               p, cell, c, cg, cg0, cg1, cg_gl, nat;
 +    int               shift, shift_at;
 +
 +    ind = &cd->ind[pulse];
 +
 +    /* First correct the already stored data */
 +    shift = ind->nrecv[ncell];
 +    for (cell = ncell-1; cell >= 0; cell--)
 +    {
 +        shift -= ind->nrecv[cell];
 +        if (shift > 0)
 +        {
 +            /* Move the cg's present from previous grid pulses */
 +            cg0                = ncg_cell[ncell+cell];
 +            cg1                = ncg_cell[ncell+cell+1];
 +            cgindex[cg1+shift] = cgindex[cg1];
 +            for (cg = cg1-1; cg >= cg0; cg--)
 +            {
 +                index_gl[cg+shift] = index_gl[cg];
 +                copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
 +                cgindex[cg+shift] = cgindex[cg];
 +                cginfo[cg+shift]  = cginfo[cg];
 +            }
 +            /* Correct the already stored send indices for the shift */
 +            for (p = 1; p <= pulse; p++)
 +            {
 +                ind_p = &cd->ind[p];
 +                cg0   = 0;
 +                for (c = 0; c < cell; c++)
 +                {
 +                    cg0 += ind_p->nsend[c];
 +                }
 +                cg1 = cg0 + ind_p->nsend[cell];
 +                for (cg = cg0; cg < cg1; cg++)
 +                {
 +                    ind_p->index[cg] += shift;
 +                }
 +            }
 +        }
 +    }
 +
 +    /* Merge in the communicated buffers */
 +    shift    = 0;
 +    shift_at = 0;
 +    cg0      = 0;
 +    for (cell = 0; cell < ncell; cell++)
 +    {
 +        cg1 = ncg_cell[ncell+cell+1] + shift;
 +        if (shift_at > 0)
 +        {
 +            /* Correct the old cg indices */
 +            for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
 +            {
 +                cgindex[cg+1] += shift_at;
 +            }
 +        }
 +        for (cg = 0; cg < ind->nrecv[cell]; cg++)
 +        {
 +            /* Copy this charge group from the buffer */
 +            index_gl[cg1] = recv_i[cg0];
 +            copy_rvec(recv_vr[cg0], cg_cm[cg1]);
 +            /* Add it to the cgindex */
 +            cg_gl          = index_gl[cg1];
 +            cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
 +            nat            = GET_CGINFO_NATOMS(cginfo[cg1]);
 +            cgindex[cg1+1] = cgindex[cg1] + nat;
 +            cg0++;
 +            cg1++;
 +            shift_at += nat;
 +        }
 +        shift                 += ind->nrecv[cell];
 +        ncg_cell[ncell+cell+1] = cg1;
 +    }
 +}
 +
 +static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
 +                               int nzone, int cg0, const int *cgindex)
 +{
 +    int cg, zone, p;
 +
 +    /* Store the atom block boundaries for easy copying of communication buffers
 +     */
 +    cg = cg0;
 +    for (zone = 0; zone < nzone; zone++)
 +    {
 +        for (p = 0; p < cd->np; p++)
 +        {
 +            cd->ind[p].cell2at0[zone] = cgindex[cg];
 +            cg += cd->ind[p].nrecv[zone];
 +            cd->ind[p].cell2at1[zone] = cgindex[cg];
 +        }
 +    }
 +}
 +
 +static gmx_bool missing_link(t_blocka *link, int cg_gl, char *bLocalCG)
 +{
 +    int      i;
 +    gmx_bool bMiss;
 +
 +    bMiss = FALSE;
 +    for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
 +    {
 +        if (!bLocalCG[link->a[i]])
 +        {
 +            bMiss = TRUE;
 +        }
 +    }
 +
 +    return bMiss;
 +}
 +
 +/* Domain corners for communication, a maximum of 4 i-zones see a j domain */
 +typedef struct {
 +    real c[DIM][4]; /* the corners for the non-bonded communication */
 +    real cr0;       /* corner for rounding */
 +    real cr1[4];    /* corners for rounding */
 +    real bc[DIM];   /* corners for bounded communication */
 +    real bcr1;      /* corner for rounding for bonded communication */
 +} dd_corners_t;
 +
 +/* Determine the corners of the domain(s) we are communicating with */
 +static void
 +set_dd_corners(const gmx_domdec_t *dd,
 +               int dim0, int dim1, int dim2,
 +               gmx_bool bDistMB,
 +               dd_corners_t *c)
 +{
 +    const gmx_domdec_comm_t  *comm;
 +    const gmx_domdec_zones_t *zones;
 +    int i, j;
 +
 +    comm = dd->comm;
 +
 +    zones = &comm->zones;
 +
 +    /* Keep the compiler happy */
 +    c->cr0  = 0;
 +    c->bcr1 = 0;
 +
 +    /* The first dimension is equal for all cells */
 +    c->c[0][0] = comm->cell_x0[dim0];
 +    if (bDistMB)
 +    {
 +        c->bc[0] = c->c[0][0];
 +    }
 +    if (dd->ndim >= 2)
 +    {
 +        dim1 = dd->dim[1];
 +        /* This cell row is only seen from the first row */
 +        c->c[1][0] = comm->cell_x0[dim1];
 +        /* All rows can see this row */
 +        c->c[1][1] = comm->cell_x0[dim1];
 +        if (dd->bGridJump)
 +        {
 +            c->c[1][1] = max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
 +            if (bDistMB)
 +            {
 +                /* For the multi-body distance we need the maximum */
 +                c->bc[1] = max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
 +            }
 +        }
 +        /* Set the upper-right corner for rounding */
 +        c->cr0 = comm->cell_x1[dim0];
 +
 +        if (dd->ndim >= 3)
 +        {
 +            dim2 = dd->dim[2];
 +            for (j = 0; j < 4; j++)
 +            {
 +                c->c[2][j] = comm->cell_x0[dim2];
 +            }
 +            if (dd->bGridJump)
 +            {
 +                /* Use the maximum of the i-cells that see a j-cell */
 +                for (i = 0; i < zones->nizone; i++)
 +                {
 +                    for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
 +                    {
 +                        if (j >= 4)
 +                        {
 +                            c->c[2][j-4] =
 +                                max(c->c[2][j-4],
 +                                    comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
 +                        }
 +                    }
 +                }
 +                if (bDistMB)
 +                {
 +                    /* For the multi-body distance we need the maximum */
 +                    c->bc[2] = comm->cell_x0[dim2];
 +                    for (i = 0; i < 2; i++)
 +                    {
 +                        for (j = 0; j < 2; j++)
 +                        {
 +                            c->bc[2] = max(c->bc[2], comm->zone_d2[i][j].p1_0);
 +                        }
 +                    }
 +                }
 +            }
 +
 +            /* Set the upper-right corner for rounding */
 +            /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
 +             * Only cell (0,0,0) can see cell 7 (1,1,1)
 +             */
 +            c->cr1[0] = comm->cell_x1[dim1];
 +            c->cr1[3] = comm->cell_x1[dim1];
 +            if (dd->bGridJump)
 +            {
 +                c->cr1[0] = max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
 +                if (bDistMB)
 +                {
 +                    /* For the multi-body distance we need the maximum */
 +                    c->bcr1 = max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* Determine which cg's we need to send in this pulse from this zone */
 +static void
 +get_zone_pulse_cgs(gmx_domdec_t *dd,
 +                   int zonei, int zone,
 +                   int cg0, int cg1,
 +                   const int *index_gl,
 +                   const int *cgindex,
 +                   int dim, int dim_ind,
 +                   int dim0, int dim1, int dim2,
 +                   real r_comm2, real r_bcomm2,
 +                   matrix box,
 +                   ivec tric_dist,
 +                   rvec *normal,
 +                   real skew_fac2_d, real skew_fac_01,
 +                   rvec *v_d, rvec *v_0, rvec *v_1,
 +                   const dd_corners_t *c,
 +                   rvec sf2_round,
 +                   gmx_bool bDistBonded,
 +                   gmx_bool bBondComm,
 +                   gmx_bool bDist2B,
 +                   gmx_bool bDistMB,
 +                   rvec *cg_cm,
 +                   int *cginfo,
 +                   gmx_domdec_ind_t *ind,
 +                   int **ibuf, int *ibuf_nalloc,
 +                   vec_rvec_t *vbuf,
 +                   int *nsend_ptr,
 +                   int *nat_ptr,
 +                   int *nsend_z_ptr)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_bool           bScrew;
 +    gmx_bool           bDistMB_pulse;
 +    int                cg, i;
 +    real               r2, rb2, r, tric_sh;
 +    rvec               rn, rb;
 +    int                dimd;
 +    int                nsend_z, nsend, nat;
 +
 +    comm = dd->comm;
 +
 +    bScrew = (dd->bScrewPBC && dim == XX);
 +
 +    bDistMB_pulse = (bDistMB && bDistBonded);
 +
 +    nsend_z = 0;
 +    nsend   = *nsend_ptr;
 +    nat     = *nat_ptr;
 +
 +    for (cg = cg0; cg < cg1; cg++)
 +    {
 +        r2  = 0;
 +        rb2 = 0;
 +        if (tric_dist[dim_ind] == 0)
 +        {
 +            /* Rectangular direction, easy */
 +            r = cg_cm[cg][dim] - c->c[dim_ind][zone];
 +            if (r > 0)
 +            {
 +                r2 += r*r;
 +            }
 +            if (bDistMB_pulse)
 +            {
 +                r = cg_cm[cg][dim] - c->bc[dim_ind];
 +                if (r > 0)
 +                {
 +                    rb2 += r*r;
 +                }
 +            }
 +            /* Rounding gives at most a 16% reduction
 +             * in communicated atoms
 +             */
 +            if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
 +            {
 +                r = cg_cm[cg][dim0] - c->cr0;
 +                /* This is the first dimension, so always r >= 0 */
 +                r2 += r*r;
 +                if (bDistMB_pulse)
 +                {
 +                    rb2 += r*r;
 +                }
 +            }
 +            if (dim_ind == 2 && (zonei == 2 || zonei == 3))
 +            {
 +                r = cg_cm[cg][dim1] - c->cr1[zone];
 +                if (r > 0)
 +                {
 +                    r2 += r*r;
 +                }
 +                if (bDistMB_pulse)
 +                {
 +                    r = cg_cm[cg][dim1] - c->bcr1;
 +                    if (r > 0)
 +                    {
 +                        rb2 += r*r;
 +                    }
 +                }
 +            }
 +        }
 +        else
 +        {
 +            /* Triclinic direction, more complicated */
 +            clear_rvec(rn);
 +            clear_rvec(rb);
 +            /* Rounding, conservative as the skew_fac multiplication
 +             * will slightly underestimate the distance.
 +             */
 +            if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
 +            {
 +                rn[dim0] = cg_cm[cg][dim0] - c->cr0;
 +                for (i = dim0+1; i < DIM; i++)
 +                {
 +                    rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
 +                }
 +                r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
 +                if (bDistMB_pulse)
 +                {
 +                    rb[dim0] = rn[dim0];
 +                    rb2      = r2;
 +                }
 +                /* Take care that the cell planes along dim0 might not
 +                 * be orthogonal to those along dim1 and dim2.
 +                 */
 +                for (i = 1; i <= dim_ind; i++)
 +                {
 +                    dimd = dd->dim[i];
 +                    if (normal[dim0][dimd] > 0)
 +                    {
 +                        rn[dimd] -= rn[dim0]*normal[dim0][dimd];
 +                        if (bDistMB_pulse)
 +                        {
 +                            rb[dimd] -= rb[dim0]*normal[dim0][dimd];
 +                        }
 +                    }
 +                }
 +            }
 +            if (dim_ind == 2 && (zonei == 2 || zonei == 3))
 +            {
 +                rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
 +                tric_sh   = 0;
 +                for (i = dim1+1; i < DIM; i++)
 +                {
 +                    tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
 +                }
 +                rn[dim1] += tric_sh;
 +                if (rn[dim1] > 0)
 +                {
 +                    r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
 +                    /* Take care of coupling of the distances
 +                     * to the planes along dim0 and dim1 through dim2.
 +                     */
 +                    r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
 +                    /* Take care that the cell planes along dim1
 +                     * might not be orthogonal to that along dim2.
 +                     */
 +                    if (normal[dim1][dim2] > 0)
 +                    {
 +                        rn[dim2] -= rn[dim1]*normal[dim1][dim2];
 +                    }
 +                }
 +                if (bDistMB_pulse)
 +                {
 +                    rb[dim1] +=
 +                        cg_cm[cg][dim1] - c->bcr1 + tric_sh;
 +                    if (rb[dim1] > 0)
 +                    {
 +                        rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
 +                        /* Take care of coupling of the distances
 +                         * to the planes along dim0 and dim1 through dim2.
 +                         */
 +                        rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
 +                        /* Take care that the cell planes along dim1
 +                         * might not be orthogonal to that along dim2.
 +                         */
 +                        if (normal[dim1][dim2] > 0)
 +                        {
 +                            rb[dim2] -= rb[dim1]*normal[dim1][dim2];
 +                        }
 +                    }
 +                }
 +            }
 +            /* The distance along the communication direction */
 +            rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
 +            tric_sh  = 0;
 +            for (i = dim+1; i < DIM; i++)
 +            {
 +                tric_sh -= cg_cm[cg][i]*v_d[i][dim];
 +            }
 +            rn[dim] += tric_sh;
 +            if (rn[dim] > 0)
 +            {
 +                r2 += rn[dim]*rn[dim]*skew_fac2_d;
 +                /* Take care of coupling of the distances
 +                 * to the planes along dim0 and dim1 through dim2.
 +                 */
 +                if (dim_ind == 1 && zonei == 1)
 +                {
 +                    r2 -= rn[dim0]*rn[dim]*skew_fac_01;
 +                }
 +            }
 +            if (bDistMB_pulse)
 +            {
 +                clear_rvec(rb);
 +                rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
 +                if (rb[dim] > 0)
 +                {
 +                    rb2 += rb[dim]*rb[dim]*skew_fac2_d;
 +                    /* Take care of coupling of the distances
 +                     * to the planes along dim0 and dim1 through dim2.
 +                     */
 +                    if (dim_ind == 1 && zonei == 1)
 +                    {
 +                        rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
 +                    }
 +                }
 +            }
 +        }
 +
 +        if (r2 < r_comm2 ||
 +            (bDistBonded &&
 +             ((bDistMB && rb2 < r_bcomm2) ||
 +              (bDist2B && r2  < r_bcomm2)) &&
 +             (!bBondComm ||
 +              (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
 +               missing_link(comm->cglink, index_gl[cg],
 +                            comm->bLocalCG)))))
 +        {
 +            /* Make an index to the local charge groups */
 +            if (nsend+1 > ind->nalloc)
 +            {
 +                ind->nalloc = over_alloc_large(nsend+1);
 +                srenew(ind->index, ind->nalloc);
 +            }
 +            if (nsend+1 > *ibuf_nalloc)
 +            {
 +                *ibuf_nalloc = over_alloc_large(nsend+1);
 +                srenew(*ibuf, *ibuf_nalloc);
 +            }
 +            ind->index[nsend] = cg;
 +            (*ibuf)[nsend]    = index_gl[cg];
 +            nsend_z++;
 +            vec_rvec_check_alloc(vbuf, nsend+1);
 +
 +            if (dd->ci[dim] == 0)
 +            {
 +                /* Correct cg_cm for pbc */
 +                rvec_add(cg_cm[cg], box[dim], vbuf->v[nsend]);
 +                if (bScrew)
 +                {
 +                    vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
 +                    vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
 +                }
 +            }
 +            else
 +            {
 +                copy_rvec(cg_cm[cg], vbuf->v[nsend]);
 +            }
 +            nsend++;
 +            nat += cgindex[cg+1] - cgindex[cg];
 +        }
 +    }
 +
 +    *nsend_ptr   = nsend;
 +    *nat_ptr     = nat;
 +    *nsend_z_ptr = nsend_z;
 +}
 +
 +static void setup_dd_communication(gmx_domdec_t *dd,
 +                                   matrix box, gmx_ddbox_t *ddbox,
 +                                   t_forcerec *fr, t_state *state, rvec **f)
 +{
 +    int                    dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
 +    int                    nzone, nzone_send, zone, zonei, cg0, cg1;
 +    int                    c, i, j, cg, cg_gl, nrcg;
 +    int                   *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_zones_t    *zones;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    cginfo_mb_t           *cginfo_mb;
 +    gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
 +    real                   r_mb, r_comm2, r_scomm2, r_bcomm2, r_0, r_1, r2inc, inv_ncg;
 +    dd_corners_t           corners;
 +    ivec                   tric_dist;
 +    rvec                  *cg_cm, *normal, *v_d, *v_0 = NULL, *v_1 = NULL, *recv_vr;
 +    real                   skew_fac2_d, skew_fac_01;
 +    rvec                   sf2_round;
 +    int                    nsend, nat;
 +    int                    th;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Setting up DD communication\n");
 +    }
 +
 +    comm  = dd->comm;
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +        case ecutsGROUP:
 +            cg_cm = fr->cg_cm;
 +            break;
 +        case ecutsVERLET:
 +            cg_cm = state->x;
 +            break;
 +        default:
 +            gmx_incons("unimplemented");
 +            cg_cm = NULL;
 +    }
 +
 +    for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
 +    {
 +        dim = dd->dim[dim_ind];
 +
 +        /* Check if we need to use triclinic distances */
 +        tric_dist[dim_ind] = 0;
 +        for (i = 0; i <= dim_ind; i++)
 +        {
 +            if (ddbox->tric_dir[dd->dim[i]])
 +            {
 +                tric_dist[dim_ind] = 1;
 +            }
 +        }
 +    }
 +
 +    bBondComm = comm->bBondComm;
 +
 +    /* Do we need to determine extra distances for multi-body bondeds? */
 +    bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
 +
 +    /* Do we need to determine extra distances for only two-body bondeds? */
 +    bDist2B = (bBondComm && !bDistMB);
 +
 +    r_comm2  = sqr(comm->cutoff);
 +    r_bcomm2 = sqr(comm->cutoff_mbody);
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "bBondComm %d, r_bc %f\n", bBondComm, sqrt(r_bcomm2));
 +    }
 +
 +    zones = &comm->zones;
 +
 +    dim0 = dd->dim[0];
 +    dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
 +    dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
 +
 +    set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
 +
 +    /* Triclinic stuff */
 +    normal      = ddbox->normal;
 +    skew_fac_01 = 0;
 +    if (dd->ndim >= 2)
 +    {
 +        v_0 = ddbox->v[dim0];
 +        if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
 +        {
 +            /* Determine the coupling coefficient for the distances
 +             * to the cell planes along dim0 and dim1 through dim2.
 +             * This is required for correct rounding.
 +             */
 +            skew_fac_01 =
 +                ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
 +            if (debug)
 +            {
 +                fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
 +            }
 +        }
 +    }
 +    if (dd->ndim >= 3)
 +    {
 +        v_1 = ddbox->v[dim1];
 +    }
 +
 +    zone_cg_range = zones->cg_range;
 +    index_gl      = dd->index_gl;
 +    cgindex       = dd->cgindex;
 +    cginfo_mb     = fr->cginfo_mb;
 +
 +    zone_cg_range[0]   = 0;
 +    zone_cg_range[1]   = dd->ncg_home;
 +    comm->zone_ncg1[0] = dd->ncg_home;
 +    pos_cg             = dd->ncg_home;
 +
 +    nat_tot = dd->nat_home;
 +    nzone   = 1;
 +    for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
 +    {
 +        dim = dd->dim[dim_ind];
 +        cd  = &comm->cd[dim_ind];
 +
 +        if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
 +        {
 +            /* No pbc in this dimension, the first node should not comm. */
 +            nzone_send = 0;
 +        }
 +        else
 +        {
 +            nzone_send = nzone;
 +        }
 +
 +        v_d         = ddbox->v[dim];
 +        skew_fac2_d = sqr(ddbox->skew_fac[dim]);
 +
 +        cd->bInPlace = TRUE;
 +        for (p = 0; p < cd->np; p++)
 +        {
 +            /* Only atoms communicated in the first pulse are used
 +             * for multi-body bonded interactions or for bBondComm.
 +             */
 +            bDistBonded = ((bDistMB || bDist2B) && p == 0);
 +
 +            ind   = &cd->ind[p];
 +            nsend = 0;
 +            nat   = 0;
 +            for (zone = 0; zone < nzone_send; zone++)
 +            {
 +                if (tric_dist[dim_ind] && dim_ind > 0)
 +                {
 +                    /* Determine slightly more optimized skew_fac's
 +                     * for rounding.
 +                     * This reduces the number of communicated atoms
 +                     * by about 10% for 3D DD of rhombic dodecahedra.
 +                     */
 +                    for (dimd = 0; dimd < dim; dimd++)
 +                    {
 +                        sf2_round[dimd] = 1;
 +                        if (ddbox->tric_dir[dimd])
 +                        {
 +                            for (i = dd->dim[dimd]+1; i < DIM; i++)
 +                            {
 +                                /* If we are shifted in dimension i
 +                                 * and the cell plane is tilted forward
 +                                 * in dimension i, skip this coupling.
 +                                 */
 +                                if (!(zones->shift[nzone+zone][i] &&
 +                                      ddbox->v[dimd][i][dimd] >= 0))
 +                                {
 +                                    sf2_round[dimd] +=
 +                                        sqr(ddbox->v[dimd][i][dimd]);
 +                                }
 +                            }
 +                            sf2_round[dimd] = 1/sf2_round[dimd];
 +                        }
 +                    }
 +                }
 +
 +                zonei = zone_perm[dim_ind][zone];
 +                if (p == 0)
 +                {
 +                    /* Here we permutate the zones to obtain a convenient order
 +                     * for neighbor searching
 +                     */
 +                    cg0 = zone_cg_range[zonei];
 +                    cg1 = zone_cg_range[zonei+1];
 +                }
 +                else
 +                {
 +                    /* Look only at the cg's received in the previous grid pulse
 +                     */
 +                    cg1 = zone_cg_range[nzone+zone+1];
 +                    cg0 = cg1 - cd->ind[p-1].nrecv[zone];
 +                }
 +
 +#pragma omp parallel for num_threads(comm->nth) schedule(static)
 +                for (th = 0; th < comm->nth; th++)
 +                {
 +                    gmx_domdec_ind_t *ind_p;
 +                    int             **ibuf_p, *ibuf_nalloc_p;
 +                    vec_rvec_t       *vbuf_p;
 +                    int              *nsend_p, *nat_p;
 +                    int              *nsend_zone_p;
 +                    int               cg0_th, cg1_th;
 +
 +                    if (th == 0)
 +                    {
 +                        /* Thread 0 writes in the comm buffers */
 +                        ind_p         = ind;
 +                        ibuf_p        = &comm->buf_int;
 +                        ibuf_nalloc_p = &comm->nalloc_int;
 +                        vbuf_p        = &comm->vbuf;
 +                        nsend_p       = &nsend;
 +                        nat_p         = &nat;
 +                        nsend_zone_p  = &ind->nsend[zone];
 +                    }
 +                    else
 +                    {
 +                        /* Other threads write into temp buffers */
 +                        ind_p         = &comm->dth[th].ind;
 +                        ibuf_p        = &comm->dth[th].ibuf;
 +                        ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
 +                        vbuf_p        = &comm->dth[th].vbuf;
 +                        nsend_p       = &comm->dth[th].nsend;
 +                        nat_p         = &comm->dth[th].nat;
 +                        nsend_zone_p  = &comm->dth[th].nsend_zone;
 +
 +                        comm->dth[th].nsend      = 0;
 +                        comm->dth[th].nat        = 0;
 +                        comm->dth[th].nsend_zone = 0;
 +                    }
 +
 +                    if (comm->nth == 1)
 +                    {
 +                        cg0_th = cg0;
 +                        cg1_th = cg1;
 +                    }
 +                    else
 +                    {
 +                        cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
 +                        cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
 +                    }
 +
 +                    /* Get the cg's for this pulse in this zone */
 +                    get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
 +                                       index_gl, cgindex,
 +                                       dim, dim_ind, dim0, dim1, dim2,
 +                                       r_comm2, r_bcomm2,
 +                                       box, tric_dist,
 +                                       normal, skew_fac2_d, skew_fac_01,
 +                                       v_d, v_0, v_1, &corners, sf2_round,
 +                                       bDistBonded, bBondComm,
 +                                       bDist2B, bDistMB,
 +                                       cg_cm, fr->cginfo,
 +                                       ind_p,
 +                                       ibuf_p, ibuf_nalloc_p,
 +                                       vbuf_p,
 +                                       nsend_p, nat_p,
 +                                       nsend_zone_p);
 +                }
 +
 +                /* Append data of threads>=1 to the communication buffers */
 +                for (th = 1; th < comm->nth; th++)
 +                {
 +                    dd_comm_setup_work_t *dth;
 +                    int                   i, ns1;
 +
 +                    dth = &comm->dth[th];
 +
 +                    ns1 = nsend + dth->nsend_zone;
 +                    if (ns1 > ind->nalloc)
 +                    {
 +                        ind->nalloc = over_alloc_dd(ns1);
 +                        srenew(ind->index, ind->nalloc);
 +                    }
 +                    if (ns1 > comm->nalloc_int)
 +                    {
 +                        comm->nalloc_int = over_alloc_dd(ns1);
 +                        srenew(comm->buf_int, comm->nalloc_int);
 +                    }
 +                    if (ns1 > comm->vbuf.nalloc)
 +                    {
 +                        comm->vbuf.nalloc = over_alloc_dd(ns1);
 +                        srenew(comm->vbuf.v, comm->vbuf.nalloc);
 +                    }
 +
 +                    for (i = 0; i < dth->nsend_zone; i++)
 +                    {
 +                        ind->index[nsend]    = dth->ind.index[i];
 +                        comm->buf_int[nsend] = dth->ibuf[i];
 +                        copy_rvec(dth->vbuf.v[i],
 +                                  comm->vbuf.v[nsend]);
 +                        nsend++;
 +                    }
 +                    nat              += dth->nat;
 +                    ind->nsend[zone] += dth->nsend_zone;
 +                }
 +            }
 +            /* Clear the counts in case we do not have pbc */
 +            for (zone = nzone_send; zone < nzone; zone++)
 +            {
 +                ind->nsend[zone] = 0;
 +            }
 +            ind->nsend[nzone]   = nsend;
 +            ind->nsend[nzone+1] = nat;
 +            /* Communicate the number of cg's and atoms to receive */
 +            dd_sendrecv_int(dd, dim_ind, dddirBackward,
 +                            ind->nsend, nzone+2,
 +                            ind->nrecv, nzone+2);
 +
 +            /* The rvec buffer is also required for atom buffers of size nsend
 +             * in dd_move_x and dd_move_f.
 +             */
 +            vec_rvec_check_alloc(&comm->vbuf, ind->nsend[nzone+1]);
 +
 +            if (p > 0)
 +            {
 +                /* We can receive in place if only the last zone is not empty */
 +                for (zone = 0; zone < nzone-1; zone++)
 +                {
 +                    if (ind->nrecv[zone] > 0)
 +                    {
 +                        cd->bInPlace = FALSE;
 +                    }
 +                }
 +                if (!cd->bInPlace)
 +                {
 +                    /* The int buffer is only required here for the cg indices */
 +                    if (ind->nrecv[nzone] > comm->nalloc_int2)
 +                    {
 +                        comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
 +                        srenew(comm->buf_int2, comm->nalloc_int2);
 +                    }
 +                    /* The rvec buffer is also required for atom buffers
 +                     * of size nrecv in dd_move_x and dd_move_f.
 +                     */
 +                    i = max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
 +                    vec_rvec_check_alloc(&comm->vbuf2, i);
 +                }
 +            }
 +
 +            /* Make space for the global cg indices */
 +            if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
 +                || dd->cg_nalloc == 0)
 +            {
 +                dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
 +                srenew(index_gl, dd->cg_nalloc);
 +                srenew(cgindex, dd->cg_nalloc+1);
 +            }
 +            /* Communicate the global cg indices */
 +            if (cd->bInPlace)
 +            {
 +                recv_i = index_gl + pos_cg;
 +            }
 +            else
 +            {
 +                recv_i = comm->buf_int2;
 +            }
 +            dd_sendrecv_int(dd, dim_ind, dddirBackward,
 +                            comm->buf_int, nsend,
 +                            recv_i,        ind->nrecv[nzone]);
 +
 +            /* Make space for cg_cm */
 +            dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
 +            if (fr->cutoff_scheme == ecutsGROUP)
 +            {
 +                cg_cm = fr->cg_cm;
 +            }
 +            else
 +            {
 +                cg_cm = state->x;
 +            }
 +            /* Communicate cg_cm */
 +            if (cd->bInPlace)
 +            {
 +                recv_vr = cg_cm + pos_cg;
 +            }
 +            else
 +            {
 +                recv_vr = comm->vbuf2.v;
 +            }
 +            dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
 +                             comm->vbuf.v, nsend,
 +                             recv_vr,      ind->nrecv[nzone]);
 +
 +            /* Make the charge group index */
 +            if (cd->bInPlace)
 +            {
 +                zone = (p == 0 ? 0 : nzone - 1);
 +                while (zone < nzone)
 +                {
 +                    for (cg = 0; cg < ind->nrecv[zone]; cg++)
 +                    {
 +                        cg_gl              = index_gl[pos_cg];
 +                        fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl);
 +                        nrcg               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
 +                        cgindex[pos_cg+1]  = cgindex[pos_cg] + nrcg;
 +                        if (bBondComm)
 +                        {
 +                            /* Update the charge group presence,
 +                             * so we can use it in the next pass of the loop.
 +                             */
 +                            comm->bLocalCG[cg_gl] = TRUE;
 +                        }
 +                        pos_cg++;
 +                    }
 +                    if (p == 0)
 +                    {
 +                        comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
 +                    }
 +                    zone++;
 +                    zone_cg_range[nzone+zone] = pos_cg;
 +                }
 +            }
 +            else
 +            {
 +                /* This part of the code is never executed with bBondComm. */
 +                merge_cg_buffers(nzone, cd, p, zone_cg_range,
 +                                 index_gl, recv_i, cg_cm, recv_vr,
 +                                 cgindex, fr->cginfo_mb, fr->cginfo);
 +                pos_cg += ind->nrecv[nzone];
 +            }
 +            nat_tot += ind->nrecv[nzone+1];
 +        }
 +        if (!cd->bInPlace)
 +        {
 +            /* Store the atom block for easy copying of communication buffers */
 +            make_cell2at_index(cd, nzone, zone_cg_range[nzone], cgindex);
 +        }
 +        nzone += nzone;
 +    }
 +    dd->index_gl = index_gl;
 +    dd->cgindex  = cgindex;
 +
 +    dd->ncg_tot          = zone_cg_range[zones->n];
 +    dd->nat_tot          = nat_tot;
 +    comm->nat[ddnatHOME] = dd->nat_home;
 +    for (i = ddnatZONE; i < ddnatNR; i++)
 +    {
 +        comm->nat[i] = dd->nat_tot;
 +    }
 +
 +    if (!bBondComm)
 +    {
 +        /* We don't need to update cginfo, since that was alrady done above.
 +         * So we pass NULL for the forcerec.
 +         */
 +        dd_set_cginfo(dd->index_gl, dd->ncg_home, dd->ncg_tot,
 +                      NULL, comm->bLocalCG);
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Finished setting up DD communication, zones:");
 +        for (c = 0; c < zones->n; c++)
 +        {
 +            fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
 +        }
 +        fprintf(debug, "\n");
 +    }
 +}
 +
 +static void set_cg_boundaries(gmx_domdec_zones_t *zones)
 +{
 +    int c;
 +
 +    for (c = 0; c < zones->nizone; c++)
 +    {
 +        zones->izone[c].cg1  = zones->cg_range[c+1];
 +        zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
 +        zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
 +    }
 +}
 +
 +static void set_zones_size(gmx_domdec_t *dd,
 +                           matrix box, const gmx_ddbox_t *ddbox,
 +                           int zone_start, int zone_end)
 +{
 +    gmx_domdec_comm_t  *comm;
 +    gmx_domdec_zones_t *zones;
 +    gmx_bool            bDistMB;
 +    int                 z, zi, zj0, zj1, d, dim;
 +    real                rcs, rcmbs;
 +    int                 i, j;
 +    real                size_j, add_tric;
 +    real                vol;
 +
 +    comm = dd->comm;
 +
 +    zones = &comm->zones;
 +
 +    /* Do we need to determine extra distances for multi-body bondeds? */
 +    bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
 +
 +    for (z = zone_start; z < zone_end; z++)
 +    {
 +        /* Copy cell limits to zone limits.
 +         * Valid for non-DD dims and non-shifted dims.
 +         */
 +        copy_rvec(comm->cell_x0, zones->size[z].x0);
 +        copy_rvec(comm->cell_x1, zones->size[z].x1);
 +    }
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +
 +        for (z = 0; z < zones->n; z++)
 +        {
 +            /* With a staggered grid we have different sizes
 +             * for non-shifted dimensions.
 +             */
 +            if (dd->bGridJump && zones->shift[z][dim] == 0)
 +            {
 +                if (d == 1)
 +                {
 +                    zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
 +                    zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
 +                }
 +                else if (d == 2)
 +                {
 +                    zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
 +                    zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
 +                }
 +            }
 +        }
 +
 +        rcs   = comm->cutoff;
 +        rcmbs = comm->cutoff_mbody;
 +        if (ddbox->tric_dir[dim])
 +        {
 +            rcs   /= ddbox->skew_fac[dim];
 +            rcmbs /= ddbox->skew_fac[dim];
 +        }
 +
 +        /* Set the lower limit for the shifted zone dimensions */
 +        for (z = zone_start; z < zone_end; z++)
 +        {
 +            if (zones->shift[z][dim] > 0)
 +            {
 +                dim = dd->dim[d];
 +                if (!dd->bGridJump || d == 0)
 +                {
 +                    zones->size[z].x0[dim] = comm->cell_x1[dim];
 +                    zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
 +                }
 +                else
 +                {
 +                    /* Here we take the lower limit of the zone from
 +                     * the lowest domain of the zone below.
 +                     */
 +                    if (z < 4)
 +                    {
 +                        zones->size[z].x0[dim] =
 +                            comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
 +                    }
 +                    else
 +                    {
 +                        if (d == 1)
 +                        {
 +                            zones->size[z].x0[dim] =
 +                                zones->size[zone_perm[2][z-4]].x0[dim];
 +                        }
 +                        else
 +                        {
 +                            zones->size[z].x0[dim] =
 +                                comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
 +                        }
 +                    }
 +                    /* A temporary limit, is updated below */
 +                    zones->size[z].x1[dim] = zones->size[z].x0[dim];
 +
 +                    if (bDistMB)
 +                    {
 +                        for (zi = 0; zi < zones->nizone; zi++)
 +                        {
 +                            if (zones->shift[zi][dim] == 0)
 +                            {
 +                                /* This takes the whole zone into account.
 +                                 * With multiple pulses this will lead
 +                                 * to a larger zone then strictly necessary.
 +                                 */
 +                                zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
 +                                                             zones->size[zi].x1[dim]+rcmbs);
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +
 +        /* Loop over the i-zones to set the upper limit of each
 +         * j-zone they see.
 +         */
 +        for (zi = 0; zi < zones->nizone; zi++)
 +        {
 +            if (zones->shift[zi][dim] == 0)
 +            {
 +                for (z = zones->izone[zi].j0; z < zones->izone[zi].j1; z++)
 +                {
 +                    if (zones->shift[z][dim] > 0)
 +                    {
 +                        zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
 +                                                     zones->size[zi].x1[dim]+rcs);
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    for (z = zone_start; z < zone_end; z++)
 +    {
 +        /* Initialization only required to keep the compiler happy */
 +        rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
 +        int  nc, c;
 +
 +        /* To determine the bounding box for a zone we need to find
 +         * the extreme corners of 4, 2 or 1 corners.
 +         */
 +        nc = 1 << (ddbox->npbcdim - 1);
 +
 +        for (c = 0; c < nc; c++)
 +        {
 +            /* Set up a zone corner at x=0, ignoring trilinic couplings */
 +            corner[XX] = 0;
 +            if ((c & 1) == 0)
 +            {
 +                corner[YY] = zones->size[z].x0[YY];
 +            }
 +            else
 +            {
 +                corner[YY] = zones->size[z].x1[YY];
 +            }
 +            if ((c & 2) == 0)
 +            {
 +                corner[ZZ] = zones->size[z].x0[ZZ];
 +            }
 +            else
 +            {
 +                corner[ZZ] = zones->size[z].x1[ZZ];
 +            }
 +            if (dd->ndim == 1 && box[ZZ][YY] != 0)
 +            {
 +                /* With 1D domain decomposition the cg's are not in
 +                 * the triclinic box, but triclinic x-y and rectangular y-z.
 +                 * Shift y back, so it will later end up at 0.
 +                 */
 +                corner[YY] -= corner[ZZ]*box[ZZ][YY]/box[ZZ][ZZ];
 +            }
 +            /* Apply the triclinic couplings */
 +            for (i = YY; i < ddbox->npbcdim; i++)
 +            {
 +                for (j = XX; j < i; j++)
 +                {
 +                    corner[j] += corner[i]*box[i][j]/box[i][i];
 +                }
 +            }
 +            if (c == 0)
 +            {
 +                copy_rvec(corner, corner_min);
 +                copy_rvec(corner, corner_max);
 +            }
 +            else
 +            {
 +                for (i = 0; i < DIM; i++)
 +                {
 +                    corner_min[i] = min(corner_min[i], corner[i]);
 +                    corner_max[i] = max(corner_max[i], corner[i]);
 +                }
 +            }
 +        }
 +        /* Copy the extreme cornes without offset along x */
 +        for (i = 0; i < DIM; i++)
 +        {
 +            zones->size[z].bb_x0[i] = corner_min[i];
 +            zones->size[z].bb_x1[i] = corner_max[i];
 +        }
 +        /* Add the offset along x */
 +        zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
 +        zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
 +    }
 +
 +    if (zone_start == 0)
 +    {
 +        vol = 1;
 +        for (dim = 0; dim < DIM; dim++)
 +        {
 +            vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
 +        }
 +        zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
 +    }
 +
 +    if (debug)
 +    {
 +        for (z = zone_start; z < zone_end; z++)
 +        {
 +            fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
 +                    z,
 +                    zones->size[z].x0[XX], zones->size[z].x1[XX],
 +                    zones->size[z].x0[YY], zones->size[z].x1[YY],
 +                    zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
 +            fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
 +                    z,
 +                    zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
 +                    zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
 +                    zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
 +        }
 +    }
 +}
 +
 +static int comp_cgsort(const void *a, const void *b)
 +{
 +    int           comp;
 +
 +    gmx_cgsort_t *cga, *cgb;
 +    cga = (gmx_cgsort_t *)a;
 +    cgb = (gmx_cgsort_t *)b;
 +
 +    comp = cga->nsc - cgb->nsc;
 +    if (comp == 0)
 +    {
 +        comp = cga->ind_gl - cgb->ind_gl;
 +    }
 +
 +    return comp;
 +}
 +
 +static void order_int_cg(int n, const gmx_cgsort_t *sort,
 +                         int *a, int *buf)
 +{
 +    int i;
 +
 +    /* Order the data */
 +    for (i = 0; i < n; i++)
 +    {
 +        buf[i] = a[sort[i].ind];
 +    }
 +
 +    /* Copy back to the original array */
 +    for (i = 0; i < n; i++)
 +    {
 +        a[i] = buf[i];
 +    }
 +}
 +
 +static void order_vec_cg(int n, const gmx_cgsort_t *sort,
 +                         rvec *v, rvec *buf)
 +{
 +    int i;
 +
 +    /* Order the data */
 +    for (i = 0; i < n; i++)
 +    {
 +        copy_rvec(v[sort[i].ind], buf[i]);
 +    }
 +
 +    /* Copy back to the original array */
 +    for (i = 0; i < n; i++)
 +    {
 +        copy_rvec(buf[i], v[i]);
 +    }
 +}
 +
 +static void order_vec_atom(int ncg, const int *cgindex, const gmx_cgsort_t *sort,
 +                           rvec *v, rvec *buf)
 +{
 +    int a, atot, cg, cg0, cg1, i;
 +
 +    if (cgindex == NULL)
 +    {
 +        /* Avoid the useless loop of the atoms within a cg */
 +        order_vec_cg(ncg, sort, v, buf);
 +
 +        return;
 +    }
 +
 +    /* Order the data */
 +    a = 0;
 +    for (cg = 0; cg < ncg; cg++)
 +    {
 +        cg0 = cgindex[sort[cg].ind];
 +        cg1 = cgindex[sort[cg].ind+1];
 +        for (i = cg0; i < cg1; i++)
 +        {
 +            copy_rvec(v[i], buf[a]);
 +            a++;
 +        }
 +    }
 +    atot = a;
 +
 +    /* Copy back to the original array */
 +    for (a = 0; a < atot; a++)
 +    {
 +        copy_rvec(buf[a], v[a]);
 +    }
 +}
 +
 +static void ordered_sort(int nsort2, gmx_cgsort_t *sort2,
 +                         int nsort_new, gmx_cgsort_t *sort_new,
 +                         gmx_cgsort_t *sort1)
 +{
 +    int i1, i2, i_new;
 +
 +    /* The new indices are not very ordered, so we qsort them */
 +    qsort_threadsafe(sort_new, nsort_new, sizeof(sort_new[0]), comp_cgsort);
 +
 +    /* sort2 is already ordered, so now we can merge the two arrays */
 +    i1    = 0;
 +    i2    = 0;
 +    i_new = 0;
 +    while (i2 < nsort2 || i_new < nsort_new)
 +    {
 +        if (i2 == nsort2)
 +        {
 +            sort1[i1++] = sort_new[i_new++];
 +        }
 +        else if (i_new == nsort_new)
 +        {
 +            sort1[i1++] = sort2[i2++];
 +        }
 +        else if (sort2[i2].nsc < sort_new[i_new].nsc ||
 +                 (sort2[i2].nsc == sort_new[i_new].nsc &&
 +                  sort2[i2].ind_gl < sort_new[i_new].ind_gl))
 +        {
 +            sort1[i1++] = sort2[i2++];
 +        }
 +        else
 +        {
 +            sort1[i1++] = sort_new[i_new++];
 +        }
 +    }
 +}
 +
 +static int dd_sort_order(gmx_domdec_t *dd, t_forcerec *fr, int ncg_home_old)
 +{
 +    gmx_domdec_sort_t *sort;
 +    gmx_cgsort_t      *cgsort, *sort_i;
 +    int                ncg_new, nsort2, nsort_new, i, *a, moved, *ibuf;
 +    int                sort_last, sort_skip;
 +
 +    sort = dd->comm->sort;
 +
 +    a = fr->ns.grid->cell_index;
 +
 +    moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
 +
 +    if (ncg_home_old >= 0)
 +    {
 +        /* The charge groups that remained in the same ns grid cell
 +         * are completely ordered. So we can sort efficiently by sorting
 +         * the charge groups that did move into the stationary list.
 +         */
 +        ncg_new   = 0;
 +        nsort2    = 0;
 +        nsort_new = 0;
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            /* Check if this cg did not move to another node */
 +            if (a[i] < moved)
 +            {
 +                if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
 +                {
 +                    /* This cg is new on this node or moved ns grid cell */
 +                    if (nsort_new >= sort->sort_new_nalloc)
 +                    {
 +                        sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
 +                        srenew(sort->sort_new, sort->sort_new_nalloc);
 +                    }
 +                    sort_i = &(sort->sort_new[nsort_new++]);
 +                }
 +                else
 +                {
 +                    /* This cg did not move */
 +                    sort_i = &(sort->sort2[nsort2++]);
 +                }
 +                /* Sort on the ns grid cell indices
 +                 * and the global topology index.
 +                 * index_gl is irrelevant with cell ns,
 +                 * but we set it here anyhow to avoid a conditional.
 +                 */
 +                sort_i->nsc    = a[i];
 +                sort_i->ind_gl = dd->index_gl[i];
 +                sort_i->ind    = i;
 +                ncg_new++;
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug, "ordered sort cgs: stationary %d moved %d\n",
 +                    nsort2, nsort_new);
 +        }
 +        /* Sort efficiently */
 +        ordered_sort(nsort2, sort->sort2, nsort_new, sort->sort_new,
 +                     sort->sort);
 +    }
 +    else
 +    {
 +        cgsort  = sort->sort;
 +        ncg_new = 0;
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            /* Sort on the ns grid cell indices
 +             * and the global topology index
 +             */
 +            cgsort[i].nsc    = a[i];
 +            cgsort[i].ind_gl = dd->index_gl[i];
 +            cgsort[i].ind    = i;
 +            if (cgsort[i].nsc < moved)
 +            {
 +                ncg_new++;
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, ncg_new);
 +        }
 +        /* Determine the order of the charge groups using qsort */
 +        qsort_threadsafe(cgsort, dd->ncg_home, sizeof(cgsort[0]), comp_cgsort);
 +    }
 +
 +    return ncg_new;
 +}
 +
 +static int dd_sort_order_nbnxn(gmx_domdec_t *dd, t_forcerec *fr)
 +{
 +    gmx_cgsort_t *sort;
 +    int           ncg_new, i, *a, na;
 +
 +    sort = dd->comm->sort->sort;
 +
 +    nbnxn_get_atomorder(fr->nbv->nbs, &a, &na);
 +
 +    ncg_new = 0;
 +    for (i = 0; i < na; i++)
 +    {
 +        if (a[i] >= 0)
 +        {
 +            sort[ncg_new].ind = a[i];
 +            ncg_new++;
 +        }
 +    }
 +
 +    return ncg_new;
 +}
 +
 +static void dd_sort_state(gmx_domdec_t *dd, rvec *cgcm, t_forcerec *fr, t_state *state,
 +                          int ncg_home_old)
 +{
 +    gmx_domdec_sort_t *sort;
 +    gmx_cgsort_t      *cgsort, *sort_i;
 +    int               *cgindex;
 +    int                ncg_new, i, *ibuf, cgsize;
 +    rvec              *vbuf;
 +
 +    sort = dd->comm->sort;
 +
 +    if (dd->ncg_home > sort->sort_nalloc)
 +    {
 +        sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
 +        srenew(sort->sort, sort->sort_nalloc);
 +        srenew(sort->sort2, sort->sort_nalloc);
 +    }
 +    cgsort = sort->sort;
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +        case ecutsGROUP:
 +            ncg_new = dd_sort_order(dd, fr, ncg_home_old);
 +            break;
 +        case ecutsVERLET:
 +            ncg_new = dd_sort_order_nbnxn(dd, fr);
 +            break;
 +        default:
 +            gmx_incons("unimplemented");
 +            ncg_new = 0;
 +    }
 +
 +    /* We alloc with the old size, since cgindex is still old */
 +    vec_rvec_check_alloc(&dd->comm->vbuf, dd->cgindex[dd->ncg_home]);
 +    vbuf = dd->comm->vbuf.v;
 +
 +    if (dd->comm->bCGs)
 +    {
 +        cgindex = dd->cgindex;
 +    }
 +    else
 +    {
 +        cgindex = NULL;
 +    }
 +
 +    /* Remove the charge groups which are no longer at home here */
 +    dd->ncg_home = ncg_new;
 +    if (debug)
 +    {
 +        fprintf(debug, "Set the new home charge group count to %d\n",
 +                dd->ncg_home);
 +    }
 +
 +    /* Reorder the state */
 +    for (i = 0; i < estNR; i++)
 +    {
 +        if (EST_DISTR(i) && (state->flags & (1<<i)))
 +        {
 +            switch (i)
 +            {
 +                case estX:
 +                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->x, vbuf);
 +                    break;
 +                case estV:
 +                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->v, vbuf);
 +                    break;
 +                case estSDX:
 +                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->sd_X, vbuf);
 +                    break;
 +                case estCGP:
 +                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->cg_p, vbuf);
 +                    break;
 +                case estLD_RNG:
 +                case estLD_RNGI:
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* No ordering required */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_sort_state");
 +                    break;
 +            }
 +        }
 +    }
 +    if (fr->cutoff_scheme == ecutsGROUP)
 +    {
 +        /* Reorder cgcm */
 +        order_vec_cg(dd->ncg_home, cgsort, cgcm, vbuf);
 +    }
 +
 +    if (dd->ncg_home+1 > sort->ibuf_nalloc)
 +    {
 +        sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
 +        srenew(sort->ibuf, sort->ibuf_nalloc);
 +    }
 +    ibuf = sort->ibuf;
 +    /* Reorder the global cg index */
 +    order_int_cg(dd->ncg_home, cgsort, dd->index_gl, ibuf);
 +    /* Reorder the cginfo */
 +    order_int_cg(dd->ncg_home, cgsort, fr->cginfo, ibuf);
 +    /* Rebuild the local cg index */
 +    if (dd->comm->bCGs)
 +    {
 +        ibuf[0] = 0;
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            cgsize    = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
 +            ibuf[i+1] = ibuf[i] + cgsize;
 +        }
 +        for (i = 0; i < dd->ncg_home+1; i++)
 +        {
 +            dd->cgindex[i] = ibuf[i];
 +        }
 +    }
 +    else
 +    {
 +        for (i = 0; i < dd->ncg_home+1; i++)
 +        {
 +            dd->cgindex[i] = i;
 +        }
 +    }
 +    /* Set the home atom number */
 +    dd->nat_home = dd->cgindex[dd->ncg_home];
 +
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        /* The atoms are now exactly in grid order, update the grid order */
 +        nbnxn_set_atomorder(fr->nbv->nbs);
 +    }
 +    else
 +    {
 +        /* Copy the sorted ns cell indices back to the ns grid struct */
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            fr->ns.grid->cell_index[i] = cgsort[i].nsc;
 +        }
 +        fr->ns.grid->nr = dd->ncg_home;
 +    }
 +}
 +
 +static void add_dd_statistics(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ddnat;
 +
 +    comm = dd->comm;
 +
 +    for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
 +    {
 +        comm->sum_nat[ddnat-ddnatZONE] +=
 +            comm->nat[ddnat] - comm->nat[ddnat-1];
 +    }
 +    comm->ndecomp++;
 +}
 +
 +void reset_dd_statistics_counters(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ddnat;
 +
 +    comm = dd->comm;
 +
 +    /* Reset all the statistics and counters for total run counting */
 +    for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
 +    {
 +        comm->sum_nat[ddnat-ddnatZONE] = 0;
 +    }
 +    comm->ndecomp   = 0;
 +    comm->nload     = 0;
 +    comm->load_step = 0;
 +    comm->load_sum  = 0;
 +    comm->load_max  = 0;
 +    clear_ivec(comm->load_lim);
 +    comm->load_mdf = 0;
 +    comm->load_pme = 0;
 +}
 +
 +void print_dd_statistics(t_commrec *cr, t_inputrec *ir, FILE *fplog)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ddnat;
 +    double             av;
 +
 +    comm = cr->dd->comm;
 +
 +    gmx_sumd(ddnatNR-ddnatZONE, comm->sum_nat, cr);
 +
 +    if (fplog == NULL)
 +    {
 +        return;
 +    }
 +
 +    fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
 +
 +    for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
 +    {
 +        av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
 +        switch (ddnat)
 +        {
 +            case ddnatZONE:
 +                fprintf(fplog,
 +                        " av. #atoms communicated per step for force:  %d x %.1f\n",
 +                        2, av);
 +                break;
 +            case ddnatVSITE:
 +                if (cr->dd->vsite_comm)
 +                {
 +                    fprintf(fplog,
 +                            " av. #atoms communicated per step for vsites: %d x %.1f\n",
 +                            (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
 +                            av);
 +                }
 +                break;
 +            case ddnatCON:
 +                if (cr->dd->constraint_comm)
 +                {
 +                    fprintf(fplog,
 +                            " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
 +                            1 + ir->nLincsIter, av);
 +                }
 +                break;
 +            default:
 +                gmx_incons(" Unknown type for DD statistics");
 +        }
 +    }
 +    fprintf(fplog, "\n");
 +
 +    if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
 +    {
 +        print_dd_load_av(fplog, cr->dd);
 +    }
 +}
 +
 +void dd_partition_system(FILE                *fplog,
 +                         gmx_large_int_t      step,
 +                         t_commrec           *cr,
 +                         gmx_bool             bMasterState,
 +                         int                  nstglobalcomm,
 +                         t_state             *state_global,
 +                         gmx_mtop_t          *top_global,
 +                         t_inputrec          *ir,
 +                         t_state             *state_local,
 +                         rvec               **f,
 +                         t_mdatoms           *mdatoms,
 +                         gmx_localtop_t      *top_local,
 +                         t_forcerec          *fr,
 +                         gmx_vsite_t         *vsite,
 +                         gmx_shellfc_t        shellfc,
 +                         gmx_constr_t         constr,
 +                         t_nrnb              *nrnb,
 +                         gmx_wallcycle_t      wcycle,
 +                         gmx_bool             bVerbose)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    gmx_ddbox_t        ddbox = {0};
 +    t_block           *cgs_gl;
 +    gmx_large_int_t    step_pcoupl;
 +    rvec               cell_ns_x0, cell_ns_x1;
 +    int                i, j, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
 +    gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckDLB, bTurnOnDLB, bLogLoad;
 +    gmx_bool           bRedist, bSortCG, bResortAll;
 +    ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
 +    real               grid_density;
 +    char               sbuf[22];
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    bBoxChanged = (bMasterState || DEFORM(*ir));
 +    if (ir->epc != epcNO)
 +    {
 +        /* With nstpcouple > 1 pressure coupling happens.
 +         * one step after calculating the pressure.
 +         * Box scaling happens at the end of the MD step,
 +         * after the DD partitioning.
 +         * We therefore have to do DLB in the first partitioning
 +         * after an MD step where P-coupling occured.
 +         * We need to determine the last step in which p-coupling occurred.
 +         * MRS -- need to validate this for vv?
 +         */
 +        n = ir->nstpcouple;
 +        if (n == 1)
 +        {
 +            step_pcoupl = step - 1;
 +        }
 +        else
 +        {
 +            step_pcoupl = ((step - 1)/n)*n + 1;
 +        }
 +        if (step_pcoupl >= comm->partition_step)
 +        {
 +            bBoxChanged = TRUE;
 +        }
 +    }
 +
 +    bNStGlobalComm = (step % nstglobalcomm == 0);
 +
 +    if (!comm->bDynLoadBal)
 +    {
 +        bDoDLB = FALSE;
 +    }
 +    else
 +    {
 +        /* Should we do dynamic load balacing this step?
 +         * Since it requires (possibly expensive) global communication,
 +         * we might want to do DLB less frequently.
 +         */
 +        if (bBoxChanged || ir->epc != epcNO)
 +        {
 +            bDoDLB = bBoxChanged;
 +        }
 +        else
 +        {
 +            bDoDLB = bNStGlobalComm;
 +        }
 +    }
 +
 +    /* Check if we have recorded loads on the nodes */
 +    if (comm->bRecordLoad && dd_load_count(comm))
 +    {
 +        if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
 +        {
 +            /* Check if we should use DLB at the second partitioning
 +             * and every 100 partitionings,
 +             * so the extra communication cost is negligible.
 +             */
 +            n         = max(100, nstglobalcomm);
 +            bCheckDLB = (comm->n_load_collect == 0 ||
 +                         comm->n_load_have % n == n-1);
 +        }
 +        else
 +        {
 +            bCheckDLB = FALSE;
 +        }
 +
 +        /* Print load every nstlog, first and last step to the log file */
 +        bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
 +                    comm->n_load_collect == 0 ||
 +                    (ir->nsteps >= 0 &&
 +                     (step + ir->nstlist > ir->init_step + ir->nsteps)));
 +
 +        /* Avoid extra communication due to verbose screen output
 +         * when nstglobalcomm is set.
 +         */
 +        if (bDoDLB || bLogLoad || bCheckDLB ||
 +            (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
 +        {
 +            get_load_distribution(dd, wcycle);
 +            if (DDMASTER(dd))
 +            {
 +                if (bLogLoad)
 +                {
 +                    dd_print_load(fplog, dd, step-1);
 +                }
 +                if (bVerbose)
 +                {
 +                    dd_print_load_verbose(dd);
 +                }
 +            }
 +            comm->n_load_collect++;
 +
 +            if (bCheckDLB)
 +            {
 +                /* Since the timings are node dependent, the master decides */
 +                if (DDMASTER(dd))
 +                {
 +                    bTurnOnDLB =
 +                        (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
 +                    if (debug)
 +                    {
 +                        fprintf(debug, "step %s, imb loss %f\n",
 +                                gmx_step_str(step, sbuf),
 +                                dd_force_imb_perf_loss(dd));
 +                    }
 +                }
 +                dd_bcast(dd, sizeof(bTurnOnDLB), &bTurnOnDLB);
 +                if (bTurnOnDLB)
 +                {
 +                    turn_on_dlb(fplog, cr, step);
 +                    bDoDLB = TRUE;
 +                }
 +            }
 +        }
 +        comm->n_load_have++;
 +    }
 +
 +    cgs_gl = &comm->cgs_gl;
 +
 +    bRedist = FALSE;
 +    if (bMasterState)
 +    {
 +        /* Clear the old state */
 +        clear_dd_indices(dd, 0, 0);
 +        ncgindex_set = 0;
 +
 +        set_ddbox(dd, bMasterState, cr, ir, state_global->box,
 +                  TRUE, cgs_gl, state_global->x, &ddbox);
 +
 +        get_cg_distribution(fplog, step, dd, cgs_gl,
 +                            state_global->box, &ddbox, state_global->x);
 +
 +        dd_distribute_state(dd, cgs_gl,
 +                            state_global, state_local, f);
 +
 +        dd_make_local_cgs(dd, &top_local->cgs);
 +
 +        /* Ensure that we have space for the new distribution */
 +        dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
 +
 +        if (fr->cutoff_scheme == ecutsGROUP)
 +        {
 +            calc_cgcm(fplog, 0, dd->ncg_home,
 +                      &top_local->cgs, state_local->x, fr->cg_cm);
 +        }
 +
 +        inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
 +
 +        dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
 +    }
 +    else if (state_local->ddp_count != dd->ddp_count)
 +    {
 +        if (state_local->ddp_count > dd->ddp_count)
 +        {
 +            gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local->ddp_count, dd->ddp_count);
 +        }
 +
 +        if (state_local->ddp_count_cg_gl != state_local->ddp_count)
 +        {
 +            gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
 +        }
 +
 +        /* Clear the old state */
 +        clear_dd_indices(dd, 0, 0);
 +
 +        /* Build the new indices */
 +        rebuild_cgindex(dd, cgs_gl->index, state_local);
 +        make_dd_indices(dd, cgs_gl->index, 0);
 +        ncgindex_set = dd->ncg_home;
 +
 +        if (fr->cutoff_scheme == ecutsGROUP)
 +        {
 +            /* Redetermine the cg COMs */
 +            calc_cgcm(fplog, 0, dd->ncg_home,
 +                      &top_local->cgs, state_local->x, fr->cg_cm);
 +        }
 +
 +        inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
 +
 +        dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
 +
 +        set_ddbox(dd, bMasterState, cr, ir, state_local->box,
 +                  TRUE, &top_local->cgs, state_local->x, &ddbox);
 +
 +        bRedist = comm->bDynLoadBal;
 +    }
 +    else
 +    {
 +        /* We have the full state, only redistribute the cgs */
 +
 +        /* Clear the non-home indices */
 +        clear_dd_indices(dd, dd->ncg_home, dd->nat_home);
 +        ncgindex_set = 0;
 +
 +        /* Avoid global communication for dim's without pbc and -gcom */
 +        if (!bNStGlobalComm)
 +        {
 +            copy_rvec(comm->box0, ddbox.box0    );
 +            copy_rvec(comm->box_size, ddbox.box_size);
 +        }
 +        set_ddbox(dd, bMasterState, cr, ir, state_local->box,
 +                  bNStGlobalComm, &top_local->cgs, state_local->x, &ddbox);
 +
 +        bBoxChanged = TRUE;
 +        bRedist     = TRUE;
 +    }
 +    /* For dim's without pbc and -gcom */
 +    copy_rvec(ddbox.box0, comm->box0    );
 +    copy_rvec(ddbox.box_size, comm->box_size);
 +
 +    set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(&ddbox, ir), bMasterState, bDoDLB,
 +                      step, wcycle);
 +
 +    if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
 +    {
 +        write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
 +    }
 +
 +    /* Check if we should sort the charge groups */
 +    if (comm->nstSortCG > 0)
 +    {
 +        bSortCG = (bMasterState ||
 +                   (bRedist && (step % comm->nstSortCG == 0)));
 +    }
 +    else
 +    {
 +        bSortCG = FALSE;
 +    }
 +
 +    ncg_home_old = dd->ncg_home;
 +
 +    ncg_moved = 0;
 +    if (bRedist)
 +    {
 +        wallcycle_sub_start(wcycle, ewcsDD_REDIST);
 +
 +        dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
 +                           state_local, f, fr,
 +                           !bSortCG, nrnb, &ncgindex_set, &ncg_moved);
 +
 +        wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
 +    }
 +
 +    get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
 +                          dd, &ddbox,
 +                          &comm->cell_x0, &comm->cell_x1,
 +                          dd->ncg_home, fr->cg_cm,
 +                          cell_ns_x0, cell_ns_x1, &grid_density);
 +
 +    if (bBoxChanged)
 +    {
 +        comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
 +    }
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +        case ecutsGROUP:
 +            copy_ivec(fr->ns.grid->n, ncells_old);
 +            grid_first(fplog, fr->ns.grid, dd, &ddbox,
 +                       state_local->box, cell_ns_x0, cell_ns_x1,
 +                       fr->rlistlong, grid_density);
 +            break;
 +        case ecutsVERLET:
 +            nbnxn_get_ncells(fr->nbv->nbs, &ncells_old[XX], &ncells_old[YY]);
 +            break;
 +        default:
 +            gmx_incons("unimplemented");
 +    }
 +    /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
 +    copy_ivec(ddbox.tric_dir, comm->tric_dir);
 +
 +    if (bSortCG)
 +    {
 +        wallcycle_sub_start(wcycle, ewcsDD_GRID);
 +
 +        /* Sort the state on charge group position.
 +         * This enables exact restarts from this step.
 +         * It also improves performance by about 15% with larger numbers
 +         * of atoms per node.
 +         */
 +
 +        /* Fill the ns grid with the home cell,
 +         * so we can sort with the indices.
 +         */
 +        set_zones_ncg_home(dd);
 +
 +        switch (fr->cutoff_scheme)
 +        {
 +            case ecutsVERLET:
 +                set_zones_size(dd, state_local->box, &ddbox, 0, 1);
 +
 +                nbnxn_put_on_grid(fr->nbv->nbs, fr->ePBC, state_local->box,
 +                                  0,
 +                                  comm->zones.size[0].bb_x0,
 +                                  comm->zones.size[0].bb_x1,
 +                                  0, dd->ncg_home,
 +                                  comm->zones.dens_zone0,
 +                                  fr->cginfo,
 +                                  state_local->x,
 +                                  ncg_moved, bRedist ? comm->moved : NULL,
 +                                  fr->nbv->grp[eintLocal].kernel_type,
 +                                  fr->nbv->grp[eintLocal].nbat);
 +
 +                nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]);
 +                break;
 +            case ecutsGROUP:
 +                fill_grid(&comm->zones, fr->ns.grid, dd->ncg_home,
 +                          0, dd->ncg_home, fr->cg_cm);
 +
 +                copy_ivec(fr->ns.grid->n, ncells_new);
 +                break;
 +            default:
 +                gmx_incons("unimplemented");
 +        }
 +
 +        bResortAll = bMasterState;
 +
 +        /* Check if we can user the old order and ns grid cell indices
 +         * of the charge groups to sort the charge groups efficiently.
 +         */
 +        if (ncells_new[XX] != ncells_old[XX] ||
 +            ncells_new[YY] != ncells_old[YY] ||
 +            ncells_new[ZZ] != ncells_old[ZZ])
 +        {
 +            bResortAll = TRUE;
 +        }
 +
 +        if (debug)
 +        {
 +            fprintf(debug, "Step %s, sorting the %d home charge groups\n",
 +                    gmx_step_str(step, sbuf), dd->ncg_home);
 +        }
 +        dd_sort_state(dd, fr->cg_cm, fr, state_local,
 +                      bResortAll ? -1 : ncg_home_old);
 +        /* Rebuild all the indices */
 +        ga2la_clear(dd->ga2la);
 +        ncgindex_set = 0;
 +
 +        wallcycle_sub_stop(wcycle, ewcsDD_GRID);
 +    }
 +
 +    wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
 +
 +    /* Setup up the communication and communicate the coordinates */
 +    setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
 +
 +    /* Set the indices */
 +    make_dd_indices(dd, cgs_gl->index, ncgindex_set);
 +
 +    /* Set the charge group boundaries for neighbor searching */
 +    set_cg_boundaries(&comm->zones);
 +
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        set_zones_size(dd, state_local->box, &ddbox,
 +                       bSortCG ? 1 : 0, comm->zones.n);
 +    }
 +
 +    wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
 +
 +    /*
 +       write_dd_pdb("dd_home",step,"dump",top_global,cr,
 +                 -1,state_local->x,state_local->box);
 +     */
 +
 +    wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
 +
 +    /* Extract a local topology from the global topology */
 +    for (i = 0; i < dd->ndim; i++)
 +    {
 +        np[dd->dim[i]] = comm->cd[i].np;
 +    }
 +    dd_make_local_top(dd, &comm->zones, dd->npbcdim, state_local->box,
 +                      comm->cellsize_min, np,
 +                      fr,
 +                      fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : state_local->x,
 +                      vsite, top_global, top_local);
 +
 +    wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
 +
 +    wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
 +
 +    /* Set up the special atom communication */
 +    n = comm->nat[ddnatZONE];
 +    for (i = ddnatZONE+1; i < ddnatNR; i++)
 +    {
 +        switch (i)
 +        {
 +            case ddnatVSITE:
 +                if (vsite && vsite->n_intercg_vsite)
 +                {
 +                    n = dd_make_local_vsites(dd, n, top_local->idef.il);
 +                }
 +                break;
 +            case ddnatCON:
 +                if (dd->bInterCGcons || dd->bInterCGsettles)
 +                {
 +                    /* Only for inter-cg constraints we need special code */
 +                    n = dd_make_local_constraints(dd, n, top_global, fr->cginfo,
 +                                                  constr, ir->nProjOrder,
 +                                                  top_local->idef.il);
 +                }
 +                break;
 +            default:
 +                gmx_incons("Unknown special atom type setup");
 +        }
 +        comm->nat[i] = n;
 +    }
 +
 +    wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
 +
 +    wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
 +
 +    /* Make space for the extra coordinates for virtual site
 +     * or constraint communication.
 +     */
 +    state_local->natoms = comm->nat[ddnatNR-1];
 +    if (state_local->natoms > state_local->nalloc)
 +    {
 +        dd_realloc_state(state_local, f, state_local->natoms);
 +    }
 +
 +    if (fr->bF_NoVirSum)
 +    {
 +        if (vsite && vsite->n_intercg_vsite)
 +        {
 +            nat_f_novirsum = comm->nat[ddnatVSITE];
 +        }
 +        else
 +        {
 +            if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
 +            {
 +                nat_f_novirsum = dd->nat_tot;
 +            }
 +            else
 +            {
 +                nat_f_novirsum = dd->nat_home;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        nat_f_novirsum = 0;
 +    }
 +
 +    /* Set the number of atoms required for the force calculation.
 +     * Forces need to be constrained when using a twin-range setup
 +     * or with energy minimization. For simple simulations we could
 +     * avoid some allocation, zeroing and copying, but this is
 +     * probably not worth the complications ande checking.
 +     */
 +    forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot,
 +                        dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum);
 +
 +    /* We make the all mdatoms up to nat_tot_con.
 +     * We could save some work by only setting invmass
 +     * between nat_tot and nat_tot_con.
 +     */
 +    /* This call also sets the new number of home particles to dd->nat_home */
 +    atoms2md(top_global, ir,
 +             comm->nat[ddnatCON], dd->gatindex, 0, dd->nat_home, mdatoms);
 +
 +    /* Now we have the charges we can sort the FE interactions */
 +    dd_sort_local_top(dd, mdatoms, top_local);
 +
 +    if (vsite != NULL)
 +    {
 +        /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
 +        split_vsites_over_threads(top_local->idef.il, mdatoms, FALSE, vsite);
 +    }
 +
 +    if (shellfc)
 +    {
 +        /* Make the local shell stuff, currently no communication is done */
 +        make_local_shells(cr, mdatoms, shellfc);
 +    }
 +
 +    if (ir->implicit_solvent)
 +    {
 +        make_local_gb(cr, fr->born, ir->gb_algorithm);
 +    }
 +
 +    init_bonded_thread_force_reduction(fr, &top_local->idef);
 +
 +    if (!(cr->duty & DUTY_PME))
 +    {
 +        /* Send the charges to our PME only node */
 +        gmx_pme_send_q(cr, mdatoms->nChargePerturbed,
 +                       mdatoms->chargeA, mdatoms->chargeB,
 +                       dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
 +    }
 +
 +    if (constr)
 +    {
 +        set_constraints(constr, top_local, ir, mdatoms, cr);
 +    }
 +
 +    if (ir->ePull != epullNO)
 +    {
 +        /* Update the local pull groups */
 +        dd_make_local_pull_groups(dd, ir->pull, mdatoms);
 +    }
 +
 +    if (ir->bRot)
 +    {
 +        /* Update the local rotation groups */
 +        dd_make_local_rotation_groups(dd, ir->rot);
 +    }
 +
 +
 +    add_dd_statistics(dd);
 +
 +    /* Make sure we only count the cycles for this DD partitioning */
 +    clear_dd_cycle_counts(dd);
 +
 +    /* Because the order of the atoms might have changed since
 +     * the last vsite construction, we need to communicate the constructing
 +     * atom coordinates again (for spreading the forces this MD step).
 +     */
 +    dd_move_x_vsites(dd, state_local->box, state_local->x);
 +
 +    wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
 +
 +    if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
 +    {
 +        dd_move_x(dd, state_local->box, state_local->x);
 +        write_dd_pdb("dd_dump", step, "dump", top_global, cr,
 +                     -1, state_local->x, state_local->box);
 +    }
 +
 +    /* Store the partitioning step */
 +    comm->partition_step = step;
 +
 +    /* Increase the DD partitioning counter */
 +    dd->ddp_count++;
 +    /* The state currently matches this DD partitioning count, store it */
 +    state_local->ddp_count = dd->ddp_count;
 +    if (bMasterState)
 +    {
 +        /* The DD master node knows the complete cg distribution,
 +         * store the count so we can possibly skip the cg info communication.
 +         */
 +        comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
 +    }
 +
 +    if (comm->DD_debug > 0)
 +    {
 +        /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
 +        check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global),
 +                                "after partitioning");
 +    }
 +}
index 45e6ce7a901b6883a76fe8f2ba2d0baee0579155,0000000000000000000000000000000000000000..9235b68fba3e385f79b0163afa5947b46d82e711
mode 100644,000000..100644
--- /dev/null
@@@ -1,1431 -1,0 +1,1435 @@@
- void GenerateGibbsProbabilities(real *ene, real *p_k, real *pks, int minfep, int maxfep)
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#ifdef GMX_CRAY_XT3
 +#include <catamount/dclock.h>
 +#endif
 +
 +
 +#include <stdio.h>
 +#include <time.h>
 +#ifdef HAVE_SYS_TIME_H
 +#include <sys/time.h>
 +#endif
 +#include <math.h>
 +#include "typedefs.h"
 +#include "string2.h"
 +#include "gmxfio.h"
 +#include "smalloc.h"
 +#include "names.h"
 +#include "confio.h"
 +#include "mvdata.h"
 +#include "txtdump.h"
 +#include "pbc.h"
 +#include "chargegroup.h"
 +#include "vec.h"
 +#include "nrnb.h"
 +#include "mshift.h"
 +#include "mdrun.h"
 +#include "update.h"
 +#include "physics.h"
 +#include "main.h"
 +#include "mdatoms.h"
 +#include "force.h"
 +#include "bondf.h"
 +#include "pme.h"
 +#include "disre.h"
 +#include "orires.h"
 +#include "network.h"
 +#include "calcmu.h"
 +#include "constr.h"
 +#include "xvgr.h"
 +#include "trnio.h"
 +#include "xtcio.h"
 +#include "gmx_random.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "gmx_wallcycle.h"
 +#include "macros.h"
 +
 +#include "gromacs/utility/gmxmpi.h"
 +
- void GenerateWeightedGibbsProbabilities(real *ene, real *p_k, real *pks, int nlim, real *nvals, real delta)
++void GenerateGibbsProbabilities(real *ene, double *p_k, double *pks, int minfep, int maxfep)
 +{
 +
 +    int  i;
 +    real maxene;
 +
 +    *pks   = 0.0;
 +    maxene = ene[minfep];
 +    /* find the maximum value */
 +    for (i = minfep; i <= maxfep; i++)
 +    {
 +        if (ene[i] > maxene)
 +        {
 +            maxene = ene[i];
 +        }
 +    }
 +    /* find the denominator */
 +    for (i = minfep; i <= maxfep; i++)
 +    {
 +        *pks += exp(ene[i]-maxene);
 +    }
 +    /*numerators*/
 +    for (i = minfep; i <= maxfep; i++)
 +    {
 +        p_k[i] = exp(ene[i]-maxene) / *pks;
 +    }
 +}
 +
-     real     de, de_function, dr, denom, maxdr, pks = 0;
++void GenerateWeightedGibbsProbabilities(real *ene, double *p_k, double *pks, int nlim, real *nvals, real delta)
 +{
 +
 +    int   i;
 +    real  maxene;
 +    real *nene;
 +    *pks = 0.0;
 +
 +    snew(nene, nlim);
 +    for (i = 0; i < nlim; i++)
 +    {
 +        if (nvals[i] == 0)
 +        {
 +            /* add the delta, since we need to make sure it's greater than zero, and
 +               we need a non-arbitrary number? */
 +            nene[i] = ene[i] + log(nvals[i]+delta);
 +        }
 +        else
 +        {
 +            nene[i] = ene[i] + log(nvals[i]);
 +        }
 +    }
 +
 +    /* find the maximum value */
 +    maxene = nene[0];
 +    for (i = 0; i < nlim; i++)
 +    {
 +        if (nene[i] > maxene)
 +        {
 +            maxene = nene[i];
 +        }
 +    }
 +
 +    /* subtract off the maximum, avoiding overflow */
 +    for (i = 0; i < nlim; i++)
 +    {
 +        nene[i] -= maxene;
 +    }
 +
 +    /* find the denominator */
 +    for (i = 0; i < nlim; i++)
 +    {
 +        *pks += exp(nene[i]);
 +    }
 +
 +    /*numerators*/
 +    for (i = 0; i < nlim; i++)
 +    {
 +        p_k[i] = exp(nene[i]) / *pks;
 +    }
 +    sfree(nene);
 +}
 +
 +real do_logsum(int N, real *a_n)
 +{
 +
 +    /*     RETURN VALUE */
 +    /* log(\sum_{i=0}^(N-1) exp[a_n]) */
 +    real maxarg;
 +    real sum;
 +    int  i;
 +    real logsum;
 +    /*     compute maximum argument to exp(.) */
 +
 +    maxarg = a_n[0];
 +    for (i = 1; i < N; i++)
 +    {
 +        maxarg = max(maxarg, a_n[i]);
 +    }
 +
 +    /* compute sum of exp(a_n - maxarg) */
 +    sum = 0.0;
 +    for (i = 0; i < N; i++)
 +    {
 +        sum = sum + exp(a_n[i] - maxarg);
 +    }
 +
 +    /*     compute log sum */
 +    logsum = log(sum) + maxarg;
 +    return logsum;
 +}
 +
 +int FindMinimum(real *min_metric, int N)
 +{
 +
 +    real min_val;
 +    int  min_nval, nval;
 +
 +    min_nval = 0;
 +    min_val  = min_metric[0];
 +
 +    for (nval = 0; nval < N; nval++)
 +    {
 +        if (min_metric[nval] < min_val)
 +        {
 +            min_val  = min_metric[nval];
 +            min_nval = nval;
 +        }
 +    }
 +    return min_nval;
 +}
 +
 +static gmx_bool CheckHistogramRatios(int nhisto, real *histo, real ratio)
 +{
 +
 +    int      i;
 +    real     nmean;
 +    gmx_bool bIfFlat;
 +
 +    nmean = 0;
 +    for (i = 0; i < nhisto; i++)
 +    {
 +        nmean += histo[i];
 +    }
 +
 +    if (nmean == 0)
 +    {
 +        /* no samples! is bad!*/
 +        bIfFlat = FALSE;
 +        return bIfFlat;
 +    }
 +    nmean /= (real)nhisto;
 +
 +    bIfFlat = TRUE;
 +    for (i = 0; i < nhisto; i++)
 +    {
 +        /* make sure that all points are in the ratio < x <  1/ratio range  */
 +        if (!((histo[i]/nmean < 1.0/ratio) && (histo[i]/nmean > ratio)))
 +        {
 +            bIfFlat = FALSE;
 +            break;
 +        }
 +    }
 +    return bIfFlat;
 +}
 +
 +static gmx_bool CheckIfDoneEquilibrating(int nlim, t_expanded *expand, df_history_t *dfhist, gmx_large_int_t step)
 +{
 +
 +    int      i, totalsamples;
 +    gmx_bool bDoneEquilibrating = TRUE;
 +    gmx_bool bIfFlat;
 +
 +    /* assume we have equilibrated the weights, then check to see if any of the conditions are not met */
 +
 +    /* calculate the total number of samples */
 +    switch (expand->elmceq)
 +    {
 +        case elmceqNO:
 +            /* We have not equilibrated, and won't, ever. */
 +            return FALSE;
 +        case elmceqYES:
 +            /* we have equilibrated -- we're done */
 +            return TRUE;
 +        case elmceqSTEPS:
 +            /* first, check if we are equilibrating by steps, if we're still under */
 +            if (step < expand->equil_steps)
 +            {
 +                bDoneEquilibrating = FALSE;
 +            }
 +            break;
 +        case elmceqSAMPLES:
 +            totalsamples = 0;
 +            for (i = 0; i < nlim; i++)
 +            {
 +                totalsamples += dfhist->n_at_lam[i];
 +            }
 +            if (totalsamples < expand->equil_samples)
 +            {
 +                bDoneEquilibrating = FALSE;
 +            }
 +            break;
 +        case elmceqNUMATLAM:
 +            for (i = 0; i < nlim; i++)
 +            {
 +                if (dfhist->n_at_lam[i] < expand->equil_n_at_lam) /* we are still doing the initial sweep, so we're definitely not
 +                                                                     done equilibrating*/
 +                {
 +                    bDoneEquilibrating  = FALSE;
 +                    break;
 +                }
 +            }
 +            break;
 +        case elmceqWLDELTA:
 +            if (EWL(expand->elamstats)) /* This check is in readir as well, but
 +                                           just to be sure */
 +            {
 +                if (dfhist->wl_delta > expand->equil_wl_delta)
 +                {
 +                    bDoneEquilibrating = FALSE;
 +                }
 +            }
 +            break;
 +        case elmceqRATIO:
 +            /* we can use the flatness as a judge of good weights, as long as
 +               we're not doing minvar, or Wang-Landau.
 +               But turn off for now until we figure out exactly how we do this.
 +             */
 +
 +            if (!(EWL(expand->elamstats) || expand->elamstats == elamstatsMINVAR))
 +            {
 +                /* we want to use flatness -avoiding- the forced-through samples.  Plus, we need to convert to
 +                   floats for this histogram function. */
 +
 +                real *modhisto;
 +                snew(modhisto, nlim);
 +                for (i = 0; i < nlim; i++)
 +                {
 +                    modhisto[i] = 1.0*(dfhist->n_at_lam[i]-expand->lmc_forced_nstart);
 +                }
 +                bIfFlat = CheckHistogramRatios(nlim, modhisto, expand->equil_ratio);
 +                sfree(modhisto);
 +                if (!bIfFlat)
 +                {
 +                    bDoneEquilibrating = FALSE;
 +                }
 +            }
 +        default:
 +            bDoneEquilibrating = TRUE;
 +    }
 +    /* one last case to go though, if we are doing slow growth to get initial values, we haven't finished equilibrating */
 +
 +    if (expand->lmc_forced_nstart > 0)
 +    {
 +        for (i = 0; i < nlim; i++)
 +        {
 +            if (dfhist->n_at_lam[i] < expand->lmc_forced_nstart) /* we are still doing the initial sweep, so we're definitely not
 +                                                                    done equilibrating*/
 +            {
 +                bDoneEquilibrating = FALSE;
 +                break;
 +            }
 +        }
 +    }
 +    return bDoneEquilibrating;
 +}
 +
 +static gmx_bool UpdateWeights(int nlim, t_expanded *expand, df_history_t *dfhist,
 +                              int fep_state, real *scaled_lamee, real *weighted_lamee, gmx_large_int_t step)
 +{
 +    real     maxdiff = 0.000000001;
 +    gmx_bool bSufficientSamples;
 +    int      i, k, n, nz, indexi, indexk, min_n, max_n, nlam, totali;
 +    int      n0, np1, nm1, nval, min_nvalm, min_nvalp, maxc;
 +    real     omega_m1_0, omega_p1_m1, omega_m1_p1, omega_p1_0, clam_osum;
-     real    *lam_weights, *lam_minvar_corr, *lam_variance, *lam_dg, *p_k;
++    real     de, de_function, dr, denom, maxdr;
 +    real     min_val, cnval, zero_sum_weights;
 +    real    *omegam_array, *weightsm_array, *omegap_array, *weightsp_array, *varm_array, *varp_array, *dwp_array, *dwm_array;
 +    real     clam_varm, clam_varp, clam_weightsm, clam_weightsp, clam_minvar;
-                 dfhist->wl_histo[i] += p_k[i];
++    real    *lam_weights, *lam_minvar_corr, *lam_variance, *lam_dg;
++    double  *p_k;
++    double  pks = 0;
 +    real    *numweighted_lamee, *logfrac;
 +    int     *nonzero;
 +    real     chi_m1_0, chi_p1_0, chi_m2_0, chi_p2_0, chi_p1_m1, chi_p2_m1, chi_m1_p1, chi_m2_p1;
 +
 +    /* if we have equilibrated the weights, exit now */
 +    if (dfhist->bEquil)
 +    {
 +        return FALSE;
 +    }
 +
 +    if (CheckIfDoneEquilibrating(nlim, expand, dfhist, step))
 +    {
 +        dfhist->bEquil = TRUE;
 +        /* zero out the visited states so we know how many equilibrated states we have
 +           from here on out.*/
 +        for (i = 0; i < nlim; i++)
 +        {
 +            dfhist->n_at_lam[i] = 0;
 +        }
 +        return TRUE;
 +    }
 +
 +    /* If we reached this far, we have not equilibrated yet, keep on
 +       going resetting the weights */
 +
 +    if (EWL(expand->elamstats))
 +    {
 +        if (expand->elamstats == elamstatsWL)  /* Standard Wang-Landau */
 +        {
 +            dfhist->sum_weights[fep_state] -= dfhist->wl_delta;
 +            dfhist->wl_histo[fep_state]    += 1.0;
 +        }
 +        else if (expand->elamstats == elamstatsWWL) /* Weighted Wang-Landau */
 +        {
 +            snew(p_k, nlim);
 +
 +            /* first increment count */
 +            GenerateGibbsProbabilities(weighted_lamee, p_k, &pks, 0, nlim-1);
 +            for (i = 0; i < nlim; i++)
 +            {
-                 dfhist->sum_weights[i] -= dfhist->wl_delta*p_k[i];
++                dfhist->wl_histo[i] += (real)p_k[i];
 +            }
 +
 +            /* then increment weights (uses count) */
 +            pks = 0.0;
 +            GenerateWeightedGibbsProbabilities(weighted_lamee, p_k, &pks, nlim, dfhist->wl_histo, dfhist->wl_delta);
 +
 +            for (i = 0; i < nlim; i++)
 +            {
-                 di = 1+dfhist->wl_delta*p_k[i];
++                dfhist->sum_weights[i] -= dfhist->wl_delta*(real)p_k[i];
 +            }
 +            /* Alternate definition, using logarithms. Shouldn't make very much difference! */
 +            /*
 +               real di;
 +               for (i=0;i<nlim;i++)
 +               {
- static int ChooseNewLambda(int nlim, t_expanded *expand, df_history_t *dfhist, int fep_state, real *weighted_lamee, real *p_k, gmx_rng_t rng)
++                di = (real)1.0 + dfhist->wl_delta*(real)p_k[i];
 +                dfhist->sum_weights[i] -= log(di);
 +               }
 +             */
 +            sfree(p_k);
 +        }
 +
 +        zero_sum_weights =  dfhist->sum_weights[0];
 +        for (i = 0; i < nlim; i++)
 +        {
 +            dfhist->sum_weights[i] -= zero_sum_weights;
 +        }
 +    }
 +
 +    if (expand->elamstats == elamstatsBARKER || expand->elamstats == elamstatsMETROPOLIS || expand->elamstats == elamstatsMINVAR)
 +    {
 +
 +        de_function = 0;  /* to get rid of warnings, but this value will not be used because of the logic */
 +        maxc        = 2*expand->c_range+1;
 +
 +        snew(lam_dg, nlim);
 +        snew(lam_variance, nlim);
 +
 +        snew(omegap_array, maxc);
 +        snew(weightsp_array, maxc);
 +        snew(varp_array, maxc);
 +        snew(dwp_array, maxc);
 +
 +        snew(omegam_array, maxc);
 +        snew(weightsm_array, maxc);
 +        snew(varm_array, maxc);
 +        snew(dwm_array, maxc);
 +
 +        /* unpack the current lambdas -- we will only update 2 of these */
 +
 +        for (i = 0; i < nlim-1; i++)
 +        {   /* only through the second to last */
 +            lam_dg[i]       = dfhist->sum_dg[i+1] - dfhist->sum_dg[i];
 +            lam_variance[i] = pow(dfhist->sum_variance[i+1], 2) - pow(dfhist->sum_variance[i], 2);
 +        }
 +
 +        /* accumulate running averages */
 +        for (nval = 0; nval < maxc; nval++)
 +        {
 +            /* constants for later use */
 +            cnval = (real)(nval-expand->c_range);
 +            /* actually, should be able to rewrite it w/o exponential, for better numerical stability */
 +            if (fep_state > 0)
 +            {
 +                de = exp(cnval - (scaled_lamee[fep_state]-scaled_lamee[fep_state-1]));
 +                if (expand->elamstats == elamstatsBARKER || expand->elamstats == elamstatsMINVAR)
 +                {
 +                    de_function = 1.0/(1.0+de);
 +                }
 +                else if (expand->elamstats == elamstatsMETROPOLIS)
 +                {
 +                    if (de < 1.0)
 +                    {
 +                        de_function = 1.0;
 +                    }
 +                    else
 +                    {
 +                        de_function = 1.0/de;
 +                    }
 +                }
 +                dfhist->accum_m[fep_state][nval]  += de_function;
 +                dfhist->accum_m2[fep_state][nval] += de_function*de_function;
 +            }
 +
 +            if (fep_state < nlim-1)
 +            {
 +                de = exp(-cnval + (scaled_lamee[fep_state+1]-scaled_lamee[fep_state]));
 +                if (expand->elamstats == elamstatsBARKER || expand->elamstats == elamstatsMINVAR)
 +                {
 +                    de_function = 1.0/(1.0+de);
 +                }
 +                else if (expand->elamstats == elamstatsMETROPOLIS)
 +                {
 +                    if (de < 1.0)
 +                    {
 +                        de_function = 1.0;
 +                    }
 +                    else
 +                    {
 +                        de_function = 1.0/de;
 +                    }
 +                }
 +                dfhist->accum_p[fep_state][nval]  += de_function;
 +                dfhist->accum_p2[fep_state][nval] += de_function*de_function;
 +            }
 +
 +            /* Metropolis transition and Barker transition (unoptimized Bennett) acceptance weight determination */
 +
 +            n0  = dfhist->n_at_lam[fep_state];
 +            if (fep_state > 0)
 +            {
 +                nm1 = dfhist->n_at_lam[fep_state-1];
 +            }
 +            else
 +            {
 +                nm1 = 0;
 +            }
 +            if (fep_state < nlim-1)
 +            {
 +                np1 = dfhist->n_at_lam[fep_state+1];
 +            }
 +            else
 +            {
 +                np1 = 0;
 +            }
 +
 +            /* logic SHOULD keep these all set correctly whatever the logic, but apparently it can't figure it out. */
 +            chi_m1_0 = chi_p1_0 = chi_m2_0 = chi_p2_0 = chi_p1_m1 = chi_p2_m1 = chi_m1_p1 = chi_m2_p1 = 0;
 +
 +            if (n0 > 0)
 +            {
 +                chi_m1_0 = dfhist->accum_m[fep_state][nval]/n0;
 +                chi_p1_0 = dfhist->accum_p[fep_state][nval]/n0;
 +                chi_m2_0 = dfhist->accum_m2[fep_state][nval]/n0;
 +                chi_p2_0 = dfhist->accum_p2[fep_state][nval]/n0;
 +            }
 +
 +            if ((fep_state > 0 ) && (nm1 > 0))
 +            {
 +                chi_p1_m1 = dfhist->accum_p[fep_state-1][nval]/nm1;
 +                chi_p2_m1 = dfhist->accum_p2[fep_state-1][nval]/nm1;
 +            }
 +
 +            if ((fep_state < nlim-1) && (np1 > 0))
 +            {
 +                chi_m1_p1 = dfhist->accum_m[fep_state+1][nval]/np1;
 +                chi_m2_p1 = dfhist->accum_m2[fep_state+1][nval]/np1;
 +            }
 +
 +            omega_m1_0    = 0;
 +            omega_p1_0    = 0;
 +            clam_weightsm = 0;
 +            clam_weightsp = 0;
 +            clam_varm     = 0;
 +            clam_varp     = 0;
 +
 +            if (fep_state > 0)
 +            {
 +                if (n0 > 0)
 +                {
 +                    omega_m1_0 = chi_m2_0/(chi_m1_0*chi_m1_0) - 1.0;
 +                }
 +                if (nm1 > 0)
 +                {
 +                    omega_p1_m1 = chi_p2_m1/(chi_p1_m1*chi_p1_m1) - 1.0;
 +                }
 +                if ((n0 > 0) && (nm1 > 0))
 +                {
 +                    clam_weightsm = (log(chi_m1_0) - log(chi_p1_m1)) + cnval;
 +                    clam_varm     = (1.0/n0)*(omega_m1_0) + (1.0/nm1)*(omega_p1_m1);
 +                }
 +            }
 +
 +            if (fep_state < nlim-1)
 +            {
 +                if (n0 > 0)
 +                {
 +                    omega_p1_0 = chi_p2_0/(chi_p1_0*chi_p1_0) - 1.0;
 +                }
 +                if (np1 > 0)
 +                {
 +                    omega_m1_p1 = chi_m2_p1/(chi_m1_p1*chi_m1_p1) - 1.0;
 +                }
 +                if ((n0 > 0) && (np1 > 0))
 +                {
 +                    clam_weightsp = (log(chi_m1_p1) - log(chi_p1_0)) + cnval;
 +                    clam_varp     = (1.0/np1)*(omega_m1_p1) + (1.0/n0)*(omega_p1_0);
 +                }
 +            }
 +
 +            if (n0 > 0)
 +            {
 +                omegam_array[nval]             = omega_m1_0;
 +            }
 +            else
 +            {
 +                omegam_array[nval]             = 0;
 +            }
 +            weightsm_array[nval]           = clam_weightsm;
 +            varm_array[nval]               = clam_varm;
 +            if (nm1 > 0)
 +            {
 +                dwm_array[nval]  = fabs( (cnval + log((1.0*n0)/nm1)) - lam_dg[fep_state-1] );
 +            }
 +            else
 +            {
 +                dwm_array[nval]  = fabs( cnval - lam_dg[fep_state-1] );
 +            }
 +
 +            if (n0 > 0)
 +            {
 +                omegap_array[nval]             = omega_p1_0;
 +            }
 +            else
 +            {
 +                omegap_array[nval]             = 0;
 +            }
 +            weightsp_array[nval]           = clam_weightsp;
 +            varp_array[nval]               = clam_varp;
 +            if ((np1 > 0) && (n0 > 0))
 +            {
 +                dwp_array[nval]  = fabs( (cnval + log((1.0*np1)/n0)) - lam_dg[fep_state] );
 +            }
 +            else
 +            {
 +                dwp_array[nval]  = fabs( cnval - lam_dg[fep_state] );
 +            }
 +
 +        }
 +
 +        /* find the C's closest to the old weights value */
 +
 +        min_nvalm     = FindMinimum(dwm_array, maxc);
 +        omega_m1_0    = omegam_array[min_nvalm];
 +        clam_weightsm = weightsm_array[min_nvalm];
 +        clam_varm     = varm_array[min_nvalm];
 +
 +        min_nvalp     = FindMinimum(dwp_array, maxc);
 +        omega_p1_0    = omegap_array[min_nvalp];
 +        clam_weightsp = weightsp_array[min_nvalp];
 +        clam_varp     = varp_array[min_nvalp];
 +
 +        clam_osum   = omega_m1_0 + omega_p1_0;
 +        clam_minvar = 0;
 +        if (clam_osum > 0)
 +        {
 +            clam_minvar = 0.5*log(clam_osum);
 +        }
 +
 +        if (fep_state > 0)
 +        {
 +            lam_dg[fep_state-1]       = clam_weightsm;
 +            lam_variance[fep_state-1] = clam_varm;
 +        }
 +
 +        if (fep_state < nlim-1)
 +        {
 +            lam_dg[fep_state]       = clam_weightsp;
 +            lam_variance[fep_state] = clam_varp;
 +        }
 +
 +        if (expand->elamstats == elamstatsMINVAR)
 +        {
 +            bSufficientSamples = TRUE;
 +            /* make sure they are all past a threshold */
 +            for (i = 0; i < nlim; i++)
 +            {
 +                if (dfhist->n_at_lam[i] < expand->minvarmin)
 +                {
 +                    bSufficientSamples = FALSE;
 +                }
 +            }
 +            if (bSufficientSamples)
 +            {
 +                dfhist->sum_minvar[fep_state] = clam_minvar;
 +                if (fep_state == 0)
 +                {
 +                    for (i = 0; i < nlim; i++)
 +                    {
 +                        dfhist->sum_minvar[i] += (expand->minvar_const-clam_minvar);
 +                    }
 +                    expand->minvar_const          = clam_minvar;
 +                    dfhist->sum_minvar[fep_state] = 0.0;
 +                }
 +                else
 +                {
 +                    dfhist->sum_minvar[fep_state] -= expand->minvar_const;
 +                }
 +            }
 +        }
 +
 +        /* we need to rezero minvar now, since it could change at fep_state = 0 */
 +        dfhist->sum_dg[0]       = 0.0;
 +        dfhist->sum_variance[0] = 0.0;
 +        dfhist->sum_weights[0]  = dfhist->sum_dg[0] + dfhist->sum_minvar[0]; /* should be zero */
 +
 +        for (i = 1; i < nlim; i++)
 +        {
 +            dfhist->sum_dg[i]       = lam_dg[i-1] + dfhist->sum_dg[i-1];
 +            dfhist->sum_variance[i] = sqrt(lam_variance[i-1] + pow(dfhist->sum_variance[i-1], 2));
 +            dfhist->sum_weights[i]  = dfhist->sum_dg[i] + dfhist->sum_minvar[i];
 +        }
 +
 +        sfree(lam_dg);
 +        sfree(lam_variance);
 +
 +        sfree(omegam_array);
 +        sfree(weightsm_array);
 +        sfree(varm_array);
 +        sfree(dwm_array);
 +
 +        sfree(omegap_array);
 +        sfree(weightsp_array);
 +        sfree(varp_array);
 +        sfree(dwp_array);
 +    }
 +    return FALSE;
 +}
 +
-     real     r1, r2, pks, de_old, de_new, de, trialprob, tprob = 0;
++static int ChooseNewLambda(int nlim, t_expanded *expand, df_history_t *dfhist, int fep_state, real *weighted_lamee, double *p_k, gmx_rng_t rng)
 +{
 +    /* Choose new lambda value, and update transition matrix */
 +
 +    int      i, ifep, jfep, minfep, maxfep, lamnew, lamtrial, starting_fep_state;
-     real    *propose, *accept, *remainder;
++    real     r1, r2, de_old, de_new, de, trialprob, tprob = 0;
 +    real   **Tij;
-                 if (remainder[fep_state] < 2.0e-15)
++    double  *propose, *accept, *remainder;
++    double   pks;
 +    real     sum, pnorm;
 +    gmx_bool bRestricted;
 +
 +    starting_fep_state = fep_state;
 +    lamnew             = fep_state; /* so that there is a default setting -- stays the same */
 +
 +    if (!EWL(expand->elamstats))    /* ignore equilibrating the weights if using WL */
 +    {
 +        if ((expand->lmc_forced_nstart > 0) && (dfhist->n_at_lam[nlim-1] <= expand->lmc_forced_nstart))
 +        {
 +            /* Use a marching method to run through the lambdas and get preliminary free energy data,
 +               before starting 'free' sampling.  We start free sampling when we have enough at each lambda */
 +
 +            /* if we have enough at this lambda, move on to the next one */
 +
 +            if (dfhist->n_at_lam[fep_state] == expand->lmc_forced_nstart)
 +            {
 +                lamnew = fep_state+1;
 +                if (lamnew == nlim)  /* whoops, stepped too far! */
 +                {
 +                    lamnew -= 1;
 +                }
 +            }
 +            else
 +            {
 +                lamnew = fep_state;
 +            }
 +            return lamnew;
 +        }
 +    }
 +
 +    snew(propose, nlim);
 +    snew(accept, nlim);
 +    snew(remainder, nlim);
 +
 +    for (i = 0; i < expand->lmc_repeats; i++)
 +    {
 +
 +        for (ifep = 0; ifep < nlim; ifep++)
 +        {
 +            propose[ifep] = 0;
 +            accept[ifep]  = 0;
 +        }
 +
 +        if ((expand->elmcmove == elmcmoveGIBBS) || (expand->elmcmove == elmcmoveMETGIBBS))
 +        {
 +            bRestricted = TRUE;
 +            /* use the Gibbs sampler, with restricted range */
 +            if (expand->gibbsdeltalam < 0)
 +            {
 +                minfep      = 0;
 +                maxfep      = nlim-1;
 +                bRestricted = FALSE;
 +            }
 +            else
 +            {
 +                minfep = fep_state - expand->gibbsdeltalam;
 +                maxfep = fep_state + expand->gibbsdeltalam;
 +                if (minfep < 0)
 +                {
 +                    minfep = 0;
 +                }
 +                if (maxfep > nlim-1)
 +                {
 +                    maxfep = nlim-1;
 +                }
 +            }
 +
 +            GenerateGibbsProbabilities(weighted_lamee, p_k, &pks, minfep, maxfep);
 +
 +            if (expand->elmcmove == elmcmoveGIBBS)
 +            {
 +                for (ifep = minfep; ifep <= maxfep; ifep++)
 +                {
 +                    propose[ifep] = p_k[ifep];
 +                    accept[ifep]  = 1.0;
 +                }
 +                /* Gibbs sampling */
 +                r1 = gmx_rng_uniform_real(rng);
 +                for (lamnew = minfep; lamnew <= maxfep; lamnew++)
 +                {
 +                    if (r1 <= p_k[lamnew])
 +                    {
 +                        break;
 +                    }
 +                    r1 -= p_k[lamnew];
 +                }
 +            }
 +            else if (expand->elmcmove == elmcmoveMETGIBBS)
 +            {
 +
 +                /* Metropolized Gibbs sampling */
 +                for (ifep = minfep; ifep <= maxfep; ifep++)
 +                {
 +                    remainder[ifep] = 1 - p_k[ifep];
 +                }
 +
 +                /* find the proposal probabilities */
 +
 +                if (remainder[fep_state] == 0)
 +                {
 +                    /* only the current state has any probability */
 +                    /* we have to stay at the current state */
 +                    lamnew = fep_state;
 +                }
 +                else
 +                {
 +                    for (ifep = minfep; ifep <= maxfep; ifep++)
 +                    {
 +                        if (ifep != fep_state)
 +                        {
 +                            propose[ifep] = p_k[ifep]/remainder[fep_state];
 +                        }
 +                        else
 +                        {
 +                            propose[ifep] = 0;
 +                        }
 +                    }
 +
 +                    r1 = gmx_rng_uniform_real(rng);
 +                    for (lamtrial = minfep; lamtrial <= maxfep; lamtrial++)
 +                    {
 +                        pnorm = p_k[lamtrial]/remainder[fep_state];
 +                        if (lamtrial != fep_state)
 +                        {
 +                            if (r1 <= pnorm)
 +                            {
 +                                break;
 +                            }
 +                            r1 -= pnorm;
 +                        }
 +                    }
 +
 +                    /* we have now selected lamtrial according to p(lamtrial)/1-p(fep_state) */
 +                    tprob = 1.0;
 +                    /* trial probability is min{1,\frac{1 - p(old)}{1-p(new)} MRS 1/8/2008 */
 +                    trialprob = (remainder[fep_state])/(remainder[lamtrial]);
 +                    if (trialprob < tprob)
 +                    {
 +                        tprob = trialprob;
 +                    }
 +                    r2 = gmx_rng_uniform_real(rng);
 +                    if (r2 < tprob)
 +                    {
 +                        lamnew = lamtrial;
 +                    }
 +                    else
 +                    {
 +                        lamnew = fep_state;
 +                    }
 +                }
 +
 +                /* now figure out the acceptance probability for each */
 +                for (ifep = minfep; ifep <= maxfep; ifep++)
 +                {
 +                    tprob = 1.0;
 +                    if (remainder[ifep] != 0)
 +                    {
 +                        trialprob = (remainder[fep_state])/(remainder[ifep]);
 +                    }
 +                    else
 +                    {
 +                        trialprob = 1.0; /* this state is the only choice! */
 +                    }
 +                    if (trialprob < tprob)
 +                    {
 +                        tprob = trialprob;
 +                    }
 +                    /* probability for fep_state=0, but that's fine, it's never proposed! */
 +                    accept[ifep] = tprob;
 +                }
 +            }
 +
 +            if (lamnew > maxfep)
 +            {
 +                /* it's possible some rounding is failing */
-                     /* probably numerical rounding error -- no state other than the original has weight */
++                if (gmx_within_tol(remainder[fep_state],0,50*GMX_DOUBLE_EPS))
 +                {
-     real       *pfep_lamee, *p_k, *scaled_lamee, *weighted_lamee;
++                    /* numerical rounding error -- no state other than the original has weight */
 +                    lamnew = fep_state;
 +                }
 +                else
 +                {
 +                    /* probably not a numerical issue */
 +                    int   loc    = 0;
 +                    int   nerror = 200+(maxfep-minfep+1)*60;
 +                    char *errorstr;
 +                    snew(errorstr, nerror);
 +                    /* if its greater than maxfep, then something went wrong -- probably underflow in the calculation
 +                       of sum weights. Generated detailed info for failure */
 +                    loc += sprintf(errorstr, "Something wrong in choosing new lambda state with a Gibbs move -- probably underflow in weight determination.\nDenominator is: %3d%17.10e\n  i                dE        numerator          weights\n", 0, pks);
 +                    for (ifep = minfep; ifep <= maxfep; ifep++)
 +                    {
 +                        loc += sprintf(&errorstr[loc], "%3d %17.10e%17.10e%17.10e\n", ifep, weighted_lamee[ifep], p_k[ifep], dfhist->sum_weights[ifep]);
 +                    }
 +                    gmx_fatal(FARGS, errorstr);
 +                }
 +            }
 +        }
 +        else if ((expand->elmcmove == elmcmoveMETROPOLIS) || (expand->elmcmove == elmcmoveBARKER))
 +        {
 +            /* use the metropolis sampler with trial +/- 1 */
 +            r1 = gmx_rng_uniform_real(rng);
 +            if (r1 < 0.5)
 +            {
 +                if (fep_state == 0)
 +                {
 +                    lamtrial = fep_state;
 +                }
 +                else
 +                {
 +                    lamtrial = fep_state-1;
 +                }
 +            }
 +            else
 +            {
 +                if (fep_state == nlim-1)
 +                {
 +                    lamtrial = fep_state;
 +                }
 +                else
 +                {
 +                    lamtrial = fep_state+1;
 +                }
 +            }
 +
 +            de = weighted_lamee[lamtrial] - weighted_lamee[fep_state];
 +            if (expand->elmcmove == elmcmoveMETROPOLIS)
 +            {
 +                tprob     = 1.0;
 +                trialprob = exp(de);
 +                if (trialprob < tprob)
 +                {
 +                    tprob = trialprob;
 +                }
 +                propose[fep_state] = 0;
 +                propose[lamtrial]  = 1.0; /* note that this overwrites the above line if fep_state = ntrial, which only occurs at the ends */
 +                accept[fep_state]  = 1.0; /* doesn't actually matter, never proposed unless fep_state = ntrial, in which case it's 1.0 anyway */
 +                accept[lamtrial]   = tprob;
 +
 +            }
 +            else if (expand->elmcmove == elmcmoveBARKER)
 +            {
 +                tprob = 1.0/(1.0+exp(-de));
 +
 +                propose[fep_state] = (1-tprob);
 +                propose[lamtrial] += tprob; /* we add, to account for the fact that at the end, they might be the same point */
 +                accept[fep_state]  = 1.0;
 +                accept[lamtrial]   = 1.0;
 +            }
 +
 +            r2 = gmx_rng_uniform_real(rng);
 +            if (r2 < tprob)
 +            {
 +                lamnew = lamtrial;
 +            }
 +            else
 +            {
 +                lamnew = fep_state;
 +            }
 +        }
 +
 +        for (ifep = 0; ifep < nlim; ifep++)
 +        {
 +            dfhist->Tij[fep_state][ifep]      += propose[ifep]*accept[ifep];
 +            dfhist->Tij[fep_state][fep_state] += propose[ifep]*(1.0-accept[ifep]);
 +        }
 +        fep_state = lamnew;
 +    }
 +
 +    dfhist->Tij_empirical[starting_fep_state][lamnew] += 1.0;
 +
 +    sfree(propose);
 +    sfree(accept);
 +    sfree(remainder);
 +
 +    return lamnew;
 +}
 +
 +/* print out the weights to the log, along with current state */
 +extern void PrintFreeEnergyInfoToFile(FILE *outfile, t_lambda *fep, t_expanded *expand, t_simtemp *simtemp, df_history_t *dfhist,
 +                                      int nlam, int frequency, gmx_large_int_t step)
 +{
 +    int         nlim, i, ifep, jfep;
 +    real        dw, dg, dv, dm, Tprint;
 +    real       *temps;
 +    const char *print_names[efptNR] = {" FEPL", "MassL", "CoulL", " VdwL", "BondL", "RestT", "Temp.(K)"};
 +    gmx_bool    bSimTemp            = FALSE;
 +
 +    nlim = fep->n_lambda;
 +    if (simtemp != NULL)
 +    {
 +        bSimTemp = TRUE;
 +    }
 +
 +    if (mod(step, frequency) == 0)
 +    {
 +        fprintf(outfile, "             MC-lambda information\n");
 +        if (EWL(expand->elamstats) && (!(dfhist->bEquil)))
 +        {
 +            fprintf(outfile, "  Wang-Landau incrementor is: %11.5g\n", dfhist->wl_delta);
 +        }
 +        fprintf(outfile, "  N");
 +        for (i = 0; i < efptNR; i++)
 +        {
 +            if (fep->separate_dvdl[i])
 +            {
 +                fprintf(outfile, "%7s", print_names[i]);
 +            }
 +            else if ((i == efptTEMPERATURE) && bSimTemp)
 +            {
 +                fprintf(outfile, "%10s", print_names[i]); /* more space for temperature formats */
 +            }
 +        }
 +        fprintf(outfile, "    Count   ");
 +        if (expand->elamstats == elamstatsMINVAR)
 +        {
 +            fprintf(outfile, "W(in kT)   G(in kT)  dG(in kT)  dV(in kT)\n");
 +        }
 +        else
 +        {
 +            fprintf(outfile, "G(in kT)  dG(in kT)\n");
 +        }
 +        for (ifep = 0; ifep < nlim; ifep++)
 +        {
 +            if (ifep == nlim-1)
 +            {
 +                dw = 0.0;
 +                dg = 0.0;
 +                dv = 0.0;
 +                dm = 0.0;
 +            }
 +            else
 +            {
 +                dw = dfhist->sum_weights[ifep+1] - dfhist->sum_weights[ifep];
 +                dg = dfhist->sum_dg[ifep+1] - dfhist->sum_dg[ifep];
 +                dv = sqrt(pow(dfhist->sum_variance[ifep+1], 2) - pow(dfhist->sum_variance[ifep], 2));
 +                dm = dfhist->sum_minvar[ifep+1] - dfhist->sum_minvar[ifep];
 +
 +            }
 +            fprintf(outfile, "%3d", (ifep+1));
 +            for (i = 0; i < efptNR; i++)
 +            {
 +                if (fep->separate_dvdl[i])
 +                {
 +                    fprintf(outfile, "%7.3f", fep->all_lambda[i][ifep]);
 +                }
 +                else if (i == efptTEMPERATURE && bSimTemp)
 +                {
 +                    fprintf(outfile, "%9.3f", simtemp->temperatures[ifep]);
 +                }
 +            }
 +            if (EWL(expand->elamstats) && (!(dfhist->bEquil)))  /* if performing WL and still haven't equilibrated */
 +            {
 +                if (expand->elamstats == elamstatsWL)
 +                {
 +                    fprintf(outfile, " %8d", (int)dfhist->wl_histo[ifep]);
 +                }
 +                else
 +                {
 +                    fprintf(outfile, " %8.3f", dfhist->wl_histo[ifep]);
 +                }
 +            }
 +            else   /* we have equilibrated weights */
 +            {
 +                fprintf(outfile, " %8d", dfhist->n_at_lam[ifep]);
 +            }
 +            if (expand->elamstats == elamstatsMINVAR)
 +            {
 +                fprintf(outfile, " %10.5f %10.5f %10.5f %10.5f", dfhist->sum_weights[ifep], dfhist->sum_dg[ifep], dg, dv);
 +            }
 +            else
 +            {
 +                fprintf(outfile, " %10.5f %10.5f", dfhist->sum_weights[ifep], dw);
 +            }
 +            if (ifep == nlam)
 +            {
 +                fprintf(outfile, " <<\n");
 +            }
 +            else
 +            {
 +                fprintf(outfile, "   \n");
 +            }
 +        }
 +        fprintf(outfile, "\n");
 +
 +        if ((mod(step, expand->nstTij) == 0) && (expand->nstTij > 0) && (step > 0))
 +        {
 +            fprintf(outfile, "                     Transition Matrix\n");
 +            for (ifep = 0; ifep < nlim; ifep++)
 +            {
 +                fprintf(outfile, "%12d", (ifep+1));
 +            }
 +            fprintf(outfile, "\n");
 +            for (ifep = 0; ifep < nlim; ifep++)
 +            {
 +                for (jfep = 0; jfep < nlim; jfep++)
 +                {
 +                    if (dfhist->n_at_lam[ifep] > 0)
 +                    {
 +                        if (expand->bSymmetrizedTMatrix)
 +                        {
 +                            Tprint = (dfhist->Tij[ifep][jfep]+dfhist->Tij[jfep][ifep])/(dfhist->n_at_lam[ifep]+dfhist->n_at_lam[jfep]);
 +                        }
 +                        else
 +                        {
 +                            Tprint = (dfhist->Tij[ifep][jfep])/(dfhist->n_at_lam[ifep]);
 +                        }
 +                    }
 +                    else
 +                    {
 +                        Tprint = 0.0;
 +                    }
 +                    fprintf(outfile, "%12.8f", Tprint);
 +                }
 +                fprintf(outfile, "%3d\n", (ifep+1));
 +            }
 +
 +            fprintf(outfile, "                  Empirical Transition Matrix\n");
 +            for (ifep = 0; ifep < nlim; ifep++)
 +            {
 +                fprintf(outfile, "%12d", (ifep+1));
 +            }
 +            fprintf(outfile, "\n");
 +            for (ifep = 0; ifep < nlim; ifep++)
 +            {
 +                for (jfep = 0; jfep < nlim; jfep++)
 +                {
 +                    if (dfhist->n_at_lam[ifep] > 0)
 +                    {
 +                        if (expand->bSymmetrizedTMatrix)
 +                        {
 +                            Tprint = (dfhist->Tij_empirical[ifep][jfep]+dfhist->Tij_empirical[jfep][ifep])/(dfhist->n_at_lam[ifep]+dfhist->n_at_lam[jfep]);
 +                        }
 +                        else
 +                        {
 +                            Tprint = dfhist->Tij_empirical[ifep][jfep]/(dfhist->n_at_lam[ifep]);
 +                        }
 +                    }
 +                    else
 +                    {
 +                        Tprint = 0.0;
 +                    }
 +                    fprintf(outfile, "%12.8f", Tprint);
 +                }
 +                fprintf(outfile, "%3d\n", (ifep+1));
 +            }
 +        }
 +    }
 +}
 +
 +extern void get_mc_state(gmx_rng_t rng, t_state *state)
 +{
 +    gmx_rng_get_state(rng, state->mc_rng, state->mc_rngi);
 +}
 +
 +extern void set_mc_state(gmx_rng_t rng, t_state *state)
 +{
 +    gmx_rng_set_state(rng, state->mc_rng, state->mc_rngi[0]);
 +}
 +
 +extern int ExpandedEnsembleDynamics(FILE *log, t_inputrec *ir, gmx_enerdata_t *enerd,
 +                                    t_state *state, t_extmass *MassQ, df_history_t *dfhist,
 +                                    gmx_large_int_t step, gmx_rng_t mcrng,
 +                                    rvec *v, t_mdatoms *mdatoms)
 +{
++    real       *pfep_lamee, *scaled_lamee, *weighted_lamee;
++    double     *p_k;
 +    int         i, nlam, nlim, lamnew, totalsamples;
 +    real        oneovert, maxscaled = 0, maxweighted = 0;
 +    t_expanded *expand;
 +    t_simtemp  *simtemp;
 +    double     *temperature_lambdas;
 +    gmx_bool    bIfReset, bSwitchtoOneOverT, bDoneEquilibrating = FALSE;
 +
 +    expand  = ir->expandedvals;
 +    simtemp = ir->simtempvals;
 +    nlim    = ir->fepvals->n_lambda;
 +    nlam    = state->fep_state;
 +
 +    snew(scaled_lamee, nlim);
 +    snew(weighted_lamee, nlim);
 +    snew(pfep_lamee, nlim);
 +    snew(p_k, nlim);
 +
 +    if (expand->bInit_weights)                    /* if initialized weights, we need to fill them in */
 +    {
 +        dfhist->wl_delta = expand->init_wl_delta; /* MRS -- this would fit better somewhere else? */
 +        for (i = 0; i < nlim; i++)
 +        {
 +            dfhist->sum_weights[i] = expand->init_lambda_weights[i];
 +            dfhist->sum_dg[i]      = expand->init_lambda_weights[i];
 +        }
 +        expand->bInit_weights = FALSE;
 +    }
 +
 +    /* update the count at the current lambda*/
 +    dfhist->n_at_lam[nlam]++;
 +
 +    /* need to calculate the PV term somewhere, but not needed here? Not until there's a lambda state that's
 +       pressure controlled.*/
 +    /*
 +       pVTerm = 0;
 +       where does this PV term go?
 +       for (i=0;i<nlim;i++)
 +       {
 +       fep_lamee[i] += pVTerm;
 +       }
 +     */
 +
 +    /* determine the minimum value to avoid overflow.  Probably a better way to do this */
 +    /* we don't need to include the pressure term, since the volume is the same between the two.
 +       is there some term we are neglecting, however? */
 +
 +    if (ir->efep != efepNO)
 +    {
 +        for (i = 0; i < nlim; i++)
 +        {
 +            if (ir->bSimTemp)
 +            {
 +                /* Note -- this assumes no mass changes, since kinetic energy is not added  . . . */
 +                scaled_lamee[i] = (enerd->enerpart_lambda[i+1]-enerd->enerpart_lambda[0])/(simtemp->temperatures[i]*BOLTZ)
 +                    + enerd->term[F_EPOT]*(1.0/(simtemp->temperatures[i])- 1.0/(simtemp->temperatures[nlam]))/BOLTZ;
 +            }
 +            else
 +            {
 +                scaled_lamee[i] = (enerd->enerpart_lambda[i+1]-enerd->enerpart_lambda[0])/(expand->mc_temp*BOLTZ);
 +                /* mc_temp is currently set to the system reft unless otherwise defined */
 +            }
 +
 +            /* save these energies for printing, so they don't get overwritten by the next step */
 +            /* they aren't overwritten in the non-free energy case, but we always print with these
 +               for simplicity */
 +        }
 +    }
 +    else
 +    {
 +        if (ir->bSimTemp)
 +        {
 +            for (i = 0; i < nlim; i++)
 +            {
 +                scaled_lamee[i] = enerd->term[F_EPOT]*(1.0/simtemp->temperatures[i] - 1.0/simtemp->temperatures[nlam])/BOLTZ;
 +            }
 +        }
 +    }
 +
 +    for (i = 0; i < nlim; i++)
 +    {
 +        pfep_lamee[i] = scaled_lamee[i];
 +
 +        weighted_lamee[i] = dfhist->sum_weights[i] - scaled_lamee[i];
 +        if (i == 0)
 +        {
 +            maxscaled   = scaled_lamee[i];
 +            maxweighted = weighted_lamee[i];
 +        }
 +        else
 +        {
 +            if (scaled_lamee[i] > maxscaled)
 +            {
 +                maxscaled = scaled_lamee[i];
 +            }
 +            if (weighted_lamee[i] > maxweighted)
 +            {
 +                maxweighted = weighted_lamee[i];
 +            }
 +        }
 +    }
 +
 +    for (i = 0; i < nlim; i++)
 +    {
 +        scaled_lamee[i]   -= maxscaled;
 +        weighted_lamee[i] -= maxweighted;
 +    }
 +
 +    /* update weights - we decide whether or not to actually do this inside */
 +
 +    bDoneEquilibrating = UpdateWeights(nlim, expand, dfhist, nlam, scaled_lamee, weighted_lamee, step);
 +    if (bDoneEquilibrating)
 +    {
 +        if (log)
 +        {
 +            fprintf(log, "\nStep %d: Weights have equilibrated, using criteria: %s\n", (int)step, elmceq_names[expand->elmceq]);
 +        }
 +    }
 +
 +    lamnew = ChooseNewLambda(nlim, expand, dfhist, nlam, weighted_lamee, p_k, mcrng);
 +    /* if using simulated tempering, we need to adjust the temperatures */
 +    if (ir->bSimTemp && (lamnew != nlam)) /* only need to change the temperatures if we change the state */
 +    {
 +        int   i, j, n, d;
 +        real *buf_ngtc;
 +        real  told;
 +        int   nstart, nend, gt;
 +
 +        snew(buf_ngtc, ir->opts.ngtc);
 +
 +        for (i = 0; i < ir->opts.ngtc; i++)
 +        {
 +            if (ir->opts.ref_t[i] > 0)
 +            {
 +                told              = ir->opts.ref_t[i];
 +                ir->opts.ref_t[i] =  simtemp->temperatures[lamnew];
 +                buf_ngtc[i]       = sqrt(ir->opts.ref_t[i]/told); /* using the buffer as temperature scaling */
 +            }
 +        }
 +
 +        /* we don't need to manipulate the ekind information, as it isn't due to be reset until the next step anyway */
 +
 +        nstart = mdatoms->start;
 +        nend   = nstart + mdatoms->homenr;
 +        for (n = nstart; n < nend; n++)
 +        {
 +            gt = 0;
 +            if (mdatoms->cTC)
 +            {
 +                gt = mdatoms->cTC[n];
 +            }
 +            for (d = 0; d < DIM; d++)
 +            {
 +                v[n][d] *= buf_ngtc[gt];
 +            }
 +        }
 +
 +        if (IR_NPT_TROTTER(ir) || IR_NPH_TROTTER(ir) || IR_NVT_TROTTER(ir))
 +        {
 +            /* we need to recalculate the masses if the temperature has changed */
 +            init_npt_masses(ir, state, MassQ, FALSE);
 +            for (i = 0; i < state->nnhpres; i++)
 +            {
 +                for (j = 0; j < ir->opts.nhchainlength; j++)
 +                {
 +                    state->nhpres_vxi[i+j] *= buf_ngtc[i];
 +                }
 +            }
 +            for (i = 0; i < ir->opts.ngtc; i++)
 +            {
 +                for (j = 0; j < ir->opts.nhchainlength; j++)
 +                {
 +                    state->nosehoover_vxi[i+j] *= buf_ngtc[i];
 +                }
 +            }
 +        }
 +        sfree(buf_ngtc);
 +    }
 +
 +    /* now check on the Wang-Landau updating critera */
 +
 +    if (EWL(expand->elamstats))
 +    {
 +        bSwitchtoOneOverT = FALSE;
 +        if (expand->bWLoneovert)
 +        {
 +            totalsamples = 0;
 +            for (i = 0; i < nlim; i++)
 +            {
 +                totalsamples += dfhist->n_at_lam[i];
 +            }
 +            oneovert = (1.0*nlim)/totalsamples;
 +            /* oneovert has decreasd by a bit since last time, so we actually make sure its within one of this number */
 +            /* switch to 1/t incrementing when wl_delta has decreased at least once, and wl_delta is now less than 1/t */
 +            if ((dfhist->wl_delta <= ((totalsamples)/(totalsamples-1.00001))*oneovert) &&
 +                (dfhist->wl_delta < expand->init_wl_delta))
 +            {
 +                bSwitchtoOneOverT = TRUE;
 +            }
 +        }
 +        if (bSwitchtoOneOverT)
 +        {
 +            dfhist->wl_delta = oneovert; /* now we reduce by this each time, instead of only at flatness */
 +        }
 +        else
 +        {
 +            bIfReset = CheckHistogramRatios(nlim, dfhist->wl_histo, expand->wl_ratio);
 +            if (bIfReset)
 +            {
 +                for (i = 0; i < nlim; i++)
 +                {
 +                    dfhist->wl_histo[i] = 0;
 +                }
 +                dfhist->wl_delta *= expand->wl_scale;
 +                if (log)
 +                {
 +                    fprintf(log, "\nStep %d: weights are now:", (int)step);
 +                    for (i = 0; i < nlim; i++)
 +                    {
 +                        fprintf(log, " %.5f", dfhist->sum_weights[i]);
 +                    }
 +                    fprintf(log, "\n");
 +                }
 +            }
 +        }
 +    }
 +    sfree(pfep_lamee);
 +    sfree(scaled_lamee);
 +    sfree(weighted_lamee);
 +    sfree(p_k);
 +
 +    return lamnew;
 +}
index 971c27d9514d4e6144e31390210f706279937ccf,0000000000000000000000000000000000000000..ace2478b063e72575dfa33a8fb79432755d905c0
mode 100644,000000..100644
--- /dev/null
@@@ -1,864 -1,0 +1,862 @@@
- /* Each name should not exceed 19 characters */
 +/*  -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2008, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <string.h>
 +#include "gmx_wallcycle.h"
 +#include "gmx_cyclecounter.h"
 +#include "smalloc.h"
 +#include "gmx_fatal.h"
 +#include "md_logging.h"
 +#include "string2.h"
 +
 +#include "gromacs/utility/gmxmpi.h"
 +
 +/* DEBUG_WCYCLE adds consistency checking for the counters.
 + * It checks if you stop a counter different from the last
 + * one that was opened and if you do nest too deep.
 + */
 +/* #define DEBUG_WCYCLE */
 +
 +typedef struct
 +{
 +    int          n;
 +    gmx_cycles_t c;
 +    gmx_cycles_t start;
 +    gmx_cycles_t last;
 +} wallcc_t;
 +
 +typedef struct gmx_wallcycle
 +{
 +    wallcc_t        *wcc;
 +    /* variables for testing/debugging */
 +    gmx_bool         wc_barrier;
 +    wallcc_t        *wcc_all;
 +    int              wc_depth;
 +#ifdef DEBUG_WCYCLE
 +#define DEPTH_MAX 6
 +    int               counterlist[DEPTH_MAX];
 +    int               count_depth;
 +#endif
 +    int               ewc_prev;
 +    gmx_cycles_t      cycle_prev;
 +    gmx_large_int_t   reset_counters;
 +#ifdef GMX_MPI
 +    MPI_Comm          mpi_comm_mygroup;
 +#endif
 +    int               nthreads_pp;
 +    int               nthreads_pme;
 +#ifdef GMX_CYCLE_SUBCOUNTERS
 +    wallcc_t         *wcsc;
 +#endif
 +    double           *cycles_sum;
 +} gmx_wallcycle_t_t;
 +
-         fprintf(fplog, " %-19s %4d %4s %10s  %10.3f %12.3f   %5.1f\n",
++/* Each name should not exceed 19 printing characters
++   (ie. terminating null can be twentieth) */
 +static const char *wcn[ewcNR] =
 +{
 +    "Run", "Step", "PP during PME", "Domain decomp.", "DD comm. load",
 +    "DD comm. bounds", "Vsite constr.", "Send X to PME", "Neighbor search", "Launch GPU ops.",
 +    "Comm. coord.", "Born radii", "Force", "Wait + Comm. F", "PME mesh",
 +    "PME redist. X/F", "PME spread/gather", "PME 3D-FFT", "PME 3D-FFT Comm.", "PME solve",
 +    "PME wait for PP", "Wait + Recv. PME F", "Wait GPU nonlocal", "Wait GPU local", "NB X/F buffer ops.",
 +    "Vsite spread", "Write traj.", "Update", "Constraints", "Comm. energies",
 +    "Enforced rotation", "Add rot. forces", "Test"
 +};
 +
 +static const char *wcsn[ewcsNR] =
 +{
 +    "DD redist.", "DD NS grid + sort", "DD setup comm.",
 +    "DD make top.", "DD make constr.", "DD top. other",
 +    "NS grid local", "NS grid non-loc.", "NS search local", "NS search non-loc.",
 +    "Bonded F", "Nonbonded F", "Ewald F correction",
 +    "NB X buffer ops.", "NB F buffer ops."
 +};
 +
 +gmx_bool wallcycle_have_counter(void)
 +{
 +    return gmx_cycles_have_counter();
 +}
 +
 +gmx_wallcycle_t wallcycle_init(FILE *fplog, int resetstep, t_commrec *cr,
 +                               int nthreads_pp, int nthreads_pme)
 +{
 +    gmx_wallcycle_t wc;
 +
 +
 +    if (!wallcycle_have_counter())
 +    {
 +        return NULL;
 +    }
 +
 +    snew(wc, 1);
 +
 +    wc->wc_barrier          = FALSE;
 +    wc->wcc_all             = NULL;
 +    wc->wc_depth            = 0;
 +    wc->ewc_prev            = -1;
 +    wc->reset_counters      = resetstep;
 +    wc->nthreads_pp         = nthreads_pp;
 +    wc->nthreads_pme        = nthreads_pme;
 +    wc->cycles_sum          = NULL;
 +
 +#ifdef GMX_MPI
 +    if (PAR(cr) && getenv("GMX_CYCLE_BARRIER") != NULL)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "\nWill call MPI_Barrier before each cycle start/stop call\n\n");
 +        }
 +        wc->wc_barrier       = TRUE;
 +        wc->mpi_comm_mygroup = cr->mpi_comm_mygroup;
 +    }
 +#endif
 +
 +    snew(wc->wcc, ewcNR);
 +    if (getenv("GMX_CYCLE_ALL") != NULL)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "\nWill time all the code during the run\n\n");
 +        }
 +        snew(wc->wcc_all, ewcNR*ewcNR);
 +    }
 +
 +#ifdef GMX_CYCLE_SUBCOUNTERS
 +    snew(wc->wcsc, ewcsNR);
 +#endif
 +
 +#ifdef DEBUG_WCYCLE
 +    wc->count_depth = 0;
 +#endif
 +
 +    return wc;
 +}
 +
 +void wallcycle_destroy(gmx_wallcycle_t wc)
 +{
 +    if (wc == NULL)
 +    {
 +        return;
 +    }
 +
 +    if (wc->wcc != NULL)
 +    {
 +        sfree(wc->wcc);
 +    }
 +    if (wc->wcc_all != NULL)
 +    {
 +        sfree(wc->wcc_all);
 +    }
 +#ifdef GMX_CYCLE_SUBCOUNTERS
 +    if (wc->wcsc != NULL)
 +    {
 +        sfree(wc->wcsc);
 +    }
 +#endif
 +    sfree(wc);
 +}
 +
 +static void wallcycle_all_start(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
 +{
 +    wc->ewc_prev   = ewc;
 +    wc->cycle_prev = cycle;
 +}
 +
 +static void wallcycle_all_stop(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
 +{
 +    wc->wcc_all[wc->ewc_prev*ewcNR+ewc].n += 1;
 +    wc->wcc_all[wc->ewc_prev*ewcNR+ewc].c += cycle - wc->cycle_prev;
 +}
 +
 +
 +#ifdef DEBUG_WCYCLE
 +static void debug_start_check(gmx_wallcycle_t wc, int ewc)
 +{
 +    /* fprintf(stderr,"wcycle_start depth %d, %s\n",wc->count_depth,wcn[ewc]); */
 +
 +    if (wc->count_depth < 0 || wc->count_depth >= DEPTH_MAX)
 +    {
 +        gmx_fatal(FARGS, "wallcycle counter depth out of range: %d",
 +                  wc->count_depth);
 +    }
 +    wc->counterlist[wc->count_depth] = ewc;
 +    wc->count_depth++;
 +}
 +
 +static void debug_stop_check(gmx_wallcycle_t wc, int ewc)
 +{
 +    wc->count_depth--;
 +
 +    /* fprintf(stderr,"wcycle_stop depth %d, %s\n",wc->count_depth,wcn[ewc]); */
 +
 +    if (wc->count_depth < 0)
 +    {
 +        gmx_fatal(FARGS, "wallcycle counter depth out of range when stopping %s: %d", wcn[ewc], wc->count_depth);
 +    }
 +    if (wc->counterlist[wc->count_depth] != ewc)
 +    {
 +        gmx_fatal(FARGS, "wallcycle mismatch at stop, start %s, stop %s",
 +                  wcn[wc->counterlist[wc->count_depth]], wcn[ewc]);
 +    }
 +}
 +#endif
 +
 +void wallcycle_start(gmx_wallcycle_t wc, int ewc)
 +{
 +    gmx_cycles_t cycle;
 +
 +    if (wc == NULL)
 +    {
 +        return;
 +    }
 +
 +#ifdef GMX_MPI
 +    if (wc->wc_barrier)
 +    {
 +        MPI_Barrier(wc->mpi_comm_mygroup);
 +    }
 +#endif
 +
 +#ifdef DEBUG_WCYCLE
 +    debug_start_check(wc, ewc);
 +#endif
 +
 +    cycle              = gmx_cycles_read();
 +    wc->wcc[ewc].start = cycle;
 +    if (wc->wcc_all != NULL)
 +    {
 +        wc->wc_depth++;
 +        if (ewc == ewcRUN)
 +        {
 +            wallcycle_all_start(wc, ewc, cycle);
 +        }
 +        else if (wc->wc_depth == 3)
 +        {
 +            wallcycle_all_stop(wc, ewc, cycle);
 +        }
 +    }
 +}
 +
 +void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc)
 +{
 +    if (wc == NULL)
 +    {
 +        return;
 +    }
 +
 +    wallcycle_start(wc, ewc);
 +    wc->wcc[ewc].n--;
 +}
 +
 +double wallcycle_stop(gmx_wallcycle_t wc, int ewc)
 +{
 +    gmx_cycles_t cycle, last;
 +
 +    if (wc == NULL)
 +    {
 +        return 0;
 +    }
 +
 +#ifdef GMX_MPI
 +    if (wc->wc_barrier)
 +    {
 +        MPI_Barrier(wc->mpi_comm_mygroup);
 +    }
 +#endif
 +
 +#ifdef DEBUG_WCYCLE
 +    debug_stop_check(wc, ewc);
 +#endif
 +
 +    cycle           = gmx_cycles_read();
 +    last            = cycle - wc->wcc[ewc].start;
 +    wc->wcc[ewc].c += last;
 +    wc->wcc[ewc].n++;
 +    if (wc->wcc_all)
 +    {
 +        wc->wc_depth--;
 +        if (ewc == ewcRUN)
 +        {
 +            wallcycle_all_stop(wc, ewc, cycle);
 +        }
 +        else if (wc->wc_depth == 2)
 +        {
 +            wallcycle_all_start(wc, ewc, cycle);
 +        }
 +    }
 +
 +    return last;
 +}
 +
 +void wallcycle_reset_all(gmx_wallcycle_t wc)
 +{
 +    int i;
 +
 +    if (wc == NULL)
 +    {
 +        return;
 +    }
 +
 +    for (i = 0; i < ewcNR; i++)
 +    {
 +        wc->wcc[i].n = 0;
 +        wc->wcc[i].c = 0;
 +    }
 +    if (wc->wcc_all)
 +    {
 +        for (i = 0; i < ewcNR*ewcNR; i++)
 +        {
 +            wc->wcc_all[i].n = 0;
 +            wc->wcc_all[i].c = 0;
 +        }
 +    }
 +#ifdef GMX_CYCLE_SUBCOUNTERS
 +    for (i = 0; i < ewcsNR; i++)
 +    {
 +        wc->wcsc[i].n = 0;
 +        wc->wcsc[i].c = 0;
 +    }
 +#endif
 +}
 +
 +static gmx_bool is_pme_counter(int ewc)
 +{
 +    return (ewc >= ewcPMEMESH && ewc <= ewcPMEWAITCOMM);
 +}
 +
 +static gmx_bool is_pme_subcounter(int ewc)
 +{
 +    return (ewc >= ewcPME_REDISTXF && ewc < ewcPMEWAITCOMM);
 +}
 +
 +void wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc)
 +{
 +    wallcc_t *wcc;
 +    double   *cycles;
 +    double    cycles_n[ewcNR+ewcsNR], buf[ewcNR+ewcsNR], *cyc_all, *buf_all;
 +    int       i, j;
 +    int       nsum;
 +
 +    if (wc == NULL)
 +    {
 +        return;
 +    }
 +
 +    snew(wc->cycles_sum, ewcNR+ewcsNR);
 +    cycles = wc->cycles_sum;
 +
 +    wcc = wc->wcc;
 +
 +    for (i = 0; i < ewcNR; i++)
 +    {
 +        if (is_pme_counter(i) || (i == ewcRUN && cr->duty == DUTY_PME))
 +        {
 +            wcc[i].c *= wc->nthreads_pme;
 +
 +            if (wc->wcc_all)
 +            {
 +                for (j = 0; j < ewcNR; j++)
 +                {
 +                    wc->wcc_all[i*ewcNR+j].c *= wc->nthreads_pme;
 +                }
 +            }
 +        }
 +        else
 +        {
 +            wcc[i].c *= wc->nthreads_pp;
 +
 +            if (wc->wcc_all)
 +            {
 +                for (j = 0; j < ewcNR; j++)
 +                {
 +                    wc->wcc_all[i*ewcNR+j].c *= wc->nthreads_pp;
 +                }
 +            }
 +        }
 +    }
 +
 +    if (wcc[ewcDDCOMMLOAD].n > 0)
 +    {
 +        wcc[ewcDOMDEC].c -= wcc[ewcDDCOMMLOAD].c;
 +    }
 +    if (wcc[ewcDDCOMMBOUND].n > 0)
 +    {
 +        wcc[ewcDOMDEC].c -= wcc[ewcDDCOMMBOUND].c;
 +    }
 +    if (wcc[ewcPME_FFTCOMM].n > 0)
 +    {
 +        wcc[ewcPME_FFT].c -= wcc[ewcPME_FFTCOMM].c;
 +    }
 +
 +    if (cr->npmenodes == 0)
 +    {
 +        /* All nodes do PME (or no PME at all) */
 +        if (wcc[ewcPMEMESH].n > 0)
 +        {
 +            wcc[ewcFORCE].c -= wcc[ewcPMEMESH].c;
 +        }
 +    }
 +    else
 +    {
 +        /* The are PME-only nodes */
 +        if (wcc[ewcPMEMESH].n > 0)
 +        {
 +            /* This must be a PME only node, calculate the Wait + Comm. time */
 +            wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH].c;
 +        }
 +    }
 +
 +    /* Store the cycles in a double buffer for summing */
 +    for (i = 0; i < ewcNR; i++)
 +    {
 +        cycles_n[i] = (double)wcc[i].n;
 +        cycles[i]   = (double)wcc[i].c;
 +    }
 +    nsum = ewcNR;
 +#ifdef GMX_CYCLE_SUBCOUNTERS
 +    for (i = 0; i < ewcsNR; i++)
 +    {
 +        wc->wcsc[i].c    *= wc->nthreads_pp;
 +        cycles_n[ewcNR+i] = (double)wc->wcsc[i].n;
 +        cycles[ewcNR+i]   = (double)wc->wcsc[i].c;
 +    }
 +    nsum += ewcsNR;
 +#endif
 +
 +#ifdef GMX_MPI
 +    if (cr->nnodes > 1)
 +    {
 +        MPI_Allreduce(cycles_n, buf, nsum, MPI_DOUBLE, MPI_MAX,
 +                      cr->mpi_comm_mysim);
 +        for (i = 0; i < ewcNR; i++)
 +        {
 +            wcc[i].n = (int)(buf[i] + 0.5);
 +        }
 +#ifdef GMX_CYCLE_SUBCOUNTERS
 +        for (i = 0; i < ewcsNR; i++)
 +        {
 +            wc->wcsc[i].n = (int)(buf[ewcNR+i] + 0.5);
 +        }
 +#endif
 +
 +        MPI_Allreduce(cycles, buf, nsum, MPI_DOUBLE, MPI_SUM,
 +                      cr->mpi_comm_mysim);
 +        for (i = 0; i < nsum; i++)
 +        {
 +            cycles[i] = buf[i];
 +        }
 +
 +        if (wc->wcc_all != NULL)
 +        {
 +            snew(cyc_all, ewcNR*ewcNR);
 +            snew(buf_all, ewcNR*ewcNR);
 +            for (i = 0; i < ewcNR*ewcNR; i++)
 +            {
 +                cyc_all[i] = wc->wcc_all[i].c;
 +            }
 +            MPI_Allreduce(cyc_all, buf_all, ewcNR*ewcNR, MPI_DOUBLE, MPI_SUM,
 +                          cr->mpi_comm_mysim);
 +            for (i = 0; i < ewcNR*ewcNR; i++)
 +            {
 +                wc->wcc_all[i].c = buf_all[i];
 +            }
 +            sfree(buf_all);
 +            sfree(cyc_all);
 +        }
 +    }
 +#endif
 +}
 +
 +static void print_cycles(FILE *fplog, double c2t, const char *name,
 +                         int nthreads_tot,
 +                         int nnodes, int nthreads,
 +                         int n, double c, double tot)
 +{
 +    char   num[11];
 +    char   thstr[6];
 +    double wallt;
 +
 +    if (c > 0)
 +    {
 +        if (n > 0)
 +        {
 +            snprintf(num, sizeof(num), "%10d", n);
 +            if (nthreads < 0)
 +            {
 +                snprintf(thstr, sizeof(thstr), "N/A");
 +            }
 +            else
 +            {
 +                snprintf(thstr, sizeof(thstr), "%4d", nthreads);
 +            }
 +        }
 +        else
 +        {
 +            sprintf(num, "          ");
 +            sprintf(thstr, "    ");
 +        }
 +        /* Convert the cycle count to wallclock time for this task */
 +        if (nthreads > 0)
 +        {
 +            /* Cycle count has been multiplied by the thread count,
 +             * correct for the number of threads used.
 +             */
 +            wallt = c*c2t*nthreads_tot/(double)(nnodes*nthreads);
 +        }
 +        else
 +        {
 +            /* nthreads=-1 signals total run time, no correction required */
 +            wallt = c*c2t;
 +        }
-                 snprintf(buf, 9, "%-9s", wcn[i]);
-                 buf[9] = ' ';
-                 snprintf(buf+10, 9, "%-9s", wcn[j]);
-                 buf[19] = '\0';
++        fprintf(fplog, " %-19.19s %4d %4s %10s  %10.3f %12.3f   %5.1f\n",
 +                name, nnodes, thstr, num, wallt, c*1e-9, 100*c/tot);
 +    }
 +}
 +
 +static void print_gputimes(FILE *fplog, const char *name,
 +                           int n, double t, double tot_t)
 +{
 +    char num[11];
 +    char avg_perf[11];
 +
 +    if (n > 0)
 +    {
 +        snprintf(num, sizeof(num), "%10d", n);
 +        snprintf(avg_perf, sizeof(avg_perf), "%10.3f", t/n);
 +    }
 +    else
 +    {
 +        sprintf(num, "          ");
 +        sprintf(avg_perf, "          ");
 +    }
 +    if (t != tot_t)
 +    {
 +        fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n",
 +                name, num, t/1000, avg_perf, 100 * t/tot_t);
 +    }
 +    else
 +    {
 +        fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n",
 +                name, "", t/1000, avg_perf, 100.0);
 +    }
 +}
 +
 +void wallcycle_print(FILE *fplog, int nnodes, int npme, double realtime,
 +                     gmx_wallcycle_t wc, wallclock_gpu_t *gpu_t)
 +{
 +    double     *cycles;
 +    double      c2t, tot, tot_gpu, tot_cpu_overlap, gpu_cpu_ratio, sum, tot_k;
 +    int         i, j, npp, nth_pp, nth_pme, nth_tot;
 +    char        buf[STRLEN];
 +    const char *hline = "-----------------------------------------------------------------------------";
 +
 +    if (wc == NULL)
 +    {
 +        return;
 +    }
 +
 +    nth_pp  = wc->nthreads_pp;
 +    nth_pme = wc->nthreads_pme;
 +
 +    cycles = wc->cycles_sum;
 +
 +    if (npme > 0)
 +    {
 +        npp = nnodes - npme;
 +
 +        nth_tot = npp*nth_pp + npme*nth_pme;
 +    }
 +    else
 +    {
 +        npp  = nnodes;
 +        npme = nnodes;
 +
 +        nth_tot = npp*nth_pp;
 +    }
 +
 +    tot = cycles[ewcRUN];
 +
 +    /* Conversion factor from cycles to seconds */
 +    if (tot > 0)
 +    {
 +        c2t = realtime/tot;
 +    }
 +    else
 +    {
 +        c2t = 0;
 +    }
 +
 +    fprintf(fplog, "\n     R E A L   C Y C L E   A N D   T I M E   A C C O U N T I N G\n\n");
 +
 +    fprintf(fplog, " Computing:         Nodes   Th.     Count  Wall t (s)     G-Cycles       %c\n", '%');
 +    fprintf(fplog, "%s\n", hline);
 +    sum = 0;
 +    for (i = ewcPPDURINGPME+1; i < ewcNR; i++)
 +    {
 +        if (!is_pme_subcounter(i))
 +        {
 +            print_cycles(fplog, c2t, wcn[i], nth_tot,
 +                         is_pme_counter(i) ? npme : npp,
 +                         is_pme_counter(i) ? nth_pme : nth_pp,
 +                         wc->wcc[i].n, cycles[i], tot);
 +            sum += cycles[i];
 +        }
 +    }
 +    if (wc->wcc_all != NULL)
 +    {
 +        for (i = 0; i < ewcNR; i++)
 +        {
 +            for (j = 0; j < ewcNR; j++)
 +            {
++                snprintf(buf, 20, "%-9.9s %-9.9s", wcn[i], wcn[j]);
 +                print_cycles(fplog, c2t, buf, nth_tot,
 +                             is_pme_counter(i) ? npme : npp,
 +                             is_pme_counter(i) ? nth_pme : nth_pp,
 +                             wc->wcc_all[i*ewcNR+j].n,
 +                             wc->wcc_all[i*ewcNR+j].c,
 +                             tot);
 +            }
 +        }
 +    }
 +    print_cycles(fplog, c2t, "Rest", nth_tot, npp, -1, 0, tot-sum, tot);
 +    fprintf(fplog, "%s\n", hline);
 +    print_cycles(fplog, c2t, "Total", nth_tot, nnodes, -1, 0, tot, tot);
 +    fprintf(fplog, "%s\n", hline);
 +
 +    if (wc->wcc[ewcPMEMESH].n > 0)
 +    {
 +        fprintf(fplog, "%s\n", hline);
 +        for (i = ewcPPDURINGPME+1; i < ewcNR; i++)
 +        {
 +            if (is_pme_subcounter(i))
 +            {
 +                print_cycles(fplog, c2t, wcn[i], nth_tot,
 +                             is_pme_counter(i) ? npme : npp,
 +                             is_pme_counter(i) ? nth_pme : nth_pp,
 +                             wc->wcc[i].n, cycles[i], tot);
 +            }
 +        }
 +        fprintf(fplog, "%s\n", hline);
 +    }
 +
 +#ifdef GMX_CYCLE_SUBCOUNTERS
 +    fprintf(fplog, "%s\n", hline);
 +    for (i = 0; i < ewcsNR; i++)
 +    {
 +        print_cycles(fplog, c2t, wcsn[i], nth_tot, npp, nth_pp,
 +                     wc->wcsc[i].n, cycles[ewcNR+i], tot);
 +    }
 +    fprintf(fplog, "%s\n", hline);
 +#endif
 +
 +    /* print GPU timing summary */
 +    if (gpu_t)
 +    {
 +        const char *k_log_str[2][2] = {
 +            {"Nonbonded F kernel", "Nonbonded F+ene k."},
 +            {"Nonbonded F+prune k.", "Nonbonded F+ene+prune k."}
 +        };
 +
 +        tot_gpu = gpu_t->pl_h2d_t + gpu_t->nb_h2d_t + gpu_t->nb_d2h_t;
 +
 +        /* add up the kernel timings */
 +        tot_k = 0.0;
 +        for (i = 0; i < 2; i++)
 +        {
 +            for (j = 0; j < 2; j++)
 +            {
 +                tot_k += gpu_t->ktime[i][j].t;
 +            }
 +        }
 +        tot_gpu += tot_k;
 +
 +        tot_cpu_overlap = wc->wcc[ewcFORCE].c;
 +        if (wc->wcc[ewcPMEMESH].n > 0)
 +        {
 +            tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
 +        }
 +        tot_cpu_overlap *= c2t * 1000; /* convert s to ms */
 +
 +        fprintf(fplog, "\n GPU timings\n%s\n", hline);
 +        fprintf(fplog, " Computing:                         Count  Wall t (s)      ms/step       %c\n", '%');
 +        fprintf(fplog, "%s\n", hline);
 +        print_gputimes(fplog, "Pair list H2D",
 +                       gpu_t->pl_h2d_c, gpu_t->pl_h2d_t, tot_gpu);
 +        print_gputimes(fplog, "X / q H2D",
 +                       gpu_t->nb_c, gpu_t->nb_h2d_t, tot_gpu);
 +
 +        for (i = 0; i < 2; i++)
 +        {
 +            for (j = 0; j < 2; j++)
 +            {
 +                if (gpu_t->ktime[i][j].c)
 +                {
 +                    print_gputimes(fplog, k_log_str[i][j],
 +                                   gpu_t->ktime[i][j].c, gpu_t->ktime[i][j].t, tot_gpu);
 +                }
 +            }
 +        }
 +
 +        print_gputimes(fplog, "F D2H",  gpu_t->nb_c, gpu_t->nb_d2h_t, tot_gpu);
 +        fprintf(fplog, "%s\n", hline);
 +        print_gputimes(fplog, "Total ", gpu_t->nb_c, tot_gpu, tot_gpu);
 +        fprintf(fplog, "%s\n", hline);
 +
 +        gpu_cpu_ratio = tot_gpu/tot_cpu_overlap;
 +        fprintf(fplog, "\nForce evaluation time GPU/CPU: %.3f ms/%.3f ms = %.3f\n",
 +                tot_gpu/gpu_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n,
 +                gpu_cpu_ratio);
 +
 +        /* only print notes related to CPU-GPU load balance with PME */
 +        if (wc->wcc[ewcPMEMESH].n > 0)
 +        {
 +            fprintf(fplog, "For optimal performance this ratio should be close to 1!\n");
 +
 +            /* print note if the imbalance is high with PME case in which
 +             * CPU-GPU load balancing is possible */
 +            if (gpu_cpu_ratio < 0.75 || gpu_cpu_ratio > 1.2)
 +            {
 +                /* Only the sim master calls this function, so always print to stderr */
 +                if (gpu_cpu_ratio < 0.75)
 +                {
 +                    if (npp > 1)
 +                    {
 +                        /* The user could have used -notunepme,
 +                         * but we currently can't check that here.
 +                         */
 +                        md_print_warn(NULL, fplog,
 +                                      "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
 +                                      "      performance loss. Maybe the domain decomposition limits the PME tuning.\n"
 +                                      "      In that case, try setting the DD grid manually (-dd) or lowering -dds.");
 +                    }
 +                    else
 +                    {
 +                        /* We should not end up here, unless the box is
 +                         * too small for increasing the cut-off for PME tuning.
 +                         */
 +                        md_print_warn(NULL, fplog,
 +                                      "\nNOTE: The GPU has >25%% less load than the CPU. This imbalance causes\n"
 +                                      "      performance loss.");
 +                    }
 +                }
 +                if (gpu_cpu_ratio > 1.2)
 +                {
 +                    md_print_warn(NULL, fplog,
 +                                  "\nNOTE: The GPU has >20%% more load than the CPU. This imbalance causes\n"
 +                                  "      performance loss, consider using a shorter cut-off and a finer PME grid.");
 +                }
 +            }
 +        }
 +    }
 +
 +    if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 &&
 +        (cycles[ewcDOMDEC] > tot*0.1 ||
 +         cycles[ewcNS] > tot*0.1))
 +    {
 +        /* Only the sim master calls this function, so always print to stderr */
 +        if (wc->wcc[ewcDOMDEC].n == 0)
 +        {
 +            md_print_warn(NULL, fplog,
 +                          "NOTE: %d %% of the run time was spent in pair search,\n"
 +                          "      you might want to increase nstlist (this has no effect on accuracy)\n",
 +                          (int)(100*cycles[ewcNS]/tot+0.5));
 +        }
 +        else
 +        {
 +            md_print_warn(NULL, fplog,
 +                          "NOTE: %d %% of the run time was spent in domain decomposition,\n"
 +                          "      %d %% of the run time was spent in pair search,\n"
 +                          "      you might want to increase nstlist (this has no effect on accuracy)\n",
 +                          (int)(100*cycles[ewcDOMDEC]/tot+0.5),
 +                          (int)(100*cycles[ewcNS]/tot+0.5));
 +        }
 +    }
 +
 +    if (cycles[ewcMoveE] > tot*0.05)
 +    {
 +        /* Only the sim master calls this function, so always print to stderr */
 +        md_print_warn(NULL, fplog,
 +                      "NOTE: %d %% of the run time was spent communicating energies,\n"
 +                      "      you might want to use the -gcom option of mdrun\n",
 +                      (int)(100*cycles[ewcMoveE]/tot+0.5));
 +    }
 +}
 +
 +extern gmx_large_int_t wcycle_get_reset_counters(gmx_wallcycle_t wc)
 +{
 +    if (wc == NULL)
 +    {
 +        return -1;
 +    }
 +
 +    return wc->reset_counters;
 +}
 +
 +extern void wcycle_set_reset_counters(gmx_wallcycle_t wc, gmx_large_int_t reset_counters)
 +{
 +    if (wc == NULL)
 +    {
 +        return;
 +    }
 +
 +    wc->reset_counters = reset_counters;
 +}
 +
 +#ifdef GMX_CYCLE_SUBCOUNTERS
 +
 +void wallcycle_sub_start(gmx_wallcycle_t wc, int ewcs)
 +{
 +    if (wc != NULL)
 +    {
 +        wc->wcsc[ewcs].start = gmx_cycles_read();
 +    }
 +}
 +
 +void wallcycle_sub_stop(gmx_wallcycle_t wc, int ewcs)
 +{
 +    if (wc != NULL)
 +    {
 +        wc->wcsc[ewcs].c += gmx_cycles_read() - wc->wcsc[ewcs].start;
 +        wc->wcsc[ewcs].n++;
 +    }
 +}
 +
 +#endif /* GMX_CYCLE_SUBCOUNTERS */
index 2d7fa7907975f3c88c09c44f7ffea83fa49aa28d,0000000000000000000000000000000000000000..5fa7e693cd828f2fe96f805c7d3193d67ae840ce
mode 100644,000000..100644
--- /dev/null
@@@ -1,2863 -1,0 +1,2865 @@@
-                 "\nEnergy minimization reached the maximum number"
-                 "of steps before the forces reached the requested"
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <string.h>
 +#include <time.h>
 +#include <math.h>
 +#include "sysstuff.h"
 +#include "string2.h"
 +#include "network.h"
 +#include "confio.h"
 +#include "smalloc.h"
 +#include "nrnb.h"
 +#include "main.h"
 +#include "force.h"
 +#include "macros.h"
 +#include "random.h"
 +#include "names.h"
 +#include "gmx_fatal.h"
 +#include "txtdump.h"
 +#include "typedefs.h"
 +#include "update.h"
 +#include "constr.h"
 +#include "vec.h"
 +#include "statutil.h"
 +#include "tgroup.h"
 +#include "mdebin.h"
 +#include "vsite.h"
 +#include "force.h"
 +#include "mdrun.h"
 +#include "md_support.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "trnio.h"
 +#include "mdatoms.h"
 +#include "ns.h"
 +#include "gmx_wallcycle.h"
 +#include "mtop_util.h"
 +#include "gmxfio.h"
 +#include "pme.h"
 +#include "bondf.h"
 +#include "gmx_omp_nthreads.h"
++#include "md_logging.h"
 +
 +
 +#include "gromacs/linearalgebra/mtxio.h"
 +#include "gromacs/linearalgebra/sparsematrix.h"
 +
 +typedef struct {
 +    t_state  s;
 +    rvec    *f;
 +    real     epot;
 +    real     fnorm;
 +    real     fmax;
 +    int      a_fmax;
 +} em_state_t;
 +
 +static em_state_t *init_em_state()
 +{
 +    em_state_t *ems;
 +
 +    snew(ems, 1);
 +
 +    /* does this need to be here?  Should the array be declared differently (staticaly)in the state definition? */
 +    snew(ems->s.lambda, efptNR);
 +
 +    return ems;
 +}
 +
 +static void print_em_start(FILE *fplog, t_commrec *cr, gmx_runtime_t *runtime,
 +                           gmx_wallcycle_t wcycle,
 +                           const char *name)
 +{
 +    char buf[STRLEN];
 +
 +    runtime_start(runtime);
 +
 +    sprintf(buf, "Started %s", name);
 +    print_date_and_time(fplog, cr->nodeid, buf, NULL);
 +
 +    wallcycle_start(wcycle, ewcRUN);
 +}
 +static void em_time_end(gmx_runtime_t  *runtime,
 +                        gmx_wallcycle_t wcycle)
 +{
 +    wallcycle_stop(wcycle, ewcRUN);
 +
 +    runtime_end(runtime);
 +}
 +
 +static void sp_header(FILE *out, const char *minimizer, real ftol, int nsteps)
 +{
 +    fprintf(out, "\n");
 +    fprintf(out, "%s:\n", minimizer);
 +    fprintf(out, "   Tolerance (Fmax)   = %12.5e\n", ftol);
 +    fprintf(out, "   Number of steps    = %12d\n", nsteps);
 +}
 +
 +static void warn_step(FILE *fp, real ftol, gmx_bool bLastStep, gmx_bool bConstrain)
 +{
 +    char buffer[2048];
 +    if (bLastStep)
 +    {
 +        sprintf(buffer,
-                 "\nEnergy minimization has stopped, but the forces have"
-                 "not converged to the requested precision Fmax < %g (which"
-                 "may not be possible for your system). It stopped"
-                 "because the algorithm tried to make a new step whose size"
-                 "was too small, or there was no change in the energy since"
-                 "last step. Either way, we regard the minimization as"
-                 "converged to within the available machine precision,"
++                "\nEnergy minimization reached the maximum number "
++                "of steps before the forces reached the requested "
 +                "precision Fmax < %g.\n", ftol);
 +    }
 +    else
 +    {
 +        sprintf(buffer,
-                 "\nDouble precision normally gives you higher accuracy, but"
-                 "this is often not needed for preparing to run molecular"
++                "\nEnergy minimization has stopped, but the forces have "
++                "not converged to the requested precision Fmax < %g (which "
++                "may not be possible for your system). It stopped "
++                "because the algorithm tried to make a new step whose size "
++                "was too small, or there was no change in the energy since "
++                "last step. Either way, we regard the minimization as "
++                "converged to within the available machine precision, "
 +                "given your starting configuration and EM parameters.\n%s%s",
 +                ftol,
 +                sizeof(real) < sizeof(double) ?
-     size_t               sz;
++                "\nDouble precision normally gives you higher accuracy, but "
++                "this is often not needed for preparing to run molecular "
 +                "dynamics.\n" :
 +                "",
 +                bConstrain ?
 +                "You might need to increase your constraint accuracy, or turn\n"
 +                "off constraints altogether (set constraints = none in mdp file)\n" :
 +                "");
 +    }
 +    fputs(wrap_lines(buffer, 78, 0, FALSE), fp);
 +}
 +
 +
 +
 +static void print_converged(FILE *fp, const char *alg, real ftol,
 +                            gmx_large_int_t count, gmx_bool bDone, gmx_large_int_t nsteps,
 +                            real epot, real fmax, int nfmax, real fnorm)
 +{
 +    char buf[STEPSTRSIZE];
 +
 +    if (bDone)
 +    {
 +        fprintf(fp, "\n%s converged to Fmax < %g in %s steps\n",
 +                alg, ftol, gmx_step_str(count, buf));
 +    }
 +    else if (count < nsteps)
 +    {
 +        fprintf(fp, "\n%s converged to machine precision in %s steps,\n"
 +                "but did not reach the requested Fmax < %g.\n",
 +                alg, gmx_step_str(count, buf), ftol);
 +    }
 +    else
 +    {
 +        fprintf(fp, "\n%s did not converge to Fmax < %g in %s steps.\n",
 +                alg, ftol, gmx_step_str(count, buf));
 +    }
 +
 +#ifdef GMX_DOUBLE
 +    fprintf(fp, "Potential Energy  = %21.14e\n", epot);
 +    fprintf(fp, "Maximum force     = %21.14e on atom %d\n", fmax, nfmax+1);
 +    fprintf(fp, "Norm of force     = %21.14e\n", fnorm);
 +#else
 +    fprintf(fp, "Potential Energy  = %14.7e\n", epot);
 +    fprintf(fp, "Maximum force     = %14.7e on atom %d\n", fmax, nfmax+1);
 +    fprintf(fp, "Norm of force     = %14.7e\n", fnorm);
 +#endif
 +}
 +
 +static void get_f_norm_max(t_commrec *cr,
 +                           t_grpopts *opts, t_mdatoms *mdatoms, rvec *f,
 +                           real *fnorm, real *fmax, int *a_fmax)
 +{
 +    double fnorm2, *sum;
 +    real   fmax2, fmax2_0, fam;
 +    int    la_max, a_max, start, end, i, m, gf;
 +
 +    /* This routine finds the largest force and returns it.
 +     * On parallel machines the global max is taken.
 +     */
 +    fnorm2 = 0;
 +    fmax2  = 0;
 +    la_max = -1;
 +    gf     = 0;
 +    start  = mdatoms->start;
 +    end    = mdatoms->homenr + start;
 +    if (mdatoms->cFREEZE)
 +    {
 +        for (i = start; i < end; i++)
 +        {
 +            gf  = mdatoms->cFREEZE[i];
 +            fam = 0;
 +            for (m = 0; m < DIM; m++)
 +            {
 +                if (!opts->nFreeze[gf][m])
 +                {
 +                    fam += sqr(f[i][m]);
 +                }
 +            }
 +            fnorm2 += fam;
 +            if (fam > fmax2)
 +            {
 +                fmax2  = fam;
 +                la_max = i;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        for (i = start; i < end; i++)
 +        {
 +            fam     = norm2(f[i]);
 +            fnorm2 += fam;
 +            if (fam > fmax2)
 +            {
 +                fmax2  = fam;
 +                la_max = i;
 +            }
 +        }
 +    }
 +
 +    if (la_max >= 0 && DOMAINDECOMP(cr))
 +    {
 +        a_max = cr->dd->gatindex[la_max];
 +    }
 +    else
 +    {
 +        a_max = la_max;
 +    }
 +    if (PAR(cr))
 +    {
 +        snew(sum, 2*cr->nnodes+1);
 +        sum[2*cr->nodeid]   = fmax2;
 +        sum[2*cr->nodeid+1] = a_max;
 +        sum[2*cr->nnodes]   = fnorm2;
 +        gmx_sumd(2*cr->nnodes+1, sum, cr);
 +        fnorm2 = sum[2*cr->nnodes];
 +        /* Determine the global maximum */
 +        for (i = 0; i < cr->nnodes; i++)
 +        {
 +            if (sum[2*i] > fmax2)
 +            {
 +                fmax2 = sum[2*i];
 +                a_max = (int)(sum[2*i+1] + 0.5);
 +            }
 +        }
 +        sfree(sum);
 +    }
 +
 +    if (fnorm)
 +    {
 +        *fnorm = sqrt(fnorm2);
 +    }
 +    if (fmax)
 +    {
 +        *fmax  = sqrt(fmax2);
 +    }
 +    if (a_fmax)
 +    {
 +        *a_fmax = a_max;
 +    }
 +}
 +
 +static void get_state_f_norm_max(t_commrec *cr,
 +                                 t_grpopts *opts, t_mdatoms *mdatoms,
 +                                 em_state_t *ems)
 +{
 +    get_f_norm_max(cr, opts, mdatoms, ems->f, &ems->fnorm, &ems->fmax, &ems->a_fmax);
 +}
 +
 +void init_em(FILE *fplog, const char *title,
 +             t_commrec *cr, t_inputrec *ir,
 +             t_state *state_global, gmx_mtop_t *top_global,
 +             em_state_t *ems, gmx_localtop_t **top,
 +             rvec **f, rvec **f_global,
 +             t_nrnb *nrnb, rvec mu_tot,
 +             t_forcerec *fr, gmx_enerdata_t **enerd,
 +             t_graph **graph, t_mdatoms *mdatoms, gmx_global_stat_t *gstat,
 +             gmx_vsite_t *vsite, gmx_constr_t constr,
 +             int nfile, const t_filenm fnm[],
 +             gmx_mdoutf_t **outf, t_mdebin **mdebin)
 +{
 +    int  start, homenr, i;
 +    real dvdl_constr;
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog, "Initiating %s\n", title);
 +    }
 +
 +    state_global->ngtc = 0;
 +
 +    /* Initialize lambda variables */
 +    initialize_lambdas(fplog, ir, &(state_global->fep_state), state_global->lambda, NULL);
 +
 +    init_nrnb(nrnb);
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        *top = dd_init_local_top(top_global);
 +
 +        dd_init_local_state(cr->dd, state_global, &ems->s);
 +
 +        *f = NULL;
 +
 +        /* Distribute the charge groups over the nodes from the master node */
 +        dd_partition_system(fplog, ir->init_step, cr, TRUE, 1,
 +                            state_global, top_global, ir,
 +                            &ems->s, &ems->f, mdatoms, *top,
 +                            fr, vsite, NULL, constr,
 +                            nrnb, NULL, FALSE);
 +        dd_store_state(cr->dd, &ems->s);
 +
 +        if (ir->nstfout)
 +        {
 +            snew(*f_global, top_global->natoms);
 +        }
 +        else
 +        {
 +            *f_global = NULL;
 +        }
 +        *graph = NULL;
 +    }
 +    else
 +    {
 +        snew(*f, top_global->natoms);
 +
 +        /* Just copy the state */
 +        ems->s = *state_global;
 +        snew(ems->s.x, ems->s.nalloc);
 +        snew(ems->f, ems->s.nalloc);
 +        for (i = 0; i < state_global->natoms; i++)
 +        {
 +            copy_rvec(state_global->x[i], ems->s.x[i]);
 +        }
 +        copy_mat(state_global->box, ems->s.box);
 +
 +        if (PAR(cr) && ir->eI != eiNM)
 +        {
 +            /* Initialize the particle decomposition and split the topology */
 +            *top = split_system(fplog, top_global, ir, cr);
 +
 +            pd_cg_range(cr, &fr->cg0, &fr->hcg);
 +        }
 +        else
 +        {
 +            *top = gmx_mtop_generate_local_top(top_global, ir);
 +        }
 +        *f_global = *f;
 +
 +        forcerec_set_excl_load(fr, *top, cr);
 +
 +        init_bonded_thread_force_reduction(fr, &(*top)->idef);
 +
 +        if (ir->ePBC != epbcNONE && !fr->bMolPBC)
 +        {
 +            *graph = mk_graph(fplog, &((*top)->idef), 0, top_global->natoms, FALSE, FALSE);
 +        }
 +        else
 +        {
 +            *graph = NULL;
 +        }
 +
 +        if (PARTDECOMP(cr))
 +        {
 +            pd_at_range(cr, &start, &homenr);
 +            homenr -= start;
 +        }
 +        else
 +        {
 +            start  = 0;
 +            homenr = top_global->natoms;
 +        }
 +        atoms2md(top_global, ir, 0, NULL, start, homenr, mdatoms);
 +        update_mdatoms(mdatoms, state_global->lambda[efptFEP]);
 +
 +        if (vsite)
 +        {
 +            set_vsite_top(vsite, *top, mdatoms, cr);
 +        }
 +    }
 +
 +    if (constr)
 +    {
 +        if (ir->eConstrAlg == econtSHAKE &&
 +            gmx_mtop_ftype_count(top_global, F_CONSTR) > 0)
 +        {
 +            gmx_fatal(FARGS, "Can not do energy minimization with %s, use %s\n",
 +                      econstr_names[econtSHAKE], econstr_names[econtLINCS]);
 +        }
 +
 +        if (!DOMAINDECOMP(cr))
 +        {
 +            set_constraints(constr, *top, ir, mdatoms, cr);
 +        }
 +
 +        if (!ir->bContinuation)
 +        {
 +            /* Constrain the starting coordinates */
 +            dvdl_constr = 0;
 +            constrain(PAR(cr) ? NULL : fplog, TRUE, TRUE, constr, &(*top)->idef,
 +                      ir, NULL, cr, -1, 0, mdatoms,
 +                      ems->s.x, ems->s.x, NULL, fr->bMolPBC, ems->s.box,
 +                      ems->s.lambda[efptFEP], &dvdl_constr,
 +                      NULL, NULL, nrnb, econqCoord, FALSE, 0, 0);
 +        }
 +    }
 +
 +    if (PAR(cr))
 +    {
 +        *gstat = global_stat_init(ir);
 +    }
 +
 +    *outf = init_mdoutf(nfile, fnm, 0, cr, ir, NULL);
 +
 +    snew(*enerd, 1);
 +    init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda,
 +                  *enerd);
 +
 +    if (mdebin != NULL)
 +    {
 +        /* Init bin for energy stuff */
 +        *mdebin = init_mdebin((*outf)->fp_ene, top_global, ir, NULL);
 +    }
 +
 +    clear_rvec(mu_tot);
 +    calc_shifts(ems->s.box, fr->shift_vec);
 +}
 +
 +static void finish_em(t_commrec *cr, gmx_mdoutf_t *outf,
 +                      gmx_runtime_t *runtime, gmx_wallcycle_t wcycle)
 +{
 +    if (!(cr->duty & DUTY_PME))
 +    {
 +        /* Tell the PME only node to finish */
 +        gmx_pme_send_finish(cr);
 +    }
 +
 +    done_mdoutf(outf);
 +
 +    em_time_end(runtime, wcycle);
 +}
 +
 +static void swap_em_state(em_state_t *ems1, em_state_t *ems2)
 +{
 +    em_state_t tmp;
 +
 +    tmp   = *ems1;
 +    *ems1 = *ems2;
 +    *ems2 = tmp;
 +}
 +
 +static void copy_em_coords(em_state_t *ems, t_state *state)
 +{
 +    int i;
 +
 +    for (i = 0; (i < state->natoms); i++)
 +    {
 +        copy_rvec(ems->s.x[i], state->x[i]);
 +    }
 +}
 +
 +static void write_em_traj(FILE *fplog, t_commrec *cr,
 +                          gmx_mdoutf_t *outf,
 +                          gmx_bool bX, gmx_bool bF, const char *confout,
 +                          gmx_mtop_t *top_global,
 +                          t_inputrec *ir, gmx_large_int_t step,
 +                          em_state_t *state,
 +                          t_state *state_global, rvec *f_global)
 +{
 +    int mdof_flags;
 +
 +    if ((bX || bF || confout != NULL) && !DOMAINDECOMP(cr))
 +    {
 +        copy_em_coords(state, state_global);
 +        f_global = state->f;
 +    }
 +
 +    mdof_flags = 0;
 +    if (bX)
 +    {
 +        mdof_flags |= MDOF_X;
 +    }
 +    if (bF)
 +    {
 +        mdof_flags |= MDOF_F;
 +    }
 +    write_traj(fplog, cr, outf, mdof_flags,
 +               top_global, step, (double)step,
 +               &state->s, state_global, state->f, f_global, NULL, NULL);
 +
 +    if (confout != NULL && MASTER(cr))
 +    {
 +        if (ir->ePBC != epbcNONE && !ir->bPeriodicMols && DOMAINDECOMP(cr))
 +        {
 +            /* Make molecules whole only for confout writing */
 +            do_pbc_mtop(fplog, ir->ePBC, state_global->box, top_global,
 +                        state_global->x);
 +        }
 +
 +        write_sto_conf_mtop(confout,
 +                            *top_global->name, top_global,
 +                            state_global->x, NULL, ir->ePBC, state_global->box);
 +    }
 +}
 +
 +static void do_em_step(t_commrec *cr, t_inputrec *ir, t_mdatoms *md,
 +                       gmx_bool bMolPBC,
 +                       em_state_t *ems1, real a, rvec *f, em_state_t *ems2,
 +                       gmx_constr_t constr, gmx_localtop_t *top,
 +                       t_nrnb *nrnb, gmx_wallcycle_t wcycle,
 +                       gmx_large_int_t count)
 +
 +{
 +    t_state *s1, *s2;
 +    int      i;
 +    int      start, end;
 +    rvec    *x1, *x2;
 +    real     dvdl_constr;
 +
 +    s1 = &ems1->s;
 +    s2 = &ems2->s;
 +
 +    if (DOMAINDECOMP(cr) && s1->ddp_count != cr->dd->ddp_count)
 +    {
 +        gmx_incons("state mismatch in do_em_step");
 +    }
 +
 +    s2->flags = s1->flags;
 +
 +    if (s2->nalloc != s1->nalloc)
 +    {
 +        s2->nalloc = s1->nalloc;
 +        srenew(s2->x, s1->nalloc);
 +        srenew(ems2->f,  s1->nalloc);
 +        if (s2->flags & (1<<estCGP))
 +        {
 +            srenew(s2->cg_p,  s1->nalloc);
 +        }
 +    }
 +
 +    s2->natoms = s1->natoms;
 +    copy_mat(s1->box, s2->box);
 +    /* Copy free energy state */
 +    for (i = 0; i < efptNR; i++)
 +    {
 +        s2->lambda[i] = s1->lambda[i];
 +    }
 +    copy_mat(s1->box, s2->box);
 +
 +    start = md->start;
 +    end   = md->start + md->homenr;
 +
 +    x1 = s1->x;
 +    x2 = s2->x;
 +
 +#pragma omp parallel num_threads(gmx_omp_nthreads_get(emntUpdate))
 +    {
 +        int gf, i, m;
 +
 +        gf = 0;
 +#pragma omp for schedule(static) nowait
 +        for (i = start; i < end; i++)
 +        {
 +            if (md->cFREEZE)
 +            {
 +                gf = md->cFREEZE[i];
 +            }
 +            for (m = 0; m < DIM; m++)
 +            {
 +                if (ir->opts.nFreeze[gf][m])
 +                {
 +                    x2[i][m] = x1[i][m];
 +                }
 +                else
 +                {
 +                    x2[i][m] = x1[i][m] + a*f[i][m];
 +                }
 +            }
 +        }
 +
 +        if (s2->flags & (1<<estCGP))
 +        {
 +            /* Copy the CG p vector */
 +            x1 = s1->cg_p;
 +            x2 = s2->cg_p;
 +#pragma omp for schedule(static) nowait
 +            for (i = start; i < end; i++)
 +            {
 +                copy_rvec(x1[i], x2[i]);
 +            }
 +        }
 +
 +        if (DOMAINDECOMP(cr))
 +        {
 +            s2->ddp_count = s1->ddp_count;
 +            if (s2->cg_gl_nalloc < s1->cg_gl_nalloc)
 +            {
 +#pragma omp barrier
 +                s2->cg_gl_nalloc = s1->cg_gl_nalloc;
 +                srenew(s2->cg_gl, s2->cg_gl_nalloc);
 +#pragma omp barrier
 +            }
 +            s2->ncg_gl = s1->ncg_gl;
 +#pragma omp for schedule(static) nowait
 +            for (i = 0; i < s2->ncg_gl; i++)
 +            {
 +                s2->cg_gl[i] = s1->cg_gl[i];
 +            }
 +            s2->ddp_count_cg_gl = s1->ddp_count_cg_gl;
 +        }
 +    }
 +
 +    if (constr)
 +    {
 +        wallcycle_start(wcycle, ewcCONSTR);
 +        dvdl_constr = 0;
 +        constrain(NULL, TRUE, TRUE, constr, &top->idef,
 +                  ir, NULL, cr, count, 0, md,
 +                  s1->x, s2->x, NULL, bMolPBC, s2->box,
 +                  s2->lambda[efptBONDED], &dvdl_constr,
 +                  NULL, NULL, nrnb, econqCoord, FALSE, 0, 0);
 +        wallcycle_stop(wcycle, ewcCONSTR);
 +    }
 +}
 +
 +static void em_dd_partition_system(FILE *fplog, int step, t_commrec *cr,
 +                                   gmx_mtop_t *top_global, t_inputrec *ir,
 +                                   em_state_t *ems, gmx_localtop_t *top,
 +                                   t_mdatoms *mdatoms, t_forcerec *fr,
 +                                   gmx_vsite_t *vsite, gmx_constr_t constr,
 +                                   t_nrnb *nrnb, gmx_wallcycle_t wcycle)
 +{
 +    /* Repartition the domain decomposition */
 +    wallcycle_start(wcycle, ewcDOMDEC);
 +    dd_partition_system(fplog, step, cr, FALSE, 1,
 +                        NULL, top_global, ir,
 +                        &ems->s, &ems->f,
 +                        mdatoms, top, fr, vsite, NULL, constr,
 +                        nrnb, wcycle, FALSE);
 +    dd_store_state(cr->dd, &ems->s);
 +    wallcycle_stop(wcycle, ewcDOMDEC);
 +}
 +
 +static void evaluate_energy(FILE *fplog, t_commrec *cr,
 +                            gmx_mtop_t *top_global,
 +                            em_state_t *ems, gmx_localtop_t *top,
 +                            t_inputrec *inputrec,
 +                            t_nrnb *nrnb, gmx_wallcycle_t wcycle,
 +                            gmx_global_stat_t gstat,
 +                            gmx_vsite_t *vsite, gmx_constr_t constr,
 +                            t_fcdata *fcd,
 +                            t_graph *graph, t_mdatoms *mdatoms,
 +                            t_forcerec *fr, rvec mu_tot,
 +                            gmx_enerdata_t *enerd, tensor vir, tensor pres,
 +                            gmx_large_int_t count, gmx_bool bFirst)
 +{
 +    real     t;
 +    gmx_bool bNS;
 +    int      nabnsb;
 +    tensor   force_vir, shake_vir, ekin;
 +    real     dvdl_constr, prescorr, enercorr, dvdlcorr;
 +    real     terminate = 0;
 +
 +    /* Set the time to the initial time, the time does not change during EM */
 +    t = inputrec->init_t;
 +
 +    if (bFirst ||
 +        (DOMAINDECOMP(cr) && ems->s.ddp_count < cr->dd->ddp_count))
 +    {
 +        /* This the first state or an old state used before the last ns */
 +        bNS = TRUE;
 +    }
 +    else
 +    {
 +        bNS = FALSE;
 +        if (inputrec->nstlist > 0)
 +        {
 +            bNS = TRUE;
 +        }
 +        else if (inputrec->nstlist == -1)
 +        {
 +            nabnsb = natoms_beyond_ns_buffer(inputrec, fr, &top->cgs, NULL, ems->s.x);
 +            if (PAR(cr))
 +            {
 +                gmx_sumi(1, &nabnsb, cr);
 +            }
 +            bNS = (nabnsb > 0);
 +        }
 +    }
 +
 +    if (vsite)
 +    {
 +        construct_vsites(vsite, ems->s.x, 1, NULL,
 +                         top->idef.iparams, top->idef.il,
 +                         fr->ePBC, fr->bMolPBC, graph, cr, ems->s.box);
 +    }
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        if (bNS)
 +        {
 +            /* Repartition the domain decomposition */
 +            em_dd_partition_system(fplog, count, cr, top_global, inputrec,
 +                                   ems, top, mdatoms, fr, vsite, constr,
 +                                   nrnb, wcycle);
 +        }
 +    }
 +
 +    /* Calc force & energy on new trial position  */
 +    /* do_force always puts the charge groups in the box and shifts again
 +     * We do not unshift, so molecules are always whole in congrad.c
 +     */
 +    do_force(fplog, cr, inputrec,
 +             count, nrnb, wcycle, top, &top_global->groups,
 +             ems->s.box, ems->s.x, &ems->s.hist,
 +             ems->f, force_vir, mdatoms, enerd, fcd,
 +             ems->s.lambda, graph, fr, vsite, mu_tot, t, NULL, NULL, TRUE,
 +             GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES |
 +             GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY |
 +             (bNS ? GMX_FORCE_NS | GMX_FORCE_DO_LR : 0));
 +
 +    /* Clear the unused shake virial and pressure */
 +    clear_mat(shake_vir);
 +    clear_mat(pres);
 +
 +    /* Communicate stuff when parallel */
 +    if (PAR(cr) && inputrec->eI != eiNM)
 +    {
 +        wallcycle_start(wcycle, ewcMoveE);
 +
 +        global_stat(fplog, gstat, cr, enerd, force_vir, shake_vir, mu_tot,
 +                    inputrec, NULL, NULL, NULL, 1, &terminate,
 +                    top_global, &ems->s, FALSE,
 +                    CGLO_ENERGY |
 +                    CGLO_PRESSURE |
 +                    CGLO_CONSTRAINT |
 +                    CGLO_FIRSTITERATE);
 +
 +        wallcycle_stop(wcycle, ewcMoveE);
 +    }
 +
 +    /* Calculate long range corrections to pressure and energy */
 +    calc_dispcorr(fplog, inputrec, fr, count, top_global->natoms, ems->s.box, ems->s.lambda[efptVDW],
 +                  pres, force_vir, &prescorr, &enercorr, &dvdlcorr);
 +    enerd->term[F_DISPCORR] = enercorr;
 +    enerd->term[F_EPOT]    += enercorr;
 +    enerd->term[F_PRES]    += prescorr;
 +    enerd->term[F_DVDL]    += dvdlcorr;
 +
 +    ems->epot = enerd->term[F_EPOT];
 +
 +    if (constr)
 +    {
 +        /* Project out the constraint components of the force */
 +        wallcycle_start(wcycle, ewcCONSTR);
 +        dvdl_constr = 0;
 +        constrain(NULL, FALSE, FALSE, constr, &top->idef,
 +                  inputrec, NULL, cr, count, 0, mdatoms,
 +                  ems->s.x, ems->f, ems->f, fr->bMolPBC, ems->s.box,
 +                  ems->s.lambda[efptBONDED], &dvdl_constr,
 +                  NULL, &shake_vir, nrnb, econqForceDispl, FALSE, 0, 0);
 +        if (fr->bSepDVDL && fplog)
 +        {
 +            gmx_print_sepdvdl(fplog, "Constraints", t, dvdl_constr);
 +        }
 +        enerd->term[F_DVDL_CONSTR] += dvdl_constr;
 +        m_add(force_vir, shake_vir, vir);
 +        wallcycle_stop(wcycle, ewcCONSTR);
 +    }
 +    else
 +    {
 +        copy_mat(force_vir, vir);
 +    }
 +
 +    clear_mat(ekin);
 +    enerd->term[F_PRES] =
 +        calc_pres(fr->ePBC, inputrec->nwall, ems->s.box, ekin, vir, pres);
 +
 +    sum_dhdl(enerd, ems->s.lambda, inputrec->fepvals);
 +
 +    if (EI_ENERGY_MINIMIZATION(inputrec->eI))
 +    {
 +        get_state_f_norm_max(cr, &(inputrec->opts), mdatoms, ems);
 +    }
 +}
 +
 +static double reorder_partsum(t_commrec *cr, t_grpopts *opts, t_mdatoms *mdatoms,
 +                              gmx_mtop_t *mtop,
 +                              em_state_t *s_min, em_state_t *s_b)
 +{
 +    rvec          *fm, *fb, *fmg;
 +    t_block       *cgs_gl;
 +    int            ncg, *cg_gl, *index, c, cg, i, a0, a1, a, gf, m;
 +    double         partsum;
 +    unsigned char *grpnrFREEZE;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Doing reorder_partsum\n");
 +    }
 +
 +    fm = s_min->f;
 +    fb = s_b->f;
 +
 +    cgs_gl = dd_charge_groups_global(cr->dd);
 +    index  = cgs_gl->index;
 +
 +    /* Collect fm in a global vector fmg.
 +     * This conflicts with the spirit of domain decomposition,
 +     * but to fully optimize this a much more complicated algorithm is required.
 +     */
 +    snew(fmg, mtop->natoms);
 +
 +    ncg   = s_min->s.ncg_gl;
 +    cg_gl = s_min->s.cg_gl;
 +    i     = 0;
 +    for (c = 0; c < ncg; c++)
 +    {
 +        cg = cg_gl[c];
 +        a0 = index[cg];
 +        a1 = index[cg+1];
 +        for (a = a0; a < a1; a++)
 +        {
 +            copy_rvec(fm[i], fmg[a]);
 +            i++;
 +        }
 +    }
 +    gmx_sum(mtop->natoms*3, fmg[0], cr);
 +
 +    /* Now we will determine the part of the sum for the cgs in state s_b */
 +    ncg         = s_b->s.ncg_gl;
 +    cg_gl       = s_b->s.cg_gl;
 +    partsum     = 0;
 +    i           = 0;
 +    gf          = 0;
 +    grpnrFREEZE = mtop->groups.grpnr[egcFREEZE];
 +    for (c = 0; c < ncg; c++)
 +    {
 +        cg = cg_gl[c];
 +        a0 = index[cg];
 +        a1 = index[cg+1];
 +        for (a = a0; a < a1; a++)
 +        {
 +            if (mdatoms->cFREEZE && grpnrFREEZE)
 +            {
 +                gf = grpnrFREEZE[i];
 +            }
 +            for (m = 0; m < DIM; m++)
 +            {
 +                if (!opts->nFreeze[gf][m])
 +                {
 +                    partsum += (fb[i][m] - fmg[a][m])*fb[i][m];
 +                }
 +            }
 +            i++;
 +        }
 +    }
 +
 +    sfree(fmg);
 +
 +    return partsum;
 +}
 +
 +static real pr_beta(t_commrec *cr, t_grpopts *opts, t_mdatoms *mdatoms,
 +                    gmx_mtop_t *mtop,
 +                    em_state_t *s_min, em_state_t *s_b)
 +{
 +    rvec  *fm, *fb;
 +    double sum;
 +    int    gf, i, m;
 +
 +    /* This is just the classical Polak-Ribiere calculation of beta;
 +     * it looks a bit complicated since we take freeze groups into account,
 +     * and might have to sum it in parallel runs.
 +     */
 +
 +    if (!DOMAINDECOMP(cr) ||
 +        (s_min->s.ddp_count == cr->dd->ddp_count &&
 +         s_b->s.ddp_count   == cr->dd->ddp_count))
 +    {
 +        fm  = s_min->f;
 +        fb  = s_b->f;
 +        sum = 0;
 +        gf  = 0;
 +        /* This part of code can be incorrect with DD,
 +         * since the atom ordering in s_b and s_min might differ.
 +         */
 +        for (i = mdatoms->start; i < mdatoms->start+mdatoms->homenr; i++)
 +        {
 +            if (mdatoms->cFREEZE)
 +            {
 +                gf = mdatoms->cFREEZE[i];
 +            }
 +            for (m = 0; m < DIM; m++)
 +            {
 +                if (!opts->nFreeze[gf][m])
 +                {
 +                    sum += (fb[i][m] - fm[i][m])*fb[i][m];
 +                }
 +            }
 +        }
 +    }
 +    else
 +    {
 +        /* We need to reorder cgs while summing */
 +        sum = reorder_partsum(cr, opts, mdatoms, mtop, s_min, s_b);
 +    }
 +    if (PAR(cr))
 +    {
 +        gmx_sumd(1, &sum, cr);
 +    }
 +
 +    return sum/sqr(s_min->fnorm);
 +}
 +
 +double do_cg(FILE *fplog, t_commrec *cr,
 +             int nfile, const t_filenm fnm[],
 +             const output_env_t gmx_unused oenv, gmx_bool bVerbose, gmx_bool gmx_unused bCompact,
 +             int gmx_unused nstglobalcomm,
 +             gmx_vsite_t *vsite, gmx_constr_t constr,
 +             int gmx_unused stepout,
 +             t_inputrec *inputrec,
 +             gmx_mtop_t *top_global, t_fcdata *fcd,
 +             t_state *state_global,
 +             t_mdatoms *mdatoms,
 +             t_nrnb *nrnb, gmx_wallcycle_t wcycle,
 +             gmx_edsam_t gmx_unused ed,
 +             t_forcerec *fr,
 +             int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
 +             gmx_membed_t gmx_unused membed,
 +             real gmx_unused cpt_period, real gmx_unused max_hours,
 +             const char gmx_unused *deviceOptions,
 +             unsigned long gmx_unused Flags,
 +             gmx_runtime_t *runtime)
 +{
 +    const char       *CG = "Polak-Ribiere Conjugate Gradients";
 +
 +    em_state_t       *s_min, *s_a, *s_b, *s_c;
 +    gmx_localtop_t   *top;
 +    gmx_enerdata_t   *enerd;
 +    rvec             *f;
 +    gmx_global_stat_t gstat;
 +    t_graph          *graph;
 +    rvec             *f_global, *p, *sf, *sfm;
 +    double            gpa, gpb, gpc, tmp, sum[2], minstep;
 +    real              fnormn;
 +    real              stepsize;
 +    real              a, b, c, beta = 0.0;
 +    real              epot_repl = 0;
 +    real              pnorm;
 +    t_mdebin         *mdebin;
 +    gmx_bool          converged, foundlower;
 +    rvec              mu_tot;
 +    gmx_bool          do_log = FALSE, do_ene = FALSE, do_x, do_f;
 +    tensor            vir, pres;
 +    int               number_steps, neval = 0, nstcg = inputrec->nstcgsteep;
 +    gmx_mdoutf_t     *outf;
 +    int               i, m, gf, step, nminstep;
 +    real              terminate = 0;
 +
 +    step = 0;
 +
 +    s_min = init_em_state();
 +    s_a   = init_em_state();
 +    s_b   = init_em_state();
 +    s_c   = init_em_state();
 +
 +    /* Init em and store the local state in s_min */
 +    init_em(fplog, CG, cr, inputrec,
 +            state_global, top_global, s_min, &top, &f, &f_global,
 +            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
 +            nfile, fnm, &outf, &mdebin);
 +
 +    /* Print to log file */
 +    print_em_start(fplog, cr, runtime, wcycle, CG);
 +
 +    /* Max number of steps */
 +    number_steps = inputrec->nsteps;
 +
 +    if (MASTER(cr))
 +    {
 +        sp_header(stderr, CG, inputrec->em_tol, number_steps);
 +    }
 +    if (fplog)
 +    {
 +        sp_header(fplog, CG, inputrec->em_tol, number_steps);
 +    }
 +
 +    /* Call the force routine and some auxiliary (neighboursearching etc.) */
 +    /* do_force always puts the charge groups in the box and shifts again
 +     * We do not unshift, so molecules are always whole in congrad.c
 +     */
 +    evaluate_energy(fplog, cr,
 +                    top_global, s_min, top,
 +                    inputrec, nrnb, wcycle, gstat,
 +                    vsite, constr, fcd, graph, mdatoms, fr,
 +                    mu_tot, enerd, vir, pres, -1, TRUE);
 +    where();
 +
 +    if (MASTER(cr))
 +    {
 +        /* Copy stuff to the energy bin for easy printing etc. */
 +        upd_mdebin(mdebin, FALSE, FALSE, (double)step,
 +                   mdatoms->tmass, enerd, &s_min->s, inputrec->fepvals, inputrec->expandedvals, s_min->s.box,
 +                   NULL, NULL, vir, pres, NULL, mu_tot, constr);
 +
 +        print_ebin_header(fplog, step, step, s_min->s.lambda[efptFEP]);
 +        print_ebin(outf->fp_ene, TRUE, FALSE, FALSE, fplog, step, step, eprNORMAL,
 +                   TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
 +    }
 +    where();
 +
 +    /* Estimate/guess the initial stepsize */
 +    stepsize = inputrec->em_stepsize/s_min->fnorm;
 +
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr, "   F-max             = %12.5e on atom %d\n",
 +                s_min->fmax, s_min->a_fmax+1);
 +        fprintf(stderr, "   F-Norm            = %12.5e\n",
 +                s_min->fnorm/sqrt(state_global->natoms));
 +        fprintf(stderr, "\n");
 +        /* and copy to the log file too... */
 +        fprintf(fplog, "   F-max             = %12.5e on atom %d\n",
 +                s_min->fmax, s_min->a_fmax+1);
 +        fprintf(fplog, "   F-Norm            = %12.5e\n",
 +                s_min->fnorm/sqrt(state_global->natoms));
 +        fprintf(fplog, "\n");
 +    }
 +    /* Start the loop over CG steps.
 +     * Each successful step is counted, and we continue until
 +     * we either converge or reach the max number of steps.
 +     */
 +    converged = FALSE;
 +    for (step = 0; (number_steps < 0 || (number_steps >= 0 && step <= number_steps)) && !converged; step++)
 +    {
 +
 +        /* start taking steps in a new direction
 +         * First time we enter the routine, beta=0, and the direction is
 +         * simply the negative gradient.
 +         */
 +
 +        /* Calculate the new direction in p, and the gradient in this direction, gpa */
 +        p   = s_min->s.cg_p;
 +        sf  = s_min->f;
 +        gpa = 0;
 +        gf  = 0;
 +        for (i = mdatoms->start; i < mdatoms->start+mdatoms->homenr; i++)
 +        {
 +            if (mdatoms->cFREEZE)
 +            {
 +                gf = mdatoms->cFREEZE[i];
 +            }
 +            for (m = 0; m < DIM; m++)
 +            {
 +                if (!inputrec->opts.nFreeze[gf][m])
 +                {
 +                    p[i][m] = sf[i][m] + beta*p[i][m];
 +                    gpa    -= p[i][m]*sf[i][m];
 +                    /* f is negative gradient, thus the sign */
 +                }
 +                else
 +                {
 +                    p[i][m] = 0;
 +                }
 +            }
 +        }
 +
 +        /* Sum the gradient along the line across CPUs */
 +        if (PAR(cr))
 +        {
 +            gmx_sumd(1, &gpa, cr);
 +        }
 +
 +        /* Calculate the norm of the search vector */
 +        get_f_norm_max(cr, &(inputrec->opts), mdatoms, p, &pnorm, NULL, NULL);
 +
 +        /* Just in case stepsize reaches zero due to numerical precision... */
 +        if (stepsize <= 0)
 +        {
 +            stepsize = inputrec->em_stepsize/pnorm;
 +        }
 +
 +        /*
 +         * Double check the value of the derivative in the search direction.
 +         * If it is positive it must be due to the old information in the
 +         * CG formula, so just remove that and start over with beta=0.
 +         * This corresponds to a steepest descent step.
 +         */
 +        if (gpa > 0)
 +        {
 +            beta = 0;
 +            step--;   /* Don't count this step since we are restarting */
 +            continue; /* Go back to the beginning of the big for-loop */
 +        }
 +
 +        /* Calculate minimum allowed stepsize, before the average (norm)
 +         * relative change in coordinate is smaller than precision
 +         */
 +        minstep = 0;
 +        for (i = mdatoms->start; i < mdatoms->start+mdatoms->homenr; i++)
 +        {
 +            for (m = 0; m < DIM; m++)
 +            {
 +                tmp = fabs(s_min->s.x[i][m]);
 +                if (tmp < 1.0)
 +                {
 +                    tmp = 1.0;
 +                }
 +                tmp      = p[i][m]/tmp;
 +                minstep += tmp*tmp;
 +            }
 +        }
 +        /* Add up from all CPUs */
 +        if (PAR(cr))
 +        {
 +            gmx_sumd(1, &minstep, cr);
 +        }
 +
 +        minstep = GMX_REAL_EPS/sqrt(minstep/(3*state_global->natoms));
 +
 +        if (stepsize < minstep)
 +        {
 +            converged = TRUE;
 +            break;
 +        }
 +
 +        /* Write coordinates if necessary */
 +        do_x = do_per_step(step, inputrec->nstxout);
 +        do_f = do_per_step(step, inputrec->nstfout);
 +
 +        write_em_traj(fplog, cr, outf, do_x, do_f, NULL,
 +                      top_global, inputrec, step,
 +                      s_min, state_global, f_global);
 +
 +        /* Take a step downhill.
 +         * In theory, we should minimize the function along this direction.
 +         * That is quite possible, but it turns out to take 5-10 function evaluations
 +         * for each line. However, we dont really need to find the exact minimum -
 +         * it is much better to start a new CG step in a modified direction as soon
 +         * as we are close to it. This will save a lot of energy evaluations.
 +         *
 +         * In practice, we just try to take a single step.
 +         * If it worked (i.e. lowered the energy), we increase the stepsize but
 +         * the continue straight to the next CG step without trying to find any minimum.
 +         * If it didn't work (higher energy), there must be a minimum somewhere between
 +         * the old position and the new one.
 +         *
 +         * Due to the finite numerical accuracy, it turns out that it is a good idea
 +         * to even accept a SMALL increase in energy, if the derivative is still downhill.
 +         * This leads to lower final energies in the tests I've done. / Erik
 +         */
 +        s_a->epot = s_min->epot;
 +        a         = 0.0;
 +        c         = a + stepsize; /* reference position along line is zero */
 +
 +        if (DOMAINDECOMP(cr) && s_min->s.ddp_count < cr->dd->ddp_count)
 +        {
 +            em_dd_partition_system(fplog, step, cr, top_global, inputrec,
 +                                   s_min, top, mdatoms, fr, vsite, constr,
 +                                   nrnb, wcycle);
 +        }
 +
 +        /* Take a trial step (new coords in s_c) */
 +        do_em_step(cr, inputrec, mdatoms, fr->bMolPBC, s_min, c, s_min->s.cg_p, s_c,
 +                   constr, top, nrnb, wcycle, -1);
 +
 +        neval++;
 +        /* Calculate energy for the trial step */
 +        evaluate_energy(fplog, cr,
 +                        top_global, s_c, top,
 +                        inputrec, nrnb, wcycle, gstat,
 +                        vsite, constr, fcd, graph, mdatoms, fr,
 +                        mu_tot, enerd, vir, pres, -1, FALSE);
 +
 +        /* Calc derivative along line */
 +        p   = s_c->s.cg_p;
 +        sf  = s_c->f;
 +        gpc = 0;
 +        for (i = mdatoms->start; i < mdatoms->start+mdatoms->homenr; i++)
 +        {
 +            for (m = 0; m < DIM; m++)
 +            {
 +                gpc -= p[i][m]*sf[i][m]; /* f is negative gradient, thus the sign */
 +            }
 +        }
 +        /* Sum the gradient along the line across CPUs */
 +        if (PAR(cr))
 +        {
 +            gmx_sumd(1, &gpc, cr);
 +        }
 +
 +        /* This is the max amount of increase in energy we tolerate */
 +        tmp = sqrt(GMX_REAL_EPS)*fabs(s_a->epot);
 +
 +        /* Accept the step if the energy is lower, or if it is not significantly higher
 +         * and the line derivative is still negative.
 +         */
 +        if (s_c->epot < s_a->epot || (gpc < 0 && s_c->epot < (s_a->epot + tmp)))
 +        {
 +            foundlower = TRUE;
 +            /* Great, we found a better energy. Increase step for next iteration
 +             * if we are still going down, decrease it otherwise
 +             */
 +            if (gpc < 0)
 +            {
 +                stepsize *= 1.618034; /* The golden section */
 +            }
 +            else
 +            {
 +                stepsize *= 0.618034; /* 1/golden section */
 +            }
 +        }
 +        else
 +        {
 +            /* New energy is the same or higher. We will have to do some work
 +             * to find a smaller value in the interval. Take smaller step next time!
 +             */
 +            foundlower = FALSE;
 +            stepsize  *= 0.618034;
 +        }
 +
 +
 +
 +
 +        /* OK, if we didn't find a lower value we will have to locate one now - there must
 +         * be one in the interval [a=0,c].
 +         * The same thing is valid here, though: Don't spend dozens of iterations to find
 +         * the line minimum. We try to interpolate based on the derivative at the endpoints,
 +         * and only continue until we find a lower value. In most cases this means 1-2 iterations.
 +         *
 +         * I also have a safeguard for potentially really patological functions so we never
 +         * take more than 20 steps before we give up ...
 +         *
 +         * If we already found a lower value we just skip this step and continue to the update.
 +         */
 +        if (!foundlower)
 +        {
 +            nminstep = 0;
 +
 +            do
 +            {
 +                /* Select a new trial point.
 +                 * If the derivatives at points a & c have different sign we interpolate to zero,
 +                 * otherwise just do a bisection.
 +                 */
 +                if (gpa < 0 && gpc > 0)
 +                {
 +                    b = a + gpa*(a-c)/(gpc-gpa);
 +                }
 +                else
 +                {
 +                    b = 0.5*(a+c);
 +                }
 +
 +                /* safeguard if interpolation close to machine accuracy causes errors:
 +                 * never go outside the interval
 +                 */
 +                if (b <= a || b >= c)
 +                {
 +                    b = 0.5*(a+c);
 +                }
 +
 +                if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count)
 +                {
 +                    /* Reload the old state */
 +                    em_dd_partition_system(fplog, -1, cr, top_global, inputrec,
 +                                           s_min, top, mdatoms, fr, vsite, constr,
 +                                           nrnb, wcycle);
 +                }
 +
 +                /* Take a trial step to this new point - new coords in s_b */
 +                do_em_step(cr, inputrec, mdatoms, fr->bMolPBC, s_min, b, s_min->s.cg_p, s_b,
 +                           constr, top, nrnb, wcycle, -1);
 +
 +                neval++;
 +                /* Calculate energy for the trial step */
 +                evaluate_energy(fplog, cr,
 +                                top_global, s_b, top,
 +                                inputrec, nrnb, wcycle, gstat,
 +                                vsite, constr, fcd, graph, mdatoms, fr,
 +                                mu_tot, enerd, vir, pres, -1, FALSE);
 +
 +                /* p does not change within a step, but since the domain decomposition
 +                 * might change, we have to use cg_p of s_b here.
 +                 */
 +                p   = s_b->s.cg_p;
 +                sf  = s_b->f;
 +                gpb = 0;
 +                for (i = mdatoms->start; i < mdatoms->start+mdatoms->homenr; i++)
 +                {
 +                    for (m = 0; m < DIM; m++)
 +                    {
 +                        gpb -= p[i][m]*sf[i][m]; /* f is negative gradient, thus the sign */
 +                    }
 +                }
 +                /* Sum the gradient along the line across CPUs */
 +                if (PAR(cr))
 +                {
 +                    gmx_sumd(1, &gpb, cr);
 +                }
 +
 +                if (debug)
 +                {
 +                    fprintf(debug, "CGE: EpotA %f EpotB %f EpotC %f gpb %f\n",
 +                            s_a->epot, s_b->epot, s_c->epot, gpb);
 +                }
 +
 +                epot_repl = s_b->epot;
 +
 +                /* Keep one of the intervals based on the value of the derivative at the new point */
 +                if (gpb > 0)
 +                {
 +                    /* Replace c endpoint with b */
 +                    swap_em_state(s_b, s_c);
 +                    c   = b;
 +                    gpc = gpb;
 +                }
 +                else
 +                {
 +                    /* Replace a endpoint with b */
 +                    swap_em_state(s_b, s_a);
 +                    a   = b;
 +                    gpa = gpb;
 +                }
 +
 +                /*
 +                 * Stop search as soon as we find a value smaller than the endpoints.
 +                 * Never run more than 20 steps, no matter what.
 +                 */
 +                nminstep++;
 +            }
 +            while ((epot_repl > s_a->epot || epot_repl > s_c->epot) &&
 +                   (nminstep < 20));
 +
 +            if (fabs(epot_repl - s_min->epot) < fabs(s_min->epot)*GMX_REAL_EPS ||
 +                nminstep >= 20)
 +            {
 +                /* OK. We couldn't find a significantly lower energy.
 +                 * If beta==0 this was steepest descent, and then we give up.
 +                 * If not, set beta=0 and restart with steepest descent before quitting.
 +                 */
 +                if (beta == 0.0)
 +                {
 +                    /* Converged */
 +                    converged = TRUE;
 +                    break;
 +                }
 +                else
 +                {
 +                    /* Reset memory before giving up */
 +                    beta = 0.0;
 +                    continue;
 +                }
 +            }
 +
 +            /* Select min energy state of A & C, put the best in B.
 +             */
 +            if (s_c->epot < s_a->epot)
 +            {
 +                if (debug)
 +                {
 +                    fprintf(debug, "CGE: C (%f) is lower than A (%f), moving C to B\n",
 +                            s_c->epot, s_a->epot);
 +                }
 +                swap_em_state(s_b, s_c);
 +                gpb = gpc;
 +                b   = c;
 +            }
 +            else
 +            {
 +                if (debug)
 +                {
 +                    fprintf(debug, "CGE: A (%f) is lower than C (%f), moving A to B\n",
 +                            s_a->epot, s_c->epot);
 +                }
 +                swap_em_state(s_b, s_a);
 +                gpb = gpa;
 +                b   = a;
 +            }
 +
 +        }
 +        else
 +        {
 +            if (debug)
 +            {
 +                fprintf(debug, "CGE: Found a lower energy %f, moving C to B\n",
 +                        s_c->epot);
 +            }
 +            swap_em_state(s_b, s_c);
 +            gpb = gpc;
 +            b   = c;
 +        }
 +
 +        /* new search direction */
 +        /* beta = 0 means forget all memory and restart with steepest descents. */
 +        if (nstcg && ((step % nstcg) == 0))
 +        {
 +            beta = 0.0;
 +        }
 +        else
 +        {
 +            /* s_min->fnorm cannot be zero, because then we would have converged
 +             * and broken out.
 +             */
 +
 +            /* Polak-Ribiere update.
 +             * Change to fnorm2/fnorm2_old for Fletcher-Reeves
 +             */
 +            beta = pr_beta(cr, &inputrec->opts, mdatoms, top_global, s_min, s_b);
 +        }
 +        /* Limit beta to prevent oscillations */
 +        if (fabs(beta) > 5.0)
 +        {
 +            beta = 0.0;
 +        }
 +
 +
 +        /* update positions */
 +        swap_em_state(s_min, s_b);
 +        gpa = gpb;
 +
 +        /* Print it if necessary */
 +        if (MASTER(cr))
 +        {
 +            if (bVerbose)
 +            {
 +                fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n",
 +                        step, s_min->epot, s_min->fnorm/sqrt(state_global->natoms),
 +                        s_min->fmax, s_min->a_fmax+1);
 +            }
 +            /* Store the new (lower) energies */
 +            upd_mdebin(mdebin, FALSE, FALSE, (double)step,
 +                       mdatoms->tmass, enerd, &s_min->s, inputrec->fepvals, inputrec->expandedvals, s_min->s.box,
 +                       NULL, NULL, vir, pres, NULL, mu_tot, constr);
 +
 +            do_log = do_per_step(step, inputrec->nstlog);
 +            do_ene = do_per_step(step, inputrec->nstenergy);
 +            if (do_log)
 +            {
 +                print_ebin_header(fplog, step, step, s_min->s.lambda[efptFEP]);
 +            }
 +            print_ebin(outf->fp_ene, do_ene, FALSE, FALSE,
 +                       do_log ? fplog : NULL, step, step, eprNORMAL,
 +                       TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
 +        }
 +
 +        /* Stop when the maximum force lies below tolerance.
 +         * If we have reached machine precision, converged is already set to true.
 +         */
 +        converged = converged || (s_min->fmax < inputrec->em_tol);
 +
 +    } /* End of the loop */
 +
 +    if (converged)
 +    {
 +        step--; /* we never took that last step in this case */
 +
 +    }
 +    if (s_min->fmax > inputrec->em_tol)
 +    {
 +        if (MASTER(cr))
 +        {
 +            warn_step(stderr, inputrec->em_tol, step-1 == number_steps, FALSE);
 +            warn_step(fplog, inputrec->em_tol, step-1 == number_steps, FALSE);
 +        }
 +        converged = FALSE;
 +    }
 +
 +    if (MASTER(cr))
 +    {
 +        /* If we printed energy and/or logfile last step (which was the last step)
 +         * we don't have to do it again, but otherwise print the final values.
 +         */
 +        if (!do_log)
 +        {
 +            /* Write final value to log since we didn't do anything the last step */
 +            print_ebin_header(fplog, step, step, s_min->s.lambda[efptFEP]);
 +        }
 +        if (!do_ene || !do_log)
 +        {
 +            /* Write final energy file entries */
 +            print_ebin(outf->fp_ene, !do_ene, FALSE, FALSE,
 +                       !do_log ? fplog : NULL, step, step, eprNORMAL,
 +                       TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
 +        }
 +    }
 +
 +    /* Print some stuff... */
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
 +    }
 +
 +    /* IMPORTANT!
 +     * For accurate normal mode calculation it is imperative that we
 +     * store the last conformation into the full precision binary trajectory.
 +     *
 +     * However, we should only do it if we did NOT already write this step
 +     * above (which we did if do_x or do_f was true).
 +     */
 +    do_x = !do_per_step(step, inputrec->nstxout);
 +    do_f = (inputrec->nstfout > 0 && !do_per_step(step, inputrec->nstfout));
 +
 +    write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm),
 +                  top_global, inputrec, step,
 +                  s_min, state_global, f_global);
 +
 +    fnormn = s_min->fnorm/sqrt(state_global->natoms);
 +
 +    if (MASTER(cr))
 +    {
 +        print_converged(stderr, CG, inputrec->em_tol, step, converged, number_steps,
 +                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
 +        print_converged(fplog, CG, inputrec->em_tol, step, converged, number_steps,
 +                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
 +
 +        fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval);
 +    }
 +
 +    finish_em(cr, outf, runtime, wcycle);
 +
 +    /* To print the actual number of steps we needed somewhere */
 +    runtime->nsteps_done = step;
 +
 +    return 0;
 +} /* That's all folks */
 +
 +
 +double do_lbfgs(FILE *fplog, t_commrec *cr,
 +                int nfile, const t_filenm fnm[],
 +                const output_env_t gmx_unused oenv, gmx_bool bVerbose, gmx_bool gmx_unused bCompact,
 +                int gmx_unused nstglobalcomm,
 +                gmx_vsite_t *vsite, gmx_constr_t constr,
 +                int gmx_unused stepout,
 +                t_inputrec *inputrec,
 +                gmx_mtop_t *top_global, t_fcdata *fcd,
 +                t_state *state,
 +                t_mdatoms *mdatoms,
 +                t_nrnb *nrnb, gmx_wallcycle_t wcycle,
 +                gmx_edsam_t gmx_unused ed,
 +                t_forcerec *fr,
 +                int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
 +                gmx_membed_t gmx_unused membed,
 +                real gmx_unused cpt_period, real gmx_unused max_hours,
 +                const char gmx_unused *deviceOptions,
 +                unsigned long gmx_unused Flags,
 +                gmx_runtime_t *runtime)
 +{
 +    static const char *LBFGS = "Low-Memory BFGS Minimizer";
 +    em_state_t         ems;
 +    gmx_localtop_t    *top;
 +    gmx_enerdata_t    *enerd;
 +    rvec              *f;
 +    gmx_global_stat_t  gstat;
 +    t_graph           *graph;
 +    rvec              *f_global;
 +    int                ncorr, nmaxcorr, point, cp, neval, nminstep;
 +    double             stepsize, gpa, gpb, gpc, tmp, minstep;
 +    real              *rho, *alpha, *ff, *xx, *p, *s, *lastx, *lastf, **dx, **dg;
 +    real              *xa, *xb, *xc, *fa, *fb, *fc, *xtmp, *ftmp;
 +    real               a, b, c, maxdelta, delta;
 +    real               diag, Epot0, Epot, EpotA, EpotB, EpotC;
 +    real               dgdx, dgdg, sq, yr, beta;
 +    t_mdebin          *mdebin;
 +    gmx_bool           converged, first;
 +    rvec               mu_tot;
 +    real               fnorm, fmax;
 +    gmx_bool           do_log, do_ene, do_x, do_f, foundlower, *frozen;
 +    tensor             vir, pres;
 +    int                start, end, number_steps;
 +    gmx_mdoutf_t      *outf;
 +    int                i, k, m, n, nfmax, gf, step;
 +    int                mdof_flags;
 +    /* not used */
 +    real               terminate;
 +
 +    if (PAR(cr))
 +    {
 +        gmx_fatal(FARGS, "Cannot do parallel L-BFGS Minimization - yet.\n");
 +    }
 +
 +    if (NULL != constr)
 +    {
 +        gmx_fatal(FARGS, "The combination of constraints and L-BFGS minimization is not implemented. Either do not use constraints, or use another minimizer (e.g. steepest descent).");
 +    }
 +
 +    n        = 3*state->natoms;
 +    nmaxcorr = inputrec->nbfgscorr;
 +
 +    /* Allocate memory */
 +    /* Use pointers to real so we dont have to loop over both atoms and
 +     * dimensions all the time...
 +     * x/f are allocated as rvec *, so make new x0/f0 pointers-to-real
 +     * that point to the same memory.
 +     */
 +    snew(xa, n);
 +    snew(xb, n);
 +    snew(xc, n);
 +    snew(fa, n);
 +    snew(fb, n);
 +    snew(fc, n);
 +    snew(frozen, n);
 +
 +    snew(p, n);
 +    snew(lastx, n);
 +    snew(lastf, n);
 +    snew(rho, nmaxcorr);
 +    snew(alpha, nmaxcorr);
 +
 +    snew(dx, nmaxcorr);
 +    for (i = 0; i < nmaxcorr; i++)
 +    {
 +        snew(dx[i], n);
 +    }
 +
 +    snew(dg, nmaxcorr);
 +    for (i = 0; i < nmaxcorr; i++)
 +    {
 +        snew(dg[i], n);
 +    }
 +
 +    step  = 0;
 +    neval = 0;
 +
 +    /* Init em */
 +    init_em(fplog, LBFGS, cr, inputrec,
 +            state, top_global, &ems, &top, &f, &f_global,
 +            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
 +            nfile, fnm, &outf, &mdebin);
 +    /* Do_lbfgs is not completely updated like do_steep and do_cg,
 +     * so we free some memory again.
 +     */
 +    sfree(ems.s.x);
 +    sfree(ems.f);
 +
 +    xx = (real *)state->x;
 +    ff = (real *)f;
 +
 +    start = mdatoms->start;
 +    end   = mdatoms->homenr + start;
 +
 +    /* Print to log file */
 +    print_em_start(fplog, cr, runtime, wcycle, LBFGS);
 +
 +    do_log = do_ene = do_x = do_f = TRUE;
 +
 +    /* Max number of steps */
 +    number_steps = inputrec->nsteps;
 +
 +    /* Create a 3*natoms index to tell whether each degree of freedom is frozen */
 +    gf = 0;
 +    for (i = start; i < end; i++)
 +    {
 +        if (mdatoms->cFREEZE)
 +        {
 +            gf = mdatoms->cFREEZE[i];
 +        }
 +        for (m = 0; m < DIM; m++)
 +        {
 +            frozen[3*i+m] = inputrec->opts.nFreeze[gf][m];
 +        }
 +    }
 +    if (MASTER(cr))
 +    {
 +        sp_header(stderr, LBFGS, inputrec->em_tol, number_steps);
 +    }
 +    if (fplog)
 +    {
 +        sp_header(fplog, LBFGS, inputrec->em_tol, number_steps);
 +    }
 +
 +    if (vsite)
 +    {
 +        construct_vsites(vsite, state->x, 1, NULL,
 +                         top->idef.iparams, top->idef.il,
 +                         fr->ePBC, fr->bMolPBC, graph, cr, state->box);
 +    }
 +
 +    /* Call the force routine and some auxiliary (neighboursearching etc.) */
 +    /* do_force always puts the charge groups in the box and shifts again
 +     * We do not unshift, so molecules are always whole
 +     */
 +    neval++;
 +    ems.s.x = state->x;
 +    ems.f   = f;
 +    evaluate_energy(fplog, cr,
 +                    top_global, &ems, top,
 +                    inputrec, nrnb, wcycle, gstat,
 +                    vsite, constr, fcd, graph, mdatoms, fr,
 +                    mu_tot, enerd, vir, pres, -1, TRUE);
 +    where();
 +
 +    if (MASTER(cr))
 +    {
 +        /* Copy stuff to the energy bin for easy printing etc. */
 +        upd_mdebin(mdebin, FALSE, FALSE, (double)step,
 +                   mdatoms->tmass, enerd, state, inputrec->fepvals, inputrec->expandedvals, state->box,
 +                   NULL, NULL, vir, pres, NULL, mu_tot, constr);
 +
 +        print_ebin_header(fplog, step, step, state->lambda[efptFEP]);
 +        print_ebin(outf->fp_ene, TRUE, FALSE, FALSE, fplog, step, step, eprNORMAL,
 +                   TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
 +    }
 +    where();
 +
 +    /* This is the starting energy */
 +    Epot = enerd->term[F_EPOT];
 +
 +    fnorm = ems.fnorm;
 +    fmax  = ems.fmax;
 +    nfmax = ems.a_fmax;
 +
 +    /* Set the initial step.
 +     * since it will be multiplied by the non-normalized search direction
 +     * vector (force vector the first time), we scale it by the
 +     * norm of the force.
 +     */
 +
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr, "Using %d BFGS correction steps.\n\n", nmaxcorr);
 +        fprintf(stderr, "   F-max             = %12.5e on atom %d\n", fmax, nfmax+1);
 +        fprintf(stderr, "   F-Norm            = %12.5e\n", fnorm/sqrt(state->natoms));
 +        fprintf(stderr, "\n");
 +        /* and copy to the log file too... */
 +        fprintf(fplog, "Using %d BFGS correction steps.\n\n", nmaxcorr);
 +        fprintf(fplog, "   F-max             = %12.5e on atom %d\n", fmax, nfmax+1);
 +        fprintf(fplog, "   F-Norm            = %12.5e\n", fnorm/sqrt(state->natoms));
 +        fprintf(fplog, "\n");
 +    }
 +
 +    point = 0;
 +    for (i = 0; i < n; i++)
 +    {
 +        if (!frozen[i])
 +        {
 +            dx[point][i] = ff[i]; /* Initial search direction */
 +        }
 +        else
 +        {
 +            dx[point][i] = 0;
 +        }
 +    }
 +
 +    stepsize  = 1.0/fnorm;
 +    converged = FALSE;
 +
 +    /* Start the loop over BFGS steps.
 +     * Each successful step is counted, and we continue until
 +     * we either converge or reach the max number of steps.
 +     */
 +
 +    ncorr = 0;
 +
 +    /* Set the gradient from the force */
 +    converged = FALSE;
 +    for (step = 0; (number_steps < 0 || (number_steps >= 0 && step <= number_steps)) && !converged; step++)
 +    {
 +
 +        /* Write coordinates if necessary */
 +        do_x = do_per_step(step, inputrec->nstxout);
 +        do_f = do_per_step(step, inputrec->nstfout);
 +
 +        mdof_flags = 0;
 +        if (do_x)
 +        {
 +            mdof_flags |= MDOF_X;
 +        }
 +
 +        if (do_f)
 +        {
 +            mdof_flags |= MDOF_F;
 +        }
 +
 +        write_traj(fplog, cr, outf, mdof_flags,
 +                   top_global, step, (real)step, state, state, f, f, NULL, NULL);
 +
 +        /* Do the linesearching in the direction dx[point][0..(n-1)] */
 +
 +        /* pointer to current direction - point=0 first time here */
 +        s = dx[point];
 +
 +        /* calculate line gradient */
 +        for (gpa = 0, i = 0; i < n; i++)
 +        {
 +            gpa -= s[i]*ff[i];
 +        }
 +
 +        /* Calculate minimum allowed stepsize, before the average (norm)
 +         * relative change in coordinate is smaller than precision
 +         */
 +        for (minstep = 0, i = 0; i < n; i++)
 +        {
 +            tmp = fabs(xx[i]);
 +            if (tmp < 1.0)
 +            {
 +                tmp = 1.0;
 +            }
 +            tmp      = s[i]/tmp;
 +            minstep += tmp*tmp;
 +        }
 +        minstep = GMX_REAL_EPS/sqrt(minstep/n);
 +
 +        if (stepsize < minstep)
 +        {
 +            converged = TRUE;
 +            break;
 +        }
 +
 +        /* Store old forces and coordinates */
 +        for (i = 0; i < n; i++)
 +        {
 +            lastx[i] = xx[i];
 +            lastf[i] = ff[i];
 +        }
 +        Epot0 = Epot;
 +
 +        first = TRUE;
 +
 +        for (i = 0; i < n; i++)
 +        {
 +            xa[i] = xx[i];
 +        }
 +
 +        /* Take a step downhill.
 +         * In theory, we should minimize the function along this direction.
 +         * That is quite possible, but it turns out to take 5-10 function evaluations
 +         * for each line. However, we dont really need to find the exact minimum -
 +         * it is much better to start a new BFGS step in a modified direction as soon
 +         * as we are close to it. This will save a lot of energy evaluations.
 +         *
 +         * In practice, we just try to take a single step.
 +         * If it worked (i.e. lowered the energy), we increase the stepsize but
 +         * the continue straight to the next BFGS step without trying to find any minimum.
 +         * If it didn't work (higher energy), there must be a minimum somewhere between
 +         * the old position and the new one.
 +         *
 +         * Due to the finite numerical accuracy, it turns out that it is a good idea
 +         * to even accept a SMALL increase in energy, if the derivative is still downhill.
 +         * This leads to lower final energies in the tests I've done. / Erik
 +         */
 +        foundlower = FALSE;
 +        EpotA      = Epot0;
 +        a          = 0.0;
 +        c          = a + stepsize; /* reference position along line is zero */
 +
 +        /* Check stepsize first. We do not allow displacements
 +         * larger than emstep.
 +         */
 +        do
 +        {
 +            c        = a + stepsize;
 +            maxdelta = 0;
 +            for (i = 0; i < n; i++)
 +            {
 +                delta = c*s[i];
 +                if (delta > maxdelta)
 +                {
 +                    maxdelta = delta;
 +                }
 +            }
 +            if (maxdelta > inputrec->em_stepsize)
 +            {
 +                stepsize *= 0.1;
 +            }
 +        }
 +        while (maxdelta > inputrec->em_stepsize);
 +
 +        /* Take a trial step */
 +        for (i = 0; i < n; i++)
 +        {
 +            xc[i] = lastx[i] + c*s[i];
 +        }
 +
 +        neval++;
 +        /* Calculate energy for the trial step */
 +        ems.s.x = (rvec *)xc;
 +        ems.f   = (rvec *)fc;
 +        evaluate_energy(fplog, cr,
 +                        top_global, &ems, top,
 +                        inputrec, nrnb, wcycle, gstat,
 +                        vsite, constr, fcd, graph, mdatoms, fr,
 +                        mu_tot, enerd, vir, pres, step, FALSE);
 +        EpotC = ems.epot;
 +
 +        /* Calc derivative along line */
 +        for (gpc = 0, i = 0; i < n; i++)
 +        {
 +            gpc -= s[i]*fc[i]; /* f is negative gradient, thus the sign */
 +        }
 +        /* Sum the gradient along the line across CPUs */
 +        if (PAR(cr))
 +        {
 +            gmx_sumd(1, &gpc, cr);
 +        }
 +
 +        /* This is the max amount of increase in energy we tolerate */
 +        tmp = sqrt(GMX_REAL_EPS)*fabs(EpotA);
 +
 +        /* Accept the step if the energy is lower, or if it is not significantly higher
 +         * and the line derivative is still negative.
 +         */
 +        if (EpotC < EpotA || (gpc < 0 && EpotC < (EpotA+tmp)))
 +        {
 +            foundlower = TRUE;
 +            /* Great, we found a better energy. Increase step for next iteration
 +             * if we are still going down, decrease it otherwise
 +             */
 +            if (gpc < 0)
 +            {
 +                stepsize *= 1.618034; /* The golden section */
 +            }
 +            else
 +            {
 +                stepsize *= 0.618034; /* 1/golden section */
 +            }
 +        }
 +        else
 +        {
 +            /* New energy is the same or higher. We will have to do some work
 +             * to find a smaller value in the interval. Take smaller step next time!
 +             */
 +            foundlower = FALSE;
 +            stepsize  *= 0.618034;
 +        }
 +
 +        /* OK, if we didn't find a lower value we will have to locate one now - there must
 +         * be one in the interval [a=0,c].
 +         * The same thing is valid here, though: Don't spend dozens of iterations to find
 +         * the line minimum. We try to interpolate based on the derivative at the endpoints,
 +         * and only continue until we find a lower value. In most cases this means 1-2 iterations.
 +         *
 +         * I also have a safeguard for potentially really patological functions so we never
 +         * take more than 20 steps before we give up ...
 +         *
 +         * If we already found a lower value we just skip this step and continue to the update.
 +         */
 +
 +        if (!foundlower)
 +        {
 +
 +            nminstep = 0;
 +            do
 +            {
 +                /* Select a new trial point.
 +                 * If the derivatives at points a & c have different sign we interpolate to zero,
 +                 * otherwise just do a bisection.
 +                 */
 +
 +                if (gpa < 0 && gpc > 0)
 +                {
 +                    b = a + gpa*(a-c)/(gpc-gpa);
 +                }
 +                else
 +                {
 +                    b = 0.5*(a+c);
 +                }
 +
 +                /* safeguard if interpolation close to machine accuracy causes errors:
 +                 * never go outside the interval
 +                 */
 +                if (b <= a || b >= c)
 +                {
 +                    b = 0.5*(a+c);
 +                }
 +
 +                /* Take a trial step */
 +                for (i = 0; i < n; i++)
 +                {
 +                    xb[i] = lastx[i] + b*s[i];
 +                }
 +
 +                neval++;
 +                /* Calculate energy for the trial step */
 +                ems.s.x = (rvec *)xb;
 +                ems.f   = (rvec *)fb;
 +                evaluate_energy(fplog, cr,
 +                                top_global, &ems, top,
 +                                inputrec, nrnb, wcycle, gstat,
 +                                vsite, constr, fcd, graph, mdatoms, fr,
 +                                mu_tot, enerd, vir, pres, step, FALSE);
 +                EpotB = ems.epot;
 +
 +                fnorm = ems.fnorm;
 +
 +                for (gpb = 0, i = 0; i < n; i++)
 +                {
 +                    gpb -= s[i]*fb[i]; /* f is negative gradient, thus the sign */
 +
 +                }
 +                /* Sum the gradient along the line across CPUs */
 +                if (PAR(cr))
 +                {
 +                    gmx_sumd(1, &gpb, cr);
 +                }
 +
 +                /* Keep one of the intervals based on the value of the derivative at the new point */
 +                if (gpb > 0)
 +                {
 +                    /* Replace c endpoint with b */
 +                    EpotC = EpotB;
 +                    c     = b;
 +                    gpc   = gpb;
 +                    /* swap coord pointers b/c */
 +                    xtmp = xb;
 +                    ftmp = fb;
 +                    xb   = xc;
 +                    fb   = fc;
 +                    xc   = xtmp;
 +                    fc   = ftmp;
 +                }
 +                else
 +                {
 +                    /* Replace a endpoint with b */
 +                    EpotA = EpotB;
 +                    a     = b;
 +                    gpa   = gpb;
 +                    /* swap coord pointers a/b */
 +                    xtmp = xb;
 +                    ftmp = fb;
 +                    xb   = xa;
 +                    fb   = fa;
 +                    xa   = xtmp;
 +                    fa   = ftmp;
 +                }
 +
 +                /*
 +                 * Stop search as soon as we find a value smaller than the endpoints,
 +                 * or if the tolerance is below machine precision.
 +                 * Never run more than 20 steps, no matter what.
 +                 */
 +                nminstep++;
 +            }
 +            while ((EpotB > EpotA || EpotB > EpotC) && (nminstep < 20));
 +
 +            if (fabs(EpotB-Epot0) < GMX_REAL_EPS || nminstep >= 20)
 +            {
 +                /* OK. We couldn't find a significantly lower energy.
 +                 * If ncorr==0 this was steepest descent, and then we give up.
 +                 * If not, reset memory to restart as steepest descent before quitting.
 +                 */
 +                if (ncorr == 0)
 +                {
 +                    /* Converged */
 +                    converged = TRUE;
 +                    break;
 +                }
 +                else
 +                {
 +                    /* Reset memory */
 +                    ncorr = 0;
 +                    /* Search in gradient direction */
 +                    for (i = 0; i < n; i++)
 +                    {
 +                        dx[point][i] = ff[i];
 +                    }
 +                    /* Reset stepsize */
 +                    stepsize = 1.0/fnorm;
 +                    continue;
 +                }
 +            }
 +
 +            /* Select min energy state of A & C, put the best in xx/ff/Epot
 +             */
 +            if (EpotC < EpotA)
 +            {
 +                Epot = EpotC;
 +                /* Use state C */
 +                for (i = 0; i < n; i++)
 +                {
 +                    xx[i] = xc[i];
 +                    ff[i] = fc[i];
 +                }
 +                stepsize = c;
 +            }
 +            else
 +            {
 +                Epot = EpotA;
 +                /* Use state A */
 +                for (i = 0; i < n; i++)
 +                {
 +                    xx[i] = xa[i];
 +                    ff[i] = fa[i];
 +                }
 +                stepsize = a;
 +            }
 +
 +        }
 +        else
 +        {
 +            /* found lower */
 +            Epot = EpotC;
 +            /* Use state C */
 +            for (i = 0; i < n; i++)
 +            {
 +                xx[i] = xc[i];
 +                ff[i] = fc[i];
 +            }
 +            stepsize = c;
 +        }
 +
 +        /* Update the memory information, and calculate a new
 +         * approximation of the inverse hessian
 +         */
 +
 +        /* Have new data in Epot, xx, ff */
 +        if (ncorr < nmaxcorr)
 +        {
 +            ncorr++;
 +        }
 +
 +        for (i = 0; i < n; i++)
 +        {
 +            dg[point][i]  = lastf[i]-ff[i];
 +            dx[point][i] *= stepsize;
 +        }
 +
 +        dgdg = 0;
 +        dgdx = 0;
 +        for (i = 0; i < n; i++)
 +        {
 +            dgdg += dg[point][i]*dg[point][i];
 +            dgdx += dg[point][i]*dx[point][i];
 +        }
 +
 +        diag = dgdx/dgdg;
 +
 +        rho[point] = 1.0/dgdx;
 +        point++;
 +
 +        if (point >= nmaxcorr)
 +        {
 +            point = 0;
 +        }
 +
 +        /* Update */
 +        for (i = 0; i < n; i++)
 +        {
 +            p[i] = ff[i];
 +        }
 +
 +        cp = point;
 +
 +        /* Recursive update. First go back over the memory points */
 +        for (k = 0; k < ncorr; k++)
 +        {
 +            cp--;
 +            if (cp < 0)
 +            {
 +                cp = ncorr-1;
 +            }
 +
 +            sq = 0;
 +            for (i = 0; i < n; i++)
 +            {
 +                sq += dx[cp][i]*p[i];
 +            }
 +
 +            alpha[cp] = rho[cp]*sq;
 +
 +            for (i = 0; i < n; i++)
 +            {
 +                p[i] -= alpha[cp]*dg[cp][i];
 +            }
 +        }
 +
 +        for (i = 0; i < n; i++)
 +        {
 +            p[i] *= diag;
 +        }
 +
 +        /* And then go forward again */
 +        for (k = 0; k < ncorr; k++)
 +        {
 +            yr = 0;
 +            for (i = 0; i < n; i++)
 +            {
 +                yr += p[i]*dg[cp][i];
 +            }
 +
 +            beta = rho[cp]*yr;
 +            beta = alpha[cp]-beta;
 +
 +            for (i = 0; i < n; i++)
 +            {
 +                p[i] += beta*dx[cp][i];
 +            }
 +
 +            cp++;
 +            if (cp >= ncorr)
 +            {
 +                cp = 0;
 +            }
 +        }
 +
 +        for (i = 0; i < n; i++)
 +        {
 +            if (!frozen[i])
 +            {
 +                dx[point][i] = p[i];
 +            }
 +            else
 +            {
 +                dx[point][i] = 0;
 +            }
 +        }
 +
 +        stepsize = 1.0;
 +
 +        /* Test whether the convergence criterion is met */
 +        get_f_norm_max(cr, &(inputrec->opts), mdatoms, f, &fnorm, &fmax, &nfmax);
 +
 +        /* Print it if necessary */
 +        if (MASTER(cr))
 +        {
 +            if (bVerbose)
 +            {
 +                fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n",
 +                        step, Epot, fnorm/sqrt(state->natoms), fmax, nfmax+1);
 +            }
 +            /* Store the new (lower) energies */
 +            upd_mdebin(mdebin, FALSE, FALSE, (double)step,
 +                       mdatoms->tmass, enerd, state, inputrec->fepvals, inputrec->expandedvals, state->box,
 +                       NULL, NULL, vir, pres, NULL, mu_tot, constr);
 +            do_log = do_per_step(step, inputrec->nstlog);
 +            do_ene = do_per_step(step, inputrec->nstenergy);
 +            if (do_log)
 +            {
 +                print_ebin_header(fplog, step, step, state->lambda[efptFEP]);
 +            }
 +            print_ebin(outf->fp_ene, do_ene, FALSE, FALSE,
 +                       do_log ? fplog : NULL, step, step, eprNORMAL,
 +                       TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
 +        }
 +
 +        /* Stop when the maximum force lies below tolerance.
 +         * If we have reached machine precision, converged is already set to true.
 +         */
 +
 +        converged = converged || (fmax < inputrec->em_tol);
 +
 +    } /* End of the loop */
 +
 +    if (converged)
 +    {
 +        step--; /* we never took that last step in this case */
 +
 +    }
 +    if (fmax > inputrec->em_tol)
 +    {
 +        if (MASTER(cr))
 +        {
 +            warn_step(stderr, inputrec->em_tol, step-1 == number_steps, FALSE);
 +            warn_step(fplog, inputrec->em_tol, step-1 == number_steps, FALSE);
 +        }
 +        converged = FALSE;
 +    }
 +
 +    /* If we printed energy and/or logfile last step (which was the last step)
 +     * we don't have to do it again, but otherwise print the final values.
 +     */
 +    if (!do_log) /* Write final value to log since we didn't do anythin last step */
 +    {
 +        print_ebin_header(fplog, step, step, state->lambda[efptFEP]);
 +    }
 +    if (!do_ene || !do_log) /* Write final energy file entries */
 +    {
 +        print_ebin(outf->fp_ene, !do_ene, FALSE, FALSE,
 +                   !do_log ? fplog : NULL, step, step, eprNORMAL,
 +                   TRUE, mdebin, fcd, &(top_global->groups), &(inputrec->opts));
 +    }
 +
 +    /* Print some stuff... */
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
 +    }
 +
 +    /* IMPORTANT!
 +     * For accurate normal mode calculation it is imperative that we
 +     * store the last conformation into the full precision binary trajectory.
 +     *
 +     * However, we should only do it if we did NOT already write this step
 +     * above (which we did if do_x or do_f was true).
 +     */
 +    do_x = !do_per_step(step, inputrec->nstxout);
 +    do_f = !do_per_step(step, inputrec->nstfout);
 +    write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm),
 +                  top_global, inputrec, step,
 +                  &ems, state, f);
 +
 +    if (MASTER(cr))
 +    {
 +        print_converged(stderr, LBFGS, inputrec->em_tol, step, converged,
 +                        number_steps, Epot, fmax, nfmax, fnorm/sqrt(state->natoms));
 +        print_converged(fplog, LBFGS, inputrec->em_tol, step, converged,
 +                        number_steps, Epot, fmax, nfmax, fnorm/sqrt(state->natoms));
 +
 +        fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval);
 +    }
 +
 +    finish_em(cr, outf, runtime, wcycle);
 +
 +    /* To print the actual number of steps we needed somewhere */
 +    runtime->nsteps_done = step;
 +
 +    return 0;
 +} /* That's all folks */
 +
 +
 +double do_steep(FILE *fplog, t_commrec *cr,
 +                int nfile, const t_filenm fnm[],
 +                const output_env_t gmx_unused oenv, gmx_bool bVerbose, gmx_bool gmx_unused bCompact,
 +                int gmx_unused nstglobalcomm,
 +                gmx_vsite_t *vsite, gmx_constr_t constr,
 +                int gmx_unused stepout,
 +                t_inputrec *inputrec,
 +                gmx_mtop_t *top_global, t_fcdata *fcd,
 +                t_state *state_global,
 +                t_mdatoms *mdatoms,
 +                t_nrnb *nrnb, gmx_wallcycle_t wcycle,
 +                gmx_edsam_t gmx_unused  ed,
 +                t_forcerec *fr,
 +                int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
 +                gmx_membed_t gmx_unused membed,
 +                real gmx_unused cpt_period, real gmx_unused max_hours,
 +                const char  gmx_unused *deviceOptions,
 +                unsigned long gmx_unused Flags,
 +                gmx_runtime_t *runtime)
 +{
 +    const char       *SD = "Steepest Descents";
 +    em_state_t       *s_min, *s_try;
 +    rvec             *f_global;
 +    gmx_localtop_t   *top;
 +    gmx_enerdata_t   *enerd;
 +    rvec             *f;
 +    gmx_global_stat_t gstat;
 +    t_graph          *graph;
 +    real              stepsize, constepsize;
 +    real              ustep, fnormn;
 +    gmx_mdoutf_t     *outf;
 +    t_mdebin         *mdebin;
 +    gmx_bool          bDone, bAbort, do_x, do_f;
 +    tensor            vir, pres;
 +    rvec              mu_tot;
 +    int               nsteps;
 +    int               count          = 0;
 +    int               steps_accepted = 0;
 +    /* not used */
 +    real              terminate = 0;
 +
 +    s_min = init_em_state();
 +    s_try = init_em_state();
 +
 +    /* Init em and store the local state in s_try */
 +    init_em(fplog, SD, cr, inputrec,
 +            state_global, top_global, s_try, &top, &f, &f_global,
 +            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
 +            nfile, fnm, &outf, &mdebin);
 +
 +    /* Print to log file  */
 +    print_em_start(fplog, cr, runtime, wcycle, SD);
 +
 +    /* Set variables for stepsize (in nm). This is the largest
 +     * step that we are going to make in any direction.
 +     */
 +    ustep    = inputrec->em_stepsize;
 +    stepsize = 0;
 +
 +    /* Max number of steps  */
 +    nsteps = inputrec->nsteps;
 +
 +    if (MASTER(cr))
 +    {
 +        /* Print to the screen  */
 +        sp_header(stderr, SD, inputrec->em_tol, nsteps);
 +    }
 +    if (fplog)
 +    {
 +        sp_header(fplog, SD, inputrec->em_tol, nsteps);
 +    }
 +
 +    /**** HERE STARTS THE LOOP ****
 +     * count is the counter for the number of steps
 +     * bDone will be TRUE when the minimization has converged
 +     * bAbort will be TRUE when nsteps steps have been performed or when
 +     * the stepsize becomes smaller than is reasonable for machine precision
 +     */
 +    count  = 0;
 +    bDone  = FALSE;
 +    bAbort = FALSE;
 +    while (!bDone && !bAbort)
 +    {
 +        bAbort = (nsteps >= 0) && (count == nsteps);
 +
 +        /* set new coordinates, except for first step */
 +        if (count > 0)
 +        {
 +            do_em_step(cr, inputrec, mdatoms, fr->bMolPBC,
 +                       s_min, stepsize, s_min->f, s_try,
 +                       constr, top, nrnb, wcycle, count);
 +        }
 +
 +        evaluate_energy(fplog, cr,
 +                        top_global, s_try, top,
 +                        inputrec, nrnb, wcycle, gstat,
 +                        vsite, constr, fcd, graph, mdatoms, fr,
 +                        mu_tot, enerd, vir, pres, count, count == 0);
 +
 +        if (MASTER(cr))
 +        {
 +            print_ebin_header(fplog, count, count, s_try->s.lambda[efptFEP]);
 +        }
 +
 +        if (count == 0)
 +        {
 +            s_min->epot = s_try->epot + 1;
 +        }
 +
 +        /* Print it if necessary  */
 +        if (MASTER(cr))
 +        {
 +            if (bVerbose)
 +            {
 +                fprintf(stderr, "Step=%5d, Dmax= %6.1e nm, Epot= %12.5e Fmax= %11.5e, atom= %d%c",
 +                        count, ustep, s_try->epot, s_try->fmax, s_try->a_fmax+1,
 +                        (s_try->epot < s_min->epot) ? '\n' : '\r');
 +            }
 +
 +            if (s_try->epot < s_min->epot)
 +            {
 +                /* Store the new (lower) energies  */
 +                upd_mdebin(mdebin, FALSE, FALSE, (double)count,
 +                           mdatoms->tmass, enerd, &s_try->s, inputrec->fepvals, inputrec->expandedvals,
 +                           s_try->s.box, NULL, NULL, vir, pres, NULL, mu_tot, constr);
 +                print_ebin(outf->fp_ene, TRUE,
 +                           do_per_step(steps_accepted, inputrec->nstdisreout),
 +                           do_per_step(steps_accepted, inputrec->nstorireout),
 +                           fplog, count, count, eprNORMAL, TRUE,
 +                           mdebin, fcd, &(top_global->groups), &(inputrec->opts));
 +                fflush(fplog);
 +            }
 +        }
 +
 +        /* Now if the new energy is smaller than the previous...
 +         * or if this is the first step!
 +         * or if we did random steps!
 +         */
 +
 +        if ( (count == 0) || (s_try->epot < s_min->epot) )
 +        {
 +            steps_accepted++;
 +
 +            /* Test whether the convergence criterion is met...  */
 +            bDone = (s_try->fmax < inputrec->em_tol);
 +
 +            /* Copy the arrays for force, positions and energy  */
 +            /* The 'Min' array always holds the coords and forces of the minimal
 +               sampled energy  */
 +            swap_em_state(s_min, s_try);
 +            if (count > 0)
 +            {
 +                ustep *= 1.2;
 +            }
 +
 +            /* Write to trn, if necessary */
 +            do_x = do_per_step(steps_accepted, inputrec->nstxout);
 +            do_f = do_per_step(steps_accepted, inputrec->nstfout);
 +            write_em_traj(fplog, cr, outf, do_x, do_f, NULL,
 +                          top_global, inputrec, count,
 +                          s_min, state_global, f_global);
 +        }
 +        else
 +        {
 +            /* If energy is not smaller make the step smaller...  */
 +            ustep *= 0.5;
 +
 +            if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count)
 +            {
 +                /* Reload the old state */
 +                em_dd_partition_system(fplog, count, cr, top_global, inputrec,
 +                                       s_min, top, mdatoms, fr, vsite, constr,
 +                                       nrnb, wcycle);
 +            }
 +        }
 +
 +        /* Determine new step  */
 +        stepsize = ustep/s_min->fmax;
 +
 +        /* Check if stepsize is too small, with 1 nm as a characteristic length */
 +#ifdef GMX_DOUBLE
 +        if (count == nsteps || ustep < 1e-12)
 +#else
 +        if (count == nsteps || ustep < 1e-6)
 +#endif
 +        {
 +            if (MASTER(cr))
 +            {
 +                warn_step(stderr, inputrec->em_tol, count == nsteps, constr != NULL);
 +                warn_step(fplog, inputrec->em_tol, count == nsteps, constr != NULL);
 +            }
 +            bAbort = TRUE;
 +        }
 +
 +        count++;
 +    } /* End of the loop  */
 +
 +    /* Print some shit...  */
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr, "\nwriting lowest energy coordinates.\n");
 +    }
 +    write_em_traj(fplog, cr, outf, TRUE, inputrec->nstfout, ftp2fn(efSTO, nfile, fnm),
 +                  top_global, inputrec, count,
 +                  s_min, state_global, f_global);
 +
 +    fnormn = s_min->fnorm/sqrt(state_global->natoms);
 +
 +    if (MASTER(cr))
 +    {
 +        print_converged(stderr, SD, inputrec->em_tol, count, bDone, nsteps,
 +                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
 +        print_converged(fplog, SD, inputrec->em_tol, count, bDone, nsteps,
 +                        s_min->epot, s_min->fmax, s_min->a_fmax, fnormn);
 +    }
 +
 +    finish_em(cr, outf, runtime, wcycle);
 +
 +    /* To print the actual number of steps we needed somewhere */
 +    inputrec->nsteps = count;
 +
 +    runtime->nsteps_done = count;
 +
 +    return 0;
 +} /* That's all folks */
 +
 +
 +double do_nm(FILE *fplog, t_commrec *cr,
 +             int nfile, const t_filenm fnm[],
 +             const output_env_t gmx_unused oenv, gmx_bool bVerbose, gmx_bool gmx_unused  bCompact,
 +             int gmx_unused nstglobalcomm,
 +             gmx_vsite_t *vsite, gmx_constr_t constr,
 +             int gmx_unused stepout,
 +             t_inputrec *inputrec,
 +             gmx_mtop_t *top_global, t_fcdata *fcd,
 +             t_state *state_global,
 +             t_mdatoms *mdatoms,
 +             t_nrnb *nrnb, gmx_wallcycle_t wcycle,
 +             gmx_edsam_t  gmx_unused ed,
 +             t_forcerec *fr,
 +             int gmx_unused repl_ex_nst, int gmx_unused repl_ex_nex, int gmx_unused repl_ex_seed,
 +             gmx_membed_t gmx_unused membed,
 +             real gmx_unused cpt_period, real gmx_unused max_hours,
 +             const char gmx_unused *deviceOptions,
 +             unsigned long gmx_unused Flags,
 +             gmx_runtime_t *runtime)
 +{
 +    const char          *NM = "Normal Mode Analysis";
 +    gmx_mdoutf_t        *outf;
 +    int                  natoms, atom, d;
 +    int                  nnodes, node;
 +    rvec                *f_global;
 +    gmx_localtop_t      *top;
 +    gmx_enerdata_t      *enerd;
 +    rvec                *f;
 +    gmx_global_stat_t    gstat;
 +    t_graph             *graph;
 +    real                 t, t0, lambda, lam0;
 +    gmx_bool             bNS;
 +    tensor               vir, pres;
 +    rvec                 mu_tot;
 +    rvec                *fneg, *dfdx;
 +    gmx_bool             bSparse; /* use sparse matrix storage format */
-         fprintf(stderr, "Non-cutoff electrostatics used, forcing full Hessian format.\n");
++    size_t               sz=0;
 +    gmx_sparsematrix_t * sparse_matrix           = NULL;
 +    real           *     full_matrix             = NULL;
 +    em_state_t       *   state_work;
 +
 +    /* added with respect to mdrun */
 +    int        i, j, k, row, col;
 +    real       der_range = 10.0*sqrt(GMX_REAL_EPS);
 +    real       x_min;
 +    real       fnorm, fmax;
 +
 +    if (constr != NULL)
 +    {
 +        gmx_fatal(FARGS, "Constraints present with Normal Mode Analysis, this combination is not supported");
 +    }
 +
 +    state_work = init_em_state();
 +
 +    /* Init em and store the local state in state_minimum */
 +    init_em(fplog, NM, cr, inputrec,
 +            state_global, top_global, state_work, &top,
 +            &f, &f_global,
 +            nrnb, mu_tot, fr, &enerd, &graph, mdatoms, &gstat, vsite, constr,
 +            nfile, fnm, &outf, NULL);
 +
 +    natoms = top_global->natoms;
 +    snew(fneg, natoms);
 +    snew(dfdx, natoms);
 +
 +#ifndef GMX_DOUBLE
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr,
 +                "NOTE: This version of Gromacs has been compiled in single precision,\n"
 +                "      which MIGHT not be accurate enough for normal mode analysis.\n"
 +                "      Gromacs now uses sparse matrix storage, so the memory requirements\n"
 +                "      are fairly modest even if you recompile in double precision.\n\n");
 +    }
 +#endif
 +
 +    /* Check if we can/should use sparse storage format.
 +     *
 +     * Sparse format is only useful when the Hessian itself is sparse, which it
 +     * will be when we use a cutoff.
 +     * For small systems (n<1000) it is easier to always use full matrix format, though.
 +     */
 +    if (EEL_FULL(fr->eeltype) || fr->rlist == 0.0)
 +    {
-         fprintf(stderr, "Small system size (N=%d), using full Hessian format.\n", top_global->natoms);
++        md_print_info(cr, fplog, "Non-cutoff electrostatics used, forcing full Hessian format.\n");
 +        bSparse = FALSE;
 +    }
 +    else if (top_global->natoms < 1000)
 +    {
-         fprintf(stderr, "Using compressed symmetric sparse Hessian format.\n");
++        md_print_info(cr, fplog, "Small system size (N=%d), using full Hessian format.\n", top_global->natoms);
 +        bSparse = FALSE;
 +    }
 +    else
 +    {
-     sz = DIM*top_global->natoms;
++        md_print_info(cr, fplog, "Using compressed symmetric sparse Hessian format.\n");
 +        bSparse = TRUE;
 +    }
 +
-     fprintf(stderr, "Allocating Hessian memory...\n\n");
++    if (MASTER(cr))
++    {
++        sz = DIM*top_global->natoms;
 +
-     if (bSparse)
-     {
-         sparse_matrix = gmx_sparsematrix_init(sz);
-         sparse_matrix->compressed_symmetric = TRUE;
-     }
-     else
-     {
-         snew(full_matrix, sz*sz);
++        fprintf(stderr, "Allocating Hessian memory...\n\n");
 +
-     if (MASTER(cr))
++        if (bSparse)
++        {
++            sparse_matrix = gmx_sparsematrix_init(sz);
++            sparse_matrix->compressed_symmetric = TRUE;
++        }
++        else
++        {
++            snew(full_matrix, sz*sz);
++        }
 +    }
 +
 +    /* Initial values */
 +    t0           = inputrec->init_t;
 +    lam0         = inputrec->fepvals->init_lambda;
 +    t            = t0;
 +    lambda       = lam0;
 +
 +    init_nrnb(nrnb);
 +
 +    where();
 +
 +    /* Write start time and temperature */
 +    print_em_start(fplog, cr, runtime, wcycle, NM);
 +
 +    /* fudge nr of steps to nr of atoms */
 +    inputrec->nsteps = natoms*2;
 +
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr, "starting normal mode calculation '%s'\n%d steps.\n\n",
 +                *(top_global->name), (int)inputrec->nsteps);
 +    }
 +
 +    nnodes = cr->nnodes;
 +
 +    /* Make evaluate_energy do a single node force calculation */
 +    cr->nnodes = 1;
 +    evaluate_energy(fplog, cr,
 +                    top_global, state_work, top,
 +                    inputrec, nrnb, wcycle, gstat,
 +                    vsite, constr, fcd, graph, mdatoms, fr,
 +                    mu_tot, enerd, vir, pres, -1, TRUE);
 +    cr->nnodes = nnodes;
 +
 +    /* if forces are not small, warn user */
 +    get_state_f_norm_max(cr, &(inputrec->opts), mdatoms, state_work);
 +
-         fprintf(stderr, "Maximum force:%12.5e\n", state_work->fmax);
-         if (state_work->fmax > 1.0e-3)
-         {
-             fprintf(stderr, "Maximum force probably not small enough to");
-             fprintf(stderr, " ensure that you are in an \nenergy well. ");
-             fprintf(stderr, "Be aware that negative eigenvalues may occur");
-             fprintf(stderr, " when the\nresulting matrix is diagonalized.\n");
-         }
++    md_print_info(cr, fplog, "Maximum force:%12.5e\n", state_work->fmax);
++    if (state_work->fmax > 1.0e-3)
 +    {
++        md_print_info(cr, fplog,
++                      "The force is probably not small enough to "
++                      "ensure that you are at a minimum.\n"
++                      "Be aware that negative eigenvalues may occur\n"
++                      "when the resulting matrix is diagonalized.\n\n");
 +    }
 +
 +    /***********************************************************
 +     *
 +     *      Loop over all pairs in matrix
 +     *
 +     *      do_force called twice. Once with positive and
 +     *      once with negative displacement
 +     *
 +     ************************************************************/
 +
 +    /* Steps are divided one by one over the nodes */
 +    for (atom = cr->nodeid; atom < natoms; atom += nnodes)
 +    {
 +
 +        for (d = 0; d < DIM; d++)
 +        {
 +            x_min = state_work->s.x[atom][d];
 +
 +            state_work->s.x[atom][d] = x_min - der_range;
 +
 +            /* Make evaluate_energy do a single node force calculation */
 +            cr->nnodes = 1;
 +            evaluate_energy(fplog, cr,
 +                            top_global, state_work, top,
 +                            inputrec, nrnb, wcycle, gstat,
 +                            vsite, constr, fcd, graph, mdatoms, fr,
 +                            mu_tot, enerd, vir, pres, atom*2, FALSE);
 +
 +            for (i = 0; i < natoms; i++)
 +            {
 +                copy_rvec(state_work->f[i], fneg[i]);
 +            }
 +
 +            state_work->s.x[atom][d] = x_min + der_range;
 +
 +            evaluate_energy(fplog, cr,
 +                            top_global, state_work, top,
 +                            inputrec, nrnb, wcycle, gstat,
 +                            vsite, constr, fcd, graph, mdatoms, fr,
 +                            mu_tot, enerd, vir, pres, atom*2+1, FALSE);
 +            cr->nnodes = nnodes;
 +
 +            /* x is restored to original */
 +            state_work->s.x[atom][d] = x_min;
 +
 +            for (j = 0; j < natoms; j++)
 +            {
 +                for (k = 0; (k < DIM); k++)
 +                {
 +                    dfdx[j][k] =
 +                        -(state_work->f[j][k] - fneg[j][k])/(2*der_range);
 +                }
 +            }
 +
 +            if (!MASTER(cr))
 +            {
 +#ifdef GMX_MPI
 +#ifdef GMX_DOUBLE
 +#define mpi_type MPI_DOUBLE
 +#else
 +#define mpi_type MPI_FLOAT
 +#endif
 +                MPI_Send(dfdx[0], natoms*DIM, mpi_type, MASTERNODE(cr), cr->nodeid,
 +                         cr->mpi_comm_mygroup);
 +#endif
 +            }
 +            else
 +            {
 +                for (node = 0; (node < nnodes && atom+node < natoms); node++)
 +                {
 +                    if (node > 0)
 +                    {
 +#ifdef GMX_MPI
 +                        MPI_Status stat;
 +                        MPI_Recv(dfdx[0], natoms*DIM, mpi_type, node, node,
 +                                 cr->mpi_comm_mygroup, &stat);
 +#undef mpi_type
 +#endif
 +                    }
 +
 +                    row = (atom + node)*DIM + d;
 +
 +                    for (j = 0; j < natoms; j++)
 +                    {
 +                        for (k = 0; k < DIM; k++)
 +                        {
 +                            col = j*DIM + k;
 +
 +                            if (bSparse)
 +                            {
 +                                if (col >= row && dfdx[j][k] != 0.0)
 +                                {
 +                                    gmx_sparsematrix_increment_value(sparse_matrix,
 +                                                                     row, col, dfdx[j][k]);
 +                                }
 +                            }
 +                            else
 +                            {
 +                                full_matrix[row*sz+col] = dfdx[j][k];
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +
 +            if (bVerbose && fplog)
 +            {
 +                fflush(fplog);
 +            }
 +        }
 +        /* write progress */
 +        if (MASTER(cr) && bVerbose)
 +        {
 +            fprintf(stderr, "\rFinished step %d out of %d",
 +                    min(atom+nnodes, natoms), natoms);
 +            fflush(stderr);
 +        }
 +    }
 +
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr, "\n\nWriting Hessian...\n");
 +        gmx_mtxio_write(ftp2fn(efMTX, nfile, fnm), sz, sz, full_matrix, sparse_matrix);
 +    }
 +
 +    finish_em(cr, outf, runtime, wcycle);
 +
 +    runtime->nsteps_done = natoms*2;
 +
 +    return 0;
 +}
index 836201be04933e8268a9fdc953e7b941dbf8f6a1,0000000000000000000000000000000000000000..432c3ad5ea77a56d1f677251cda652da710a6e58
mode 100644,000000..100644
--- /dev/null
@@@ -1,259 -1,0 +1,280 @@@
-     int     *nsubc;            /* The number of sub cells for each super cell */
-     float   *bbcz;             /* Bounding boxes in z for the super cells     */
-     float   *bb;               /* 3D bounding boxes for the sub cells         */
-     float   *bbj;              /* 3D j-b.boxes for SSE-double or AVX-single   */
-     int     *flags;            /* Flag for the super cells                    */
-     int      nc_nalloc;        /* Allocation size for the pointers above      */
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustr
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +
 +#ifndef _nbnxn_internal_h
 +#define _nbnxn_internal_h
 +
 +#include "typedefs.h"
 +#include "domdec.h"
 +#include "gmx_cyclecounter.h"
 +
 +#ifdef GMX_NBNXN_SIMD
 +/* The include below sets the SIMD instruction type (precision+width)
 + * for all nbnxn SIMD search and non-bonded kernel code.
 + */
 +#ifdef GMX_NBNXN_HALF_WIDTH_SIMD
 +#define GMX_USE_HALF_WIDTH_SIMD_HERE
 +#endif
 +#include "gmx_simd_macros.h"
 +#endif
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
 +
 +#ifdef GMX_X86_SSE2
 +/* Use 4-way SIMD for, always, single precision bounding box calculations */
 +#define NBNXN_SEARCH_BB_SSE
 +#endif
 +
 +
 +#ifdef GMX_NBNXN_SIMD
 +/* Memory alignment in bytes as required by SIMD aligned loads/stores */
 +#define NBNXN_MEM_ALIGN  (GMX_SIMD_WIDTH_HERE*sizeof(real))
 +#else
 +/* No alignment required, but set it so we can call the same routines */
 +#define NBNXN_MEM_ALIGN  32
 +#endif
 +
 +
++/* Pair search box lower and upper corner in x,y,z.
++ * Store this in 4 iso 3 reals, which is useful with SSE.
++ * To avoid complicating the code we also use 4 without SSE.
++ */
++#define NNBSBB_C         4
++/* Pair search box lower and upper bound in z only. */
++#define NNBSBB_D         2
++/* Pair search box lower and upper corner x,y,z indices, entry 3 is unused */
++#define BB_X  0
++#define BB_Y  1
++#define BB_Z  2
++
++/* Bounding box for a nbnxn atom cluster */
++typedef struct {
++    float lower[NNBSBB_C];
++    float upper[NNBSBB_C];
++} nbnxn_bb_t;
++
++
 +/* A pair-search grid struct for one domain decomposition zone */
 +typedef struct {
 +    rvec     c0;               /* The lower corner of the (local) grid        */
 +    rvec     c1;               /* The upper corner of the (local) grid        */
 +    real     atom_density;     /* The atom number density for the local grid  */
 +
 +    gmx_bool bSimple;          /* Is this grid simple or super/sub            */
 +    int      na_c;             /* Number of atoms per cluster                 */
 +    int      na_cj;            /* Number of atoms for list j-clusters         */
 +    int      na_sc;            /* Number of atoms per super-cluster           */
 +    int      na_c_2log;        /* 2log of na_c                                */
 +
 +    int      ncx;              /* Number of (super-)cells along x             */
 +    int      ncy;              /* Number of (super-)cells along y             */
 +    int      nc;               /* Total number of (super-)cells               */
 +
 +    real     sx;               /* x-size of a (super-)cell                    */
 +    real     sy;               /* y-size of a (super-)cell                    */
 +    real     inv_sx;           /* 1/sx                                        */
 +    real     inv_sy;           /* 1/sy                                        */
 +
 +    int      cell0;            /* Index in nbs->cell corresponding to cell 0  */
 +
 +    int     *cxy_na;           /* The number of atoms for each column in x,y  */
 +    int     *cxy_ind;          /* Grid (super)cell index, offset from cell0   */
 +    int      cxy_nalloc;       /* Allocation size for cxy_na and cxy_ind      */
 +
-     float   *bbcz_simple;      /* bbcz for simple grid converted from super   */
-     float   *bb_simple;        /* bb for simple grid converted from super     */
-     int     *flags_simple;     /* flags for simple grid converted from super  */
-     int      nc_nalloc_simple; /* Allocation size for the pointers above   */
++    int        *nsubc;         /* The number of sub cells for each super cell */
++    float      *bbcz;          /* Bounding boxes in z for the super cells     */
++    nbnxn_bb_t *bb;            /* 3D bounding boxes for the sub cells         */
++    nbnxn_bb_t *bbj;           /* 3D j-b.boxes for SSE-double or AVX-single   */
++    float      *pbb;           /* 3D b. boxes in xxxx format per super cell   */
++    int        *flags;         /* Flag for the super cells                    */
++    int         nc_nalloc;     /* Allocation size for the pointers above      */
 +
-     gmx_cache_protect_t     cp0;   /* Protect cache between threads               */
++    float      *bbcz_simple;   /* bbcz for simple grid converted from super   */
++    nbnxn_bb_t *bb_simple;     /* bb for simple grid converted from super     */
++    int        *flags_simple;  /* flags for simple grid converted from super  */
++    int         nc_nalloc_simple; /* Allocation size for the pointers above   */
 +
 +    int      nsubc_tot;        /* Total number of subcell, used for printing  */
 +} nbnxn_grid_t;
 +
 +#ifdef GMX_NBNXN_SIMD
 +
 +typedef struct nbnxn_x_ci_simd_4xn {
 +    /* The i-cluster coordinates for simple search */
 +    gmx_mm_pr ix_SSE0, iy_SSE0, iz_SSE0;
 +    gmx_mm_pr ix_SSE1, iy_SSE1, iz_SSE1;
 +    gmx_mm_pr ix_SSE2, iy_SSE2, iz_SSE2;
 +    gmx_mm_pr ix_SSE3, iy_SSE3, iz_SSE3;
 +} nbnxn_x_ci_simd_4xn_t;
 +
 +typedef struct nbnxn_x_ci_simd_2xnn {
 +    /* The i-cluster coordinates for simple search */
 +    gmx_mm_pr ix_SSE0, iy_SSE0, iz_SSE0;
 +    gmx_mm_pr ix_SSE2, iy_SSE2, iz_SSE2;
 +} nbnxn_x_ci_simd_2xnn_t;
 +
 +#endif
 +
 +/* Working data for the actual i-supercell during pair search */
 +typedef struct nbnxn_list_work {
-     float                  *bb_ci; /* The bounding boxes, pbc shifted, for each cluster */
-     real                   *x_ci;  /* The coordinates, pbc shifted, for each atom       */
++    gmx_cache_protect_t     cp0;    /* Protect cache between threads               */
 +
++    nbnxn_bb_t             *bb_ci;  /* The bounding boxes, pbc shifted, for each cluster */
++    float                  *pbb_ci; /* As bb_ci, but in xxxx packed format               */
++    real                   *x_ci;   /* The coordinates, pbc shifted, for each atom       */
 +#ifdef GMX_NBNXN_SIMD
 +    nbnxn_x_ci_simd_4xn_t  *x_ci_simd_4xn;
 +    nbnxn_x_ci_simd_2xnn_t *x_ci_simd_2xnn;
 +#endif
 +    int                     cj_ind;          /* The current cj_ind index for the current list     */
 +    int                     cj4_init;        /* The first unitialized cj4 block                   */
 +
 +    float                  *d2;              /* Bounding box distance work array                  */
 +
 +    nbnxn_cj_t             *cj;              /* The j-cell list                                   */
 +    int                     cj_nalloc;       /* Allocation size of cj                             */
 +
 +    int                     ncj_noq;         /* Nr. of cluster pairs without Coul for flop count  */
 +    int                     ncj_hlj;         /* Nr. of cluster pairs with 1/2 LJ for flop count   */
 +
 +    int                    *sort;            /* Sort index                    */
 +    int                     sort_nalloc;     /* Allocation size of sort       */
 +
 +    nbnxn_sci_t            *sci_sort;        /* Second sci array, for sorting */
 +    int                     sci_sort_nalloc; /* Allocation size of sci_sort   */
 +
 +    gmx_cache_protect_t     cp1;             /* Protect cache between threads               */
 +} nbnxn_list_work_t;
 +
 +/* Function type for setting the i-atom coordinate working data */
 +typedef void
 +    gmx_icell_set_x_t (int ci,
 +                       real shx, real shy, real shz,
 +                       int na_c,
 +                       int stride, const real *x,
 +                       nbnxn_list_work_t *work);
 +
 +static gmx_icell_set_x_t icell_set_x_simple;
 +#ifdef GMX_NBNXN_SIMD
 +static gmx_icell_set_x_t icell_set_x_simple_simd_4xn;
 +static gmx_icell_set_x_t icell_set_x_simple_simd_2xnn;
 +#endif
 +static gmx_icell_set_x_t icell_set_x_supersub;
 +#ifdef NBNXN_SEARCH_SSE
 +static gmx_icell_set_x_t icell_set_x_supersub_sse8;
 +#endif
 +
 +/* Local cycle count struct for profiling */
 +typedef struct {
 +    int          count;
 +    gmx_cycles_t c;
 +    gmx_cycles_t start;
 +} nbnxn_cycle_t;
 +
 +/* Local cycle count enum for profiling */
 +enum {
 +    enbsCCgrid, enbsCCsearch, enbsCCcombine, enbsCCreducef, enbsCCnr
 +};
 +
 +/* Thread-local work struct, contains part of nbnxn_grid_t */
 +typedef struct {
 +    gmx_cache_protect_t  cp0;
 +
 +    int                 *cxy_na;
 +    int                  cxy_na_nalloc;
 +
 +    int                 *sort_work;
 +    int                  sort_work_nalloc;
 +
 +    nbnxn_buffer_flags_t buffer_flags; /* Flags for force buffer access */
 +
 +    int                  ndistc;       /* Number of distance checks for flop counting */
 +
 +    nbnxn_cycle_t        cc[enbsCCnr];
 +
 +    gmx_cache_protect_t  cp1;
 +} nbnxn_search_work_t;
 +
 +/* Main pair-search struct, contains the grid(s), not the pair-list(s) */
 +typedef struct nbnxn_search {
 +    int                 ePBC;            /* PBC type enum                              */
 +    matrix              box;             /* The periodic unit-cell                     */
 +
 +    gmx_bool            DomDec;          /* Are we doing domain decomposition?         */
 +    ivec                dd_dim;          /* Are we doing DD in x,y,z?                  */
 +    gmx_domdec_zones_t *zones;           /* The domain decomposition zones        */
 +
 +    int                 ngrid;           /* The number of grids, equal to #DD-zones    */
 +    nbnxn_grid_t       *grid;            /* Array of grids, size ngrid                 */
 +    int                *cell;            /* Actual allocated cell array for all grids  */
 +    int                 cell_nalloc;     /* Allocation size of cell                    */
 +    int                *a;               /* Atom index for grid, the inverse of cell   */
 +    int                 a_nalloc;        /* Allocation size of a                       */
 +
 +    int                 natoms_local;    /* The local atoms run from 0 to natoms_local */
 +    int                 natoms_nonlocal; /* The non-local atoms run from natoms_local
 +                                          * to natoms_nonlocal */
 +
 +    gmx_bool             print_cycles;
 +    int                  search_count;
 +    nbnxn_cycle_t        cc[enbsCCnr];
 +
 +    gmx_icell_set_x_t   *icell_set_x; /* Function for setting i-coords    */
 +
 +    int                  nthread_max; /* Maximum number of threads for pair-search  */
 +    nbnxn_search_work_t *work;        /* Work array, size nthread_max          */
 +} nbnxn_search_t_t;
 +
 +
 +static void nbs_cycle_start(nbnxn_cycle_t *cc)
 +{
 +    cc->start = gmx_cycles_read();
 +}
 +
 +static void nbs_cycle_stop(nbnxn_cycle_t *cc)
 +{
 +    cc->c += gmx_cycles_read() - cc->start;
 +    cc->count++;
 +}
 +
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +#endif
index 7baaf5ac5db204c121e7e53b4379e4360cf000f6,0000000000000000000000000000000000000000..fecd353a4ec438ed0306829e3f231509d2f41a93
mode 100644,000000..100644
--- /dev/null
@@@ -1,333 -1,0 +1,369 @@@
 +/*
 + * This file is part of the GROMACS molecular simulation package.
 + *
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 + * Copyright (c) 2012, by the GROMACS development team, led by
 + * David van der Spoel, Berk Hess, Erik Lindahl, and including many
 + * others, as listed in the AUTHORS file in the top-level source
 + * directory and at http://www.gromacs.org.
 + *
 + * GROMACS is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2.1
 + * of the License, or (at your option) any later version.
 + *
 + * GROMACS is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with GROMACS; if not, see
 + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 + *
 + * If you want to redistribute modifications to GROMACS, please
 + * consider that scientific software is very special. Version
 + * control is crucial - bugs must be traceable. We will be happy to
 + * consider code for inclusion in the official distribution, but
 + * derived work must not be called official GROMACS. Details are found
 + * in the README & COPYING files - if they are missing, get the
 + * official version at http://www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "typedefs.h"
 +#include "vec.h"
 +#include "smalloc.h"
 +#include "force.h"
 +#include "gmx_omp_nthreads.h"
 +#include "../nbnxn_consts.h"
 +#include "nbnxn_kernel_common.h"
 +
 +#ifdef GMX_NBNXN_SIMD_2XNN
 +
 +/* Include the full width SIMD macros */
 +#include "gmx_simd_macros.h"
 +#include "gmx_simd_vec.h"
 +
 +#include "nbnxn_kernel_simd_2xnn.h"
 +
 +#if !(GMX_SIMD_WIDTH_HERE == 8 || GMX_SIMD_WIDTH_HERE == 16)
 +#error "unsupported SIMD width"
 +#endif
 +
++#define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
++
++#define UNROLLI    NBNXN_CPU_CLUSTER_I_SIZE
++#define UNROLLJ    (GMX_SIMD_WIDTH_HERE/2)
++
++/* The stride of all the atom data arrays is equal to half the SIMD width */
++#define STRIDE     (GMX_SIMD_WIDTH_HERE/2)
++
++#if GMX_SIMD_WIDTH_HERE == 8
++#define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
++#else
++#if GMX_SIMD_WIDTH_HERE == 16
++/* This is getting ridiculous, SIMD horizontal adds would help,
++ * but this is not performance critical (only used to reduce energies)
++ */
++#define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7]+x[8]+x[9]+x[10]+x[11]+x[12]+x[13]+x[14]+x[15])
++#else
++#error "unsupported kernel configuration"
++#endif
++#endif
++
++
++#include "nbnxn_kernel_simd_utils.h"
++
++static inline void
++gmx_load_simd_2xnn_interactions(int            excl,
++                                gmx_exclfilter filter_S0,
++                                gmx_exclfilter filter_S2,
++                                gmx_mm_pb     *interact_S0,
++                                gmx_mm_pb     *interact_S2)
++{
++    /* Load integer topology exclusion interaction mask */
++    gmx_exclfilter mask_pr_S = gmx_load1_exclfilter(excl);
++    *interact_S0  = gmx_checkbitmask_pb(mask_pr_S, filter_S0);
++    *interact_S2  = gmx_checkbitmask_pb(mask_pr_S, filter_S2);
++}
 +
 +/* Include all flavors of the SSE or AVX 2x(N+N) kernel loops */
 +
 +/* Analytical reaction-field kernels */
 +#define CALC_COUL_RF
 +
 +#include "nbnxn_kernel_simd_2xnn_includes.h"
 +
 +#undef CALC_COUL_RF
 +
 +/* Tabulated exclusion interaction electrostatics kernels */
 +#define CALC_COUL_TAB
 +
 +/* Single cut-off: rcoulomb = rvdw */
 +#include "nbnxn_kernel_simd_2xnn_includes.h"
 +
 +/* Twin cut-off: rcoulomb >= rvdw */
 +#define VDW_CUTOFF_CHECK
 +#include "nbnxn_kernel_simd_2xnn_includes.h"
 +#undef VDW_CUTOFF_CHECK
 +
 +#undef CALC_COUL_TAB
 +
 +/* Analytical Ewald exclusion interaction electrostatics kernels */
 +#define CALC_COUL_EWALD
 +
 +/* Single cut-off: rcoulomb = rvdw */
 +#include "nbnxn_kernel_simd_2xnn_includes.h"
 +
 +/* Twin cut-off: rcoulomb >= rvdw */
 +#define VDW_CUTOFF_CHECK
 +#include "nbnxn_kernel_simd_2xnn_includes.h"
 +#undef VDW_CUTOFF_CHECK
 +
 +#undef CALC_COUL_EWALD
 +
 +
 +typedef void (*p_nbk_func_ener)(const nbnxn_pairlist_t     *nbl,
 +                                const nbnxn_atomdata_t     *nbat,
 +                                const interaction_const_t  *ic,
 +                                rvec                       *shift_vec,
 +                                real                       *f,
 +                                real                       *fshift,
 +                                real                       *Vvdw,
 +                                real                       *Vc);
 +
 +typedef void (*p_nbk_func_noener)(const nbnxn_pairlist_t     *nbl,
 +                                  const nbnxn_atomdata_t     *nbat,
 +                                  const interaction_const_t  *ic,
 +                                  rvec                       *shift_vec,
 +                                  real                       *f,
 +                                  real                       *fshift);
 +
 +enum {
 +    coultRF, coultTAB, coultTAB_TWIN, coultEWALD, coultEWALD_TWIN, coultNR
 +};
 +
 +#define NBK_FN(elec, ljcomb) nbnxn_kernel_simd_2xnn_ ## elec ## _comb_ ## ljcomb ## _ener
 +static p_nbk_func_ener p_nbk_ener[coultNR][ljcrNR] =
 +{ { NBK_FN(rf, geom), NBK_FN(rf, lb), NBK_FN(rf, none) },
 +  { NBK_FN(tab, geom), NBK_FN(tab, lb), NBK_FN(tab, none) },
 +  { NBK_FN(tab_twin, geom), NBK_FN(tab_twin, lb), NBK_FN(tab_twin, none) },
 +  { NBK_FN(ewald, geom), NBK_FN(ewald, lb), NBK_FN(ewald, none) },
 +  { NBK_FN(ewald_twin, geom), NBK_FN(ewald_twin, lb), NBK_FN(ewald_twin, none) } };
 +#undef NBK_FN
 +
 +#define NBK_FN(elec, ljcomb) nbnxn_kernel_simd_2xnn_ ## elec ## _comb_ ## ljcomb ## _energrp
 +static p_nbk_func_ener p_nbk_energrp[coultNR][ljcrNR] =
 +{ { NBK_FN(rf, geom), NBK_FN(rf, lb), NBK_FN(rf, none) },
 +  { NBK_FN(tab, geom), NBK_FN(tab, lb), NBK_FN(tab, none) },
 +  { NBK_FN(tab_twin, geom), NBK_FN(tab_twin, lb), NBK_FN(tab_twin, none) },
 +  { NBK_FN(ewald, geom), NBK_FN(ewald, lb), NBK_FN(ewald, none) },
 +  { NBK_FN(ewald_twin, geom), NBK_FN(ewald_twin, lb), NBK_FN(ewald_twin, none) } };
 +#undef NBK_FN
 +
 +#define NBK_FN(elec, ljcomb) nbnxn_kernel_simd_2xnn_ ## elec ## _comb_ ## ljcomb ## _noener
 +static p_nbk_func_noener p_nbk_noener[coultNR][ljcrNR] =
 +{ { NBK_FN(rf, geom), NBK_FN(rf, lb), NBK_FN(rf, none) },
 +  { NBK_FN(tab, geom), NBK_FN(tab, lb), NBK_FN(tab, none) },
 +  { NBK_FN(tab_twin, geom), NBK_FN(tab_twin, lb), NBK_FN(tab_twin, none) },
 +  { NBK_FN(ewald, geom), NBK_FN(ewald, lb), NBK_FN(ewald, none) },
 +  { NBK_FN(ewald_twin, geom), NBK_FN(ewald_twin, lb), NBK_FN(ewald_twin, none) } };
 +#undef NBK_FN
 +
 +
 +static void reduce_group_energies(int ng, int ng_2log,
 +                                  const real *VSvdw, const real *VSc,
 +                                  real *Vvdw, real *Vc)
 +{
 +    const int unrollj      = GMX_SIMD_WIDTH_HERE/2;
 +    const int unrollj_half = unrollj/2;
 +    int       ng_p2, i, j, j0, j1, c, s;
 +
 +    ng_p2 = (1<<ng_2log);
 +
 +    /* The size of the x86 SIMD energy group buffer array is:
 +     * ng*ng*ng_p2*unrollj_half*simd_width
 +     */
 +    for (i = 0; i < ng; i++)
 +    {
 +        for (j = 0; j < ng; j++)
 +        {
 +            Vvdw[i*ng+j] = 0;
 +            Vc[i*ng+j]   = 0;
 +        }
 +
 +        for (j1 = 0; j1 < ng; j1++)
 +        {
 +            for (j0 = 0; j0 < ng; j0++)
 +            {
 +                c = ((i*ng + j1)*ng_p2 + j0)*unrollj_half*unrollj;
 +                for (s = 0; s < unrollj_half; s++)
 +                {
 +                    Vvdw[i*ng+j0] += VSvdw[c+0];
 +                    Vvdw[i*ng+j1] += VSvdw[c+1];
 +                    Vc  [i*ng+j0] += VSc  [c+0];
 +                    Vc  [i*ng+j1] += VSc  [c+1];
 +                    c             += unrollj + 2;
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +#endif /* GMX_NBNXN_SIMD_2XNN */
 +
 +void
 +nbnxn_kernel_simd_2xnn(nbnxn_pairlist_set_t       gmx_unused *nbl_list,
 +                       const nbnxn_atomdata_t     gmx_unused *nbat,
 +                       const interaction_const_t  gmx_unused *ic,
 +                       int                        gmx_unused ewald_excl,
 +                       rvec                       gmx_unused *shift_vec,
 +                       int                        gmx_unused  force_flags,
 +                       int                        gmx_unused  clearF,
 +                       real                       gmx_unused *fshift,
 +                       real                       gmx_unused *Vc,
 +                       real                       gmx_unused *Vvdw)
 +#ifdef GMX_NBNXN_SIMD_2XNN
 +{
 +    int                nnbl;
 +    nbnxn_pairlist_t **nbl;
 +    int                coult;
 +    int                nb;
 +
 +    nnbl = nbl_list->nnbl;
 +    nbl  = nbl_list->nbl;
 +
 +    if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
 +    {
 +        coult = coultRF;
 +    }
 +    else
 +    {
 +        if (ewald_excl == ewaldexclTable)
 +        {
 +            if (ic->rcoulomb == ic->rvdw)
 +            {
 +                coult = coultTAB;
 +            }
 +            else
 +            {
 +                coult = coultTAB_TWIN;
 +            }
 +        }
 +        else
 +        {
 +            if (ic->rcoulomb == ic->rvdw)
 +            {
 +                coult = coultEWALD;
 +            }
 +            else
 +            {
 +                coult = coultEWALD_TWIN;
 +            }
 +        }
 +    }
 +
 +#pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded))
 +    for (nb = 0; nb < nnbl; nb++)
 +    {
 +        nbnxn_atomdata_output_t *out;
 +        real                    *fshift_p;
 +
 +        out = &nbat->out[nb];
 +
 +        if (clearF == enbvClearFYes)
 +        {
 +            clear_f(nbat, nb, out->f);
 +        }
 +
 +        if ((force_flags & GMX_FORCE_VIRIAL) && nnbl == 1)
 +        {
 +            fshift_p = fshift;
 +        }
 +        else
 +        {
 +            fshift_p = out->fshift;
 +
 +            if (clearF == enbvClearFYes)
 +            {
 +                clear_fshift(fshift_p);
 +            }
 +        }
 +
 +        /* With Ewald type electrostatics we the forces for excluded atom pairs
 +         * should not contribute to the virial sum. The exclusion forces
 +         * are not calculate in the energy kernels, but are in _noener.
 +         */
 +        if (!((force_flags & GMX_FORCE_ENERGY) ||
 +              (EEL_FULL(ic->eeltype) && (force_flags & GMX_FORCE_VIRIAL))))
 +        {
 +            /* Don't calculate energies */
 +            p_nbk_noener[coult][nbat->comb_rule](nbl[nb], nbat,
 +                                                 ic,
 +                                                 shift_vec,
 +                                                 out->f,
 +                                                 fshift_p);
 +        }
 +        else if (out->nV == 1 || !(force_flags & GMX_FORCE_ENERGY))
 +        {
 +            /* No energy groups */
 +            out->Vvdw[0] = 0;
 +            out->Vc[0]   = 0;
 +
 +            p_nbk_ener[coult][nbat->comb_rule](nbl[nb], nbat,
 +                                               ic,
 +                                               shift_vec,
 +                                               out->f,
 +                                               fshift_p,
 +                                               out->Vvdw,
 +                                               out->Vc);
 +        }
 +        else
 +        {
 +            /* Calculate energy group contributions */
 +            int i;
 +
 +            for (i = 0; i < out->nVS; i++)
 +            {
 +                out->VSvdw[i] = 0;
 +            }
 +            for (i = 0; i < out->nVS; i++)
 +            {
 +                out->VSc[i] = 0;
 +            }
 +
 +            p_nbk_energrp[coult][nbat->comb_rule](nbl[nb], nbat,
 +                                                  ic,
 +                                                  shift_vec,
 +                                                  out->f,
 +                                                  fshift_p,
 +                                                  out->VSvdw,
 +                                                  out->VSc);
 +
 +            reduce_group_energies(nbat->nenergrp, nbat->neg_2log,
 +                                  out->VSvdw, out->VSc,
 +                                  out->Vvdw, out->Vc);
 +        }
 +    }
 +
 +    if (force_flags & GMX_FORCE_ENERGY)
 +    {
 +        reduce_energies_over_lists(nbat, nnbl, Vvdw, Vc);
 +    }
 +}
 +#else
 +{
 +    gmx_incons("nbnxn_kernel_simd_2xnn called while GROMACS was configured without 2x(N+N) SIMD kernels enabled");
 +}
 +#endif
index 9068062d7de8f35146b4a9a0c0bf7bb7a62b9eb2,0000000000000000000000000000000000000000..e0960f44ad60c602c127e2181836cdad17649f2e
mode 100644,000000..100644
--- /dev/null
@@@ -1,773 -1,0 +1,753 @@@
- #ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
-     {
-         /* Load integer topology exclusion interaction mask */
-         gmx_epi32 mask_pr_S = gmx_set1_epi32(l_cj[cjind].excl);
-         interact_S0  = gmx_checkbitmask_epi32(mask_pr_S, filter_S0);
-         interact_S2  = gmx_checkbitmask_epi32(mask_pr_S, filter_S2);
-     }
- #else
- #ifdef GMX_SIMD_HAVE_CHECKBITMASK_PR
-     {
-         /* Integer mask set, cast to real and real mask operations */
-         gmx_mm_pr mask_pr_S = gmx_castsi_pr(gmx_set1_epi32(l_cj[cjind].excl));
-         interact_S0  = gmx_checkbitmask_pr(mask_pr_S, filter_S0);
-         interact_S2  = gmx_checkbitmask_pr(mask_pr_S, filter_S2);
-     }
- #else
- #error "No SIMD bitmask operation available"
- #endif
- #endif
 +/*
 + * This file is part of the GROMACS molecular simulation package.
 + *
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2009, The GROMACS Development Team
 + * Copyright (c) 2012, by the GROMACS development team, led by
 + * David van der Spoel, Berk Hess, Erik Lindahl, and including many
 + * others, as listed in the AUTHORS file in the top-level source
 + * directory and at http://www.gromacs.org.
 + *
 + * GROMACS is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2.1
 + * of the License, or (at your option) any later version.
 + *
 + * GROMACS is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with GROMACS; if not, see
 + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 + *
 + * If you want to redistribute modifications to GROMACS, please
 + * consider that scientific software is very special. Version
 + * control is crucial - bugs must be traceable. We will be happy to
 + * consider code for inclusion in the official distribution, but
 + * derived work must not be called official GROMACS. Details are found
 + * in the README & COPYING files - if they are missing, get the
 + * official version at http://www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org.
 + */
 +
 +/* This is the innermost loop contents for the 4 x N atom SIMD kernel.
 + * This flavor of the kernel duplicates the data for N j-particles in
 + * 2xN wide SIMD registers to do operate on 2 i-particles at once.
 + * This leads to 4/2=2 sets of most instructions. Therefore we call
 + * this kernel 2x(N+N) = 2xnn
 + *
 + * This 2xnn kernel is basically the 4xn equivalent with half the registers
 + * and instructions removed.
 + *
 + * An alternative would be to load to different cluster of N j-particles
 + * into SIMD registers, giving a 4x(N+N) kernel. This doubles the amount
 + * of instructions, which could lead to better scheduling. But we actually
 + * observed worse scheduling for the AVX-256 4x8 normal analytical PME
 + * kernel, which has a lower pair throughput than 2x(4+4) with gcc 4.7.
 + * It could be worth trying this option, but it takes some more effort.
 + * This 2xnn kernel is basically the 4xn equivalent with
 + */
 +
 +
 +/* When calculating RF or Ewald interactions we calculate the electrostatic
 + * forces on excluded atom pairs here in the non-bonded loops.
 + * But when energies and/or virial is required we calculate them
 + * separately to as then it is easier to separate the energy and virial
 + * contributions.
 + */
 +#if defined CHECK_EXCLS && defined CALC_COULOMB
 +#define EXCL_FORCES
 +#endif
 +
 +/* Without exclusions and energies we only need to mask the cut-off,
 + * this can be faster with blendv.
 + */
 +#if !(defined CHECK_EXCLS || defined CALC_ENERGIES) && defined GMX_SIMD_HAVE_BLENDV && !defined COUNT_PAIRS
 +/* With RF and tabulated Coulomb we replace cmp+and with sub+blendv.
 + * With gcc this is slower, except for RF on Sandy Bridge.
 + * Tested with gcc 4.6.2, 4.6.3 and 4.7.1.
 + */
 +#if (defined CALC_COUL_RF || defined CALC_COUL_TAB) && (!defined __GNUC__ || (defined CALC_COUL_RF && defined GMX_X86_AVX_256))
 +#define CUTOFF_BLENDV
 +#endif
 +/* With analytical Ewald we replace cmp+and+and with sub+blendv+blendv.
 + * This is only faster with icc on Sandy Bridge (PS kernel slower than gcc 4.7).
 + * Tested with icc 13.
 + */
 +#if defined CALC_COUL_EWALD && defined __INTEL_COMPILER && defined GMX_X86_AVX_256
 +#define CUTOFF_BLENDV
 +#endif
 +#endif
 +
 +{
 +    int        cj, aj, ajx, ajy, ajz;
 +
 +#ifdef ENERGY_GROUPS
 +    /* Energy group indices for two atoms packed into one int */
 +    int        egp_jj[UNROLLJ/2];
 +#endif
 +
 +#ifdef CHECK_EXCLS
 +    /* Interaction (non-exclusion) mask of all 1's or 0's */
 +    gmx_mm_pb  interact_S0;
 +    gmx_mm_pb  interact_S2;
 +#endif
 +
 +    gmx_mm_pr  jx_S, jy_S, jz_S;
 +    gmx_mm_pr  dx_S0, dy_S0, dz_S0;
 +    gmx_mm_pr  dx_S2, dy_S2, dz_S2;
 +    gmx_mm_pr  tx_S0, ty_S0, tz_S0;
 +    gmx_mm_pr  tx_S2, ty_S2, tz_S2;
 +    gmx_mm_pr  rsq_S0, rinv_S0, rinvsq_S0;
 +    gmx_mm_pr  rsq_S2, rinv_S2, rinvsq_S2;
 +#ifndef CUTOFF_BLENDV
 +    /* wco: within cut-off, mask of all 1's or 0's */
 +    gmx_mm_pb  wco_S0;
 +    gmx_mm_pb  wco_S2;
 +#endif
 +#ifdef VDW_CUTOFF_CHECK
 +    gmx_mm_pb  wco_vdw_S0;
 +#ifndef HALF_LJ
 +    gmx_mm_pb  wco_vdw_S2;
 +#endif
 +#endif
 +#ifdef CALC_COULOMB
 +#ifdef CHECK_EXCLS
 +    /* 1/r masked with the interaction mask */
 +    gmx_mm_pr  rinv_ex_S0;
 +    gmx_mm_pr  rinv_ex_S2;
 +#endif
 +    gmx_mm_pr  jq_S;
 +    gmx_mm_pr  qq_S0;
 +    gmx_mm_pr  qq_S2;
 +#ifdef CALC_COUL_TAB
 +    /* The force (PME mesh force) we need to subtract from 1/r^2 */
 +    gmx_mm_pr  fsub_S0;
 +    gmx_mm_pr  fsub_S2;
 +#endif
 +#ifdef CALC_COUL_EWALD
 +    gmx_mm_pr  brsq_S0, brsq_S2;
 +    gmx_mm_pr  ewcorr_S0, ewcorr_S2;
 +#endif
 +
 +    /* frcoul = (1/r - fsub)*r */
 +    gmx_mm_pr  frcoul_S0;
 +    gmx_mm_pr  frcoul_S2;
 +#ifdef CALC_COUL_TAB
 +    /* For tables: r, rs=r/sp, rf=floor(rs), frac=rs-rf */
 +    gmx_mm_pr  r_S0, rs_S0, rf_S0, frac_S0;
 +    gmx_mm_pr  r_S2, rs_S2, rf_S2, frac_S2;
 +    /* Table index: rs truncated to an int */
 +    gmx_epi32  ti_S0, ti_S2;
 +    /* Linear force table values */
 +    gmx_mm_pr  ctab0_S0, ctab1_S0;
 +    gmx_mm_pr  ctab0_S2, ctab1_S2;
 +#ifdef CALC_ENERGIES
 +    /* Quadratic energy table value */
 +    gmx_mm_pr  ctabv_S0;
 +    gmx_mm_pr  ctabv_S2;
 +#endif
 +#endif
 +#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
 +    /* The potential (PME mesh) we need to subtract from 1/r */
 +    gmx_mm_pr  vc_sub_S0;
 +    gmx_mm_pr  vc_sub_S2;
 +#endif
 +#ifdef CALC_ENERGIES
 +    /* Electrostatic potential */
 +    gmx_mm_pr  vcoul_S0;
 +    gmx_mm_pr  vcoul_S2;
 +#endif
 +#endif
 +    /* The force times 1/r */
 +    gmx_mm_pr  fscal_S0;
 +    gmx_mm_pr  fscal_S2;
 +
 +#ifdef CALC_LJ
 +#ifdef LJ_COMB_LB
 +    /* LJ sigma_j/2 and sqrt(epsilon_j) */
 +    gmx_mm_pr  hsig_j_S, seps_j_S;
 +    /* LJ sigma_ij and epsilon_ij */
 +    gmx_mm_pr  sig_S0, eps_S0;
 +#ifndef HALF_LJ
 +    gmx_mm_pr  sig_S2, eps_S2;
 +#endif
 +#ifdef CALC_ENERGIES
 +    gmx_mm_pr  sig2_S0, sig6_S0;
 +#ifndef HALF_LJ
 +    gmx_mm_pr  sig2_S2, sig6_S2;
 +#endif
 +#endif /* LJ_COMB_LB */
 +#endif /* CALC_LJ */
 +
 +#ifdef LJ_COMB_GEOM
 +    gmx_mm_pr  c6s_j_S, c12s_j_S;
 +#endif
 +
 +#if defined LJ_COMB_GEOM || defined LJ_COMB_LB
 +    /* Index for loading LJ parameters, complicated when interleaving */
 +    int         aj2;
 +#endif
 +
 +#ifndef FIX_LJ_C
 +    /* LJ C6 and C12 parameters, used with geometric comb. rule */
 +    gmx_mm_pr  c6_S0, c12_S0;
 +#ifndef HALF_LJ
 +    gmx_mm_pr  c6_S2, c12_S2;
 +#endif
 +#endif
 +
 +    /* Intermediate variables for LJ calculation */
 +#ifndef LJ_COMB_LB
 +    gmx_mm_pr  rinvsix_S0;
 +#ifndef HALF_LJ
 +    gmx_mm_pr  rinvsix_S2;
 +#endif
 +#endif
 +#ifdef LJ_COMB_LB
 +    gmx_mm_pr  sir_S0, sir2_S0, sir6_S0;
 +#ifndef HALF_LJ
 +    gmx_mm_pr  sir_S2, sir2_S2, sir6_S2;
 +#endif
 +#endif
 +
 +    gmx_mm_pr  FrLJ6_S0, FrLJ12_S0;
 +#ifndef HALF_LJ
 +    gmx_mm_pr  FrLJ6_S2, FrLJ12_S2;
 +#endif
 +#ifdef CALC_ENERGIES
 +    gmx_mm_pr  VLJ6_S0, VLJ12_S0, VLJ_S0;
 +#ifndef HALF_LJ
 +    gmx_mm_pr  VLJ6_S2, VLJ12_S2, VLJ_S2;
 +#endif
 +#endif
 +#endif /* CALC_LJ */
 +
 +    gmx_mm_hpr fjx_S, fjy_S, fjz_S;
 +
 +    /* j-cluster index */
 +    cj            = l_cj[cjind].cj;
 +
 +    /* Atom indices (of the first atom in the cluster) */
 +    aj            = cj*UNROLLJ;
 +#if defined CALC_LJ && (defined LJ_COMB_GEOM || defined LJ_COMB_LB)
 +#if UNROLLJ == STRIDE
 +    aj2           = aj*2;
 +#else
 +    aj2           = (cj>>1)*2*STRIDE + (cj & 1)*UNROLLJ;
 +#endif
 +#endif
 +#if UNROLLJ == STRIDE
 +    ajx           = aj*DIM;
 +#else
 +    ajx           = (cj>>1)*DIM*STRIDE + (cj & 1)*UNROLLJ;
 +#endif
 +    ajy           = ajx + STRIDE;
 +    ajz           = ajy + STRIDE;
 +
 +#ifdef CHECK_EXCLS
-     /* No exclusion forces: remove all excluded atom pairs from the list */
++    gmx_load_simd_2xnn_interactions(l_cj[cjind].excl, filter_S0, filter_S2, &interact_S0, &interact_S2);
 +#endif /* CHECK_EXCLS */
 +
 +    /* load j atom coordinates */
 +    gmx_loaddh_pr(&jx_S, x+ajx);
 +    gmx_loaddh_pr(&jy_S, x+ajy);
 +    gmx_loaddh_pr(&jz_S, x+ajz);
 +
 +    /* Calculate distance */
 +    dx_S0       = gmx_sub_pr(ix_S0, jx_S);
 +    dy_S0       = gmx_sub_pr(iy_S0, jy_S);
 +    dz_S0       = gmx_sub_pr(iz_S0, jz_S);
 +    dx_S2       = gmx_sub_pr(ix_S2, jx_S);
 +    dy_S2       = gmx_sub_pr(iy_S2, jy_S);
 +    dz_S2       = gmx_sub_pr(iz_S2, jz_S);
 +
 +    /* rsq = dx*dx+dy*dy+dz*dz */
 +    rsq_S0      = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0);
 +    rsq_S2      = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2);
 +
 +#ifndef CUTOFF_BLENDV
 +    wco_S0      = gmx_cmplt_pr(rsq_S0, rc2_S);
 +    wco_S2      = gmx_cmplt_pr(rsq_S2, rc2_S);
 +#endif
 +
 +#ifdef CHECK_EXCLS
 +#ifdef EXCL_FORCES
 +    /* Only remove the (sub-)diagonal to avoid double counting */
 +#if UNROLLJ == UNROLLI
 +    if (cj == ci_sh)
 +    {
 +        wco_S0  = gmx_and_pb(wco_S0, diagonal_mask_S0);
 +        wco_S2  = gmx_and_pb(wco_S2, diagonal_mask_S2);
 +    }
 +#else
 +#if UNROLLJ == 2*UNROLLI
 +    if (cj*2 == ci_sh)
 +    {
 +        wco_S0  = gmx_and_pb(wco_S0, diagonal_mask0_S0);
 +        wco_S2  = gmx_and_pb(wco_S2, diagonal_mask0_S2);
 +    }
 +    else if (cj*2 + 1 == ci_sh)
 +    {
 +        wco_S0  = gmx_and_pb(wco_S0, diagonal_mask1_S0);
 +        wco_S2  = gmx_and_pb(wco_S2, diagonal_mask1_S2);
 +    }
 +#else
 +#error "only UNROLLJ == UNROLLI*(1 or 2) currently supported in 2xnn kernels"
 +#endif
 +#endif
 +#else /* EXCL_FORCES */
++      /* No exclusion forces: remove all excluded atom pairs from the list */
 +    wco_S0      = gmx_and_pb(wco_S0, interact_S0);
 +    wco_S2      = gmx_and_pb(wco_S2, interact_S2);
 +#endif
 +#endif
 +
 +#ifdef COUNT_PAIRS
 +    {
 +        int  i, j;
 +        real tmpa[2*GMX_SIMD_WIDTH_HERE], *tmp;
 +        tmp = gmx_simd_align_real(tmpa);
 +        for (i = 0; i < UNROLLI; i+=2)
 +        {
 +            gmx_store_pr(tmp, i == 0 ? wco_S0 : wco_S2);
 +            for (j = 0; j < 2*UNROLLJ; j++)
 +            {
 +                if (!(tmp[j] == 0))
 +                {
 +                    npair++;
 +                }
 +            }
 +        }
 +    }
 +#endif
 +
 +#ifdef CHECK_EXCLS
 +    /* For excluded pairs add a small number to avoid r^-6 = NaN */
 +    rsq_S0      = gmx_masknot_add_pr(interact_S0, rsq_S0, avoid_sing_S);
 +    rsq_S2      = gmx_masknot_add_pr(interact_S2, rsq_S2, avoid_sing_S);
 +#endif
 +
 +    /* Calculate 1/r */
 +    rinv_S0     = gmx_invsqrt_pr(rsq_S0);
 +    rinv_S2     = gmx_invsqrt_pr(rsq_S2);
 +
 +#ifdef CALC_COULOMB
 +    /* Load parameters for j atom */
 +    gmx_loaddh_pr(&jq_S, q+aj);
 +    qq_S0       = gmx_mul_pr(iq_S0, jq_S);
 +    qq_S2       = gmx_mul_pr(iq_S2, jq_S);
 +#endif
 +
 +#ifdef CALC_LJ
 +
 +#if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C
 +    load_lj_pair_params2(nbfp0, nbfp1, type, aj, &c6_S0, &c12_S0);
 +#ifndef HALF_LJ
 +    load_lj_pair_params2(nbfp2, nbfp3, type, aj, &c6_S2, &c12_S2);
 +#endif
 +#endif /* not defined any LJ rule */
 +
 +#ifdef LJ_COMB_GEOM
 +    gmx_loaddh_pr(&c6s_j_S,  ljc+aj2+0);
 +    gmx_loaddh_pr(&c12s_j_S, ljc+aj2+STRIDE);
 +    c6_S0       = gmx_mul_pr(c6s_S0, c6s_j_S );
 +#ifndef HALF_LJ
 +    c6_S2       = gmx_mul_pr(c6s_S2, c6s_j_S );
 +#endif
 +    c12_S0      = gmx_mul_pr(c12s_S0, c12s_j_S);
 +#ifndef HALF_LJ
 +    c12_S2      = gmx_mul_pr(c12s_S2, c12s_j_S);
 +#endif
 +#endif /* LJ_COMB_GEOM */
 +
 +#ifdef LJ_COMB_LB
 +    gmx_loaddh_pr(&hsig_j_S, ljc+aj2+0);
 +    gmx_loaddh_pr(&seps_j_S, ljc+aj2+STRIDE);
 +
 +    sig_S0      = gmx_add_pr(hsig_i_S0, hsig_j_S);
 +    eps_S0      = gmx_mul_pr(seps_i_S0, seps_j_S);
 +#ifndef HALF_LJ
 +    sig_S2      = gmx_add_pr(hsig_i_S2, hsig_j_S);
 +    eps_S2      = gmx_mul_pr(seps_i_S2, seps_j_S);
 +#endif
 +#endif /* LJ_COMB_LB */
 +
 +#endif /* CALC_LJ */
 +
 +#ifndef CUTOFF_BLENDV
 +    rinv_S0     = gmx_blendzero_pr(rinv_S0, wco_S0);
 +    rinv_S2     = gmx_blendzero_pr(rinv_S2, wco_S2);
 +#else
 +    /* We only need to mask for the cut-off: blendv is faster */
 +    rinv_S0     = gmx_blendv_pr(rinv_S0, zero_S, gmx_sub_pr(rc2_S, rsq_S0));
 +    rinv_S2     = gmx_blendv_pr(rinv_S2, zero_S, gmx_sub_pr(rc2_S, rsq_S2));
 +#endif
 +
 +    rinvsq_S0   = gmx_mul_pr(rinv_S0, rinv_S0);
 +    rinvsq_S2   = gmx_mul_pr(rinv_S2, rinv_S2);
 +
 +#ifdef CALC_COULOMB
 +    /* Note that here we calculate force*r, not the usual force/r.
 +     * This allows avoiding masking the reaction-field contribution,
 +     * as frcoul is later multiplied by rinvsq which has been
 +     * masked with the cut-off check.
 +     */
 +
 +#ifdef EXCL_FORCES
 +    /* Only add 1/r for non-excluded atom pairs */
 +    rinv_ex_S0  = gmx_blendzero_pr(rinv_S0, interact_S0);
 +    rinv_ex_S2  = gmx_blendzero_pr(rinv_S2, interact_S2);
 +#else
 +    /* No exclusion forces, we always need 1/r */
 +#define     rinv_ex_S0    rinv_S0
 +#define     rinv_ex_S2    rinv_S2
 +#endif
 +
 +#ifdef CALC_COUL_RF
 +    /* Electrostatic interactions */
 +    frcoul_S0   = gmx_mul_pr(qq_S0, gmx_madd_pr(rsq_S0, mrc_3_S, rinv_ex_S0));
 +    frcoul_S2   = gmx_mul_pr(qq_S2, gmx_madd_pr(rsq_S2, mrc_3_S, rinv_ex_S2));
 +
 +#ifdef CALC_ENERGIES
 +    vcoul_S0    = gmx_mul_pr(qq_S0, gmx_add_pr(rinv_ex_S0, gmx_add_pr(gmx_mul_pr(rsq_S0, hrc_3_S), moh_rc_S)));
 +    vcoul_S2    = gmx_mul_pr(qq_S2, gmx_add_pr(rinv_ex_S2, gmx_add_pr(gmx_mul_pr(rsq_S2, hrc_3_S), moh_rc_S)));
 +#endif
 +#endif
 +
 +#ifdef CALC_COUL_EWALD
 +    /* We need to mask (or limit) rsq for the cut-off,
 +     * as large distances can cause an overflow in gmx_pmecorrF/V.
 +     */
 +#ifndef CUTOFF_BLENDV
 +    brsq_S0     = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S0, wco_S0));
 +    brsq_S2     = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S2, wco_S2));
 +#else
 +    /* Strangely, putting mul on a separate line is slower (icc 13) */
 +    brsq_S0     = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S0, zero_S, gmx_sub_pr(rc2_S, rsq_S0)));
 +    brsq_S2     = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S2, zero_S, gmx_sub_pr(rc2_S, rsq_S2)));
 +#endif
 +    ewcorr_S0   = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S0), beta_S);
 +    ewcorr_S2   = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S2), beta_S);
 +    frcoul_S0   = gmx_mul_pr(qq_S0, gmx_madd_pr(ewcorr_S0, brsq_S0, rinv_ex_S0));
 +    frcoul_S2   = gmx_mul_pr(qq_S2, gmx_madd_pr(ewcorr_S2, brsq_S2, rinv_ex_S2));
 +
 +#ifdef CALC_ENERGIES
 +    vc_sub_S0   = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S0), beta_S);
 +    vc_sub_S2   = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S2), beta_S);
 +#endif
 +
 +#endif /* CALC_COUL_EWALD */
 +
 +#ifdef CALC_COUL_TAB
 +    /* Electrostatic interactions */
 +    r_S0        = gmx_mul_pr(rsq_S0, rinv_S0);
 +    r_S2        = gmx_mul_pr(rsq_S2, rinv_S2);
 +    /* Convert r to scaled table units */
 +    rs_S0       = gmx_mul_pr(r_S0, invtsp_S);
 +    rs_S2       = gmx_mul_pr(r_S2, invtsp_S);
 +    /* Truncate scaled r to an int */
 +    ti_S0       = gmx_cvttpr_epi32(rs_S0);
 +    ti_S2       = gmx_cvttpr_epi32(rs_S2);
 +#ifdef GMX_SIMD_HAVE_FLOOR
 +    rf_S0       = gmx_floor_pr(rs_S0);
 +    rf_S2       = gmx_floor_pr(rs_S2);
 +#else
 +    rf_S0       = gmx_cvtepi32_pr(ti_S0);
 +    rf_S2       = gmx_cvtepi32_pr(ti_S2);
 +#endif
 +    frac_S0     = gmx_sub_pr(rs_S0, rf_S0);
 +    frac_S2     = gmx_sub_pr(rs_S2, rf_S2);
 +
 +    /* Load and interpolate table forces and possibly energies.
 +     * Force and energy can be combined in one table, stride 4: FDV0
 +     * or in two separate tables with stride 1: F and V
 +     * Currently single precision uses FDV0, double F and V.
 +     */
 +#ifndef CALC_ENERGIES
 +    load_table_f(tab_coul_F, ti_S0, ti0, &ctab0_S0, &ctab1_S0);
 +    load_table_f(tab_coul_F, ti_S2, ti2, &ctab0_S2, &ctab1_S2);
 +#else
 +#ifdef TAB_FDV0
 +    load_table_f_v(tab_coul_F, ti_S0, ti0, &ctab0_S0, &ctab1_S0, &ctabv_S0);
 +    load_table_f_v(tab_coul_F, ti_S2, ti2, &ctab0_S2, &ctab1_S2, &ctabv_S2);
 +#else
 +    load_table_f_v(tab_coul_F, tab_coul_V, ti_S0, ti0, &ctab0_S0, &ctab1_S0, &ctabv_S0);
 +    load_table_f_v(tab_coul_F, tab_coul_V, ti_S2, ti2, &ctab0_S2, &ctab1_S2, &ctabv_S2);
 +#endif
 +#endif
 +    fsub_S0     = gmx_add_pr(ctab0_S0, gmx_mul_pr(frac_S0, ctab1_S0));
 +    fsub_S2     = gmx_add_pr(ctab0_S2, gmx_mul_pr(frac_S2, ctab1_S2));
 +    frcoul_S0   = gmx_mul_pr(qq_S0, gmx_sub_pr(rinv_ex_S0, gmx_mul_pr(fsub_S0, r_S0)));
 +    frcoul_S2   = gmx_mul_pr(qq_S2, gmx_sub_pr(rinv_ex_S2, gmx_mul_pr(fsub_S2, r_S2)));
 +
 +#ifdef CALC_ENERGIES
 +    vc_sub_S0   = gmx_add_pr(ctabv_S0, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S0), gmx_add_pr(ctab0_S0, fsub_S0)));
 +    vc_sub_S2   = gmx_add_pr(ctabv_S2, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S2), gmx_add_pr(ctab0_S2, fsub_S2)));
 +#endif
 +#endif /* CALC_COUL_TAB */
 +
 +#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
 +#ifndef NO_SHIFT_EWALD
 +    /* Add Ewald potential shift to vc_sub for convenience */
 +#ifdef CHECK_EXCLS
 +    vc_sub_S0   = gmx_add_pr(vc_sub_S0, gmx_blendzero_pr(sh_ewald_S, interact_S0));
 +    vc_sub_S2   = gmx_add_pr(vc_sub_S2, gmx_blendzero_pr(sh_ewald_S, interact_S2));
 +#else
 +    vc_sub_S0   = gmx_add_pr(vc_sub_S0, sh_ewald_S);
 +    vc_sub_S2   = gmx_add_pr(vc_sub_S2, sh_ewald_S);
 +#endif
 +#endif
 +
 +    vcoul_S0    = gmx_mul_pr(qq_S0, gmx_sub_pr(rinv_ex_S0, vc_sub_S0));
 +    vcoul_S2    = gmx_mul_pr(qq_S2, gmx_sub_pr(rinv_ex_S2, vc_sub_S2));
 +#endif
 +
 +#ifdef CALC_ENERGIES
 +    /* Mask energy for cut-off and diagonal */
 +    vcoul_S0    = gmx_blendzero_pr(vcoul_S0, wco_S0);
 +    vcoul_S2    = gmx_blendzero_pr(vcoul_S2, wco_S2);
 +#endif
 +
 +#endif /* CALC_COULOMB */
 +
 +#ifdef CALC_LJ
 +    /* Lennard-Jones interaction */
 +
 +#ifdef VDW_CUTOFF_CHECK
 +    wco_vdw_S0  = gmx_cmplt_pr(rsq_S0, rcvdw2_S);
 +#ifndef HALF_LJ
 +    wco_vdw_S2  = gmx_cmplt_pr(rsq_S2, rcvdw2_S);
 +#endif
 +#else
 +    /* Same cut-off for Coulomb and VdW, reuse the registers */
 +#define     wco_vdw_S0    wco_S0
 +#define     wco_vdw_S2    wco_S2
 +#endif
 +
 +#ifndef LJ_COMB_LB
 +    rinvsix_S0  = gmx_mul_pr(rinvsq_S0, gmx_mul_pr(rinvsq_S0, rinvsq_S0));
 +#ifdef EXCL_FORCES
 +    rinvsix_S0  = gmx_blendzero_pr(rinvsix_S0, interact_S0);
 +#endif
 +#ifndef HALF_LJ
 +    rinvsix_S2  = gmx_mul_pr(rinvsq_S2, gmx_mul_pr(rinvsq_S2, rinvsq_S2));
 +#ifdef EXCL_FORCES
 +    rinvsix_S2  = gmx_blendzero_pr(rinvsix_S2, interact_S2);
 +#endif
 +#endif
 +#ifdef VDW_CUTOFF_CHECK
 +    rinvsix_S0  = gmx_blendzero_pr(rinvsix_S0, wco_vdw_S0);
 +#ifndef HALF_LJ
 +    rinvsix_S2  = gmx_blendzero_pr(rinvsix_S2, wco_vdw_S2);
 +#endif
 +#endif
 +    FrLJ6_S0    = gmx_mul_pr(c6_S0, rinvsix_S0);
 +#ifndef HALF_LJ
 +    FrLJ6_S2    = gmx_mul_pr(c6_S2, rinvsix_S2);
 +#endif
 +    FrLJ12_S0   = gmx_mul_pr(c12_S0, gmx_mul_pr(rinvsix_S0, rinvsix_S0));
 +#ifndef HALF_LJ
 +    FrLJ12_S2   = gmx_mul_pr(c12_S2, gmx_mul_pr(rinvsix_S2, rinvsix_S2));
 +#endif
 +#endif /* not LJ_COMB_LB */
 +
 +#ifdef LJ_COMB_LB
 +    sir_S0      = gmx_mul_pr(sig_S0, rinv_S0);
 +#ifndef HALF_LJ
 +    sir_S2      = gmx_mul_pr(sig_S2, rinv_S2);
 +#endif
 +    sir2_S0     = gmx_mul_pr(sir_S0, sir_S0);
 +#ifndef HALF_LJ
 +    sir2_S2     = gmx_mul_pr(sir_S2, sir_S2);
 +#endif
 +    sir6_S0     = gmx_mul_pr(sir2_S0, gmx_mul_pr(sir2_S0, sir2_S0));
 +#ifdef EXCL_FORCES
 +    sir6_S0     = gmx_blendzero_pr(sir6_S0, interact_S0);
 +#endif
 +#ifndef HALF_LJ
 +    sir6_S2     = gmx_mul_pr(sir2_S2, gmx_mul_pr(sir2_S2, sir2_S2));
 +#ifdef EXCL_FORCES
 +    sir6_S2     = gmx_blendzero_pr(sir6_S2, interact_S2);
 +#endif
 +#endif
 +#ifdef VDW_CUTOFF_CHECK
 +    sir6_S0     = gmx_blendzero_pr(sir6_S0, wco_vdw_S0);
 +#ifndef HALF_LJ
 +    sir6_S2     = gmx_blendzero_pr(sir6_S2, wco_vdw_S2);
 +#endif
 +#endif
 +    FrLJ6_S0    = gmx_mul_pr(eps_S0, sir6_S0);
 +#ifndef HALF_LJ
 +    FrLJ6_S2    = gmx_mul_pr(eps_S2, sir6_S2);
 +#endif
 +    FrLJ12_S0   = gmx_mul_pr(FrLJ6_S0, sir6_S0);
 +#ifndef HALF_LJ
 +    FrLJ12_S2   = gmx_mul_pr(FrLJ6_S2, sir6_S2);
 +#endif
 +#if defined CALC_ENERGIES
 +    /* We need C6 and C12 to calculate the LJ potential shift */
 +    sig2_S0     = gmx_mul_pr(sig_S0, sig_S0);
 +#ifndef HALF_LJ
 +    sig2_S2     = gmx_mul_pr(sig_S2, sig_S2);
 +#endif
 +    sig6_S0     = gmx_mul_pr(sig2_S0, gmx_mul_pr(sig2_S0, sig2_S0));
 +#ifndef HALF_LJ
 +    sig6_S2     = gmx_mul_pr(sig2_S2, gmx_mul_pr(sig2_S2, sig2_S2));
 +#endif
 +    c6_S0       = gmx_mul_pr(eps_S0, sig6_S0);
 +#ifndef HALF_LJ
 +    c6_S2       = gmx_mul_pr(eps_S2, sig6_S2);
 +#endif
 +    c12_S0      = gmx_mul_pr(c6_S0, sig6_S0);
 +#ifndef HALF_LJ
 +    c12_S2      = gmx_mul_pr(c6_S2, sig6_S2);
 +#endif
 +#endif
 +#endif /* LJ_COMB_LB */
 +
 +#endif /* CALC_LJ */
 +
 +#ifdef CALC_ENERGIES
 +#ifdef ENERGY_GROUPS
 +    /* Extract the group pair index per j pair.
 +     * Energy groups are stored per i-cluster, so things get
 +     * complicated when the i- and j-cluster size don't match.
 +     */
 +    {
 +        int egps_j;
 +#if UNROLLJ == 2
 +        egps_j    = nbat->energrp[cj>>1];
 +        egp_jj[0] = ((egps_j >> ((cj & 1)*egps_jshift)) & egps_jmask)*egps_jstride;
 +#else
 +        /* We assume UNROLLI <= UNROLLJ */
 +        int jdi;
 +        for (jdi = 0; jdi < UNROLLJ/UNROLLI; jdi++)
 +        {
 +            int jj;
 +            egps_j = nbat->energrp[cj*(UNROLLJ/UNROLLI)+jdi];
 +            for (jj = 0; jj < (UNROLLI/2); jj++)
 +            {
 +                egp_jj[jdi*(UNROLLI/2)+jj] = ((egps_j >> (jj*egps_jshift)) & egps_jmask)*egps_jstride;
 +            }
 +        }
 +#endif
 +    }
 +#endif
 +
 +#ifdef CALC_COULOMB
 +#ifndef ENERGY_GROUPS
 +    vctot_S      = gmx_add_pr(vctot_S, gmx_add_pr(vcoul_S0, vcoul_S2));
 +#else
 +    add_ener_grp_halves(vcoul_S0, vctp[0], vctp[1], egp_jj);
 +    add_ener_grp_halves(vcoul_S2, vctp[2], vctp[3], egp_jj);
 +#endif
 +#endif
 +
 +#ifdef CALC_LJ
 +    /* Calculate the LJ energies */
 +    VLJ6_S0     = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S0, gmx_mul_pr(c6_S0, sh_invrc6_S)));
 +#ifndef HALF_LJ
 +    VLJ6_S2     = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S2, gmx_mul_pr(c6_S2, sh_invrc6_S)));
 +#endif
 +    VLJ12_S0    = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S0, gmx_mul_pr(c12_S0, sh_invrc12_S)));
 +#ifndef HALF_LJ
 +    VLJ12_S2    = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S2, gmx_mul_pr(c12_S2, sh_invrc12_S)));
 +#endif
 +
 +    VLJ_S0      = gmx_sub_pr(VLJ12_S0, VLJ6_S0);
 +#ifndef HALF_LJ
 +    VLJ_S2      = gmx_sub_pr(VLJ12_S2, VLJ6_S2);
 +#endif
 +    /* The potential shift should be removed for pairs beyond cut-off */
 +    VLJ_S0      = gmx_blendzero_pr(VLJ_S0, wco_vdw_S0);
 +#ifndef HALF_LJ
 +    VLJ_S2      = gmx_blendzero_pr(VLJ_S2, wco_vdw_S2);
 +#endif
 +#ifdef CHECK_EXCLS
 +    /* The potential shift should be removed for excluded pairs */
 +    VLJ_S0      = gmx_blendzero_pr(VLJ_S0, interact_S0);
 +#ifndef HALF_LJ
 +    VLJ_S2      = gmx_blendzero_pr(VLJ_S2, interact_S2);
 +#endif
 +#endif
 +#ifndef ENERGY_GROUPS
 +    Vvdwtot_S    = gmx_add_pr(Vvdwtot_S,
 +#ifndef HALF_LJ
 +                              gmx_add_pr(VLJ_S0, VLJ_S2)
 +#else
 +                              VLJ_S0
 +#endif
 +                              );
 +#else
 +    add_ener_grp_halves(VLJ_S0, vvdwtp[0], vvdwtp[1], egp_jj);
 +#ifndef HALF_LJ
 +    add_ener_grp_halves(VLJ_S2, vvdwtp[2], vvdwtp[3], egp_jj);
 +#endif
 +#endif
 +#endif /* CALC_LJ */
 +#endif /* CALC_ENERGIES */
 +
 +#ifdef CALC_LJ
 +    fscal_S0    = gmx_mul_pr(rinvsq_S0,
 +#ifdef CALC_COULOMB
 +                             gmx_add_pr(frcoul_S0,
 +#else
 +                             (
 +#endif
 +                              gmx_sub_pr(FrLJ12_S0, FrLJ6_S0)));
 +#else
 +    fscal_S0    = gmx_mul_pr(rinvsq_S0, frcoul_S0);
 +#endif /* CALC_LJ */
 +#if defined CALC_LJ && !defined HALF_LJ
 +    fscal_S2    = gmx_mul_pr(rinvsq_S2,
 +#ifdef CALC_COULOMB
 +                             gmx_add_pr(frcoul_S2,
 +#else
 +                             (
 +#endif
 +                              gmx_sub_pr(FrLJ12_S2, FrLJ6_S2)));
 +#else
 +    /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
 +    fscal_S2    = gmx_mul_pr(rinvsq_S2, frcoul_S2);
 +#endif
 +
 +    /* Calculate temporary vectorial force */
 +    tx_S0       = gmx_mul_pr(fscal_S0, dx_S0);
 +    tx_S2       = gmx_mul_pr(fscal_S2, dx_S2);
 +    ty_S0       = gmx_mul_pr(fscal_S0, dy_S0);
 +    ty_S2       = gmx_mul_pr(fscal_S2, dy_S2);
 +    tz_S0       = gmx_mul_pr(fscal_S0, dz_S0);
 +    tz_S2       = gmx_mul_pr(fscal_S2, dz_S2);
 +
 +    /* Increment i atom force */
 +    fix_S0      = gmx_add_pr(fix_S0, tx_S0);
 +    fix_S2      = gmx_add_pr(fix_S2, tx_S2);
 +    fiy_S0      = gmx_add_pr(fiy_S0, ty_S0);
 +    fiy_S2      = gmx_add_pr(fiy_S2, ty_S2);
 +    fiz_S0      = gmx_add_pr(fiz_S0, tz_S0);
 +    fiz_S2      = gmx_add_pr(fiz_S2, tz_S2);
 +
 +    /* Decrement j atom force */
 +    gmx_load_hpr(&fjx_S, f+ajx);
 +    gmx_load_hpr(&fjy_S, f+ajy);
 +    gmx_load_hpr(&fjz_S, f+ajz);
 +    gmx_store_hpr(f+ajx, gmx_sub_hpr(fjx_S, gmx_sum4_hpr(tx_S0, tx_S2)));
 +    gmx_store_hpr(f+ajy, gmx_sub_hpr(fjy_S, gmx_sum4_hpr(ty_S0, ty_S2)));
 +    gmx_store_hpr(f+ajz, gmx_sub_hpr(fjz_S, gmx_sum4_hpr(tz_S0, tz_S2)));
 +}
 +
 +#undef  rinv_ex_S0
 +#undef  rinv_ex_S2
 +
 +#undef  wco_vdw_S0
 +#undef  wco_vdw_S2
 +
 +#undef  CUTOFF_BLENDV
 +
 +#undef  EXCL_FORCES
index fec158de8f1a38377f5ce6b1509562bb379be84f,0000000000000000000000000000000000000000..d6e49e31075a24d61478672b8f777766e2642398
mode 100644,000000..100644
--- /dev/null
@@@ -1,744 -1,0 +1,680 @@@
- #define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
- #define UNROLLI    NBNXN_CPU_CLUSTER_I_SIZE
- #define UNROLLJ    (GMX_SIMD_WIDTH_HERE/2)
- /* The stride of all the atom data arrays is equal to half the SIMD width */
- #define STRIDE     (GMX_SIMD_WIDTH_HERE/2)
- #if GMX_SIMD_WIDTH_HERE == 8
- #define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
- #else
- #if GMX_SIMD_WIDTH_HERE == 16
- /* This is getting ridiculous, SIMD horizontal adds would help,
-  * but this is not performance critical (only used to reduce energies)
-  */
- #define SUM_SIMD(x) (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7]+x[8]+x[9]+x[10]+x[11]+x[12]+x[13]+x[14]+x[15])
- #else
- #error "unsupported kernel configuration"
- #endif
- #endif
- #if defined GMX_X86_AVX_256 && !defined GMX_DOUBLE
- /* AVX-256 single precision 2x(4+4) kernel,
-  * we can do half SIMD-width aligned FDV0 table loads.
-  */
- #define TAB_FDV0
- #endif
- /* Currently stride 4 for the 2 LJ parameters is hard coded */
- #define NBFP_STRIDE  4
- #include "nbnxn_kernel_simd_utils.h"
 +/*
 + * This file is part of the GROMACS molecular simulation package.
 + *
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2009, The GROMACS Development Team
 + * Copyright (c) 2012, by the GROMACS development team, led by
 + * David van der Spoel, Berk Hess, Erik Lindahl, and including many
 + * others, as listed in the AUTHORS file in the top-level source
 + * directory and at http://www.gromacs.org.
 + *
 + * GROMACS is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2.1
 + * of the License, or (at your option) any later version.
 + *
 + * GROMACS is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with GROMACS; if not, see
 + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 + *
 + * If you want to redistribute modifications to GROMACS, please
 + * consider that scientific software is very special. Version
 + * control is crucial - bugs must be traceable. We will be happy to
 + * consider code for inclusion in the official distribution, but
 + * derived work must not be called official GROMACS. Details are found
 + * in the README & COPYING files - if they are missing, get the
 + * official version at http://www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org.
 + */
 +
 +
 +/* Half-width SIMD operations are required here.
 + * As the 4xn kernels are the "standard" kernels and some special operations
 + * are required only here, we define those in nbnxn_kernel_simd_utils_...
 + *
 + * Half-width SIMD real type:
 + * gmx_mm_hpr
 + *
 + * Half-width SIMD operations
 + * Load reals at half-width aligned pointer b into half-width SIMD register a:
 + * gmx_load_hpr(a, b)
 + * Set all entries in half-width SIMD register *a to b:
 + * gmx_set1_hpr(a, b)
 + * Load one real at b and one real at b+1 into halves of a, respectively:
 + * gmx_load1p1_pr(a, b)
 + * Load reals at half-width aligned pointer b into two halves of a:
 + * gmx_loaddh_pr(a, b)
 + * Store half-width SIMD register b into half width aligned memory a:
 + * gmx_store_hpr(a, b)
 + * gmx_add_hpr(a, b)
 + * gmx_sub_hpr(a, b)
 + * Sum over 4 half SIMD registers:
 + * gmx_sum4_hpr(a, b)
 + * Sum the elements of halfs of each input register and store sums in out:
 + * gmx_mm_transpose_sum4h_pr(a, b)
 + * Extract two half-width registers *b, *c from a full width register a:
 + * gmx_pr_to_2hpr(a, b, c)
 + */
 +
 +
-     int                 nbfp_stride;
 +/* All functionality defines are set here, except for:
 + * CALC_ENERGIES, ENERGY_GROUPS which are defined before.
 + * CHECK_EXCLS, which is set just before including the inner loop contents.
 + * The combination rule defines, LJ_COMB_GEOM or LJ_COMB_LB are currently
 + * set before calling the kernel function. We might want to move that
 + * to inside the n-loop and have a different combination rule for different
 + * ci's, as no combination rule gives a 50% performance hit for LJ.
 + */
 +
 +/* We always calculate shift forces, because it's cheap anyhow */
 +#define CALC_SHIFTFORCES
 +
 +/* Assumes all LJ parameters are identical */
 +/* #define FIX_LJ_C */
 +
 +/* The NBK_FUNC_NAME... macros below generate the whole zoo of kernels names
 + * with all combinations off electrostatics (coul), LJ combination rules (ljc)
 + * and energy calculations (ene), depending on the defines set.
 + */
 +
 +#define NBK_FUNC_NAME_C_LJC(base, coul, ljc, ene) base ## _ ## coul ## _comb_ ## ljc ## _ ## ene
 +
 +#if defined LJ_COMB_GEOM
 +#define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, geom, ene)
 +#else
 +#if defined LJ_COMB_LB
 +#define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, lb, ene)
 +#else
 +#define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, none, ene)
 +#endif
 +#endif
 +
 +#ifdef CALC_COUL_RF
 +#define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, rf, ene)
 +#endif
 +#ifdef CALC_COUL_TAB
 +#ifndef VDW_CUTOFF_CHECK
 +#define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, tab, ene)
 +#else
 +#define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, tab_twin, ene)
 +#endif
 +#endif
 +#ifdef CALC_COUL_EWALD
 +#ifndef VDW_CUTOFF_CHECK
 +#define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, ewald, ene)
 +#else
 +#define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, ewald_twin, ene)
 +#endif
 +#endif
 +
 +static void
 +#ifndef CALC_ENERGIES
 +NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn, noener)
 +#else
 +#ifndef ENERGY_GROUPS
 +NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn, ener)
 +#else
 +NBK_FUNC_NAME(nbnxn_kernel_simd_2xnn, energrp)
 +#endif
 +#endif
 +#undef NBK_FUNC_NAME
 +#undef NBK_FUNC_NAME_C
 +#undef NBK_FUNC_NAME_C_LJC
 +(const nbnxn_pairlist_t     *nbl,
 + const nbnxn_atomdata_t     *nbat,
 + const interaction_const_t  *ic,
 + rvec                       *shift_vec,
 + real                       *f
 +#ifdef CALC_SHIFTFORCES
 + ,
 + real                       *fshift
 +#endif
 +#ifdef CALC_ENERGIES
 + ,
 + real                       *Vvdw,
 + real                       *Vc
 +#endif
 +)
 +{
 +    const nbnxn_ci_t   *nbln;
 +    const nbnxn_cj_t   *l_cj;
 +    const int          *type;
 +    const real         *q;
 +    const real         *shiftvec;
 +    const real         *x;
 +    const real         *nbfp0, *nbfp1, *nbfp2 = NULL, *nbfp3 = NULL;
 +    real                facel;
 +    real               *nbfp_ptr;
-     unsigned   *excl_filter;
- #ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
-     gmx_epi32  filter_S0, filter_S2;
- #else
-     gmx_mm_pr  filter_S0, filter_S2;
- #endif
 +    int                 n, ci, ci_sh;
 +    int                 ish, ish3;
 +    gmx_bool            do_LJ, half_LJ, do_coul;
 +    int                 sci, scix, sciy, sciz, sci2;
 +    int                 cjind0, cjind1, cjind;
 +    int                 ip, jp;
 +
 +#ifdef ENERGY_GROUPS
 +    int         Vstride_i;
 +    int         egps_ishift, egps_imask;
 +    int         egps_jshift, egps_jmask, egps_jstride;
 +    int         egps_i;
 +    real       *vvdwtp[UNROLLI];
 +    real       *vctp[UNROLLI];
 +#endif
 +
 +    gmx_mm_pr  shX_S;
 +    gmx_mm_pr  shY_S;
 +    gmx_mm_pr  shZ_S;
 +    gmx_mm_pr  ix_S0, iy_S0, iz_S0;
 +    gmx_mm_pr  ix_S2, iy_S2, iz_S2;
 +    gmx_mm_pr  fix_S0, fiy_S0, fiz_S0;
 +    gmx_mm_pr  fix_S2, fiy_S2, fiz_S2;
 +    /* We use an i-force SIMD register width of 4 */
 +    /* The pr4 stuff is defined in nbnxn_kernel_simd_utils.h */
 +    gmx_mm_pr4 fix_S, fiy_S, fiz_S;
 +
 +    gmx_mm_pr  diagonal_jmi_S;
 +#if UNROLLI == UNROLLJ
 +    gmx_mm_pb  diagonal_mask_S0, diagonal_mask_S2;
 +#else
 +    gmx_mm_pb  diagonal_mask0_S0, diagonal_mask0_S2;
 +    gmx_mm_pb  diagonal_mask1_S0, diagonal_mask1_S2;
 +#endif
 +
- #if NBFP_STRIDE == 2
-     nbfp_ptr    = nbat->nbfp;
- #else
- #if NBFP_STRIDE == 4
-     nbfp_ptr    = nbat->nbfp_s4;
- #else
- #error "Only NBFP_STRIDE 2 and 4 are currently supported"
- #endif
- #endif
-     nbfp_stride = NBFP_STRIDE;
++    unsigned      *exclusion_filter;
++    gmx_exclfilter filter_S0, filter_S2;
 +
 +    gmx_mm_pr  zero_S = gmx_set1_pr(0);
 +
 +    gmx_mm_pr  one_S = gmx_set1_pr(1.0);
 +    gmx_mm_pr  iq_S0 = gmx_setzero_pr();
 +    gmx_mm_pr  iq_S2 = gmx_setzero_pr();
 +    gmx_mm_pr  mrc_3_S;
 +#ifdef CALC_ENERGIES
 +    gmx_mm_pr  hrc_3_S, moh_rc_S;
 +#endif
 +
 +#ifdef CALC_COUL_TAB
 +    /* Coulomb table variables */
 +    gmx_mm_pr   invtsp_S;
 +    const real *tab_coul_F;
 +#ifndef TAB_FDV0
 +    const real *tab_coul_V;
 +#endif
 +    int        ti0_array[2*GMX_SIMD_WIDTH_HERE], *ti0;
 +    int        ti2_array[2*GMX_SIMD_WIDTH_HERE], *ti2;
 +#ifdef CALC_ENERGIES
 +    gmx_mm_pr  mhalfsp_S;
 +#endif
 +#endif
 +
 +#ifdef CALC_COUL_EWALD
 +    gmx_mm_pr beta2_S, beta_S;
 +#endif
 +
 +#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
 +    gmx_mm_pr  sh_ewald_S;
 +#endif
 +
 +#ifdef LJ_COMB_LB
 +    const real *ljc;
 +
 +    gmx_mm_pr   hsig_i_S0, seps_i_S0;
 +    gmx_mm_pr   hsig_i_S2, seps_i_S2;
 +#else
 +#ifdef FIX_LJ_C
 +    real        pvdw_array[2*UNROLLI*UNROLLJ+GMX_SIMD_WIDTH_HERE];
 +    real       *pvdw_c6, *pvdw_c12;
 +    gmx_mm_pr   c6_S0, c12_S0;
 +    gmx_mm_pr   c6_S2, c12_S2;
 +#endif
 +
 +#ifdef LJ_COMB_GEOM
 +    const real *ljc;
 +
 +    gmx_mm_pr   c6s_S0, c12s_S0;
 +    gmx_mm_pr   c6s_S1, c12s_S1;
 +    gmx_mm_pr   c6s_S2 = gmx_setzero_pr(), c12s_S2 = gmx_setzero_pr();
 +    gmx_mm_pr   c6s_S3 = gmx_setzero_pr(), c12s_S3 = gmx_setzero_pr();
 +#endif
 +#endif /* LJ_COMB_LB */
 +
 +    gmx_mm_pr  vctot_S, Vvdwtot_S;
 +    gmx_mm_pr  sixth_S, twelveth_S;
 +
 +    gmx_mm_pr  avoid_sing_S;
 +    gmx_mm_pr  rc2_S;
 +#ifdef VDW_CUTOFF_CHECK
 +    gmx_mm_pr  rcvdw2_S;
 +#endif
 +
 +#ifdef CALC_ENERGIES
 +    gmx_mm_pr  sh_invrc6_S, sh_invrc12_S;
 +
 +    /* cppcheck-suppress unassignedVariable */
 +    real       tmpsum_array[2*GMX_SIMD_WIDTH_HERE], *tmpsum;
 +#endif
 +#ifdef CALC_SHIFTFORCES
 +    /* cppcheck-suppress unassignedVariable */
 +    real       shf_array[2*GMX_SIMD_WIDTH_HERE], *shf;
 +#endif
 +
 +    int ninner;
 +
 +#ifdef COUNT_PAIRS
 +    int npair = 0;
 +#endif
 +
 +#if defined LJ_COMB_GEOM || defined LJ_COMB_LB
 +    ljc = nbat->lj_comb;
 +#else
 +    /* No combination rule used */
-     /* Load masks for topology exclusion masking */
- #ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
- #define FILTER_STRIDE  (GMX_SIMD_EPI32_WIDTH/GMX_SIMD_WIDTH_HERE)
- #else
- #ifdef GMX_DOUBLE
- #define FILTER_STRIDE  2
- #else
- #define FILTER_STRIDE  1
- #endif
- #endif
- #if FILTER_STRIDE == 1
-     excl_filter = nbat->simd_exclusion_filter1;
- #else
-     excl_filter = nbat->simd_exclusion_filter2;
- #endif
-     /* Here we cast the exclusion filters from unsigned * to int * or real *.
-      * Since we only check bits, the actual value they represent does not
-      * matter, as long as both filter and mask data are treated the same way.
++    nbfp_ptr    = (4 == nbfp_stride) ? nbat->nbfp_s4 : nbat->nbfp;
 +#endif
 +
 +    /* Load j-i for the first i */
 +    diagonal_jmi_S    = gmx_load_pr(nbat->simd_2xnn_diagonal_j_minus_i);
 +    /* Generate all the diagonal masks as comparison results */
 +#if UNROLLI == UNROLLJ
 +    diagonal_mask_S0  = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 +    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 +    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 +    diagonal_mask_S2  = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 +#else
 +#if 2*UNROLLI == UNROLLJ
 +    diagonal_mask0_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 +    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 +    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 +    diagonal_mask0_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 +    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 +    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 +    diagonal_mask1_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 +    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 +    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 +    diagonal_mask1_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 +#endif
 +#endif
 +
- #ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
-     filter_S0 = gmx_load_si((int *)excl_filter + 0*2*UNROLLJ*FILTER_STRIDE);
-     filter_S2 = gmx_load_si((int *)excl_filter + 1*2*UNROLLJ*FILTER_STRIDE);
- #else
-     filter_S0 = gmx_load_pr((real *)excl_filter + 0*2*UNROLLJ);
-     filter_S2 = gmx_load_pr((real *)excl_filter + 1*2*UNROLLJ);
- #endif
- #undef FILTER_STRIDE
++    /* Load masks for topology exclusion masking. filter_stride is
++       static const, so the conditional will be optimized away. */
++    if (1 == filter_stride)
++    {
++        exclusion_filter = nbat->simd_exclusion_filter1;
++    }
++    else /* (2 == filter_stride) */
++    {
++        exclusion_filter = nbat->simd_exclusion_filter2;
++    }
++
++    /* Here we cast the exclusion masks from unsigned * to int * or
++     * real *.  Since we only check bits, the actual value they
++     * represent does not matter, as long as both mask and exclusion
++     * info are treated the same way.
 +     */
-     ti0 = gmx_simd_align_int(ti0_array);
-     ti2 = gmx_simd_align_int(ti2_array);
++    filter_S0 = gmx_load_exclusion_filter(exclusion_filter + 0*2*UNROLLJ*filter_stride);
++    filter_S2 = gmx_load_exclusion_filter(exclusion_filter + 1*2*UNROLLJ*filter_stride);
 +
 +#ifdef CALC_COUL_TAB
 +    /* Generate aligned table index pointers */
- #undef UNROLLI
- #undef UNROLLJ
- #undef STRIDE
- #undef TAB_FDV0
- #undef NBFP_STRIDE
++    ti0 = prepare_table_load_buffer(ti0_array);
++    ti2 = prepare_table_load_buffer(ti2_array);
 +
 +    invtsp_S  = gmx_set1_pr(ic->tabq_scale);
 +#ifdef CALC_ENERGIES
 +    mhalfsp_S = gmx_set1_pr(-0.5/ic->tabq_scale);
 +#endif
 +
 +#ifdef TAB_FDV0
 +    tab_coul_F = ic->tabq_coul_FDV0;
 +#else
 +    tab_coul_F = ic->tabq_coul_F;
 +    tab_coul_V = ic->tabq_coul_V;
 +#endif
 +#endif /* CALC_COUL_TAB */
 +
 +#ifdef CALC_COUL_EWALD
 +    beta2_S = gmx_set1_pr(ic->ewaldcoeff*ic->ewaldcoeff);
 +    beta_S  = gmx_set1_pr(ic->ewaldcoeff);
 +#endif
 +
 +#if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
 +    sh_ewald_S = gmx_set1_pr(ic->sh_ewald);
 +#endif
 +
 +    q                   = nbat->q;
 +    type                = nbat->type;
 +    facel               = ic->epsfac;
 +    shiftvec            = shift_vec[0];
 +    x                   = nbat->x;
 +
 +    avoid_sing_S = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
 +
 +    /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
 +    rc2_S    = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
 +#ifdef VDW_CUTOFF_CHECK
 +    rcvdw2_S = gmx_set1_pr(ic->rvdw*ic->rvdw);
 +#endif
 +
 +#ifdef CALC_ENERGIES
 +    sixth_S      = gmx_set1_pr(1.0/6.0);
 +    twelveth_S   = gmx_set1_pr(1.0/12.0);
 +
 +    sh_invrc6_S  = gmx_set1_pr(ic->sh_invrc6);
 +    sh_invrc12_S = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
 +#endif
 +
 +    mrc_3_S  = gmx_set1_pr(-2*ic->k_rf);
 +
 +#ifdef CALC_ENERGIES
 +    hrc_3_S  = gmx_set1_pr(ic->k_rf);
 +
 +    moh_rc_S = gmx_set1_pr(-ic->c_rf);
 +#endif
 +
 +#ifdef CALC_ENERGIES
 +    tmpsum   = gmx_simd_align_real(tmpsum_array);
 +#endif
 +#ifdef CALC_SHIFTFORCES
 +    shf      = gmx_simd_align_real(shf_array);
 +#endif
 +
 +#ifdef FIX_LJ_C
 +    pvdw_c6  = gmx_simd_align_real(pvdw_array);
 +    pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
 +
 +    for (jp = 0; jp < UNROLLJ; jp++)
 +    {
 +        pvdw_c6 [0*UNROLLJ+jp] = nbat->nbfp[0*2];
 +        pvdw_c6 [1*UNROLLJ+jp] = nbat->nbfp[0*2];
 +        pvdw_c6 [2*UNROLLJ+jp] = nbat->nbfp[0*2];
 +        pvdw_c6 [3*UNROLLJ+jp] = nbat->nbfp[0*2];
 +
 +        pvdw_c12[0*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 +        pvdw_c12[1*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 +        pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 +        pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 +    }
 +    c6_S0            = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
 +    c6_S1            = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
 +    c6_S2            = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
 +    c6_S3            = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
 +
 +    c12_S0           = gmx_load_pr(pvdw_c12+0*UNROLLJ);
 +    c12_S1           = gmx_load_pr(pvdw_c12+1*UNROLLJ);
 +    c12_S2           = gmx_load_pr(pvdw_c12+2*UNROLLJ);
 +    c12_S3           = gmx_load_pr(pvdw_c12+3*UNROLLJ);
 +#endif /* FIX_LJ_C */
 +
 +#ifdef ENERGY_GROUPS
 +    egps_ishift  = nbat->neg_2log;
 +    egps_imask   = (1<<egps_ishift) - 1;
 +    egps_jshift  = 2*nbat->neg_2log;
 +    egps_jmask   = (1<<egps_jshift) - 1;
 +    egps_jstride = (UNROLLJ>>1)*UNROLLJ;
 +    /* Major division is over i-particle energy groups, determine the stride */
 +    Vstride_i    = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
 +#endif
 +
 +    l_cj = nbl->cj;
 +
 +    ninner = 0;
 +    for (n = 0; n < nbl->nci; n++)
 +    {
 +        nbln = &nbl->ci[n];
 +
 +        ish              = (nbln->shift & NBNXN_CI_SHIFT);
 +        ish3             = ish*3;
 +        cjind0           = nbln->cj_ind_start;
 +        cjind1           = nbln->cj_ind_end;
 +        ci               = nbln->ci;
 +        ci_sh            = (ish == CENTRAL ? ci : -1);
 +
 +        shX_S = gmx_load1_pr(shiftvec+ish3);
 +        shY_S = gmx_load1_pr(shiftvec+ish3+1);
 +        shZ_S = gmx_load1_pr(shiftvec+ish3+2);
 +
 +#if UNROLLJ <= 4
 +        sci              = ci*STRIDE;
 +        scix             = sci*DIM;
 +        sci2             = sci*2;
 +#else
 +        sci              = (ci>>1)*STRIDE;
 +        scix             = sci*DIM + (ci & 1)*(STRIDE>>1);
 +        sci2             = sci*2 + (ci & 1)*(STRIDE>>1);
 +        sci             += (ci & 1)*(STRIDE>>1);
 +#endif
 +
 +        /* We have 5 LJ/C combinations, but use only three inner loops,
 +         * as the other combinations are unlikely and/or not much faster:
 +         * inner half-LJ + C for half-LJ + C / no-LJ + C
 +         * inner LJ + C      for full-LJ + C
 +         * inner LJ          for full-LJ + no-C / half-LJ + no-C
 +         */
 +        do_LJ   = (nbln->shift & NBNXN_CI_DO_LJ(0));
 +        do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
 +        half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
 +
 +#ifdef ENERGY_GROUPS
 +        egps_i = nbat->energrp[ci];
 +        {
 +            int ia, egp_ia;
 +
 +            for (ia = 0; ia < UNROLLI; ia++)
 +            {
 +                egp_ia     = (egps_i >> (ia*egps_ishift)) & egps_imask;
 +                vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
 +                vctp[ia]   = Vc   + egp_ia*Vstride_i;
 +            }
 +        }
 +#endif
 +#if defined CALC_ENERGIES
 +#if UNROLLJ == 4
 +        if (do_coul && l_cj[nbln->cj_ind_start].cj == ci_sh)
 +#endif
 +#if UNROLLJ == 8
 +        if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1))
 +#endif
 +        {
 +            int  ia;
 +            real Vc_sub_self;
 +
 +#ifdef CALC_COUL_RF
 +            Vc_sub_self = 0.5*ic->c_rf;
 +#endif
 +#ifdef CALC_COUL_TAB
 +#ifdef TAB_FDV0
 +            Vc_sub_self = 0.5*tab_coul_F[2];
 +#else
 +            Vc_sub_self = 0.5*tab_coul_V[0];
 +#endif
 +#endif
 +#ifdef CALC_COUL_EWALD
 +            /* beta/sqrt(pi) */
 +            Vc_sub_self = 0.5*ic->ewaldcoeff*M_2_SQRTPI;
 +#endif
 +
 +            for (ia = 0; ia < UNROLLI; ia++)
 +            {
 +                real qi;
 +
 +                qi = q[sci+ia];
 +#ifdef ENERGY_GROUPS
 +                vctp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
 +#else
 +                Vc[0]
 +#endif
 +                    -= facel*qi*qi*Vc_sub_self;
 +            }
 +        }
 +#endif
 +
 +        /* Load i atom data */
 +        sciy             = scix + STRIDE;
 +        sciz             = sciy + STRIDE;
 +        gmx_load1p1_pr(&ix_S0, x+scix);
 +        gmx_load1p1_pr(&ix_S2, x+scix+2);
 +        gmx_load1p1_pr(&iy_S0, x+sciy);
 +        gmx_load1p1_pr(&iy_S2, x+sciy+2);
 +        gmx_load1p1_pr(&iz_S0, x+sciz);
 +        gmx_load1p1_pr(&iz_S2, x+sciz+2);
 +        ix_S0          = gmx_add_pr(ix_S0, shX_S);
 +        ix_S2          = gmx_add_pr(ix_S2, shX_S);
 +        iy_S0          = gmx_add_pr(iy_S0, shY_S);
 +        iy_S2          = gmx_add_pr(iy_S2, shY_S);
 +        iz_S0          = gmx_add_pr(iz_S0, shZ_S);
 +        iz_S2          = gmx_add_pr(iz_S2, shZ_S);
 +
 +        if (do_coul)
 +        {
 +            gmx_mm_pr facel_S;
 +
 +            facel_S    = gmx_set1_pr(facel);
 +
 +            gmx_load1p1_pr(&iq_S0, q+sci);
 +            gmx_load1p1_pr(&iq_S2, q+sci+2);
 +            iq_S0      = gmx_mul_pr(facel_S, iq_S0);
 +            iq_S2      = gmx_mul_pr(facel_S, iq_S2);
 +        }
 +
 +#ifdef LJ_COMB_LB
 +        gmx_load1p1_pr(&hsig_i_S0, ljc+sci2+0);
 +        gmx_load1p1_pr(&hsig_i_S2, ljc+sci2+2);
 +        gmx_load1p1_pr(&seps_i_S0, ljc+sci2+STRIDE+0);
 +        gmx_load1p1_pr(&seps_i_S2, ljc+sci2+STRIDE+2);
 +#else
 +#ifdef LJ_COMB_GEOM
 +        gmx_load1p1_pr(&c6s_S0, ljc+sci2+0);
 +        if (!half_LJ)
 +        {
 +            gmx_load1p1_pr(&c6s_S2, ljc+sci2+2);
 +        }
 +        gmx_load1p1_pr(&c12s_S0, ljc+sci2+STRIDE+0);
 +        if (!half_LJ)
 +        {
 +            gmx_load1p1_pr(&c12s_S2, ljc+sci2+STRIDE+2);
 +        }
 +#else
 +        nbfp0     = nbfp_ptr + type[sci  ]*nbat->ntype*nbfp_stride;
 +        nbfp1     = nbfp_ptr + type[sci+1]*nbat->ntype*nbfp_stride;
 +        if (!half_LJ)
 +        {
 +            nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*nbfp_stride;
 +            nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*nbfp_stride;
 +        }
 +#endif
 +#endif
 +
 +        /* Zero the potential energy for this list */
 +        Vvdwtot_S        = gmx_setzero_pr();
 +        vctot_S          = gmx_setzero_pr();
 +
 +        /* Clear i atom forces */
 +        fix_S0           = gmx_setzero_pr();
 +        fix_S2           = gmx_setzero_pr();
 +        fiy_S0           = gmx_setzero_pr();
 +        fiy_S2           = gmx_setzero_pr();
 +        fiz_S0           = gmx_setzero_pr();
 +        fiz_S2           = gmx_setzero_pr();
 +
 +        cjind = cjind0;
 +
 +        /* Currently all kernels use (at least half) LJ */
 +#define CALC_LJ
 +        if (half_LJ)
 +        {
 +#define CALC_COULOMB
 +#define HALF_LJ
 +#define CHECK_EXCLS
 +            while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
 +            {
 +#include "nbnxn_kernel_simd_2xnn_inner.h"
 +                cjind++;
 +            }
 +#undef CHECK_EXCLS
 +            for (; (cjind < cjind1); cjind++)
 +            {
 +#include "nbnxn_kernel_simd_2xnn_inner.h"
 +            }
 +#undef HALF_LJ
 +#undef CALC_COULOMB
 +        }
 +        else if (do_coul)
 +        {
 +#define CALC_COULOMB
 +#define CHECK_EXCLS
 +            while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
 +            {
 +#include "nbnxn_kernel_simd_2xnn_inner.h"
 +                cjind++;
 +            }
 +#undef CHECK_EXCLS
 +            for (; (cjind < cjind1); cjind++)
 +            {
 +#include "nbnxn_kernel_simd_2xnn_inner.h"
 +            }
 +#undef CALC_COULOMB
 +        }
 +        else
 +        {
 +#define CHECK_EXCLS
 +            while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
 +            {
 +#include "nbnxn_kernel_simd_2xnn_inner.h"
 +                cjind++;
 +            }
 +#undef CHECK_EXCLS
 +            for (; (cjind < cjind1); cjind++)
 +            {
 +#include "nbnxn_kernel_simd_2xnn_inner.h"
 +            }
 +        }
 +#undef CALC_LJ
 +        ninner += cjind1 - cjind0;
 +
 +        /* Add accumulated i-forces to the force array */
 +        fix_S = gmx_mm_transpose_sum4h_pr(fix_S0, fix_S2);
 +        gmx_store_pr4(f+scix, gmx_add_pr4(fix_S, gmx_load_pr4(f+scix)));
 +
 +        fiy_S = gmx_mm_transpose_sum4h_pr(fiy_S0, fiy_S2);
 +        gmx_store_pr4(f+sciy, gmx_add_pr4(fiy_S, gmx_load_pr4(f+sciy)));
 +
 +        fiz_S = gmx_mm_transpose_sum4h_pr(fiz_S0, fiz_S2);
 +        gmx_store_pr4(f+sciz, gmx_add_pr4(fiz_S, gmx_load_pr4(f+sciz)));
 +
 +#ifdef CALC_SHIFTFORCES
 +        gmx_store_pr4(shf, fix_S);
 +        fshift[ish3+0] += SUM_SIMD4(shf);
 +        gmx_store_pr4(shf, fiy_S);
 +        fshift[ish3+1] += SUM_SIMD4(shf);
 +        gmx_store_pr4(shf, fiz_S);
 +        fshift[ish3+2] += SUM_SIMD4(shf);
 +#endif
 +
 +#ifdef CALC_ENERGIES
 +        if (do_coul)
 +        {
 +            gmx_store_pr(tmpsum, vctot_S);
 +            *Vc += SUM_SIMD(tmpsum);
 +        }
 +
 +        gmx_store_pr(tmpsum, Vvdwtot_S);
 +        *Vvdw += SUM_SIMD(tmpsum);
 +#endif
 +
 +        /* Outer loop uses 6 flops/iteration */
 +    }
 +
 +#ifdef COUNT_PAIRS
 +    printf("atom pairs %d\n", npair);
 +#endif
 +}
 +
 +
 +#undef CALC_SHIFTFORCES
index 3faedd9b4538e48fa6727e4fb9a46127da2bafe0,0000000000000000000000000000000000000000..ca07b5bc3babcbdae8fecb078337ee09bb088c3e
mode 100644,000000..100644
--- /dev/null
@@@ -1,335 -1,0 +1,382 @@@
 +/*
 + * This file is part of the GROMACS molecular simulation package.
 + *
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 + * Copyright (c) 2012, by the GROMACS development team, led by
 + * David van der Spoel, Berk Hess, Erik Lindahl, and including many
 + * others, as listed in the AUTHORS file in the top-level source
 + * directory and at http://www.gromacs.org.
 + *
 + * GROMACS is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2.1
 + * of the License, or (at your option) any later version.
 + *
 + * GROMACS is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with GROMACS; if not, see
 + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 + *
 + * If you want to redistribute modifications to GROMACS, please
 + * consider that scientific software is very special. Version
 + * control is crucial - bugs must be traceable. We will be happy to
 + * consider code for inclusion in the official distribution, but
 + * derived work must not be called official GROMACS. Details are found
 + * in the README & COPYING files - if they are missing, get the
 + * official version at http://www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "typedefs.h"
 +#include "vec.h"
 +#include "smalloc.h"
 +#include "force.h"
 +#include "gmx_omp_nthreads.h"
 +#include "../nbnxn_consts.h"
 +#include "nbnxn_kernel_common.h"
 +
 +#ifdef GMX_NBNXN_SIMD_4XN
 +
 +#ifdef GMX_NBNXN_HALF_WIDTH_SIMD
 +#define GMX_USE_HALF_WIDTH_SIMD_HERE
 +#endif
 +#include "gmx_simd_macros.h"
 +#include "gmx_simd_vec.h"
 +
 +#include "nbnxn_kernel_simd_4xn.h"
 +
 +#if !(GMX_SIMD_WIDTH_HERE == 2 || GMX_SIMD_WIDTH_HERE == 4 || GMX_SIMD_WIDTH_HERE == 8)
 +#error "unsupported SIMD width"
 +#endif
 +
++#define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
++
++#define UNROLLI    NBNXN_CPU_CLUSTER_I_SIZE
++#define UNROLLJ    GMX_SIMD_WIDTH_HERE
++
++/* The stride of all the atom data arrays is max(UNROLLI,UNROLLJ) */
++#if GMX_SIMD_WIDTH_HERE >= UNROLLI
++#define STRIDE     GMX_SIMD_WIDTH_HERE
++#else
++#define STRIDE     UNROLLI
++#endif
++
++#if GMX_SIMD_WIDTH_HERE == 2
++#define SUM_SIMD(x)  (x[0]+x[1])
++#else
++#if GMX_SIMD_WIDTH_HERE == 4
++#define SUM_SIMD(x)  SUM_SIMD4(x)
++#else
++#if GMX_SIMD_WIDTH_HERE == 8
++#define SUM_SIMD(x)  (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
++#else
++#error "unsupported kernel configuration"
++#endif
++#endif
++#endif
++
++
++#include "nbnxn_kernel_simd_utils.h"
++
++static inline void
++gmx_load_simd_4xn_interactions(int            excl,
++                               gmx_exclfilter filter_S0,
++                               gmx_exclfilter filter_S1,
++                               gmx_exclfilter filter_S2,
++                               gmx_exclfilter filter_S3,
++                               gmx_mm_pb     *interact_S0,
++                               gmx_mm_pb     *interact_S1,
++                               gmx_mm_pb     *interact_S2,
++                               gmx_mm_pb     *interact_S3)
++{
++    /* Load integer interaction mask */
++    gmx_exclfilter mask_pr_S = gmx_load1_exclfilter(excl);
++    *interact_S0  = gmx_checkbitmask_pb(mask_pr_S, filter_S0);
++    *interact_S1  = gmx_checkbitmask_pb(mask_pr_S, filter_S1);
++    *interact_S2  = gmx_checkbitmask_pb(mask_pr_S, filter_S2);
++    *interact_S3  = gmx_checkbitmask_pb(mask_pr_S, filter_S3);
++}
 +
 +/* Include all flavors of the SSE or AVX 4xN kernel loops */
 +
 +/* Analytical reaction-field kernels */
 +#define CALC_COUL_RF
 +
 +#include "nbnxn_kernel_simd_4xn_includes.h"
 +
 +#undef CALC_COUL_RF
 +
 +/* Tabulated exclusion interaction electrostatics kernels */
 +#define CALC_COUL_TAB
 +
 +/* Single cut-off: rcoulomb = rvdw */
 +#include "nbnxn_kernel_simd_4xn_includes.h"
 +
 +/* Twin cut-off: rcoulomb >= rvdw */
 +#define VDW_CUTOFF_CHECK
 +#include "nbnxn_kernel_simd_4xn_includes.h"
 +#undef VDW_CUTOFF_CHECK
 +
 +#undef CALC_COUL_TAB
 +
 +/* Analytical Ewald exclusion interaction electrostatics kernels */
 +#define CALC_COUL_EWALD
 +
 +/* Single cut-off: rcoulomb = rvdw */
 +#include "nbnxn_kernel_simd_4xn_includes.h"
 +
 +/* Twin cut-off: rcoulomb >= rvdw */
 +#define VDW_CUTOFF_CHECK
 +#include "nbnxn_kernel_simd_4xn_includes.h"
 +#undef VDW_CUTOFF_CHECK
 +
 +#undef CALC_COUL_EWALD
 +
 +
 +typedef void (*p_nbk_func_ener)(const nbnxn_pairlist_t     *nbl,
 +                                const nbnxn_atomdata_t     *nbat,
 +                                const interaction_const_t  *ic,
 +                                rvec                       *shift_vec,
 +                                real                       *f,
 +                                real                       *fshift,
 +                                real                       *Vvdw,
 +                                real                       *Vc);
 +
 +typedef void (*p_nbk_func_noener)(const nbnxn_pairlist_t     *nbl,
 +                                  const nbnxn_atomdata_t     *nbat,
 +                                  const interaction_const_t  *ic,
 +                                  rvec                       *shift_vec,
 +                                  real                       *f,
 +                                  real                       *fshift);
 +
 +enum {
 +    coultRF, coultTAB, coultTAB_TWIN, coultEWALD, coultEWALD_TWIN, coultNR
 +};
 +
 +#define NBK_FN(elec, ljcomb) nbnxn_kernel_simd_4xn_ ## elec ## _comb_ ## ljcomb ## _ener
 +static p_nbk_func_ener p_nbk_ener[coultNR][ljcrNR] =
 +{ { NBK_FN(rf, geom), NBK_FN(rf, lb), NBK_FN(rf, none) },
 +  { NBK_FN(tab, geom), NBK_FN(tab, lb), NBK_FN(tab, none) },
 +  { NBK_FN(tab_twin, geom), NBK_FN(tab_twin, lb), NBK_FN(tab_twin, none) },
 +  { NBK_FN(ewald, geom), NBK_FN(ewald, lb), NBK_FN(ewald, none) },
 +  { NBK_FN(ewald_twin, geom), NBK_FN(ewald_twin, lb), NBK_FN(ewald_twin, none) } };
 +#undef NBK_FN
 +
 +#define NBK_FN(elec, ljcomb) nbnxn_kernel_simd_4xn_ ## elec ## _comb_ ## ljcomb ## _energrp
 +static p_nbk_func_ener p_nbk_energrp[coultNR][ljcrNR] =
 +{ { NBK_FN(rf, geom), NBK_FN(rf, lb), NBK_FN(rf, none) },
 +  { NBK_FN(tab, geom), NBK_FN(tab, lb), NBK_FN(tab, none) },
 +  { NBK_FN(tab_twin, geom), NBK_FN(tab_twin, lb), NBK_FN(tab_twin, none) },
 +  { NBK_FN(ewald, geom), NBK_FN(ewald, lb), NBK_FN(ewald, none) },
 +  { NBK_FN(ewald_twin, geom), NBK_FN(ewald_twin, lb), NBK_FN(ewald_twin, none) } };
 +#undef NBK_FN
 +
 +#define NBK_FN(elec, ljcomb) nbnxn_kernel_simd_4xn_ ## elec ## _comb_ ## ljcomb ## _noener
 +static p_nbk_func_noener p_nbk_noener[coultNR][ljcrNR] =
 +{ { NBK_FN(rf, geom), NBK_FN(rf, lb), NBK_FN(rf, none) },
 +  { NBK_FN(tab, geom), NBK_FN(tab, lb), NBK_FN(tab, none) },
 +  { NBK_FN(tab_twin, geom), NBK_FN(tab_twin, lb), NBK_FN(tab_twin, none) },
 +  { NBK_FN(ewald, geom), NBK_FN(ewald, lb), NBK_FN(ewald, none) },
 +  { NBK_FN(ewald_twin, geom), NBK_FN(ewald_twin, lb), NBK_FN(ewald_twin, none) } };
 +#undef NBK_FN
 +
 +
 +static void reduce_group_energies(int ng, int ng_2log,
 +                                  const real *VSvdw, const real *VSc,
 +                                  real *Vvdw, real *Vc)
 +{
 +    const int unrollj      = GMX_SIMD_WIDTH_HERE;
 +    const int unrollj_half = unrollj/2;
 +    int       ng_p2, i, j, j0, j1, c, s;
 +
 +    ng_p2 = (1<<ng_2log);
 +
 +    /* The size of the x86 SIMD energy group buffer array is:
 +     * ng*ng*ng_p2*unrollj_half*simd_width
 +     */
 +    for (i = 0; i < ng; i++)
 +    {
 +        for (j = 0; j < ng; j++)
 +        {
 +            Vvdw[i*ng+j] = 0;
 +            Vc[i*ng+j]   = 0;
 +        }
 +
 +        for (j1 = 0; j1 < ng; j1++)
 +        {
 +            for (j0 = 0; j0 < ng; j0++)
 +            {
 +                c = ((i*ng + j1)*ng_p2 + j0)*unrollj_half*unrollj;
 +                for (s = 0; s < unrollj_half; s++)
 +                {
 +                    Vvdw[i*ng+j0] += VSvdw[c+0];
 +                    Vvdw[i*ng+j1] += VSvdw[c+1];
 +                    Vc  [i*ng+j0] += VSc  [c+0];
 +                    Vc  [i*ng+j1] += VSc  [c+1];
 +                    c             += unrollj + 2;
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +#endif /* GMX_NBNXN_SIMD_4XN */
 +
 +void
 +nbnxn_kernel_simd_4xn(nbnxn_pairlist_set_t       *nbl_list,
 +                      const nbnxn_atomdata_t     *nbat,
 +                      const interaction_const_t  *ic,
 +                      int                         ewald_excl,
 +                      rvec                       *shift_vec,
 +                      int                         force_flags,
 +                      int                         clearF,
 +                      real                       *fshift,
 +                      real                       *Vc,
 +                      real                       *Vvdw)
 +#ifdef GMX_NBNXN_SIMD_4XN
 +{
 +    int                nnbl;
 +    nbnxn_pairlist_t **nbl;
 +    int                coult;
 +    int                nb;
 +
 +    nnbl = nbl_list->nnbl;
 +    nbl  = nbl_list->nbl;
 +
 +    if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
 +    {
 +        coult = coultRF;
 +    }
 +    else
 +    {
 +        if (ewald_excl == ewaldexclTable)
 +        {
 +            if (ic->rcoulomb == ic->rvdw)
 +            {
 +                coult = coultTAB;
 +            }
 +            else
 +            {
 +                coult = coultTAB_TWIN;
 +            }
 +        }
 +        else
 +        {
 +            if (ic->rcoulomb == ic->rvdw)
 +            {
 +                coult = coultEWALD;
 +            }
 +            else
 +            {
 +                coult = coultEWALD_TWIN;
 +            }
 +        }
 +    }
 +
 +#pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded))
 +    for (nb = 0; nb < nnbl; nb++)
 +    {
 +        nbnxn_atomdata_output_t *out;
 +        real                    *fshift_p;
 +
 +        out = &nbat->out[nb];
 +
 +        if (clearF == enbvClearFYes)
 +        {
 +            clear_f(nbat, nb, out->f);
 +        }
 +
 +        if ((force_flags & GMX_FORCE_VIRIAL) && nnbl == 1)
 +        {
 +            fshift_p = fshift;
 +        }
 +        else
 +        {
 +            fshift_p = out->fshift;
 +
 +            if (clearF == enbvClearFYes)
 +            {
 +                clear_fshift(fshift_p);
 +            }
 +        }
 +
 +        /* With Ewald type electrostatics we the forces for excluded atom pairs
 +         * should not contribute to the virial sum. The exclusion forces
 +         * are not calculate in the energy kernels, but are in _noener.
 +         */
 +        if (!((force_flags & GMX_FORCE_ENERGY) ||
 +              (EEL_FULL(ic->eeltype) && (force_flags & GMX_FORCE_VIRIAL))))
 +        {
 +            /* Don't calculate energies */
 +            p_nbk_noener[coult][nbat->comb_rule](nbl[nb], nbat,
 +                                                 ic,
 +                                                 shift_vec,
 +                                                 out->f,
 +                                                 fshift_p);
 +        }
 +        else if (out->nV == 1 || !(force_flags & GMX_FORCE_ENERGY))
 +        {
 +            /* No energy groups */
 +            out->Vvdw[0] = 0;
 +            out->Vc[0]   = 0;
 +
 +            p_nbk_ener[coult][nbat->comb_rule](nbl[nb], nbat,
 +                                               ic,
 +                                               shift_vec,
 +                                               out->f,
 +                                               fshift_p,
 +                                               out->Vvdw,
 +                                               out->Vc);
 +        }
 +        else
 +        {
 +            /* Calculate energy group contributions */
 +            int i;
 +
 +            for (i = 0; i < out->nVS; i++)
 +            {
 +                out->VSvdw[i] = 0;
 +            }
 +            for (i = 0; i < out->nVS; i++)
 +            {
 +                out->VSc[i] = 0;
 +            }
 +
 +            p_nbk_energrp[coult][nbat->comb_rule](nbl[nb], nbat,
 +                                                  ic,
 +                                                  shift_vec,
 +                                                  out->f,
 +                                                  fshift_p,
 +                                                  out->VSvdw,
 +                                                  out->VSc);
 +
 +            reduce_group_energies(nbat->nenergrp, nbat->neg_2log,
 +                                  out->VSvdw, out->VSc,
 +                                  out->Vvdw, out->Vc);
 +        }
 +    }
 +
 +    if (force_flags & GMX_FORCE_ENERGY)
 +    {
 +        reduce_energies_over_lists(nbat, nnbl, Vvdw, Vc);
 +    }
 +}
 +#else
 +{
 +    gmx_incons("nbnxn_kernel_simd_4xn called while GROMACS was configured without 4xN SIMD kernels enabled");
 +}
 +#endif
index 241c180c901bb8709eb12e8a290e4c40d886c543,0000000000000000000000000000000000000000..c0e5e01bd87300daf51644ee95b2f78e2db06271
mode 100644,000000..100644
--- /dev/null
@@@ -1,1001 -1,0 +1,977 @@@
- #ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
-     {
-         /* Load integer topology exclusion interaction mask */
-         gmx_epi32 mask_pr_S = gmx_set1_epi32(l_cj[cjind].excl);
-         interact_S0  = gmx_checkbitmask_epi32(mask_pr_S, filter_S0);
-         interact_S1  = gmx_checkbitmask_epi32(mask_pr_S, filter_S1);
-         interact_S2  = gmx_checkbitmask_epi32(mask_pr_S, filter_S2);
-         interact_S3  = gmx_checkbitmask_epi32(mask_pr_S, filter_S3);
-     }
- #else
- #ifdef GMX_SIMD_HAVE_CHECKBITMASK_PR
-     {
-         /* Integer mask set, cast to real and real mask operations */
-         gmx_mm_pr mask_pr_S = gmx_castsi_pr(gmx_set1_epi32(l_cj[cjind].excl));
-         interact_S0  = gmx_checkbitmask_pr(mask_pr_S, filter_S0);
-         interact_S1  = gmx_checkbitmask_pr(mask_pr_S, filter_S1);
-         interact_S2  = gmx_checkbitmask_pr(mask_pr_S, filter_S2);
-         interact_S3  = gmx_checkbitmask_pr(mask_pr_S, filter_S3);
-     }
- #else
- #error "No SIMD bitmask operation available"
- #endif
- #endif
 +/*
 + * This file is part of the GROMACS molecular simulation package.
 + *
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2009, The GROMACS Development Team
 + * Copyright (c) 2012, by the GROMACS development team, led by
 + * David van der Spoel, Berk Hess, Erik Lindahl, and including many
 + * others, as listed in the AUTHORS file in the top-level source
 + * directory and at http://www.gromacs.org.
 + *
 + * GROMACS is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2.1
 + * of the License, or (at your option) any later version.
 + *
 + * GROMACS is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with GROMACS; if not, see
 + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 + *
 + * If you want to redistribute modifications to GROMACS, please
 + * consider that scientific software is very special. Version
 + * control is crucial - bugs must be traceable. We will be happy to
 + * consider code for inclusion in the official distribution, but
 + * derived work must not be called official GROMACS. Details are found
 + * in the README & COPYING files - if they are missing, get the
 + * official version at http://www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org.
 + */
 +
 +/* This is the innermost loop contents for the 4 x N atom SIMD kernel.
 + * This flavor of the kernel calculates interactions of 4 i-atoms
 + * with N j-atoms stored in N wide SIMD registers.
 + */
 +
 +
 +/* When calculating RF or Ewald interactions we calculate the electrostatic
 + * forces on excluded atom pairs here in the non-bonded loops.
 + * But when energies and/or virial is required we calculate them
 + * separately to as then it is easier to separate the energy and virial
 + * contributions.
 + */
 +#if defined CHECK_EXCLS && defined CALC_COULOMB
 +#define EXCL_FORCES
 +#endif
 +
 +/* Without exclusions and energies we only need to mask the cut-off,
 + * this can be faster when we have defined gmx_blendv_pr, i.e. an instruction
 + * that selects from two SIMD registers based on the contents of a third.
 + */
 +#if !(defined CHECK_EXCLS || defined CALC_ENERGIES) && defined GMX_SIMD_HAVE_BLENDV
 +/* With RF and tabulated Coulomb we replace cmp+and with sub+blendv.
 + * With gcc this is slower, except for RF on Sandy Bridge.
 + * Tested with gcc 4.6.2, 4.6.3 and 4.7.1.
 + */
 +#if (defined CALC_COUL_RF || defined CALC_COUL_TAB) && (!defined __GNUC__ || (defined CALC_COUL_RF && defined GMX_X86_AVX_256))
 +#define NBNXN_CUTOFF_USE_BLENDV
 +#endif
 +/* With analytical Ewald we replace cmp+and+and with sub+blendv+blendv.
 + * This is only faster with icc on Sandy Bridge (PS kernel slower than gcc 4.7).
 + * Tested with icc 13.
 + */
 +#if defined CALC_COUL_EWALD && defined __INTEL_COMPILER && defined GMX_X86_AVX_256
 +#define NBNXN_CUTOFF_USE_BLENDV
 +#endif
 +#endif
 +
 +{
 +    int        cj, aj, ajx, ajy, ajz;
 +
 +#ifdef ENERGY_GROUPS
 +    /* Energy group indices for two atoms packed into one int */
 +    int        egp_jj[UNROLLJ/2];
 +#endif
 +
 +#ifdef CHECK_EXCLS
 +    /* Interaction (non-exclusion) mask of all 1's or 0's */
 +    gmx_mm_pb  interact_S0;
 +    gmx_mm_pb  interact_S1;
 +    gmx_mm_pb  interact_S2;
 +    gmx_mm_pb  interact_S3;
 +#endif
 +
 +    gmx_mm_pr  jx_S, jy_S, jz_S;
 +    gmx_mm_pr  dx_S0, dy_S0, dz_S0;
 +    gmx_mm_pr  dx_S1, dy_S1, dz_S1;
 +    gmx_mm_pr  dx_S2, dy_S2, dz_S2;
 +    gmx_mm_pr  dx_S3, dy_S3, dz_S3;
 +    gmx_mm_pr  tx_S0, ty_S0, tz_S0;
 +    gmx_mm_pr  tx_S1, ty_S1, tz_S1;
 +    gmx_mm_pr  tx_S2, ty_S2, tz_S2;
 +    gmx_mm_pr  tx_S3, ty_S3, tz_S3;
 +    gmx_mm_pr  rsq_S0, rinv_S0, rinvsq_S0;
 +    gmx_mm_pr  rsq_S1, rinv_S1, rinvsq_S1;
 +    gmx_mm_pr  rsq_S2, rinv_S2, rinvsq_S2;
 +    gmx_mm_pr  rsq_S3, rinv_S3, rinvsq_S3;
 +#ifndef NBNXN_CUTOFF_USE_BLENDV
 +    /* wco: within cut-off, mask of all 1's or 0's */
 +    gmx_mm_pb  wco_S0;
 +    gmx_mm_pb  wco_S1;
 +    gmx_mm_pb  wco_S2;
 +    gmx_mm_pb  wco_S3;
 +#endif
 +#ifdef VDW_CUTOFF_CHECK
 +    gmx_mm_pb  wco_vdw_S0;
 +    gmx_mm_pb  wco_vdw_S1;
 +#ifndef HALF_LJ
 +    gmx_mm_pb  wco_vdw_S2;
 +    gmx_mm_pb  wco_vdw_S3;
 +#endif
 +#endif
 +#ifdef CALC_COULOMB
 +#ifdef CHECK_EXCLS
 +    /* 1/r masked with the interaction mask */
 +    gmx_mm_pr  rinv_ex_S0;
 +    gmx_mm_pr  rinv_ex_S1;
 +    gmx_mm_pr  rinv_ex_S2;
 +    gmx_mm_pr  rinv_ex_S3;
 +#endif
 +    gmx_mm_pr  jq_S;
 +    gmx_mm_pr  qq_S0;
 +    gmx_mm_pr  qq_S1;
 +    gmx_mm_pr  qq_S2;
 +    gmx_mm_pr  qq_S3;
 +#ifdef CALC_COUL_TAB
 +    /* The force (PME mesh force) we need to subtract from 1/r^2 */
 +    gmx_mm_pr  fsub_S0;
 +    gmx_mm_pr  fsub_S1;
 +    gmx_mm_pr  fsub_S2;
 +    gmx_mm_pr  fsub_S3;
 +#endif
 +#ifdef CALC_COUL_EWALD
 +    gmx_mm_pr  brsq_S0, brsq_S1, brsq_S2, brsq_S3;
 +    gmx_mm_pr  ewcorr_S0, ewcorr_S1, ewcorr_S2, ewcorr_S3;
 +#endif
 +
 +    /* frcoul = (1/r - fsub)*r */
 +    gmx_mm_pr  frcoul_S0;
 +    gmx_mm_pr  frcoul_S1;
 +    gmx_mm_pr  frcoul_S2;
 +    gmx_mm_pr  frcoul_S3;
 +#ifdef CALC_COUL_TAB
 +    /* For tables: r, rs=r/sp, rf=floor(rs), frac=rs-rf */
 +    gmx_mm_pr  r_S0, rs_S0, rf_S0, frac_S0;
 +    gmx_mm_pr  r_S1, rs_S1, rf_S1, frac_S1;
 +    gmx_mm_pr  r_S2, rs_S2, rf_S2, frac_S2;
 +    gmx_mm_pr  r_S3, rs_S3, rf_S3, frac_S3;
 +    /* Table index: rs truncated to an int */
 +    gmx_epi32  ti_S0, ti_S1, ti_S2, ti_S3;
 +    /* Linear force table values */
 +    gmx_mm_pr  ctab0_S0, ctab1_S0;
 +    gmx_mm_pr  ctab0_S1, ctab1_S1;
 +    gmx_mm_pr  ctab0_S2, ctab1_S2;
 +    gmx_mm_pr  ctab0_S3, ctab1_S3;
 +#ifdef CALC_ENERGIES
 +    /* Quadratic energy table value */
 +    gmx_mm_pr  ctabv_S0;
 +    gmx_mm_pr  ctabv_S1;
 +    gmx_mm_pr  ctabv_S2;
 +    gmx_mm_pr  ctabv_S3;
 +#endif
 +#endif
 +#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
 +    /* The potential (PME mesh) we need to subtract from 1/r */
 +    gmx_mm_pr  vc_sub_S0;
 +    gmx_mm_pr  vc_sub_S1;
 +    gmx_mm_pr  vc_sub_S2;
 +    gmx_mm_pr  vc_sub_S3;
 +#endif
 +#ifdef CALC_ENERGIES
 +    /* Electrostatic potential */
 +    gmx_mm_pr  vcoul_S0;
 +    gmx_mm_pr  vcoul_S1;
 +    gmx_mm_pr  vcoul_S2;
 +    gmx_mm_pr  vcoul_S3;
 +#endif
 +#endif
 +    /* The force times 1/r */
 +    gmx_mm_pr  fscal_S0;
 +    gmx_mm_pr  fscal_S1;
 +    gmx_mm_pr  fscal_S2;
 +    gmx_mm_pr  fscal_S3;
 +
 +#ifdef CALC_LJ
 +#ifdef LJ_COMB_LB
 +    /* LJ sigma_j/2 and sqrt(epsilon_j) */
 +    gmx_mm_pr  hsig_j_S, seps_j_S;
 +    /* LJ sigma_ij and epsilon_ij */
 +    gmx_mm_pr  sig_S0, eps_S0;
 +    gmx_mm_pr  sig_S1, eps_S1;
 +#ifndef HALF_LJ
 +    gmx_mm_pr  sig_S2, eps_S2;
 +    gmx_mm_pr  sig_S3, eps_S3;
 +#endif
 +#ifdef CALC_ENERGIES
 +    gmx_mm_pr  sig2_S0, sig6_S0;
 +    gmx_mm_pr  sig2_S1, sig6_S1;
 +#ifndef HALF_LJ
 +    gmx_mm_pr  sig2_S2, sig6_S2;
 +    gmx_mm_pr  sig2_S3, sig6_S3;
 +#endif
 +#endif /* LJ_COMB_LB */
 +#endif /* CALC_LJ */
 +
 +#ifdef LJ_COMB_GEOM
 +    gmx_mm_pr  c6s_j_S, c12s_j_S;
 +#endif
 +
 +#if defined LJ_COMB_GEOM || defined LJ_COMB_LB
 +    /* Index for loading LJ parameters, complicated when interleaving */
 +    int         aj2;
 +#endif
 +
 +#ifndef FIX_LJ_C
 +    /* LJ C6 and C12 parameters, used with geometric comb. rule */
 +    gmx_mm_pr  c6_S0, c12_S0;
 +    gmx_mm_pr  c6_S1, c12_S1;
 +#ifndef HALF_LJ
 +    gmx_mm_pr  c6_S2, c12_S2;
 +    gmx_mm_pr  c6_S3, c12_S3;
 +#endif
 +#endif
 +
 +    /* Intermediate variables for LJ calculation */
 +#ifndef LJ_COMB_LB
 +    gmx_mm_pr  rinvsix_S0;
 +    gmx_mm_pr  rinvsix_S1;
 +#ifndef HALF_LJ
 +    gmx_mm_pr  rinvsix_S2;
 +    gmx_mm_pr  rinvsix_S3;
 +#endif
 +#endif
 +#ifdef LJ_COMB_LB
 +    gmx_mm_pr  sir_S0, sir2_S0, sir6_S0;
 +    gmx_mm_pr  sir_S1, sir2_S1, sir6_S1;
 +#ifndef HALF_LJ
 +    gmx_mm_pr  sir_S2, sir2_S2, sir6_S2;
 +    gmx_mm_pr  sir_S3, sir2_S3, sir6_S3;
 +#endif
 +#endif
 +
 +    gmx_mm_pr  FrLJ6_S0, FrLJ12_S0;
 +    gmx_mm_pr  FrLJ6_S1, FrLJ12_S1;
 +#ifndef HALF_LJ
 +    gmx_mm_pr  FrLJ6_S2, FrLJ12_S2;
 +    gmx_mm_pr  FrLJ6_S3, FrLJ12_S3;
 +#endif
 +#ifdef CALC_ENERGIES
 +    gmx_mm_pr  VLJ6_S0, VLJ12_S0, VLJ_S0;
 +    gmx_mm_pr  VLJ6_S1, VLJ12_S1, VLJ_S1;
 +#ifndef HALF_LJ
 +    gmx_mm_pr  VLJ6_S2, VLJ12_S2, VLJ_S2;
 +    gmx_mm_pr  VLJ6_S3, VLJ12_S3, VLJ_S3;
 +#endif
 +#endif
 +#endif /* CALC_LJ */
 +
 +    /* j-cluster index */
 +    cj            = l_cj[cjind].cj;
 +
 +    /* Atom indices (of the first atom in the cluster) */
 +    aj            = cj*UNROLLJ;
 +#if defined CALC_LJ && (defined LJ_COMB_GEOM || defined LJ_COMB_LB)
 +#if UNROLLJ == STRIDE
 +    aj2           = aj*2;
 +#else
 +    aj2           = (cj>>1)*2*STRIDE + (cj & 1)*UNROLLJ;
 +#endif
 +#endif
 +#if UNROLLJ == STRIDE
 +    ajx           = aj*DIM;
 +#else
 +    ajx           = (cj>>1)*DIM*STRIDE + (cj & 1)*UNROLLJ;
 +#endif
 +    ajy           = ajx + STRIDE;
 +    ajz           = ajy + STRIDE;
 +
 +#ifdef CHECK_EXCLS
++    gmx_load_simd_4xn_interactions(l_cj[cjind].excl, filter_S0, filter_S1, filter_S2, filter_S3, &interact_S0, &interact_S1, &interact_S2, &interact_S3);
 +#endif /* CHECK_EXCLS */
 +
 +    /* load j atom coordinates */
 +    jx_S        = gmx_load_pr(x+ajx);
 +    jy_S        = gmx_load_pr(x+ajy);
 +    jz_S        = gmx_load_pr(x+ajz);
 +
 +    /* Calculate distance */
 +    dx_S0       = gmx_sub_pr(ix_S0, jx_S);
 +    dy_S0       = gmx_sub_pr(iy_S0, jy_S);
 +    dz_S0       = gmx_sub_pr(iz_S0, jz_S);
 +    dx_S1       = gmx_sub_pr(ix_S1, jx_S);
 +    dy_S1       = gmx_sub_pr(iy_S1, jy_S);
 +    dz_S1       = gmx_sub_pr(iz_S1, jz_S);
 +    dx_S2       = gmx_sub_pr(ix_S2, jx_S);
 +    dy_S2       = gmx_sub_pr(iy_S2, jy_S);
 +    dz_S2       = gmx_sub_pr(iz_S2, jz_S);
 +    dx_S3       = gmx_sub_pr(ix_S3, jx_S);
 +    dy_S3       = gmx_sub_pr(iy_S3, jy_S);
 +    dz_S3       = gmx_sub_pr(iz_S3, jz_S);
 +
 +    /* rsq = dx*dx+dy*dy+dz*dz */
 +    rsq_S0      = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0);
 +    rsq_S1      = gmx_calc_rsq_pr(dx_S1, dy_S1, dz_S1);
 +    rsq_S2      = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2);
 +    rsq_S3      = gmx_calc_rsq_pr(dx_S3, dy_S3, dz_S3);
 +
 +#ifndef NBNXN_CUTOFF_USE_BLENDV
 +    wco_S0      = gmx_cmplt_pr(rsq_S0, rc2_S);
 +    wco_S1      = gmx_cmplt_pr(rsq_S1, rc2_S);
 +    wco_S2      = gmx_cmplt_pr(rsq_S2, rc2_S);
 +    wco_S3      = gmx_cmplt_pr(rsq_S3, rc2_S);
 +#endif
 +
 +#ifdef CHECK_EXCLS
 +#ifdef EXCL_FORCES
 +    /* Only remove the (sub-)diagonal to avoid double counting */
 +#if UNROLLJ == UNROLLI
 +    if (cj == ci_sh)
 +    {
 +        wco_S0  = gmx_and_pb(wco_S0, diagonal_mask_S0);
 +        wco_S1  = gmx_and_pb(wco_S1, diagonal_mask_S1);
 +        wco_S2  = gmx_and_pb(wco_S2, diagonal_mask_S2);
 +        wco_S3  = gmx_and_pb(wco_S3, diagonal_mask_S3);
 +    }
 +#else
 +#if UNROLLJ < UNROLLI
 +    if (cj == ci_sh*2)
 +    {
 +        wco_S0  = gmx_and_pb(wco_S0, diagonal_mask0_S0);
 +        wco_S1  = gmx_and_pb(wco_S1, diagonal_mask0_S1);
 +        wco_S2  = gmx_and_pb(wco_S2, diagonal_mask0_S2);
 +        wco_S3  = gmx_and_pb(wco_S3, diagonal_mask0_S3);
 +    }
 +    if (cj == ci_sh*2 + 1)
 +    {
 +        wco_S0  = gmx_and_pb(wco_S0, diagonal_mask1_S0);
 +        wco_S1  = gmx_and_pb(wco_S1, diagonal_mask1_S1);
 +        wco_S2  = gmx_and_pb(wco_S2, diagonal_mask1_S2);
 +        wco_S3  = gmx_and_pb(wco_S3, diagonal_mask1_S3);
 +    }
 +#else
 +    if (cj*2 == ci_sh)
 +    {
 +        wco_S0  = gmx_and_pb(wco_S0, diagonal_mask0_S0);
 +        wco_S1  = gmx_and_pb(wco_S1, diagonal_mask0_S1);
 +        wco_S2  = gmx_and_pb(wco_S2, diagonal_mask0_S2);
 +        wco_S3  = gmx_and_pb(wco_S3, diagonal_mask0_S3);
 +    }
 +    else if (cj*2 + 1 == ci_sh)
 +    {
 +        wco_S0  = gmx_and_pb(wco_S0, diagonal_mask1_S0);
 +        wco_S1  = gmx_and_pb(wco_S1, diagonal_mask1_S1);
 +        wco_S2  = gmx_and_pb(wco_S2, diagonal_mask1_S2);
 +        wco_S3  = gmx_and_pb(wco_S3, diagonal_mask1_S3);
 +    }
 +#endif
 +#endif
 +#else /* EXCL_FORCES */
 +    /* No exclusion forces: remove all excluded atom pairs from the list */
 +    wco_S0      = gmx_and_pb(wco_S0, interact_S0);
 +    wco_S1      = gmx_and_pb(wco_S1, interact_S1);
 +    wco_S2      = gmx_and_pb(wco_S2, interact_S2);
 +    wco_S3      = gmx_and_pb(wco_S3, interact_S3);
 +#endif
 +#endif
 +
 +#ifdef COUNT_PAIRS
 +    {
 +        int  i, j;
 +        real tmpa[2*GMX_SIMD_WIDTH_HERE], *tmp;
 +        tmp = gmx_simd_align_real(tmpa);
 +        for (i = 0; i < UNROLLI; i++)
 +        {
 +            gmx_store_pr(tmp, gmx_sub_pr(rc2_S, i == 0 ? rsq_S0 : (i == 1 ? rsq_S1 : (i == 2 ? rsq_S2 : rsq_S3))));
 +            for (j = 0; j < UNROLLJ; j++)
 +            {
 +                if (tmp[j] >= 0)
 +                {
 +                    npair++;
 +                }
 +            }
 +        }
 +    }
 +#endif
 +
 +#ifdef CHECK_EXCLS
 +    /* For excluded pairs add a small number to avoid r^-6 = NaN */
 +    rsq_S0      = gmx_masknot_add_pr(interact_S0, rsq_S0, avoid_sing_S);
 +    rsq_S1      = gmx_masknot_add_pr(interact_S1, rsq_S1, avoid_sing_S);
 +    rsq_S2      = gmx_masknot_add_pr(interact_S2, rsq_S2, avoid_sing_S);
 +    rsq_S3      = gmx_masknot_add_pr(interact_S3, rsq_S3, avoid_sing_S);
 +#endif
 +
 +    /* Calculate 1/r */
 +#ifndef GMX_DOUBLE
 +    rinv_S0     = gmx_invsqrt_pr(rsq_S0);
 +    rinv_S1     = gmx_invsqrt_pr(rsq_S1);
 +    rinv_S2     = gmx_invsqrt_pr(rsq_S2);
 +    rinv_S3     = gmx_invsqrt_pr(rsq_S3);
 +#else
 +    gmx_mm_invsqrt2_pd(rsq_S0, rsq_S1, &rinv_S0, &rinv_S1);
 +    gmx_mm_invsqrt2_pd(rsq_S2, rsq_S3, &rinv_S2, &rinv_S3);
 +#endif
 +
 +#ifdef CALC_COULOMB
 +    /* Load parameters for j atom */
 +    jq_S        = gmx_load_pr(q+aj);
 +    qq_S0       = gmx_mul_pr(iq_S0, jq_S);
 +    qq_S1       = gmx_mul_pr(iq_S1, jq_S);
 +    qq_S2       = gmx_mul_pr(iq_S2, jq_S);
 +    qq_S3       = gmx_mul_pr(iq_S3, jq_S);
 +#endif
 +
 +#ifdef CALC_LJ
 +
 +#if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C
 +    load_lj_pair_params(nbfp0, type, aj, &c6_S0, &c12_S0);
 +    load_lj_pair_params(nbfp1, type, aj, &c6_S1, &c12_S1);
 +#ifndef HALF_LJ
 +    load_lj_pair_params(nbfp2, type, aj, &c6_S2, &c12_S2);
 +    load_lj_pair_params(nbfp3, type, aj, &c6_S3, &c12_S3);
 +#endif
 +#endif /* not defined any LJ rule */
 +
 +#ifdef LJ_COMB_GEOM
 +    c6s_j_S     = gmx_load_pr(ljc+aj2+0);
 +    c12s_j_S    = gmx_load_pr(ljc+aj2+STRIDE);
 +    c6_S0       = gmx_mul_pr(c6s_S0, c6s_j_S );
 +    c6_S1       = gmx_mul_pr(c6s_S1, c6s_j_S );
 +#ifndef HALF_LJ
 +    c6_S2       = gmx_mul_pr(c6s_S2, c6s_j_S );
 +    c6_S3       = gmx_mul_pr(c6s_S3, c6s_j_S );
 +#endif
 +    c12_S0      = gmx_mul_pr(c12s_S0, c12s_j_S);
 +    c12_S1      = gmx_mul_pr(c12s_S1, c12s_j_S);
 +#ifndef HALF_LJ
 +    c12_S2      = gmx_mul_pr(c12s_S2, c12s_j_S);
 +    c12_S3      = gmx_mul_pr(c12s_S3, c12s_j_S);
 +#endif
 +#endif /* LJ_COMB_GEOM */
 +
 +#ifdef LJ_COMB_LB
 +    hsig_j_S    = gmx_load_pr(ljc+aj2+0);
 +    seps_j_S    = gmx_load_pr(ljc+aj2+STRIDE);
 +
 +    sig_S0      = gmx_add_pr(hsig_i_S0, hsig_j_S);
 +    sig_S1      = gmx_add_pr(hsig_i_S1, hsig_j_S);
 +    eps_S0      = gmx_mul_pr(seps_i_S0, seps_j_S);
 +    eps_S1      = gmx_mul_pr(seps_i_S1, seps_j_S);
 +#ifndef HALF_LJ
 +    sig_S2      = gmx_add_pr(hsig_i_S2, hsig_j_S);
 +    sig_S3      = gmx_add_pr(hsig_i_S3, hsig_j_S);
 +    eps_S2      = gmx_mul_pr(seps_i_S2, seps_j_S);
 +    eps_S3      = gmx_mul_pr(seps_i_S3, seps_j_S);
 +#endif
 +#endif /* LJ_COMB_LB */
 +
 +#endif /* CALC_LJ */
 +
 +#ifndef NBNXN_CUTOFF_USE_BLENDV
 +    rinv_S0     = gmx_blendzero_pr(rinv_S0, wco_S0);
 +    rinv_S1     = gmx_blendzero_pr(rinv_S1, wco_S1);
 +    rinv_S2     = gmx_blendzero_pr(rinv_S2, wco_S2);
 +    rinv_S3     = gmx_blendzero_pr(rinv_S3, wco_S3);
 +#else
 +    /* We only need to mask for the cut-off: blendv is faster */
 +    rinv_S0     = gmx_blendv_pr(rinv_S0, zero_S, gmx_sub_pr(rc2_S, rsq_S0));
 +    rinv_S1     = gmx_blendv_pr(rinv_S1, zero_S, gmx_sub_pr(rc2_S, rsq_S1));
 +    rinv_S2     = gmx_blendv_pr(rinv_S2, zero_S, gmx_sub_pr(rc2_S, rsq_S2));
 +    rinv_S3     = gmx_blendv_pr(rinv_S3, zero_S, gmx_sub_pr(rc2_S, rsq_S3));
 +#endif
 +
 +    rinvsq_S0   = gmx_mul_pr(rinv_S0, rinv_S0);
 +    rinvsq_S1   = gmx_mul_pr(rinv_S1, rinv_S1);
 +    rinvsq_S2   = gmx_mul_pr(rinv_S2, rinv_S2);
 +    rinvsq_S3   = gmx_mul_pr(rinv_S3, rinv_S3);
 +
 +#ifdef CALC_COULOMB
 +    /* Note that here we calculate force*r, not the usual force/r.
 +     * This allows avoiding masking the reaction-field contribution,
 +     * as frcoul is later multiplied by rinvsq which has been
 +     * masked with the cut-off check.
 +     */
 +
 +#ifdef EXCL_FORCES
 +    /* Only add 1/r for non-excluded atom pairs */
 +    rinv_ex_S0  = gmx_blendzero_pr(rinv_S0, interact_S0);
 +    rinv_ex_S1  = gmx_blendzero_pr(rinv_S1, interact_S1);
 +    rinv_ex_S2  = gmx_blendzero_pr(rinv_S2, interact_S2);
 +    rinv_ex_S3  = gmx_blendzero_pr(rinv_S3, interact_S3);
 +#else
 +    /* No exclusion forces, we always need 1/r */
 +#define     rinv_ex_S0    rinv_S0
 +#define     rinv_ex_S1    rinv_S1
 +#define     rinv_ex_S2    rinv_S2
 +#define     rinv_ex_S3    rinv_S3
 +#endif
 +
 +#ifdef CALC_COUL_RF
 +    /* Electrostatic interactions */
 +    frcoul_S0   = gmx_mul_pr(qq_S0, gmx_madd_pr(rsq_S0, mrc_3_S, rinv_ex_S0));
 +    frcoul_S1   = gmx_mul_pr(qq_S1, gmx_madd_pr(rsq_S1, mrc_3_S, rinv_ex_S1));
 +    frcoul_S2   = gmx_mul_pr(qq_S2, gmx_madd_pr(rsq_S2, mrc_3_S, rinv_ex_S2));
 +    frcoul_S3   = gmx_mul_pr(qq_S3, gmx_madd_pr(rsq_S3, mrc_3_S, rinv_ex_S3));
 +
 +#ifdef CALC_ENERGIES
 +    vcoul_S0    = gmx_mul_pr(qq_S0, gmx_add_pr(rinv_ex_S0, gmx_add_pr(gmx_mul_pr(rsq_S0, hrc_3_S), moh_rc_S)));
 +    vcoul_S1    = gmx_mul_pr(qq_S1, gmx_add_pr(rinv_ex_S1, gmx_add_pr(gmx_mul_pr(rsq_S1, hrc_3_S), moh_rc_S)));
 +    vcoul_S2    = gmx_mul_pr(qq_S2, gmx_add_pr(rinv_ex_S2, gmx_add_pr(gmx_mul_pr(rsq_S2, hrc_3_S), moh_rc_S)));
 +    vcoul_S3    = gmx_mul_pr(qq_S3, gmx_add_pr(rinv_ex_S3, gmx_add_pr(gmx_mul_pr(rsq_S3, hrc_3_S), moh_rc_S)));
 +#endif
 +#endif
 +
 +#ifdef CALC_COUL_EWALD
 +    /* We need to mask (or limit) rsq for the cut-off,
 +     * as large distances can cause an overflow in gmx_pmecorrF/V.
 +     */
 +#ifndef NBNXN_CUTOFF_USE_BLENDV
 +    brsq_S0     = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S0, wco_S0));
 +    brsq_S1     = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S1, wco_S1));
 +    brsq_S2     = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S2, wco_S2));
 +    brsq_S3     = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S3, wco_S3));
 +#else
 +    /* Strangely, putting mul on a separate line is slower (icc 13) */
 +    brsq_S0     = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S0, zero_S, gmx_sub_pr(rc2_S, rsq_S0)));
 +    brsq_S1     = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S1, zero_S, gmx_sub_pr(rc2_S, rsq_S1)));
 +    brsq_S2     = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S2, zero_S, gmx_sub_pr(rc2_S, rsq_S2)));
 +    brsq_S3     = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S3, zero_S, gmx_sub_pr(rc2_S, rsq_S3)));
 +#endif
 +    ewcorr_S0   = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S0), beta_S);
 +    ewcorr_S1   = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S1), beta_S);
 +    ewcorr_S2   = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S2), beta_S);
 +    ewcorr_S3   = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S3), beta_S);
 +    frcoul_S0   = gmx_mul_pr(qq_S0, gmx_madd_pr(ewcorr_S0, brsq_S0, rinv_ex_S0));
 +    frcoul_S1   = gmx_mul_pr(qq_S1, gmx_madd_pr(ewcorr_S1, brsq_S1, rinv_ex_S1));
 +    frcoul_S2   = gmx_mul_pr(qq_S2, gmx_madd_pr(ewcorr_S2, brsq_S2, rinv_ex_S2));
 +    frcoul_S3   = gmx_mul_pr(qq_S3, gmx_madd_pr(ewcorr_S3, brsq_S3, rinv_ex_S3));
 +
 +#ifdef CALC_ENERGIES
 +    vc_sub_S0   = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S0), beta_S);
 +    vc_sub_S1   = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S1), beta_S);
 +    vc_sub_S2   = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S2), beta_S);
 +    vc_sub_S3   = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S3), beta_S);
 +#endif
 +
 +#endif /* CALC_COUL_EWALD */
 +
 +#ifdef CALC_COUL_TAB
 +    /* Electrostatic interactions */
 +    r_S0        = gmx_mul_pr(rsq_S0, rinv_S0);
 +    r_S1        = gmx_mul_pr(rsq_S1, rinv_S1);
 +    r_S2        = gmx_mul_pr(rsq_S2, rinv_S2);
 +    r_S3        = gmx_mul_pr(rsq_S3, rinv_S3);
 +    /* Convert r to scaled table units */
 +    rs_S0       = gmx_mul_pr(r_S0, invtsp_S);
 +    rs_S1       = gmx_mul_pr(r_S1, invtsp_S);
 +    rs_S2       = gmx_mul_pr(r_S2, invtsp_S);
 +    rs_S3       = gmx_mul_pr(r_S3, invtsp_S);
 +    /* Truncate scaled r to an int */
 +    ti_S0       = gmx_cvttpr_epi32(rs_S0);
 +    ti_S1       = gmx_cvttpr_epi32(rs_S1);
 +    ti_S2       = gmx_cvttpr_epi32(rs_S2);
 +    ti_S3       = gmx_cvttpr_epi32(rs_S3);
 +#ifdef GMX_SIMD_HAVE_FLOOR
 +    /* SSE4.1 floor is faster than gmx_cvtepi32_ps int->float cast */
 +    rf_S0       = gmx_floor_pr(rs_S0);
 +    rf_S1       = gmx_floor_pr(rs_S1);
 +    rf_S2       = gmx_floor_pr(rs_S2);
 +    rf_S3       = gmx_floor_pr(rs_S3);
 +#else
 +    rf_S0       = gmx_cvtepi32_pr(ti_S0);
 +    rf_S1       = gmx_cvtepi32_pr(ti_S1);
 +    rf_S2       = gmx_cvtepi32_pr(ti_S2);
 +    rf_S3       = gmx_cvtepi32_pr(ti_S3);
 +#endif
 +    frac_S0     = gmx_sub_pr(rs_S0, rf_S0);
 +    frac_S1     = gmx_sub_pr(rs_S1, rf_S1);
 +    frac_S2     = gmx_sub_pr(rs_S2, rf_S2);
 +    frac_S3     = gmx_sub_pr(rs_S3, rf_S3);
 +
 +    /* Load and interpolate table forces and possibly energies.
 +     * Force and energy can be combined in one table, stride 4: FDV0
 +     * or in two separate tables with stride 1: F and V
 +     * Currently single precision uses FDV0, double F and V.
 +     */
 +#ifndef CALC_ENERGIES
 +    load_table_f(tab_coul_F, ti_S0, ti0, &ctab0_S0, &ctab1_S0);
 +    load_table_f(tab_coul_F, ti_S1, ti1, &ctab0_S1, &ctab1_S1);
 +    load_table_f(tab_coul_F, ti_S2, ti2, &ctab0_S2, &ctab1_S2);
 +    load_table_f(tab_coul_F, ti_S3, ti3, &ctab0_S3, &ctab1_S3);
 +#else
 +#ifdef TAB_FDV0
 +    load_table_f_v(tab_coul_F, ti_S0, ti0, &ctab0_S0, &ctab1_S0, &ctabv_S0);
 +    load_table_f_v(tab_coul_F, ti_S1, ti1, &ctab0_S1, &ctab1_S1, &ctabv_S1);
 +    load_table_f_v(tab_coul_F, ti_S2, ti2, &ctab0_S2, &ctab1_S2, &ctabv_S2);
 +    load_table_f_v(tab_coul_F, ti_S3, ti3, &ctab0_S3, &ctab1_S3, &ctabv_S3);
 +#else
 +    load_table_f_v(tab_coul_F, tab_coul_V, ti_S0, ti0, &ctab0_S0, &ctab1_S0, &ctabv_S0);
 +    load_table_f_v(tab_coul_F, tab_coul_V, ti_S1, ti1, &ctab0_S1, &ctab1_S1, &ctabv_S1);
 +    load_table_f_v(tab_coul_F, tab_coul_V, ti_S2, ti2, &ctab0_S2, &ctab1_S2, &ctabv_S2);
 +    load_table_f_v(tab_coul_F, tab_coul_V, ti_S3, ti3, &ctab0_S3, &ctab1_S3, &ctabv_S3);
 +#endif
 +#endif
 +    fsub_S0     = gmx_add_pr(ctab0_S0, gmx_mul_pr(frac_S0, ctab1_S0));
 +    fsub_S1     = gmx_add_pr(ctab0_S1, gmx_mul_pr(frac_S1, ctab1_S1));
 +    fsub_S2     = gmx_add_pr(ctab0_S2, gmx_mul_pr(frac_S2, ctab1_S2));
 +    fsub_S3     = gmx_add_pr(ctab0_S3, gmx_mul_pr(frac_S3, ctab1_S3));
 +    frcoul_S0   = gmx_mul_pr(qq_S0, gmx_sub_pr(rinv_ex_S0, gmx_mul_pr(fsub_S0, r_S0)));
 +    frcoul_S1   = gmx_mul_pr(qq_S1, gmx_sub_pr(rinv_ex_S1, gmx_mul_pr(fsub_S1, r_S1)));
 +    frcoul_S2   = gmx_mul_pr(qq_S2, gmx_sub_pr(rinv_ex_S2, gmx_mul_pr(fsub_S2, r_S2)));
 +    frcoul_S3   = gmx_mul_pr(qq_S3, gmx_sub_pr(rinv_ex_S3, gmx_mul_pr(fsub_S3, r_S3)));
 +
 +#ifdef CALC_ENERGIES
 +    vc_sub_S0   = gmx_add_pr(ctabv_S0, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S0), gmx_add_pr(ctab0_S0, fsub_S0)));
 +    vc_sub_S1   = gmx_add_pr(ctabv_S1, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S1), gmx_add_pr(ctab0_S1, fsub_S1)));
 +    vc_sub_S2   = gmx_add_pr(ctabv_S2, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S2), gmx_add_pr(ctab0_S2, fsub_S2)));
 +    vc_sub_S3   = gmx_add_pr(ctabv_S3, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S3), gmx_add_pr(ctab0_S3, fsub_S3)));
 +#endif
 +#endif /* CALC_COUL_TAB */
 +
 +#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
 +#ifndef NO_SHIFT_EWALD
 +    /* Add Ewald potential shift to vc_sub for convenience */
 +#ifdef CHECK_EXCLS
 +    vc_sub_S0   = gmx_add_pr(vc_sub_S0, gmx_blendzero_pr(sh_ewald_S, interact_S0));
 +    vc_sub_S1   = gmx_add_pr(vc_sub_S1, gmx_blendzero_pr(sh_ewald_S, interact_S1));
 +    vc_sub_S2   = gmx_add_pr(vc_sub_S2, gmx_blendzero_pr(sh_ewald_S, interact_S2));
 +    vc_sub_S3   = gmx_add_pr(vc_sub_S3, gmx_blendzero_pr(sh_ewald_S, interact_S3));
 +#else
 +    vc_sub_S0   = gmx_add_pr(vc_sub_S0, sh_ewald_S);
 +    vc_sub_S1   = gmx_add_pr(vc_sub_S1, sh_ewald_S);
 +    vc_sub_S2   = gmx_add_pr(vc_sub_S2, sh_ewald_S);
 +    vc_sub_S3   = gmx_add_pr(vc_sub_S3, sh_ewald_S);
 +#endif
 +#endif
 +
 +    vcoul_S0    = gmx_mul_pr(qq_S0, gmx_sub_pr(rinv_ex_S0, vc_sub_S0));
 +    vcoul_S1    = gmx_mul_pr(qq_S1, gmx_sub_pr(rinv_ex_S1, vc_sub_S1));
 +    vcoul_S2    = gmx_mul_pr(qq_S2, gmx_sub_pr(rinv_ex_S2, vc_sub_S2));
 +    vcoul_S3    = gmx_mul_pr(qq_S3, gmx_sub_pr(rinv_ex_S3, vc_sub_S3));
 +
 +#endif
 +
 +#ifdef CALC_ENERGIES
 +    /* Mask energy for cut-off and diagonal */
 +    vcoul_S0    = gmx_blendzero_pr(vcoul_S0, wco_S0);
 +    vcoul_S1    = gmx_blendzero_pr(vcoul_S1, wco_S1);
 +    vcoul_S2    = gmx_blendzero_pr(vcoul_S2, wco_S2);
 +    vcoul_S3    = gmx_blendzero_pr(vcoul_S3, wco_S3);
 +#endif
 +
 +#endif /* CALC_COULOMB */
 +
 +#ifdef CALC_LJ
 +    /* Lennard-Jones interaction */
 +
 +#ifdef VDW_CUTOFF_CHECK
 +    wco_vdw_S0  = gmx_cmplt_pr(rsq_S0, rcvdw2_S);
 +    wco_vdw_S1  = gmx_cmplt_pr(rsq_S1, rcvdw2_S);
 +#ifndef HALF_LJ
 +    wco_vdw_S2  = gmx_cmplt_pr(rsq_S2, rcvdw2_S);
 +    wco_vdw_S3  = gmx_cmplt_pr(rsq_S3, rcvdw2_S);
 +#endif
 +#else
 +    /* Same cut-off for Coulomb and VdW, reuse the registers */
 +#define     wco_vdw_S0    wco_S0
 +#define     wco_vdw_S1    wco_S1
 +#define     wco_vdw_S2    wco_S2
 +#define     wco_vdw_S3    wco_S3
 +#endif
 +
 +#ifndef LJ_COMB_LB
 +    rinvsix_S0  = gmx_mul_pr(rinvsq_S0, gmx_mul_pr(rinvsq_S0, rinvsq_S0));
 +    rinvsix_S1  = gmx_mul_pr(rinvsq_S1, gmx_mul_pr(rinvsq_S1, rinvsq_S1));
 +#ifdef EXCL_FORCES
 +    rinvsix_S0  = gmx_blendzero_pr(rinvsix_S0, interact_S0);
 +    rinvsix_S1  = gmx_blendzero_pr(rinvsix_S1, interact_S1);
 +#endif
 +#ifndef HALF_LJ
 +    rinvsix_S2  = gmx_mul_pr(rinvsq_S2, gmx_mul_pr(rinvsq_S2, rinvsq_S2));
 +    rinvsix_S3  = gmx_mul_pr(rinvsq_S3, gmx_mul_pr(rinvsq_S3, rinvsq_S3));
 +#ifdef EXCL_FORCES
 +    rinvsix_S2  = gmx_blendzero_pr(rinvsix_S2, interact_S2);
 +    rinvsix_S3  = gmx_blendzero_pr(rinvsix_S3, interact_S3);
 +#endif
 +#endif
 +#ifdef VDW_CUTOFF_CHECK
 +    rinvsix_S0  = gmx_blendzero_pr(rinvsix_S0, wco_vdw_S0);
 +    rinvsix_S1  = gmx_blendzero_pr(rinvsix_S1, wco_vdw_S1);
 +#ifndef HALF_LJ
 +    rinvsix_S2  = gmx_blendzero_pr(rinvsix_S2, wco_vdw_S2);
 +    rinvsix_S3  = gmx_blendzero_pr(rinvsix_S3, wco_vdw_S3);
 +#endif
 +#endif
 +    FrLJ6_S0    = gmx_mul_pr(c6_S0, rinvsix_S0);
 +    FrLJ6_S1    = gmx_mul_pr(c6_S1, rinvsix_S1);
 +#ifndef HALF_LJ
 +    FrLJ6_S2    = gmx_mul_pr(c6_S2, rinvsix_S2);
 +    FrLJ6_S3    = gmx_mul_pr(c6_S3, rinvsix_S3);
 +#endif
 +    FrLJ12_S0   = gmx_mul_pr(c12_S0, gmx_mul_pr(rinvsix_S0, rinvsix_S0));
 +    FrLJ12_S1   = gmx_mul_pr(c12_S1, gmx_mul_pr(rinvsix_S1, rinvsix_S1));
 +#ifndef HALF_LJ
 +    FrLJ12_S2   = gmx_mul_pr(c12_S2, gmx_mul_pr(rinvsix_S2, rinvsix_S2));
 +    FrLJ12_S3   = gmx_mul_pr(c12_S3, gmx_mul_pr(rinvsix_S3, rinvsix_S3));
 +#endif
 +#endif /* not LJ_COMB_LB */
 +
 +#ifdef LJ_COMB_LB
 +    sir_S0      = gmx_mul_pr(sig_S0, rinv_S0);
 +    sir_S1      = gmx_mul_pr(sig_S1, rinv_S1);
 +#ifndef HALF_LJ
 +    sir_S2      = gmx_mul_pr(sig_S2, rinv_S2);
 +    sir_S3      = gmx_mul_pr(sig_S3, rinv_S3);
 +#endif
 +    sir2_S0     = gmx_mul_pr(sir_S0, sir_S0);
 +    sir2_S1     = gmx_mul_pr(sir_S1, sir_S1);
 +#ifndef HALF_LJ
 +    sir2_S2     = gmx_mul_pr(sir_S2, sir_S2);
 +    sir2_S3     = gmx_mul_pr(sir_S3, sir_S3);
 +#endif
 +    sir6_S0     = gmx_mul_pr(sir2_S0, gmx_mul_pr(sir2_S0, sir2_S0));
 +    sir6_S1     = gmx_mul_pr(sir2_S1, gmx_mul_pr(sir2_S1, sir2_S1));
 +#ifdef EXCL_FORCES
 +    sir6_S0     = gmx_blendzero_pr(sir6_S0, interact_S0);
 +    sir6_S1     = gmx_blendzero_pr(sir6_S1, interact_S1);
 +#endif
 +#ifndef HALF_LJ
 +    sir6_S2     = gmx_mul_pr(sir2_S2, gmx_mul_pr(sir2_S2, sir2_S2));
 +    sir6_S3     = gmx_mul_pr(sir2_S3, gmx_mul_pr(sir2_S3, sir2_S3));
 +#ifdef EXCL_FORCES
 +    sir6_S2     = gmx_blendzero_pr(sir6_S2, interact_S2);
 +    sir6_S3     = gmx_blendzero_pr(sir6_S3, interact_S3);
 +#endif
 +#endif
 +#ifdef VDW_CUTOFF_CHECK
 +    sir6_S0     = gmx_blendzero_pr(sir6_S0, wco_vdw_S0);
 +    sir6_S1     = gmx_blendzero_pr(sir6_S1, wco_vdw_S1);
 +#ifndef HALF_LJ
 +    sir6_S2     = gmx_blendzero_pr(sir6_S2, wco_vdw_S2);
 +    sir6_S3     = gmx_blendzero_pr(sir6_S3, wco_vdw_S3);
 +#endif
 +#endif
 +    FrLJ6_S0    = gmx_mul_pr(eps_S0, sir6_S0);
 +    FrLJ6_S1    = gmx_mul_pr(eps_S1, sir6_S1);
 +#ifndef HALF_LJ
 +    FrLJ6_S2    = gmx_mul_pr(eps_S2, sir6_S2);
 +    FrLJ6_S3    = gmx_mul_pr(eps_S3, sir6_S3);
 +#endif
 +    FrLJ12_S0   = gmx_mul_pr(FrLJ6_S0, sir6_S0);
 +    FrLJ12_S1   = gmx_mul_pr(FrLJ6_S1, sir6_S1);
 +#ifndef HALF_LJ
 +    FrLJ12_S2   = gmx_mul_pr(FrLJ6_S2, sir6_S2);
 +    FrLJ12_S3   = gmx_mul_pr(FrLJ6_S3, sir6_S3);
 +#endif
 +#if defined CALC_ENERGIES
 +    /* We need C6 and C12 to calculate the LJ potential shift */
 +    sig2_S0     = gmx_mul_pr(sig_S0, sig_S0);
 +    sig2_S1     = gmx_mul_pr(sig_S1, sig_S1);
 +#ifndef HALF_LJ
 +    sig2_S2     = gmx_mul_pr(sig_S2, sig_S2);
 +    sig2_S3     = gmx_mul_pr(sig_S3, sig_S3);
 +#endif
 +    sig6_S0     = gmx_mul_pr(sig2_S0, gmx_mul_pr(sig2_S0, sig2_S0));
 +    sig6_S1     = gmx_mul_pr(sig2_S1, gmx_mul_pr(sig2_S1, sig2_S1));
 +#ifndef HALF_LJ
 +    sig6_S2     = gmx_mul_pr(sig2_S2, gmx_mul_pr(sig2_S2, sig2_S2));
 +    sig6_S3     = gmx_mul_pr(sig2_S3, gmx_mul_pr(sig2_S3, sig2_S3));
 +#endif
 +    c6_S0       = gmx_mul_pr(eps_S0, sig6_S0);
 +    c6_S1       = gmx_mul_pr(eps_S1, sig6_S1);
 +#ifndef HALF_LJ
 +    c6_S2       = gmx_mul_pr(eps_S2, sig6_S2);
 +    c6_S3       = gmx_mul_pr(eps_S3, sig6_S3);
 +#endif
 +    c12_S0      = gmx_mul_pr(c6_S0, sig6_S0);
 +    c12_S1      = gmx_mul_pr(c6_S1, sig6_S1);
 +#ifndef HALF_LJ
 +    c12_S2      = gmx_mul_pr(c6_S2, sig6_S2);
 +    c12_S3      = gmx_mul_pr(c6_S3, sig6_S3);
 +#endif
 +#endif
 +#endif /* LJ_COMB_LB */
 +
 +#endif /* CALC_LJ */
 +
 +#ifdef CALC_ENERGIES
 +#ifdef ENERGY_GROUPS
 +    /* Extract the group pair index per j pair.
 +     * Energy groups are stored per i-cluster, so things get
 +     * complicated when the i- and j-cluster size don't match.
 +     */
 +    {
 +        int egps_j;
 +#if UNROLLJ == 2
 +        egps_j    = nbat->energrp[cj>>1];
 +        egp_jj[0] = ((egps_j >> ((cj & 1)*egps_jshift)) & egps_jmask)*egps_jstride;
 +#else
 +        /* We assume UNROLLI <= UNROLLJ */
 +        int jdi;
 +        for (jdi = 0; jdi < UNROLLJ/UNROLLI; jdi++)
 +        {
 +            int jj;
 +            egps_j = nbat->energrp[cj*(UNROLLJ/UNROLLI)+jdi];
 +            for (jj = 0; jj < (UNROLLI/2); jj++)
 +            {
 +                egp_jj[jdi*(UNROLLI/2)+jj] = ((egps_j >> (jj*egps_jshift)) & egps_jmask)*egps_jstride;
 +            }
 +        }
 +#endif
 +    }
 +#endif
 +
 +#ifdef CALC_COULOMB
 +#ifndef ENERGY_GROUPS
 +    vctot_S      = gmx_add_pr(vctot_S, gmx_sum4_pr(vcoul_S0, vcoul_S1, vcoul_S2, vcoul_S3));
 +#else
 +    add_ener_grp(vcoul_S0, vctp[0], egp_jj);
 +    add_ener_grp(vcoul_S1, vctp[1], egp_jj);
 +    add_ener_grp(vcoul_S2, vctp[2], egp_jj);
 +    add_ener_grp(vcoul_S3, vctp[3], egp_jj);
 +#endif
 +#endif
 +
 +#ifdef CALC_LJ
 +    /* Calculate the LJ energies */
 +    VLJ6_S0     = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S0, gmx_mul_pr(c6_S0, sh_invrc6_S)));
 +    VLJ6_S1     = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S1, gmx_mul_pr(c6_S1, sh_invrc6_S)));
 +#ifndef HALF_LJ
 +    VLJ6_S2     = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S2, gmx_mul_pr(c6_S2, sh_invrc6_S)));
 +    VLJ6_S3     = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S3, gmx_mul_pr(c6_S3, sh_invrc6_S)));
 +#endif
 +    VLJ12_S0    = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S0, gmx_mul_pr(c12_S0, sh_invrc12_S)));
 +    VLJ12_S1    = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S1, gmx_mul_pr(c12_S1, sh_invrc12_S)));
 +#ifndef HALF_LJ
 +    VLJ12_S2    = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S2, gmx_mul_pr(c12_S2, sh_invrc12_S)));
 +    VLJ12_S3    = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S3, gmx_mul_pr(c12_S3, sh_invrc12_S)));
 +#endif
 +
 +    VLJ_S0      = gmx_sub_pr(VLJ12_S0, VLJ6_S0);
 +    VLJ_S1      = gmx_sub_pr(VLJ12_S1, VLJ6_S1);
 +#ifndef HALF_LJ
 +    VLJ_S2      = gmx_sub_pr(VLJ12_S2, VLJ6_S2);
 +    VLJ_S3      = gmx_sub_pr(VLJ12_S3, VLJ6_S3);
 +#endif
 +    /* The potential shift should be removed for pairs beyond cut-off */
 +    VLJ_S0      = gmx_blendzero_pr(VLJ_S0, wco_vdw_S0);
 +    VLJ_S1      = gmx_blendzero_pr(VLJ_S1, wco_vdw_S1);
 +#ifndef HALF_LJ
 +    VLJ_S2      = gmx_blendzero_pr(VLJ_S2, wco_vdw_S2);
 +    VLJ_S3      = gmx_blendzero_pr(VLJ_S3, wco_vdw_S3);
 +#endif
 +#ifdef CHECK_EXCLS
 +    /* The potential shift should be removed for excluded pairs */
 +    VLJ_S0      = gmx_blendzero_pr(VLJ_S0, interact_S0);
 +    VLJ_S1      = gmx_blendzero_pr(VLJ_S1, interact_S1);
 +#ifndef HALF_LJ
 +    VLJ_S2      = gmx_blendzero_pr(VLJ_S2, interact_S2);
 +    VLJ_S3      = gmx_blendzero_pr(VLJ_S3, interact_S3);
 +#endif
 +#endif
 +#ifndef ENERGY_GROUPS
 +    Vvdwtot_S   = gmx_add_pr(Vvdwtot_S,
 +#ifndef HALF_LJ
 +                             gmx_sum4_pr(VLJ_S0, VLJ_S1, VLJ_S2, VLJ_S3)
 +#else
 +                             gmx_add_pr(VLJ_S0, VLJ_S1)
 +#endif
 +                             );
 +#else
 +    add_ener_grp(VLJ_S0, vvdwtp[0], egp_jj);
 +    add_ener_grp(VLJ_S1, vvdwtp[1], egp_jj);
 +#ifndef HALF_LJ
 +    add_ener_grp(VLJ_S2, vvdwtp[2], egp_jj);
 +    add_ener_grp(VLJ_S3, vvdwtp[3], egp_jj);
 +#endif
 +#endif
 +#endif /* CALC_LJ */
 +#endif /* CALC_ENERGIES */
 +
 +#ifdef CALC_LJ
 +    fscal_S0    = gmx_mul_pr(rinvsq_S0,
 +#ifdef CALC_COULOMB
 +                               gmx_add_pr(frcoul_S0,
 +#else
 +                               (
 +#endif
 +                                          gmx_sub_pr(FrLJ12_S0, FrLJ6_S0)));
 +    fscal_S1    = gmx_mul_pr(rinvsq_S1,
 +#ifdef CALC_COULOMB
 +                               gmx_add_pr(frcoul_S1,
 +#else
 +                               (
 +#endif
 +                                          gmx_sub_pr(FrLJ12_S1, FrLJ6_S1)));
 +#else
 +    fscal_S0    = gmx_mul_pr(rinvsq_S0, frcoul_S0);
 +    fscal_S1    = gmx_mul_pr(rinvsq_S1, frcoul_S1);
 +#endif /* CALC_LJ */
 +#if defined CALC_LJ && !defined HALF_LJ
 +    fscal_S2    = gmx_mul_pr(rinvsq_S2,
 +#ifdef CALC_COULOMB
 +                               gmx_add_pr(frcoul_S2,
 +#else
 +                               (
 +#endif
 +                                          gmx_sub_pr(FrLJ12_S2, FrLJ6_S2)));
 +    fscal_S3    = gmx_mul_pr(rinvsq_S3,
 +#ifdef CALC_COULOMB
 +                               gmx_add_pr(frcoul_S3,
 +#else
 +                               (
 +#endif
 +                                          gmx_sub_pr(FrLJ12_S3, FrLJ6_S3)));
 +#else
 +    /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
 +    fscal_S2    = gmx_mul_pr(rinvsq_S2, frcoul_S2);
 +    fscal_S3    = gmx_mul_pr(rinvsq_S3, frcoul_S3);
 +#endif
 +
 +    /* Calculate temporary vectorial force */
 +    tx_S0       = gmx_mul_pr(fscal_S0, dx_S0);
 +    tx_S1       = gmx_mul_pr(fscal_S1, dx_S1);
 +    tx_S2       = gmx_mul_pr(fscal_S2, dx_S2);
 +    tx_S3       = gmx_mul_pr(fscal_S3, dx_S3);
 +    ty_S0       = gmx_mul_pr(fscal_S0, dy_S0);
 +    ty_S1       = gmx_mul_pr(fscal_S1, dy_S1);
 +    ty_S2       = gmx_mul_pr(fscal_S2, dy_S2);
 +    ty_S3       = gmx_mul_pr(fscal_S3, dy_S3);
 +    tz_S0       = gmx_mul_pr(fscal_S0, dz_S0);
 +    tz_S1       = gmx_mul_pr(fscal_S1, dz_S1);
 +    tz_S2       = gmx_mul_pr(fscal_S2, dz_S2);
 +    tz_S3       = gmx_mul_pr(fscal_S3, dz_S3);
 +
 +    /* Increment i atom force */
 +    fix_S0      = gmx_add_pr(fix_S0, tx_S0);
 +    fix_S1      = gmx_add_pr(fix_S1, tx_S1);
 +    fix_S2      = gmx_add_pr(fix_S2, tx_S2);
 +    fix_S3      = gmx_add_pr(fix_S3, tx_S3);
 +    fiy_S0      = gmx_add_pr(fiy_S0, ty_S0);
 +    fiy_S1      = gmx_add_pr(fiy_S1, ty_S1);
 +    fiy_S2      = gmx_add_pr(fiy_S2, ty_S2);
 +    fiy_S3      = gmx_add_pr(fiy_S3, ty_S3);
 +    fiz_S0      = gmx_add_pr(fiz_S0, tz_S0);
 +    fiz_S1      = gmx_add_pr(fiz_S1, tz_S1);
 +    fiz_S2      = gmx_add_pr(fiz_S2, tz_S2);
 +    fiz_S3      = gmx_add_pr(fiz_S3, tz_S3);
 +
 +    /* Decrement j atom force */
 +    gmx_store_pr(f+ajx,
 +                 gmx_sub_pr( gmx_load_pr(f+ajx), gmx_sum4_pr(tx_S0, tx_S1, tx_S2, tx_S3) ));
 +    gmx_store_pr(f+ajy,
 +                 gmx_sub_pr( gmx_load_pr(f+ajy), gmx_sum4_pr(ty_S0, ty_S1, ty_S2, ty_S3) ));
 +    gmx_store_pr(f+ajz,
 +                 gmx_sub_pr( gmx_load_pr(f+ajz), gmx_sum4_pr(tz_S0, tz_S1, tz_S2, tz_S3) ));
 +}
 +
 +#undef  rinv_ex_S0
 +#undef  rinv_ex_S1
 +#undef  rinv_ex_S2
 +#undef  rinv_ex_S3
 +
 +#undef  wco_vdw_S0
 +#undef  wco_vdw_S1
 +#undef  wco_vdw_S2
 +#undef  wco_vdw_S3
 +
 +#undef  NBNXN_CUTOFF_USE_BLENDV
 +
 +#undef  EXCL_FORCES
index e5b71bac4f265896b0c4d16c4c801d1184893cd9,0000000000000000000000000000000000000000..49529fefd697de75a8eddc495e967f94c71a2929
mode 100644,000000..100644
--- /dev/null
@@@ -1,842 -1,0 +1,739 @@@
- #define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
- #define UNROLLI    NBNXN_CPU_CLUSTER_I_SIZE
- #define UNROLLJ    GMX_SIMD_WIDTH_HERE
- /* The stride of all the atom data arrays is max(UNROLLI,UNROLLJ) */
- #if GMX_SIMD_WIDTH_HERE >= UNROLLI
- #define STRIDE     GMX_SIMD_WIDTH_HERE
- #else
- #define STRIDE     UNROLLI
- #endif
- #if GMX_SIMD_WIDTH_HERE == 2
- #define SUM_SIMD(x)  (x[0]+x[1])
- #else
- #if GMX_SIMD_WIDTH_HERE == 4
- #define SUM_SIMD(x)  SUM_SIMD4(x)
- #else
- #if GMX_SIMD_WIDTH_HERE == 8
- #define SUM_SIMD(x)  (x[0]+x[1]+x[2]+x[3]+x[4]+x[5]+x[6]+x[7])
- #else
- #error "unsupported kernel configuration"
- #endif
- #endif
- #endif
- /* Decide if we should use the FDV0 table layout */
- #if defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE
- /* With full AVX-256 SIMD, half SIMD-width table loads are optimal */
- #if GMX_SIMD_WIDTH_HERE/2 == 4
- #define TAB_FDV0
- #endif
- #else
- /* We use the FDV0 table layout when we can use aligned table loads */
- #if GMX_SIMD_WIDTH_HERE == 4
- #define TAB_FDV0
- #endif
- #endif
- /* Decide the stride for the 2 LJ parameters */
- #ifdef GMX_X86_SSE2
- #ifdef GMX_DOUBLE
- #define NBFP_STRIDE  2
- #else
- #define NBFP_STRIDE  4
- #endif
- #else
- #if GMX_SIMD_WIDTH_HERE > 4
- #define NBFP_STRIDE  4
- #else
- #define NBFP_STRIDE  GMX_SIMD_WIDTH_HERE
- #endif
- #endif
- #include "nbnxn_kernel_simd_utils.h"
 +/*
 + * This file is part of the GROMACS molecular simulation package.
 + *
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2009, The GROMACS Development Team
 + * Copyright (c) 2012, by the GROMACS development team, led by
 + * David van der Spoel, Berk Hess, Erik Lindahl, and including many
 + * others, as listed in the AUTHORS file in the top-level source
 + * directory and at http://www.gromacs.org.
 + *
 + * GROMACS is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2.1
 + * of the License, or (at your option) any later version.
 + *
 + * GROMACS is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with GROMACS; if not, see
 + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 + *
 + * If you want to redistribute modifications to GROMACS, please
 + * consider that scientific software is very special. Version
 + * control is crucial - bugs must be traceable. We will be happy to
 + * consider code for inclusion in the official distribution, but
 + * derived work must not be called official GROMACS. Details are found
 + * in the README & COPYING files - if they are missing, get the
 + * official version at http://www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org.
 + */
 +
-     int                 nbfp_stride;
 +/* All functionality defines are set here, except for:
 + * CALC_ENERGIES, ENERGY_GROUPS which are defined before.
 + * CHECK_EXCLS, which is set just before including the inner loop contents.
 + * The combination rule defines, LJ_COMB_GEOM or LJ_COMB_LB are currently
 + * set before calling the kernel function. We might want to move that
 + * to inside the n-loop and have a different combination rule for different
 + * ci's, as no combination rule gives a 50% performance hit for LJ.
 + */
 +
 +/* We always calculate shift forces, because it's cheap anyhow */
 +#define CALC_SHIFTFORCES
 +
 +/* Assumes all LJ parameters are identical */
 +/* #define FIX_LJ_C */
 +
 +/* The NBK_FUNC_NAME... macros below generate the whole zoo of kernels names
 + * with all combinations off electrostatics (coul), LJ combination rules (ljc)
 + * and energy calculations (ene), depending on the defines set.
 + */
 +
 +#define NBK_FUNC_NAME_C_LJC(base, coul, ljc, ene) base ## _ ## coul ## _comb_ ## ljc ## _ ## ene
 +
 +#if defined LJ_COMB_GEOM
 +#define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, geom, ene)
 +#else
 +#if defined LJ_COMB_LB
 +#define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, lb, ene)
 +#else
 +#define NBK_FUNC_NAME_C(base, coul, ene) NBK_FUNC_NAME_C_LJC(base, coul, none, ene)
 +#endif
 +#endif
 +
 +#ifdef CALC_COUL_RF
 +#define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, rf, ene)
 +#endif
 +#ifdef CALC_COUL_TAB
 +#ifndef VDW_CUTOFF_CHECK
 +#define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, tab, ene)
 +#else
 +#define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, tab_twin, ene)
 +#endif
 +#endif
 +#ifdef CALC_COUL_EWALD
 +#ifndef VDW_CUTOFF_CHECK
 +#define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, ewald, ene)
 +#else
 +#define NBK_FUNC_NAME(base, ene) NBK_FUNC_NAME_C(base, ewald_twin, ene)
 +#endif
 +#endif
 +
 +static void
 +#ifndef CALC_ENERGIES
 +NBK_FUNC_NAME(nbnxn_kernel_simd_4xn, noener)
 +#else
 +#ifndef ENERGY_GROUPS
 +NBK_FUNC_NAME(nbnxn_kernel_simd_4xn, ener)
 +#else
 +NBK_FUNC_NAME(nbnxn_kernel_simd_4xn, energrp)
 +#endif
 +#endif
 +#undef NBK_FUNC_NAME
 +#undef NBK_FUNC_NAME_C
 +#undef NBK_FUNC_NAME_C_LJC
 +(const nbnxn_pairlist_t     *nbl,
 + const nbnxn_atomdata_t     *nbat,
 + const interaction_const_t  *ic,
 + rvec                       *shift_vec,
 + real                       *f
 +#ifdef CALC_SHIFTFORCES
 + ,
 + real                       *fshift
 +#endif
 +#ifdef CALC_ENERGIES
 + ,
 + real                       *Vvdw,
 + real                       *Vc
 +#endif
 +)
 +{
 +    const nbnxn_ci_t   *nbln;
 +    const nbnxn_cj_t   *l_cj;
 +    const int          *type;
 +    const real         *q;
 +    const real         *shiftvec;
 +    const real         *x;
 +    const real         *nbfp0, *nbfp1, *nbfp2 = NULL, *nbfp3 = NULL;
 +    real                facel;
 +    real               *nbfp_ptr;
-     unsigned   *excl_filter;
- #ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
-     gmx_epi32  filter_S0, filter_S1, filter_S2, filter_S3;
- #else
-     gmx_mm_pr  filter_S0, filter_S1, filter_S2, filter_S3;
- #endif
 +    int                 n, ci, ci_sh;
 +    int                 ish, ish3;
 +    gmx_bool            do_LJ, half_LJ, do_coul;
 +    int                 sci, scix, sciy, sciz, sci2;
 +    int                 cjind0, cjind1, cjind;
 +    int                 ip, jp;
 +
 +#ifdef ENERGY_GROUPS
 +    int         Vstride_i;
 +    int         egps_ishift, egps_imask;
 +    int         egps_jshift, egps_jmask, egps_jstride;
 +    int         egps_i;
 +    real       *vvdwtp[UNROLLI];
 +    real       *vctp[UNROLLI];
 +#endif
 +
 +    gmx_mm_pr  shX_S;
 +    gmx_mm_pr  shY_S;
 +    gmx_mm_pr  shZ_S;
 +    gmx_mm_pr  ix_S0, iy_S0, iz_S0;
 +    gmx_mm_pr  ix_S1, iy_S1, iz_S1;
 +    gmx_mm_pr  ix_S2, iy_S2, iz_S2;
 +    gmx_mm_pr  ix_S3, iy_S3, iz_S3;
 +    gmx_mm_pr  fix_S0, fiy_S0, fiz_S0;
 +    gmx_mm_pr  fix_S1, fiy_S1, fiz_S1;
 +    gmx_mm_pr  fix_S2, fiy_S2, fiz_S2;
 +    gmx_mm_pr  fix_S3, fiy_S3, fiz_S3;
 +#if UNROLLJ >= 4
 +    /* We use an i-force SIMD register width of 4 */
 +#if UNROLLJ == 4
 +#define gmx_mm_pr4     gmx_mm_pr
 +#define gmx_load_pr4   gmx_load_pr
 +#define gmx_store_pr4  gmx_store_pr
 +#define gmx_add_pr4    gmx_add_pr
 +#else
 +    /* The pr4 stuff is defined in nbnxn_kernel_simd_utils.h */
 +#endif
 +    gmx_mm_pr4 fix_S, fiy_S, fiz_S;
 +#else
 +    /* We use an i-force SIMD register width of 2 */
 +    gmx_mm_pr  fix0_S, fiy0_S, fiz0_S;
 +    gmx_mm_pr  fix2_S, fiy2_S, fiz2_S;
 +#endif
 +
 +    gmx_mm_pr  diagonal_jmi_S;
 +#if UNROLLI == UNROLLJ
 +    gmx_mm_pb  diagonal_mask_S0, diagonal_mask_S1, diagonal_mask_S2, diagonal_mask_S3;
 +#else
 +    gmx_mm_pb  diagonal_mask0_S0, diagonal_mask0_S1, diagonal_mask0_S2, diagonal_mask0_S3;
 +    gmx_mm_pb  diagonal_mask1_S0, diagonal_mask1_S1, diagonal_mask1_S2, diagonal_mask1_S3;
 +#endif
 +
- #if GMX_SIMD_WIDTH_HERE >= 8 || (defined GMX_DOUBLE && GMX_SIMD_WIDTH_HERE >= 4)
- #define STORE_TABLE_INDICES
- #endif
- #ifdef STORE_TABLE_INDICES
-     int        ti0_array[2*GMX_SIMD_WIDTH_HERE-1], *ti0;
-     int        ti1_array[2*GMX_SIMD_WIDTH_HERE-1], *ti1;
-     int        ti2_array[2*GMX_SIMD_WIDTH_HERE-1], *ti2;
-     int        ti3_array[2*GMX_SIMD_WIDTH_HERE-1], *ti3;
- #else
-     /* Table indices not used, but a function requires the argument */
-     int        *ti0 = NULL, *ti1 = NULL, *ti2 = NULL, *ti3 = NULL;
- #endif
++    unsigned      *exclusion_filter;
++    gmx_exclfilter filter_S0, filter_S1, filter_S2, filter_S3;
 +
 +    gmx_mm_pr  zero_S = gmx_set1_pr(0);
 +
 +    gmx_mm_pr  one_S  = gmx_set1_pr(1.0);
 +    gmx_mm_pr  iq_S0  = gmx_setzero_pr();
 +    gmx_mm_pr  iq_S1  = gmx_setzero_pr();
 +    gmx_mm_pr  iq_S2  = gmx_setzero_pr();
 +    gmx_mm_pr  iq_S3  = gmx_setzero_pr();
 +    gmx_mm_pr  mrc_3_S;
 +#ifdef CALC_ENERGIES
 +    gmx_mm_pr  hrc_3_S, moh_rc_S;
 +#endif
 +
 +#ifdef CALC_COUL_TAB
 +    /* Coulomb table variables */
 +    gmx_mm_pr   invtsp_S;
 +    const real *tab_coul_F;
 +#ifndef TAB_FDV0
 +    const real *tab_coul_V;
 +#endif
- #if NBFP_STRIDE == 2
-     nbfp_ptr    = nbat->nbfp;
- #else
- #if NBFP_STRIDE == 4
-     nbfp_ptr    = nbat->nbfp_s4;
- #else
- #error "Only NBFP_STRIDE 2 and 4 are currently supported"
- #endif
- #endif
-     nbfp_stride = NBFP_STRIDE;
++    /* Thread-local working buffers for force and potential lookups */
++    int        ti0_array[2*GMX_SIMD_WIDTH_HERE-1], *ti0 = NULL;
++    int        ti1_array[2*GMX_SIMD_WIDTH_HERE-1], *ti1 = NULL;
++    int        ti2_array[2*GMX_SIMD_WIDTH_HERE-1], *ti2 = NULL;
++    int        ti3_array[2*GMX_SIMD_WIDTH_HERE-1], *ti3 = NULL;
 +#ifdef CALC_ENERGIES
 +    gmx_mm_pr  mhalfsp_S;
 +#endif
 +#endif
 +
 +#ifdef CALC_COUL_EWALD
 +    gmx_mm_pr beta2_S, beta_S;
 +#endif
 +
 +#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
 +    gmx_mm_pr  sh_ewald_S;
 +#endif
 +
 +#ifdef LJ_COMB_LB
 +    const real *ljc;
 +
 +    gmx_mm_pr   hsig_i_S0, seps_i_S0;
 +    gmx_mm_pr   hsig_i_S1, seps_i_S1;
 +    gmx_mm_pr   hsig_i_S2, seps_i_S2;
 +    gmx_mm_pr   hsig_i_S3, seps_i_S3;
 +#else
 +#ifdef FIX_LJ_C
 +    real        pvdw_array[2*UNROLLI*UNROLLJ+3];
 +    real       *pvdw_c6, *pvdw_c12;
 +    gmx_mm_pr   c6_S0, c12_S0;
 +    gmx_mm_pr   c6_S1, c12_S1;
 +    gmx_mm_pr   c6_S2, c12_S2;
 +    gmx_mm_pr   c6_S3, c12_S3;
 +#endif
 +
 +#ifdef LJ_COMB_GEOM
 +    const real *ljc;
 +
 +    gmx_mm_pr   c6s_S0, c12s_S0;
 +    gmx_mm_pr   c6s_S1, c12s_S1;
 +    gmx_mm_pr   c6s_S2 = gmx_setzero_pr(), c12s_S2 = gmx_setzero_pr();
 +    gmx_mm_pr   c6s_S3 = gmx_setzero_pr(), c12s_S3 = gmx_setzero_pr();
 +#endif
 +#endif /* LJ_COMB_LB */
 +
 +    gmx_mm_pr  vctot_S, Vvdwtot_S;
 +    gmx_mm_pr  sixth_S, twelveth_S;
 +
 +    gmx_mm_pr  avoid_sing_S;
 +    gmx_mm_pr  rc2_S;
 +#ifdef VDW_CUTOFF_CHECK
 +    gmx_mm_pr  rcvdw2_S;
 +#endif
 +
 +#ifdef CALC_ENERGIES
 +    gmx_mm_pr  sh_invrc6_S, sh_invrc12_S;
 +
 +    /* cppcheck-suppress unassignedVariable */
 +    real       tmpsum_array[15], *tmpsum;
 +#endif
 +#ifdef CALC_SHIFTFORCES
 +    /* cppcheck-suppress unassignedVariable */
 +    real       shf_array[15], *shf;
 +#endif
 +
 +    int ninner;
 +
 +#ifdef COUNT_PAIRS
 +    int npair = 0;
 +#endif
 +
 +#if defined LJ_COMB_GEOM || defined LJ_COMB_LB
 +    ljc = nbat->lj_comb;
 +#else
 +    /* No combination rule used */
-     /* Load masks for topology exclusion masking */
- #ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
- #define FILTER_STRIDE  (GMX_SIMD_EPI32_WIDTH/GMX_SIMD_WIDTH_HERE)
- #else
- #ifdef GMX_DOUBLE
- #define FILTER_STRIDE  2
- #else
- #define FILTER_STRIDE  1
- #endif
- #endif
- #if FILTER_STRIDE == 1
-     excl_filter = nbat->simd_exclusion_filter1;
- #else
-     excl_filter = nbat->simd_exclusion_filter2;
- #endif
++    nbfp_ptr    = (4 == nbfp_stride) ? nbat->nbfp_s4 : nbat->nbfp;
 +#endif
 +
 +    /* Load j-i for the first i */
 +    diagonal_jmi_S    = gmx_load_pr(nbat->simd_4xn_diagonal_j_minus_i);
 +    /* Generate all the diagonal masks as comparison results */
 +#if UNROLLI == UNROLLJ
 +    diagonal_mask_S0  = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 +    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 +    diagonal_mask_S1  = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 +    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 +    diagonal_mask_S2  = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 +    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 +    diagonal_mask_S3  = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 +#else
 +#if UNROLLI == 2*UNROLLJ || 2*UNROLLI == UNROLLJ
 +    diagonal_mask0_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 +    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 +    diagonal_mask0_S1 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 +    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 +    diagonal_mask0_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 +    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 +    diagonal_mask0_S3 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 +    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 +
 +#if UNROLLI == 2*UNROLLJ
 +    /* Load j-i for the second half of the j-cluster */
 +    diagonal_jmi_S    = gmx_load_pr(nbat->simd_4xn_diagonal_j_minus_i + UNROLLJ);
 +#endif
 +
 +    diagonal_mask1_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 +    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 +    diagonal_mask1_S1 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 +    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 +    diagonal_mask1_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 +    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 +    diagonal_mask1_S3 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 +#endif
 +#endif
 +
- #ifdef GMX_SIMD_HAVE_CHECKBITMASK_EPI32
-     filter_S0 = gmx_load_si((int *)excl_filter + 0*UNROLLJ*FILTER_STRIDE);
-     filter_S1 = gmx_load_si((int *)excl_filter + 1*UNROLLJ*FILTER_STRIDE);
-     filter_S2 = gmx_load_si((int *)excl_filter + 2*UNROLLJ*FILTER_STRIDE);
-     filter_S3 = gmx_load_si((int *)excl_filter + 3*UNROLLJ*FILTER_STRIDE);
- #else
-     filter_S0 = gmx_load_pr((real *)excl_filter + 0*UNROLLJ);
-     filter_S1 = gmx_load_pr((real *)excl_filter + 1*UNROLLJ);
-     filter_S2 = gmx_load_pr((real *)excl_filter + 2*UNROLLJ);
-     filter_S3 = gmx_load_pr((real *)excl_filter + 3*UNROLLJ);
- #endif
- #undef FILTER_STRIDE
++    /* Load masks for topology exclusion masking. filter_stride is
++       static const, so the conditional will be optimized away. */
++    if (1 == filter_stride)
++    {
++        exclusion_filter = nbat->simd_exclusion_filter1;
++    }
++    else /* (2 == filter_stride) */
++    {
++        exclusion_filter = nbat->simd_exclusion_filter2;
++    }
++
 +    /* Here we cast the exclusion filters from unsigned * to int * or real *.
 +     * Since we only check bits, the actual value they represent does not
 +     * matter, as long as both filter and mask data are treated the same way.
 +     */
- #ifdef STORE_TABLE_INDICES
++    filter_S0    = gmx_load_exclusion_filter(exclusion_filter + 0*UNROLLJ*filter_stride);
++    filter_S1    = gmx_load_exclusion_filter(exclusion_filter + 1*UNROLLJ*filter_stride);
++    filter_S2    = gmx_load_exclusion_filter(exclusion_filter + 2*UNROLLJ*filter_stride);
++    filter_S3    = gmx_load_exclusion_filter(exclusion_filter + 3*UNROLLJ*filter_stride);
 +
 +#ifdef CALC_COUL_TAB
-     ti0 = gmx_simd_align_int(ti0_array);
-     ti1 = gmx_simd_align_int(ti1_array);
-     ti2 = gmx_simd_align_int(ti2_array);
-     ti3 = gmx_simd_align_int(ti3_array);
- #endif
 +    /* Generate aligned table index pointers */
- #undef STORE_TABLE_INDICES
++    ti0 = prepare_table_load_buffer(ti0_array);
++    ti1 = prepare_table_load_buffer(ti1_array);
++    ti2 = prepare_table_load_buffer(ti2_array);
++    ti3 = prepare_table_load_buffer(ti3_array);
 +
 +    invtsp_S  = gmx_set1_pr(ic->tabq_scale);
 +#ifdef CALC_ENERGIES
 +    mhalfsp_S = gmx_set1_pr(-0.5/ic->tabq_scale);
 +#endif
 +
 +#ifdef TAB_FDV0
 +    tab_coul_F = ic->tabq_coul_FDV0;
 +#else
 +    tab_coul_F = ic->tabq_coul_F;
 +    tab_coul_V = ic->tabq_coul_V;
 +#endif
 +#endif /* CALC_COUL_TAB */
 +
 +#ifdef CALC_COUL_EWALD
 +    beta2_S = gmx_set1_pr(ic->ewaldcoeff*ic->ewaldcoeff);
 +    beta_S  = gmx_set1_pr(ic->ewaldcoeff);
 +#endif
 +
 +#if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
 +    sh_ewald_S = gmx_set1_pr(ic->sh_ewald);
 +#endif
 +
 +    q                   = nbat->q;
 +    type                = nbat->type;
 +    facel               = ic->epsfac;
 +    shiftvec            = shift_vec[0];
 +    x                   = nbat->x;
 +
 +    avoid_sing_S = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
 +
 +    /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
 +    rc2_S    = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
 +#ifdef VDW_CUTOFF_CHECK
 +    rcvdw2_S = gmx_set1_pr(ic->rvdw*ic->rvdw);
 +#endif
 +
 +#ifdef CALC_ENERGIES
 +    sixth_S      = gmx_set1_pr(1.0/6.0);
 +    twelveth_S   = gmx_set1_pr(1.0/12.0);
 +
 +    sh_invrc6_S  = gmx_set1_pr(ic->sh_invrc6);
 +    sh_invrc12_S = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
 +#endif
 +
 +    mrc_3_S  = gmx_set1_pr(-2*ic->k_rf);
 +
 +#ifdef CALC_ENERGIES
 +    hrc_3_S  = gmx_set1_pr(ic->k_rf);
 +
 +    moh_rc_S = gmx_set1_pr(-ic->c_rf);
 +#endif
 +
 +#ifdef CALC_ENERGIES
 +    tmpsum   = gmx_simd_align_real(tmpsum_array);
 +#endif
 +#ifdef CALC_SHIFTFORCES
 +    shf      = gmx_simd_align_real(shf_array);
 +#endif
 +
 +#ifdef FIX_LJ_C
 +    pvdw_c6  = gmx_simd_align_real(pvdw_array+3);
 +    pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
 +
 +    for (jp = 0; jp < UNROLLJ; jp++)
 +    {
 +        pvdw_c6 [0*UNROLLJ+jp] = nbat->nbfp[0*2];
 +        pvdw_c6 [1*UNROLLJ+jp] = nbat->nbfp[0*2];
 +        pvdw_c6 [2*UNROLLJ+jp] = nbat->nbfp[0*2];
 +        pvdw_c6 [3*UNROLLJ+jp] = nbat->nbfp[0*2];
 +
 +        pvdw_c12[0*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 +        pvdw_c12[1*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 +        pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 +        pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 +    }
 +    c6_S0            = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
 +    c6_S1            = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
 +    c6_S2            = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
 +    c6_S3            = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
 +
 +    c12_S0           = gmx_load_pr(pvdw_c12+0*UNROLLJ);
 +    c12_S1           = gmx_load_pr(pvdw_c12+1*UNROLLJ);
 +    c12_S2           = gmx_load_pr(pvdw_c12+2*UNROLLJ);
 +    c12_S3           = gmx_load_pr(pvdw_c12+3*UNROLLJ);
 +#endif /* FIX_LJ_C */
 +
 +#ifdef ENERGY_GROUPS
 +    egps_ishift  = nbat->neg_2log;
 +    egps_imask   = (1<<egps_ishift) - 1;
 +    egps_jshift  = 2*nbat->neg_2log;
 +    egps_jmask   = (1<<egps_jshift) - 1;
 +    egps_jstride = (UNROLLJ>>1)*UNROLLJ;
 +    /* Major division is over i-particle energy groups, determine the stride */
 +    Vstride_i    = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
 +#endif
 +
 +    l_cj = nbl->cj;
 +
 +    ninner = 0;
 +    for (n = 0; n < nbl->nci; n++)
 +    {
 +        nbln = &nbl->ci[n];
 +
 +        ish              = (nbln->shift & NBNXN_CI_SHIFT);
 +        ish3             = ish*3;
 +        cjind0           = nbln->cj_ind_start;
 +        cjind1           = nbln->cj_ind_end;
 +        ci               = nbln->ci;
 +        ci_sh            = (ish == CENTRAL ? ci : -1);
 +
 +        shX_S = gmx_load1_pr(shiftvec+ish3);
 +        shY_S = gmx_load1_pr(shiftvec+ish3+1);
 +        shZ_S = gmx_load1_pr(shiftvec+ish3+2);
 +
 +#if UNROLLJ <= 4
 +        sci              = ci*STRIDE;
 +        scix             = sci*DIM;
 +        sci2             = sci*2;
 +#else
 +        sci              = (ci>>1)*STRIDE;
 +        scix             = sci*DIM + (ci & 1)*(STRIDE>>1);
 +        sci2             = sci*2 + (ci & 1)*(STRIDE>>1);
 +        sci             += (ci & 1)*(STRIDE>>1);
 +#endif
 +
 +        /* We have 5 LJ/C combinations, but use only three inner loops,
 +         * as the other combinations are unlikely and/or not much faster:
 +         * inner half-LJ + C for half-LJ + C / no-LJ + C
 +         * inner LJ + C      for full-LJ + C
 +         * inner LJ          for full-LJ + no-C / half-LJ + no-C
 +         */
 +        do_LJ   = (nbln->shift & NBNXN_CI_DO_LJ(0));
 +        do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
 +        half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
 +
 +#ifdef ENERGY_GROUPS
 +        egps_i = nbat->energrp[ci];
 +        {
 +            int ia, egp_ia;
 +
 +            for (ia = 0; ia < UNROLLI; ia++)
 +            {
 +                egp_ia     = (egps_i >> (ia*egps_ishift)) & egps_imask;
 +                vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
 +                vctp[ia]   = Vc   + egp_ia*Vstride_i;
 +            }
 +        }
 +#endif
 +#if defined CALC_ENERGIES
 +#if UNROLLJ == 4
 +        if (do_coul && l_cj[nbln->cj_ind_start].cj == ci_sh)
 +#endif
 +#if UNROLLJ == 2
 +        if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh<<1))
 +#endif
 +#if UNROLLJ == 8
 +        if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1))
 +#endif
 +        {
 +            int  ia;
 +            real Vc_sub_self;
 +
 +#ifdef CALC_COUL_RF
 +            Vc_sub_self = 0.5*ic->c_rf;
 +#endif
 +#ifdef CALC_COUL_TAB
 +#ifdef TAB_FDV0
 +            Vc_sub_self = 0.5*tab_coul_F[2];
 +#else
 +            Vc_sub_self = 0.5*tab_coul_V[0];
 +#endif
 +#endif
 +#ifdef CALC_COUL_EWALD
 +            /* beta/sqrt(pi) */
 +            Vc_sub_self = 0.5*ic->ewaldcoeff*M_2_SQRTPI;
 +#endif
 +
 +            for (ia = 0; ia < UNROLLI; ia++)
 +            {
 +                real qi;
 +
 +                qi = q[sci+ia];
 +#ifdef ENERGY_GROUPS
 +                vctp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
 +#else
 +                Vc[0]
 +#endif
 +                    -= facel*qi*qi*Vc_sub_self;
 +            }
 +        }
 +#endif
 +
 +        /* Load i atom data */
 +        sciy             = scix + STRIDE;
 +        sciz             = sciy + STRIDE;
 +        ix_S0          = gmx_add_pr(gmx_load1_pr(x+scix), shX_S);
 +        ix_S1          = gmx_add_pr(gmx_load1_pr(x+scix+1), shX_S);
 +        ix_S2          = gmx_add_pr(gmx_load1_pr(x+scix+2), shX_S);
 +        ix_S3          = gmx_add_pr(gmx_load1_pr(x+scix+3), shX_S);
 +        iy_S0          = gmx_add_pr(gmx_load1_pr(x+sciy), shY_S);
 +        iy_S1          = gmx_add_pr(gmx_load1_pr(x+sciy+1), shY_S);
 +        iy_S2          = gmx_add_pr(gmx_load1_pr(x+sciy+2), shY_S);
 +        iy_S3          = gmx_add_pr(gmx_load1_pr(x+sciy+3), shY_S);
 +        iz_S0          = gmx_add_pr(gmx_load1_pr(x+sciz), shZ_S);
 +        iz_S1          = gmx_add_pr(gmx_load1_pr(x+sciz+1), shZ_S);
 +        iz_S2          = gmx_add_pr(gmx_load1_pr(x+sciz+2), shZ_S);
 +        iz_S3          = gmx_add_pr(gmx_load1_pr(x+sciz+3), shZ_S);
 +
 +        if (do_coul)
 +        {
 +            iq_S0      = gmx_set1_pr(facel*q[sci]);
 +            iq_S1      = gmx_set1_pr(facel*q[sci+1]);
 +            iq_S2      = gmx_set1_pr(facel*q[sci+2]);
 +            iq_S3      = gmx_set1_pr(facel*q[sci+3]);
 +        }
 +
 +#ifdef LJ_COMB_LB
 +        hsig_i_S0      = gmx_load1_pr(ljc+sci2+0);
 +        hsig_i_S1      = gmx_load1_pr(ljc+sci2+1);
 +        hsig_i_S2      = gmx_load1_pr(ljc+sci2+2);
 +        hsig_i_S3      = gmx_load1_pr(ljc+sci2+3);
 +        seps_i_S0      = gmx_load1_pr(ljc+sci2+STRIDE+0);
 +        seps_i_S1      = gmx_load1_pr(ljc+sci2+STRIDE+1);
 +        seps_i_S2      = gmx_load1_pr(ljc+sci2+STRIDE+2);
 +        seps_i_S3      = gmx_load1_pr(ljc+sci2+STRIDE+3);
 +#else
 +#ifdef LJ_COMB_GEOM
 +        c6s_S0         = gmx_load1_pr(ljc+sci2+0);
 +        c6s_S1         = gmx_load1_pr(ljc+sci2+1);
 +        if (!half_LJ)
 +        {
 +            c6s_S2     = gmx_load1_pr(ljc+sci2+2);
 +            c6s_S3     = gmx_load1_pr(ljc+sci2+3);
 +        }
 +        c12s_S0        = gmx_load1_pr(ljc+sci2+STRIDE+0);
 +        c12s_S1        = gmx_load1_pr(ljc+sci2+STRIDE+1);
 +        if (!half_LJ)
 +        {
 +            c12s_S2    = gmx_load1_pr(ljc+sci2+STRIDE+2);
 +            c12s_S3    = gmx_load1_pr(ljc+sci2+STRIDE+3);
 +        }
 +#else
 +        nbfp0     = nbfp_ptr + type[sci  ]*nbat->ntype*nbfp_stride;
 +        nbfp1     = nbfp_ptr + type[sci+1]*nbat->ntype*nbfp_stride;
 +        if (!half_LJ)
 +        {
 +            nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*nbfp_stride;
 +            nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*nbfp_stride;
 +        }
 +#endif
 +#endif
 +
 +        /* Zero the potential energy for this list */
 +        Vvdwtot_S        = gmx_setzero_pr();
 +        vctot_S          = gmx_setzero_pr();
 +
 +        /* Clear i atom forces */
 +        fix_S0           = gmx_setzero_pr();
 +        fix_S1           = gmx_setzero_pr();
 +        fix_S2           = gmx_setzero_pr();
 +        fix_S3           = gmx_setzero_pr();
 +        fiy_S0           = gmx_setzero_pr();
 +        fiy_S1           = gmx_setzero_pr();
 +        fiy_S2           = gmx_setzero_pr();
 +        fiy_S3           = gmx_setzero_pr();
 +        fiz_S0           = gmx_setzero_pr();
 +        fiz_S1           = gmx_setzero_pr();
 +        fiz_S2           = gmx_setzero_pr();
 +        fiz_S3           = gmx_setzero_pr();
 +
 +        cjind = cjind0;
 +
 +        /* Currently all kernels use (at least half) LJ */
 +#define CALC_LJ
 +        if (half_LJ)
 +        {
 +#define CALC_COULOMB
 +#define HALF_LJ
 +#define CHECK_EXCLS
 +            while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
 +            {
 +#include "nbnxn_kernel_simd_4xn_inner.h"
 +                cjind++;
 +            }
 +#undef CHECK_EXCLS
 +            for (; (cjind < cjind1); cjind++)
 +            {
 +#include "nbnxn_kernel_simd_4xn_inner.h"
 +            }
 +#undef HALF_LJ
 +#undef CALC_COULOMB
 +        }
 +        else if (do_coul)
 +        {
 +#define CALC_COULOMB
 +#define CHECK_EXCLS
 +            while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
 +            {
 +#include "nbnxn_kernel_simd_4xn_inner.h"
 +                cjind++;
 +            }
 +#undef CHECK_EXCLS
 +            for (; (cjind < cjind1); cjind++)
 +            {
 +#include "nbnxn_kernel_simd_4xn_inner.h"
 +            }
 +#undef CALC_COULOMB
 +        }
 +        else
 +        {
 +#define CHECK_EXCLS
 +            while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
 +            {
 +#include "nbnxn_kernel_simd_4xn_inner.h"
 +                cjind++;
 +            }
 +#undef CHECK_EXCLS
 +            for (; (cjind < cjind1); cjind++)
 +            {
 +#include "nbnxn_kernel_simd_4xn_inner.h"
 +            }
 +        }
 +#undef CALC_LJ
 +        ninner += cjind1 - cjind0;
 +
 +        /* Add accumulated i-forces to the force array */
 +#if UNROLLJ >= 4
 +        fix_S = gmx_mm_transpose_sum4_pr(fix_S0, fix_S1, fix_S2, fix_S3);
 +        gmx_store_pr4(f+scix, gmx_add_pr4(fix_S, gmx_load_pr4(f+scix)));
 +
 +        fiy_S = gmx_mm_transpose_sum4_pr(fiy_S0, fiy_S1, fiy_S2, fiy_S3);
 +        gmx_store_pr4(f+sciy, gmx_add_pr4(fiy_S, gmx_load_pr4(f+sciy)));
 +
 +        fiz_S = gmx_mm_transpose_sum4_pr(fiz_S0, fiz_S1, fiz_S2, fiz_S3);
 +        gmx_store_pr4(f+sciz, gmx_add_pr4(fiz_S, gmx_load_pr4(f+sciz)));
 +
 +#ifdef CALC_SHIFTFORCES
 +        gmx_store_pr4(shf, fix_S);
 +        fshift[ish3+0] += SUM_SIMD4(shf);
 +        gmx_store_pr4(shf, fiy_S);
 +        fshift[ish3+1] += SUM_SIMD4(shf);
 +        gmx_store_pr4(shf, fiz_S);
 +        fshift[ish3+2] += SUM_SIMD4(shf);
 +#endif
 +#else
 +        fix0_S = gmx_mm_transpose_sum2_pr(fix_S0, fix_S1);
 +        gmx_store_pr(f+scix, gmx_add_pr(fix0_S, gmx_load_pr(f+scix)));
 +        fix2_S = gmx_mm_transpose_sum2_pr(fix_S2, fix_S3);
 +        gmx_store_pr(f+scix+2, gmx_add_pr(fix2_S, gmx_load_pr(f+scix+2)));
 +
 +        fiy0_S = gmx_mm_transpose_sum2_pr(fiy_S0, fiy_S1);
 +        gmx_store_pr(f+sciy, gmx_add_pr(fiy0_S, gmx_load_pr(f+sciy)));
 +        fiy2_S = gmx_mm_transpose_sum2_pr(fiy_S2, fiy_S3);
 +        gmx_store_pr(f+sciy+2, gmx_add_pr(fiy2_S, gmx_load_pr(f+sciy+2)));
 +
 +        fiz0_S = gmx_mm_transpose_sum2_pr(fiz_S0, fiz_S1);
 +        gmx_store_pr(f+sciz, gmx_add_pr(fiz0_S, gmx_load_pr(f+sciz)));
 +        fiz2_S = gmx_mm_transpose_sum2_pr(fiz_S2, fiz_S3);
 +        gmx_store_pr(f+sciz+2, gmx_add_pr(fiz2_S, gmx_load_pr(f+sciz+2)));
 +
 +#ifdef CALC_SHIFTFORCES
 +        gmx_store_pr(shf, gmx_add_pr(fix0_S, fix2_S));
 +        fshift[ish3+0] += shf[0] + shf[1];
 +        gmx_store_pr(shf, gmx_add_pr(fiy0_S, fiy2_S));
 +        fshift[ish3+1] += shf[0] + shf[1];
 +        gmx_store_pr(shf, gmx_add_pr(fiz0_S, fiz2_S));
 +        fshift[ish3+2] += shf[0] + shf[1];
 +#endif
 +#endif
 +
 +#ifdef CALC_ENERGIES
 +        if (do_coul)
 +        {
 +            gmx_store_pr(tmpsum, vctot_S);
 +            *Vc += SUM_SIMD(tmpsum);
 +        }
 +
 +        gmx_store_pr(tmpsum, Vvdwtot_S);
 +        *Vvdw += SUM_SIMD(tmpsum);
 +#endif
 +
 +        /* Outer loop uses 6 flops/iteration */
 +    }
 +
 +#ifdef COUNT_PAIRS
 +    printf("atom pairs %d\n", npair);
 +#endif
 +}
 +
 +
 +#if UNROLLJ == 4
 +#undef gmx_mm_pr4
 +#undef gmx_load_pr4
 +#undef gmx_store_pr4
 +#undef gmx_store_pr4
 +#endif
 +
- #undef UNROLLI
- #undef UNROLLJ
- #undef STRIDE
- #undef TAB_FDV0
- #undef NBFP_STRIDE
- #undef GMX_USE_HALF_WIDTH_SIMD_HERE
 +#undef CALC_SHIFTFORCES
index 4ad646534d521fa981ea90ec447c2e11ddec3590,0000000000000000000000000000000000000000..37cf50f093694e7cfa741a92aa2385df2915cdec
mode 100644,000000..100644
--- /dev/null
@@@ -1,121 -1,0 +1,189 @@@
- #ifndef _nbnxn_kernel_sse_utils_h_
- #define _nbnxn_kernel_sse_utils_h_
 +/*
 + * This file is part of the GROMACS molecular simulation package.
 + *
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS Development Team
 + * Copyright (c) 2012, by the GROMACS development team, led by
 + * David van der Spoel, Berk Hess, Erik Lindahl, and including many
 + * others, as listed in the AUTHORS file in the top-level source
 + * directory and at http://www.gromacs.org.
 + *
 + * GROMACS is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2.1
 + * of the License, or (at your option) any later version.
 + *
 + * GROMACS is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with GROMACS; if not, see
 + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 + *
 + * If you want to redistribute modifications to GROMACS, please
 + * consider that scientific software is very special. Version
 + * control is crucial - bugs must be traceable. We will be happy to
 + * consider code for inclusion in the official distribution, but
 + * derived work must not be called official GROMACS. Details are found
 + * in the README & COPYING files - if they are missing, get the
 + * official version at http://www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org.
 + */
- /* This files contains all functions/macros for the SIMD kernels
-  * which have explicit dependencies on the j-cluster size and/or SIMD-width.
++#ifndef _nbnxn_kernel_simd_utils_h_
++#define _nbnxn_kernel_simd_utils_h_
 +
- /* Include SIMD architecture specific versions of the 4/5 functions above */
++/*! \brief Provides hardware-specific utility routines for the SIMD kernels.
++ *
++ * Defines all functions, typedefs, constants and macros that have
++ * explicit dependencies on the j-cluster size, precision, or SIMD
++ * width. This includes handling diagonal, Newton and topology
++ * exclusions.
++ *
 + * The functionality which depends on the j-cluster size is:
 + *   LJ-parameter lookup
 + *   force table lookup
 + *   energy group pair energy storage
 + */
 +
++#if !defined GMX_NBNXN_SIMD_2XNN && !defined GMX_NBNXN_SIMD_4XN
++#error "Must define an NBNxN kernel flavour before including NBNxN kernel utility functions"
++#endif
 +
- #else
 +#ifdef GMX_SIMD_REFERENCE_PLAIN_C
++
 +#include "nbnxn_kernel_simd_utils_ref.h"
- #else
++
++#else /* GMX_SIMD_REFERENCE_PLAIN_C */
++
 +#ifdef GMX_X86_SSE2
 +/* Include x86 SSE2 compatible SIMD functions */
++
++/* Set the stride for the lookup of the two LJ parameters from their
++   (padded) array. Only strides of 2 and 4 are currently supported. */
++#if defined GMX_NBNXN_SIMD_2XNN
++static const int nbfp_stride = 4;
++#elif defined GMX_DOUBLE
++static const int nbfp_stride = 2;
++#else
++static const int nbfp_stride = 4;
++#endif
++
++/* Align a stack-based thread-local working array. Table loads on
++ * full-width AVX_256 use the array, but other implementations do
++ * not. */
++static gmx_inline int *
++prepare_table_load_buffer(const int *array)
++{
 +#if defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE
++    return gmx_simd_align_int(array);
++#else
++    return NULL;
++#endif
++}
++
++#if defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE
++
++/* With full AVX-256 SIMD, half SIMD-width table loads are optimal */
++#if GMX_SIMD_WIDTH_HERE == 8
++#define TAB_FDV0
++#endif
++
++/*
++Berk, 2xnn.c had the following code, but I think it is safe to remove now, given the code immediately above.
++
++#if defined GMX_X86_AVX_256 && !defined GMX_DOUBLE
++/ * AVX-256 single precision 2x(4+4) kernel,
++ * we can do half SIMD-width aligned FDV0 table loads.
++ * /
++#define TAB_FDV0
++#endif
++*/
++
 +#ifdef GMX_DOUBLE
 +#include "nbnxn_kernel_simd_utils_x86_256d.h"
- #else
++#else  /* GMX_DOUBLE */
 +#include "nbnxn_kernel_simd_utils_x86_256s.h"
++#endif /* GMX_DOUBLE */
++
++#else  /* defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE */
++
++/* We use the FDV0 table layout when we can use aligned table loads */
++#if GMX_SIMD_WIDTH_HERE == 4
++#define TAB_FDV0
 +#endif
- #else
++
 +#ifdef GMX_DOUBLE
 +#include "nbnxn_kernel_simd_utils_x86_128d.h"
- #endif
- #endif
- #endif
++#else  /* GMX_DOUBLE */
 +#include "nbnxn_kernel_simd_utils_x86_128s.h"
++#endif /* GMX_DOUBLE */
++
++#endif /* defined GMX_X86_AVX_256 && !defined GMX_USE_HALF_WIDTH_SIMD_HERE */
++
++#else  /* GMX_X86_SSE2 */
++
++#if GMX_SIMD_WIDTH_HERE > 4
++static const int nbfp_stride = 4;
++#else
++static const int nbfp_stride = GMX_SIMD_WIDTH_HERE;
 +#endif
- #endif /* _nbnxn_kernel_sse_utils_h_ */
++
++#endif /* GMX_X86_SSE2 */
++#endif /* GMX_SIMD_REFERENCE_PLAIN_C */
 +
 +
 +#ifdef UNROLLJ
 +/* Add energy register to possibly multiple terms in the energy array */
 +static inline void add_ener_grp(gmx_mm_pr e_S, real *v, const int *offset_jj)
 +{
 +    int jj;
 +
 +    /* We need to balance the number of store operations with
 +     * the rapidly increases number of combinations of energy groups.
 +     * We add to a temporary buffer for 1 i-group vs 2 j-groups.
 +     */
 +    for (jj = 0; jj < (UNROLLJ/2); jj++)
 +    {
 +        gmx_mm_pr v_S;
 +
 +        v_S = gmx_load_pr(v+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE);
 +        gmx_store_pr(v+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE, gmx_add_pr(v_S, e_S));
 +    }
 +}
 +#endif
 +
 +#if defined GMX_NBNXN_SIMD_2XNN && defined UNROLLJ
 +/* As add_ener_grp, but for two groups of UNROLLJ/2 stored in
 + * a single SIMD register.
 + */
 +static inline void
 +add_ener_grp_halves(gmx_mm_pr e_S, real *v0, real *v1, const int *offset_jj)
 +{
 +    gmx_mm_hpr e_S0, e_S1;
 +    int        jj;
 +
 +    gmx_pr_to_2hpr(e_S, &e_S0, &e_S1);
 +
 +    for (jj = 0; jj < (UNROLLJ/2); jj++)
 +    {
 +        gmx_mm_hpr v_S;
 +
 +        gmx_load_hpr(&v_S, v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2);
 +        gmx_store_hpr(v0+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2, gmx_add_hpr(v_S, e_S0));
 +    }
 +    for (jj = 0; jj < (UNROLLJ/2); jj++)
 +    {
 +        gmx_mm_hpr v_S;
 +
 +        gmx_load_hpr(&v_S, v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2);
 +        gmx_store_hpr(v1+offset_jj[jj]+jj*GMX_SIMD_WIDTH_HERE/2, gmx_add_hpr(v_S, e_S1));
 +    }
 +}
 +#endif
 +
++#endif /* _nbnxn_kernel_simd_utils_h_ */
index bf33c95c31cee2ed810195eae26be3ea72b3db6c,0000000000000000000000000000000000000000..d4155ebdbd6456d72996f8e26cfc2eae3a110b0d
mode 100644,000000..100644
--- /dev/null
@@@ -1,5164 -1,0 +1,5184 @@@
- /* Pair search box lower and upper corner in x,y,z.
-  * Store this in 4 iso 3 reals, which is useful with SSE.
-  * To avoid complicating the code we also use 4 without SSE.
-  */
- #define NNBSBB_C         4
- #define NNBSBB_B         (2*NNBSBB_C)
- /* Pair search box lower and upper bound in z only. */
- #define NNBSBB_D         2
- /* Pair search box lower and upper corner x,y,z indices */
- #define BBL_X  0
- #define BBL_Y  1
- #define BBL_Z  2
- #define BBU_X  4
- #define BBU_Y  5
- #define BBU_Z  6
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + */
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +#include <string.h>
 +#include "sysstuff.h"
 +#include "smalloc.h"
 +#include "macros.h"
 +#include "maths.h"
 +#include "vec.h"
 +#include "pbc.h"
 +#include "nbnxn_consts.h"
 +/* nbnxn_internal.h included gmx_simd_macros.h */
 +#include "nbnxn_internal.h"
 +#ifdef GMX_NBNXN_SIMD
 +#include "gmx_simd_vec.h"
 +#endif
 +#include "nbnxn_atomdata.h"
 +#include "nbnxn_search.h"
 +#include "gmx_cyclecounter.h"
 +#include "gmxfio.h"
 +#include "gmx_omp_nthreads.h"
 +#include "nrnb.h"
 +
 +
-         int bb_nalloc;
 +#ifdef NBNXN_SEARCH_BB_SSE
 +/* We use SSE or AVX-128bit for bounding box calculations */
 +
 +#ifndef GMX_DOUBLE
 +/* Single precision BBs + coordinates, we can also load coordinates using SSE */
 +#define NBNXN_SEARCH_SSE_SINGLE
 +#endif
 +
 +/* Include basic SSE2 stuff */
 +#include <emmintrin.h>
 +
 +#if defined NBNXN_SEARCH_SSE_SINGLE && (GPU_NSUBCELL == 4 || GPU_NSUBCELL == 8)
 +/* Store bounding boxes with x, y and z coordinates in packs of 4 */
 +#define NBNXN_PBB_SSE
 +#endif
 +
 +/* The width of SSE/AVX128 with single precision for bounding boxes with GPU.
 + * Here AVX-256 turns out to be slightly slower than AVX-128.
 + */
 +#define STRIDE_PBB        4
 +#define STRIDE_PBB_2LOG   2
 +
 +#endif /* NBNXN_SEARCH_BB_SSE */
 +
 +#ifdef GMX_NBNXN_SIMD
 +
 +/* The functions below are macros as they are performance sensitive */
 +
 +/* 4x4 list, pack=4: no complex conversion required */
 +/* i-cluster to j-cluster conversion */
 +#define CI_TO_CJ_J4(ci)   (ci)
 +/* cluster index to coordinate array index conversion */
 +#define X_IND_CI_J4(ci)  ((ci)*STRIDE_P4)
 +#define X_IND_CJ_J4(cj)  ((cj)*STRIDE_P4)
 +
 +/* 4x2 list, pack=4: j-cluster size is half the packing width */
 +/* i-cluster to j-cluster conversion */
 +#define CI_TO_CJ_J2(ci)  ((ci)<<1)
 +/* cluster index to coordinate array index conversion */
 +#define X_IND_CI_J2(ci)  ((ci)*STRIDE_P4)
 +#define X_IND_CJ_J2(cj)  (((cj)>>1)*STRIDE_P4 + ((cj) & 1)*(PACK_X4>>1))
 +
 +/* 4x8 list, pack=8: i-cluster size is half the packing width */
 +/* i-cluster to j-cluster conversion */
 +#define CI_TO_CJ_J8(ci)  ((ci)>>1)
 +/* cluster index to coordinate array index conversion */
 +#define X_IND_CI_J8(ci)  (((ci)>>1)*STRIDE_P8 + ((ci) & 1)*(PACK_X8>>1))
 +#define X_IND_CJ_J8(cj)  ((cj)*STRIDE_P8)
 +
 +/* The j-cluster size is matched to the SIMD width */
 +#if GMX_SIMD_WIDTH_HERE == 2
 +#define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J2(ci)
 +#define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J2(ci)
 +#define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J2(cj)
 +#else
 +#if GMX_SIMD_WIDTH_HERE == 4
 +#define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J4(ci)
 +#define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J4(ci)
 +#define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J4(cj)
 +#else
 +#if GMX_SIMD_WIDTH_HERE == 8
 +#define CI_TO_CJ_SIMD_4XN(ci)  CI_TO_CJ_J8(ci)
 +#define X_IND_CI_SIMD_4XN(ci)  X_IND_CI_J8(ci)
 +#define X_IND_CJ_SIMD_4XN(cj)  X_IND_CJ_J8(cj)
 +/* Half SIMD with j-cluster size */
 +#define CI_TO_CJ_SIMD_2XNN(ci) CI_TO_CJ_J4(ci)
 +#define X_IND_CI_SIMD_2XNN(ci) X_IND_CI_J4(ci)
 +#define X_IND_CJ_SIMD_2XNN(cj) X_IND_CJ_J4(cj)
 +#else
 +#if GMX_SIMD_WIDTH_HERE == 16
 +#define CI_TO_CJ_SIMD_2XNN(ci) CI_TO_CJ_J8(ci)
 +#define X_IND_CI_SIMD_2XNN(ci) X_IND_CI_J8(ci)
 +#define X_IND_CJ_SIMD_2XNN(cj) X_IND_CJ_J8(cj)
 +#else
 +#error "unsupported GMX_NBNXN_SIMD_WIDTH"
 +#endif
 +#endif
 +#endif
 +#endif
 +
 +#endif /* GMX_NBNXN_SIMD */
 +
 +
 +#ifdef NBNXN_SEARCH_BB_SSE
 +/* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */
 +#define NBNXN_BBXXXX
 +/* Size of bounding box corners quadruplet */
 +#define NNBSBB_XXXX      (NNBSBB_D*DIM*STRIDE_PBB)
 +#endif
 +
 +/* We shift the i-particles backward for PBC.
 + * This leads to more conditionals than shifting forward.
 + * We do this to get more balanced pair lists.
 + */
 +#define NBNXN_SHIFT_BACKWARD
 +
 +
 +/* This define is a lazy way to avoid interdependence of the grid
 + * and searching data structures.
 + */
 +#define NBNXN_NA_SC_MAX (GPU_NSUBCELL*NBNXN_GPU_CLUSTER_SIZE)
 +
 +
 +static void nbs_cycle_clear(nbnxn_cycle_t *cc)
 +{
 +    int i;
 +
 +    for (i = 0; i < enbsCCnr; i++)
 +    {
 +        cc[i].count = 0;
 +        cc[i].c     = 0;
 +    }
 +}
 +
 +static double Mcyc_av(const nbnxn_cycle_t *cc)
 +{
 +    return (double)cc->c*1e-6/cc->count;
 +}
 +
 +static void nbs_cycle_print(FILE *fp, const nbnxn_search_t nbs)
 +{
 +    int n;
 +    int t;
 +
 +    fprintf(fp, "\n");
 +    fprintf(fp, "ns %4d grid %4.1f search %4.1f red.f %5.3f",
 +            nbs->cc[enbsCCgrid].count,
 +            Mcyc_av(&nbs->cc[enbsCCgrid]),
 +            Mcyc_av(&nbs->cc[enbsCCsearch]),
 +            Mcyc_av(&nbs->cc[enbsCCreducef]));
 +
 +    if (nbs->nthread_max > 1)
 +    {
 +        if (nbs->cc[enbsCCcombine].count > 0)
 +        {
 +            fprintf(fp, " comb %5.2f",
 +                    Mcyc_av(&nbs->cc[enbsCCcombine]));
 +        }
 +        fprintf(fp, " s. th");
 +        for (t = 0; t < nbs->nthread_max; t++)
 +        {
 +            fprintf(fp, " %4.1f",
 +                    Mcyc_av(&nbs->work[t].cc[enbsCCsearch]));
 +        }
 +    }
 +    fprintf(fp, "\n");
 +}
 +
 +static void nbnxn_grid_init(nbnxn_grid_t * grid)
 +{
 +    grid->cxy_na      = NULL;
 +    grid->cxy_ind     = NULL;
 +    grid->cxy_nalloc  = 0;
 +    grid->bb          = NULL;
 +    grid->bbj         = NULL;
 +    grid->nc_nalloc   = 0;
 +}
 +
 +static int get_2log(int n)
 +{
 +    int log2;
 +
 +    log2 = 0;
 +    while ((1<<log2) < n)
 +    {
 +        log2++;
 +    }
 +    if ((1<<log2) != n)
 +    {
 +        gmx_fatal(FARGS, "nbnxn na_c (%d) is not a power of 2", n);
 +    }
 +
 +    return log2;
 +}
 +
 +static int nbnxn_kernel_to_ci_size(int nb_kernel_type)
 +{
 +    switch (nb_kernel_type)
 +    {
 +        case nbnxnk4x4_PlainC:
 +        case nbnxnk4xN_SIMD_4xN:
 +        case nbnxnk4xN_SIMD_2xNN:
 +            return NBNXN_CPU_CLUSTER_I_SIZE;
 +        case nbnxnk8x8x8_CUDA:
 +        case nbnxnk8x8x8_PlainC:
 +            /* The cluster size for super/sub lists is only set here.
 +             * Any value should work for the pair-search and atomdata code.
 +             * The kernels, of course, might require a particular value.
 +             */
 +            return NBNXN_GPU_CLUSTER_SIZE;
 +        default:
 +            gmx_incons("unknown kernel type");
 +    }
 +
 +    return 0;
 +}
 +
 +int nbnxn_kernel_to_cj_size(int nb_kernel_type)
 +{
 +    int nbnxn_simd_width = 0;
 +    int cj_size          = 0;
 +
 +#ifdef GMX_NBNXN_SIMD
 +    nbnxn_simd_width = GMX_SIMD_WIDTH_HERE;
 +#endif
 +
 +    switch (nb_kernel_type)
 +    {
 +        case nbnxnk4x4_PlainC:
 +            cj_size = NBNXN_CPU_CLUSTER_I_SIZE;
 +            break;
 +        case nbnxnk4xN_SIMD_4xN:
 +            cj_size = nbnxn_simd_width;
 +            break;
 +        case nbnxnk4xN_SIMD_2xNN:
 +            cj_size = nbnxn_simd_width/2;
 +            break;
 +        case nbnxnk8x8x8_CUDA:
 +        case nbnxnk8x8x8_PlainC:
 +            cj_size = nbnxn_kernel_to_ci_size(nb_kernel_type);
 +            break;
 +        default:
 +            gmx_incons("unknown kernel type");
 +    }
 +
 +    return cj_size;
 +}
 +
 +static int ci_to_cj(int na_cj_2log, int ci)
 +{
 +    switch (na_cj_2log)
 +    {
 +        case 2: return ci;     break;
 +        case 1: return (ci<<1); break;
 +        case 3: return (ci>>1); break;
 +    }
 +
 +    return 0;
 +}
 +
 +gmx_bool nbnxn_kernel_pairlist_simple(int nb_kernel_type)
 +{
 +    if (nb_kernel_type == nbnxnkNotSet)
 +    {
 +        gmx_fatal(FARGS, "Non-bonded kernel type not set for Verlet-style pair-list.");
 +    }
 +
 +    switch (nb_kernel_type)
 +    {
 +        case nbnxnk8x8x8_CUDA:
 +        case nbnxnk8x8x8_PlainC:
 +            return FALSE;
 +
 +        case nbnxnk4x4_PlainC:
 +        case nbnxnk4xN_SIMD_4xN:
 +        case nbnxnk4xN_SIMD_2xNN:
 +            return TRUE;
 +
 +        default:
 +            gmx_incons("Invalid nonbonded kernel type passed!");
 +            return FALSE;
 +    }
 +}
 +
 +void nbnxn_init_search(nbnxn_search_t    * nbs_ptr,
 +                       ivec               *n_dd_cells,
 +                       gmx_domdec_zones_t *zones,
 +                       int                 nthread_max)
 +{
 +    nbnxn_search_t nbs;
 +    int            d, g, t;
 +
 +    snew(nbs, 1);
 +    *nbs_ptr = nbs;
 +
 +    nbs->DomDec = (n_dd_cells != NULL);
 +
 +    clear_ivec(nbs->dd_dim);
 +    nbs->ngrid = 1;
 +    if (nbs->DomDec)
 +    {
 +        nbs->zones = zones;
 +
 +        for (d = 0; d < DIM; d++)
 +        {
 +            if ((*n_dd_cells)[d] > 1)
 +            {
 +                nbs->dd_dim[d] = 1;
 +                /* Each grid matches a DD zone */
 +                nbs->ngrid *= 2;
 +            }
 +        }
 +    }
 +
 +    snew(nbs->grid, nbs->ngrid);
 +    for (g = 0; g < nbs->ngrid; g++)
 +    {
 +        nbnxn_grid_init(&nbs->grid[g]);
 +    }
 +    nbs->cell        = NULL;
 +    nbs->cell_nalloc = 0;
 +    nbs->a           = NULL;
 +    nbs->a_nalloc    = 0;
 +
 +    nbs->nthread_max = nthread_max;
 +
 +    /* Initialize the work data structures for each thread */
 +    snew(nbs->work, nbs->nthread_max);
 +    for (t = 0; t < nbs->nthread_max; t++)
 +    {
 +        nbs->work[t].cxy_na           = NULL;
 +        nbs->work[t].cxy_na_nalloc    = 0;
 +        nbs->work[t].sort_work        = NULL;
 +        nbs->work[t].sort_work_nalloc = 0;
 +    }
 +
 +    /* Initialize detailed nbsearch cycle counting */
 +    nbs->print_cycles = (getenv("GMX_NBNXN_CYCLE") != 0);
 +    nbs->search_count = 0;
 +    nbs_cycle_clear(nbs->cc);
 +    for (t = 0; t < nbs->nthread_max; t++)
 +    {
 +        nbs_cycle_clear(nbs->work[t].cc);
 +    }
 +}
 +
 +static real grid_atom_density(int n, rvec corner0, rvec corner1)
 +{
 +    rvec size;
 +
 +    rvec_sub(corner1, corner0, size);
 +
 +    return n/(size[XX]*size[YY]*size[ZZ]);
 +}
 +
 +static int set_grid_size_xy(const nbnxn_search_t nbs,
 +                            nbnxn_grid_t *grid,
 +                            int dd_zone,
 +                            int n, rvec corner0, rvec corner1,
 +                            real atom_density)
 +{
 +    rvec size;
 +    int  na_c;
 +    real adens, tlen, tlen_x, tlen_y, nc_max;
 +    int  t;
 +
 +    rvec_sub(corner1, corner0, size);
 +
 +    if (n > grid->na_sc)
 +    {
 +        /* target cell length */
 +        if (grid->bSimple)
 +        {
 +            /* To minimize the zero interactions, we should make
 +             * the largest of the i/j cell cubic.
 +             */
 +            na_c = max(grid->na_c, grid->na_cj);
 +
 +            /* Approximately cubic cells */
 +            tlen   = pow(na_c/atom_density, 1.0/3.0);
 +            tlen_x = tlen;
 +            tlen_y = tlen;
 +        }
 +        else
 +        {
 +            /* Approximately cubic sub cells */
 +            tlen   = pow(grid->na_c/atom_density, 1.0/3.0);
 +            tlen_x = tlen*GPU_NSUBCELL_X;
 +            tlen_y = tlen*GPU_NSUBCELL_Y;
 +        }
 +        /* We round ncx and ncy down, because we get less cell pairs
 +         * in the nbsist when the fixed cell dimensions (x,y) are
 +         * larger than the variable one (z) than the other way around.
 +         */
 +        grid->ncx = max(1, (int)(size[XX]/tlen_x));
 +        grid->ncy = max(1, (int)(size[YY]/tlen_y));
 +    }
 +    else
 +    {
 +        grid->ncx = 1;
 +        grid->ncy = 1;
 +    }
 +
 +    grid->sx     = size[XX]/grid->ncx;
 +    grid->sy     = size[YY]/grid->ncy;
 +    grid->inv_sx = 1/grid->sx;
 +    grid->inv_sy = 1/grid->sy;
 +
 +    if (dd_zone > 0)
 +    {
 +        /* This is a non-home zone, add an extra row of cells
 +         * for particles communicated for bonded interactions.
 +         * These can be beyond the cut-off. It doesn't matter where
 +         * they end up on the grid, but for performance it's better
 +         * if they don't end up in cells that can be within cut-off range.
 +         */
 +        grid->ncx++;
 +        grid->ncy++;
 +    }
 +
 +    /* We need one additional cell entry for particles moved by DD */
 +    if (grid->ncx*grid->ncy+1 > grid->cxy_nalloc)
 +    {
 +        grid->cxy_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
 +        srenew(grid->cxy_na, grid->cxy_nalloc);
 +        srenew(grid->cxy_ind, grid->cxy_nalloc+1);
 +    }
 +    for (t = 0; t < nbs->nthread_max; t++)
 +    {
 +        if (grid->ncx*grid->ncy+1 > nbs->work[t].cxy_na_nalloc)
 +        {
 +            nbs->work[t].cxy_na_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
 +            srenew(nbs->work[t].cxy_na, nbs->work[t].cxy_na_nalloc);
 +        }
 +    }
 +
 +    /* Worst case scenario of 1 atom in each last cell */
 +    if (grid->na_cj <= grid->na_c)
 +    {
 +        nc_max = n/grid->na_sc + grid->ncx*grid->ncy;
 +    }
 +    else
 +    {
 +        nc_max = n/grid->na_sc + grid->ncx*grid->ncy*grid->na_cj/grid->na_c;
 +    }
 +
 +    if (nc_max > grid->nc_nalloc)
 +    {
- #ifdef NBNXN_PBB_SSE
-         bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX;
- #else
-         bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL*NNBSBB_B;
- #endif
 +        grid->nc_nalloc = over_alloc_large(nc_max);
 +        srenew(grid->nsubc, grid->nc_nalloc);
 +        srenew(grid->bbcz, grid->nc_nalloc*NNBSBB_D);
-         snew_aligned(grid->bb, bb_nalloc, 16);
++
 +        sfree_aligned(grid->bb);
 +        /* This snew also zeros the contents, this avoid possible
 +         * floating exceptions in SSE with the unused bb elements.
 +         */
-                 snew_aligned(grid->bbj, bb_nalloc*grid->na_c/grid->na_cj, 16);
++        if (grid->bSimple)
++        {
++            snew_aligned(grid->bb, grid->nc_nalloc, 16);
++        }
++        else
++        {
++#ifdef NBNXN_BBXXXX
++            int pbb_nalloc;
++
++            pbb_nalloc = grid->nc_nalloc*GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX;
++            snew_aligned(grid->pbb, pbb_nalloc, 16);
++#else
++            snew_aligned(grid->bb, grid->nc_nalloc*GPU_NSUBCELL, 16);
++#endif
++        }
 +
 +        if (grid->bSimple)
 +        {
 +            if (grid->na_cj == grid->na_c)
 +            {
 +                grid->bbj = grid->bb;
 +            }
 +            else
 +            {
 +                sfree_aligned(grid->bbj);
- static void calc_bounding_box(int na, int stride, const real *x, float *bb)
++                snew_aligned(grid->bbj, grid->nc_nalloc*grid->na_c/grid->na_cj, 16);
 +            }
 +        }
 +
 +        srenew(grid->flags, grid->nc_nalloc);
 +    }
 +
 +    copy_rvec(corner0, grid->c0);
 +    copy_rvec(corner1, grid->c1);
 +
 +    return nc_max;
 +}
 +
 +/* We need to sort paricles in grid columns on z-coordinate.
 + * As particle are very often distributed homogeneously, we a sorting
 + * algorithm similar to pigeonhole sort. We multiply the z-coordinate
 + * by a factor, cast to an int and try to store in that hole. If the hole
 + * is full, we move this or another particle. A second pass is needed to make
 + * contiguous elements. SORT_GRID_OVERSIZE is the ratio of holes to particles.
 + * 4 is the optimal value for homogeneous particle distribution and allows
 + * for an O(#particles) sort up till distributions were all particles are
 + * concentrated in 1/4 of the space. No NlogN fallback is implemented,
 + * as it can be expensive to detect imhomogeneous particle distributions.
 + * SGSF is the maximum ratio of holes used, in the worst case all particles
 + * end up in the last hole and we need #particles extra holes at the end.
 + */
 +#define SORT_GRID_OVERSIZE 4
 +#define SGSF (SORT_GRID_OVERSIZE + 1)
 +
 +/* Sort particle index a on coordinates x along dim.
 + * Backwards tells if we want decreasing iso increasing coordinates.
 + * h0 is the minimum of the coordinate range.
 + * invh is the 1/length of the sorting range.
 + * n_per_h (>=n) is the expected average number of particles per 1/invh
 + * sort is the sorting work array.
 + * sort should have a size of at least n_per_h*SORT_GRID_OVERSIZE + n,
 + * or easier, allocate at least n*SGSF elements.
 + */
 +static void sort_atoms(int dim, gmx_bool Backwards,
 +                       int *a, int n, rvec *x,
 +                       real h0, real invh, int n_per_h,
 +                       int *sort)
 +{
 +    int nsort, i, c;
 +    int zi, zim, zi_min, zi_max;
 +    int cp, tmp;
 +
 +    if (n <= 1)
 +    {
 +        /* Nothing to do */
 +        return;
 +    }
 +
 +#ifndef NDEBUG
 +    if (n > n_per_h)
 +    {
 +        gmx_incons("n > n_per_h");
 +    }
 +#endif
 +
 +    /* Transform the inverse range height into the inverse hole height */
 +    invh *= n_per_h*SORT_GRID_OVERSIZE;
 +
 +    /* Set nsort to the maximum possible number of holes used.
 +     * In worst case all n elements end up in the last bin.
 +     */
 +    nsort = n_per_h*SORT_GRID_OVERSIZE + n;
 +
 +    /* Determine the index range used, so we can limit it for the second pass */
 +    zi_min = INT_MAX;
 +    zi_max = -1;
 +
 +    /* Sort the particles using a simple index sort */
 +    for (i = 0; i < n; i++)
 +    {
 +        /* The cast takes care of float-point rounding effects below zero.
 +         * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
 +         * times the box height out of the box.
 +         */
 +        zi = (int)((x[a[i]][dim] - h0)*invh);
 +
 +#ifndef NDEBUG
 +        /* As we can have rounding effect, we use > iso >= here */
 +        if (zi < 0 || zi > n_per_h*SORT_GRID_OVERSIZE)
 +        {
 +            gmx_fatal(FARGS, "(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d*%d\n",
 +                      a[i], 'x'+dim, x[a[i]][dim], h0, invh, zi,
 +                      n_per_h, SORT_GRID_OVERSIZE);
 +        }
 +#endif
 +
 +        /* Ideally this particle should go in sort cell zi,
 +         * but that might already be in use,
 +         * in that case find the first empty cell higher up
 +         */
 +        if (sort[zi] < 0)
 +        {
 +            sort[zi] = a[i];
 +            zi_min   = min(zi_min, zi);
 +            zi_max   = max(zi_max, zi);
 +        }
 +        else
 +        {
 +            /* We have multiple atoms in the same sorting slot.
 +             * Sort on real z for minimal bounding box size.
 +             * There is an extra check for identical z to ensure
 +             * well-defined output order, independent of input order
 +             * to ensure binary reproducibility after restarts.
 +             */
 +            while (sort[zi] >= 0 && ( x[a[i]][dim] >  x[sort[zi]][dim] ||
 +                                      (x[a[i]][dim] == x[sort[zi]][dim] &&
 +                                       a[i] > sort[zi])))
 +            {
 +                zi++;
 +            }
 +
 +            if (sort[zi] >= 0)
 +            {
 +                /* Shift all elements by one slot until we find an empty slot */
 +                cp  = sort[zi];
 +                zim = zi + 1;
 +                while (sort[zim] >= 0)
 +                {
 +                    tmp       = sort[zim];
 +                    sort[zim] = cp;
 +                    cp        = tmp;
 +                    zim++;
 +                }
 +                sort[zim] = cp;
 +                zi_max    = max(zi_max, zim);
 +            }
 +            sort[zi] = a[i];
 +            zi_max   = max(zi_max, zi);
 +        }
 +    }
 +
 +    c = 0;
 +    if (!Backwards)
 +    {
 +        for (zi = 0; zi < nsort; zi++)
 +        {
 +            if (sort[zi] >= 0)
 +            {
 +                a[c++]   = sort[zi];
 +                sort[zi] = -1;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        for (zi = zi_max; zi >= zi_min; zi--)
 +        {
 +            if (sort[zi] >= 0)
 +            {
 +                a[c++]   = sort[zi];
 +                sort[zi] = -1;
 +            }
 +        }
 +    }
 +    if (c < n)
 +    {
 +        gmx_incons("Lost particles while sorting");
 +    }
 +}
 +
 +#ifdef GMX_DOUBLE
 +#define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
 +#define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
 +#else
 +#define R2F_D(x) (x)
 +#define R2F_U(x) (x)
 +#endif
 +
 +/* Coordinate order x,y,z, bb order xyz0 */
-     bb[BBL_X] = R2F_D(xl);
-     bb[BBL_Y] = R2F_D(yl);
-     bb[BBL_Z] = R2F_D(zl);
-     bb[BBU_X] = R2F_U(xh);
-     bb[BBU_Y] = R2F_U(yh);
-     bb[BBU_Z] = R2F_U(zh);
++static void calc_bounding_box(int na, int stride, const real *x, nbnxn_bb_t *bb)
 +{
 +    int  i, j;
 +    real xl, xh, yl, yh, zl, zh;
 +
 +    i  = 0;
 +    xl = x[i+XX];
 +    xh = x[i+XX];
 +    yl = x[i+YY];
 +    yh = x[i+YY];
 +    zl = x[i+ZZ];
 +    zh = x[i+ZZ];
 +    i += stride;
 +    for (j = 1; j < na; j++)
 +    {
 +        xl = min(xl, x[i+XX]);
 +        xh = max(xh, x[i+XX]);
 +        yl = min(yl, x[i+YY]);
 +        yh = max(yh, x[i+YY]);
 +        zl = min(zl, x[i+ZZ]);
 +        zh = max(zh, x[i+ZZ]);
 +        i += stride;
 +    }
 +    /* Note: possible double to float conversion here */
- static void calc_bounding_box_x_x4(int na, const real *x, float *bb)
++    bb->lower[BB_X] = R2F_D(xl);
++    bb->lower[BB_Y] = R2F_D(yl);
++    bb->lower[BB_Z] = R2F_D(zl);
++    bb->upper[BB_X] = R2F_U(xh);
++    bb->upper[BB_Y] = R2F_U(yh);
++    bb->upper[BB_Z] = R2F_U(zh);
 +}
 +
 +/* Packed coordinates, bb order xyz0 */
-     bb[BBL_X] = R2F_D(xl);
-     bb[BBL_Y] = R2F_D(yl);
-     bb[BBL_Z] = R2F_D(zl);
-     bb[BBU_X] = R2F_U(xh);
-     bb[BBU_Y] = R2F_U(yh);
-     bb[BBU_Z] = R2F_U(zh);
++static void calc_bounding_box_x_x4(int na, const real *x, nbnxn_bb_t *bb)
 +{
 +    int  j;
 +    real xl, xh, yl, yh, zl, zh;
 +
 +    xl = x[XX*PACK_X4];
 +    xh = x[XX*PACK_X4];
 +    yl = x[YY*PACK_X4];
 +    yh = x[YY*PACK_X4];
 +    zl = x[ZZ*PACK_X4];
 +    zh = x[ZZ*PACK_X4];
 +    for (j = 1; j < na; j++)
 +    {
 +        xl = min(xl, x[j+XX*PACK_X4]);
 +        xh = max(xh, x[j+XX*PACK_X4]);
 +        yl = min(yl, x[j+YY*PACK_X4]);
 +        yh = max(yh, x[j+YY*PACK_X4]);
 +        zl = min(zl, x[j+ZZ*PACK_X4]);
 +        zh = max(zh, x[j+ZZ*PACK_X4]);
 +    }
 +    /* Note: possible double to float conversion here */
- static void calc_bounding_box_x_x8(int na, const real *x, float *bb)
++    bb->lower[BB_X] = R2F_D(xl);
++    bb->lower[BB_Y] = R2F_D(yl);
++    bb->lower[BB_Z] = R2F_D(zl);
++    bb->upper[BB_X] = R2F_U(xh);
++    bb->upper[BB_Y] = R2F_U(yh);
++    bb->upper[BB_Z] = R2F_U(zh);
 +}
 +
 +/* Packed coordinates, bb order xyz0 */
-     bb[BBL_X] = R2F_D(xl);
-     bb[BBL_Y] = R2F_D(yl);
-     bb[BBL_Z] = R2F_D(zl);
-     bb[BBU_X] = R2F_U(xh);
-     bb[BBU_Y] = R2F_U(yh);
-     bb[BBU_Z] = R2F_U(zh);
++static void calc_bounding_box_x_x8(int na, const real *x, nbnxn_bb_t *bb)
 +{
 +    int  j;
 +    real xl, xh, yl, yh, zl, zh;
 +
 +    xl = x[XX*PACK_X8];
 +    xh = x[XX*PACK_X8];
 +    yl = x[YY*PACK_X8];
 +    yh = x[YY*PACK_X8];
 +    zl = x[ZZ*PACK_X8];
 +    zh = x[ZZ*PACK_X8];
 +    for (j = 1; j < na; j++)
 +    {
 +        xl = min(xl, x[j+XX*PACK_X8]);
 +        xh = max(xh, x[j+XX*PACK_X8]);
 +        yl = min(yl, x[j+YY*PACK_X8]);
 +        yh = max(yh, x[j+YY*PACK_X8]);
 +        zl = min(zl, x[j+ZZ*PACK_X8]);
 +        zh = max(zh, x[j+ZZ*PACK_X8]);
 +    }
 +    /* Note: possible double to float conversion here */
-                                           float *bb, float *bbj)
++    bb->lower[BB_X] = R2F_D(xl);
++    bb->lower[BB_Y] = R2F_D(yl);
++    bb->lower[BB_Z] = R2F_D(zl);
++    bb->upper[BB_X] = R2F_U(xh);
++    bb->upper[BB_Y] = R2F_U(yh);
++    bb->upper[BB_Z] = R2F_U(zh);
 +}
 +
 +/* Packed coordinates, bb order xyz0 */
 +static void calc_bounding_box_x_x4_halves(int na, const real *x,
- #ifndef NBNXN_SEARCH_BB_SSE
-     int i;
- #endif
++                                          nbnxn_bb_t *bb, nbnxn_bb_t *bbj)
 +{
-         calc_bounding_box_x_x4(min(na-2, 2), x+(PACK_X4>>1), bbj+NNBSBB_B);
 +    calc_bounding_box_x_x4(min(na, 2), x, bbj);
 +
 +    if (na > 2)
 +    {
-         _mm_store_ps(bbj+NNBSBB_B, _mm_load_ps(bbj));
-         _mm_store_ps(bbj+NNBSBB_B+NNBSBB_C, _mm_load_ps(bbj+NNBSBB_C));
++        calc_bounding_box_x_x4(min(na-2, 2), x+(PACK_X4>>1), bbj+1);
 +    }
 +    else
 +    {
 +        /* Set the "empty" bounding box to the same as the first one,
 +         * so we don't need to treat special cases in the rest of the code.
 +         */
 +#ifdef NBNXN_SEARCH_BB_SSE
-         for (i = 0; i < NNBSBB_B; i++)
-         {
-             bbj[NNBSBB_B + i] = bbj[i];
-         }
++        _mm_store_ps(&bbj[1].lower[0], _mm_load_ps(&bbj[0].lower[0]));
++        _mm_store_ps(&bbj[1].upper[0], _mm_load_ps(&bbj[0].upper[0]));
 +#else
-     _mm_store_ps(bb, _mm_min_ps(_mm_load_ps(bbj),
-                                 _mm_load_ps(bbj+NNBSBB_B)));
-     _mm_store_ps(bb+NNBSBB_C, _mm_max_ps(_mm_load_ps(bbj+NNBSBB_C),
-                                          _mm_load_ps(bbj+NNBSBB_B+NNBSBB_C)));
++        bbj[1] = bbj[0];
 +#endif
 +    }
 +
 +#ifdef NBNXN_SEARCH_BB_SSE
-     for (i = 0; i < NNBSBB_C; i++)
++    _mm_store_ps(&bb->lower[0], _mm_min_ps(_mm_load_ps(&bbj[0].lower[0]),
++                                           _mm_load_ps(&bbj[1].lower[0])));
++    _mm_store_ps(&bb->upper[0], _mm_max_ps(_mm_load_ps(&bbj[0].upper[0]),
++                                           _mm_load_ps(&bbj[1].upper[0])));
 +#else
-         bb[           i] = min(bbj[           i], bbj[NNBSBB_B +            i]);
-         bb[NNBSBB_C + i] = max(bbj[NNBSBB_C + i], bbj[NNBSBB_B + NNBSBB_C + i]);
 +    {
- static void calc_bounding_box_sse(int na, const float *x, float *bb)
++        int i;
++
++        for (i = 0; i < NNBSBB_C; i++)
++        {
++            bb->lower[i] = min(bbj[0].lower[i], bbj[1].lower[i]);
++            bb->upper[i] = max(bbj[0].upper[i], bbj[1].upper[i]);
++        }
 +    }
 +#endif
 +}
 +
 +#ifdef NBNXN_SEARCH_BB_SSE
 +
 +/* Coordinate order xyz, bb order xxxxyyyyzzzz */
 +static void calc_bounding_box_xxxx(int na, int stride, const real *x, float *bb)
 +{
 +    int  i, j;
 +    real xl, xh, yl, yh, zl, zh;
 +
 +    i  = 0;
 +    xl = x[i+XX];
 +    xh = x[i+XX];
 +    yl = x[i+YY];
 +    yh = x[i+YY];
 +    zl = x[i+ZZ];
 +    zh = x[i+ZZ];
 +    i += stride;
 +    for (j = 1; j < na; j++)
 +    {
 +        xl = min(xl, x[i+XX]);
 +        xh = max(xh, x[i+XX]);
 +        yl = min(yl, x[i+YY]);
 +        yh = max(yh, x[i+YY]);
 +        zl = min(zl, x[i+ZZ]);
 +        zh = max(zh, x[i+ZZ]);
 +        i += stride;
 +    }
 +    /* Note: possible double to float conversion here */
 +    bb[0*STRIDE_PBB] = R2F_D(xl);
 +    bb[1*STRIDE_PBB] = R2F_D(yl);
 +    bb[2*STRIDE_PBB] = R2F_D(zl);
 +    bb[3*STRIDE_PBB] = R2F_U(xh);
 +    bb[4*STRIDE_PBB] = R2F_U(yh);
 +    bb[5*STRIDE_PBB] = R2F_U(zh);
 +}
 +
 +#endif /* NBNXN_SEARCH_BB_SSE */
 +
 +#ifdef NBNXN_SEARCH_SSE_SINGLE
 +
 +/* Coordinate order xyz?, bb order xyz0 */
-     _mm_store_ps(bb, bb_0_SSE);
-     _mm_store_ps(bb+4, bb_1_SSE);
++static void calc_bounding_box_sse(int na, const float *x, nbnxn_bb_t *bb)
 +{
 +    __m128 bb_0_SSE, bb_1_SSE;
 +    __m128 x_SSE;
 +
 +    int    i;
 +
 +    bb_0_SSE = _mm_load_ps(x);
 +    bb_1_SSE = bb_0_SSE;
 +
 +    for (i = 1; i < na; i++)
 +    {
 +        x_SSE    = _mm_load_ps(x+i*NNBSBB_C);
 +        bb_0_SSE = _mm_min_ps(bb_0_SSE, x_SSE);
 +        bb_1_SSE = _mm_max_ps(bb_1_SSE, x_SSE);
 +    }
 +
-                                        float *bb_work,
++    _mm_store_ps(&bb->lower[0], bb_0_SSE);
++    _mm_store_ps(&bb->upper[0], bb_1_SSE);
 +}
 +
 +/* Coordinate order xyz?, bb order xxxxyyyyzzzz */
 +static void calc_bounding_box_xxxx_sse(int na, const float *x,
-     calc_bounding_box_sse(na, x, bb_work);
-     bb[0*STRIDE_PBB] = bb_work[BBL_X];
-     bb[1*STRIDE_PBB] = bb_work[BBL_Y];
-     bb[2*STRIDE_PBB] = bb_work[BBL_Z];
-     bb[3*STRIDE_PBB] = bb_work[BBU_X];
-     bb[4*STRIDE_PBB] = bb_work[BBU_Y];
-     bb[5*STRIDE_PBB] = bb_work[BBU_Z];
++                                       nbnxn_bb_t *bb_work_aligned,
 +                                       real *bb)
 +{
- static void combine_bounding_box_pairs(nbnxn_grid_t *grid, const float *bb)
++    calc_bounding_box_sse(na, x, bb_work_aligned);
++
++    bb[0*STRIDE_PBB] = bb_work_aligned->lower[BB_X];
++    bb[1*STRIDE_PBB] = bb_work_aligned->lower[BB_Y];
++    bb[2*STRIDE_PBB] = bb_work_aligned->lower[BB_Z];
++    bb[3*STRIDE_PBB] = bb_work_aligned->upper[BB_X];
++    bb[4*STRIDE_PBB] = bb_work_aligned->upper[BB_Y];
++    bb[5*STRIDE_PBB] = bb_work_aligned->upper[BB_Z];
 +}
 +
 +#endif /* NBNXN_SEARCH_SSE_SINGLE */
 +
 +
 +/* Combines pairs of consecutive bounding boxes */
-             min_SSE = _mm_min_ps(_mm_load_ps(bb+(c2*4+0)*NNBSBB_C),
-                                  _mm_load_ps(bb+(c2*4+2)*NNBSBB_C));
-             max_SSE = _mm_max_ps(_mm_load_ps(bb+(c2*4+1)*NNBSBB_C),
-                                  _mm_load_ps(bb+(c2*4+3)*NNBSBB_C));
-             _mm_store_ps(grid->bbj+(c2*2+0)*NNBSBB_C, min_SSE);
-             _mm_store_ps(grid->bbj+(c2*2+1)*NNBSBB_C, max_SSE);
++static void combine_bounding_box_pairs(nbnxn_grid_t *grid, const nbnxn_bb_t *bb)
 +{
 +    int    i, j, sc2, nc2, c2;
 +
 +    for (i = 0; i < grid->ncx*grid->ncy; i++)
 +    {
 +        /* Starting bb in a column is expected to be 2-aligned */
 +        sc2 = grid->cxy_ind[i]>>1;
 +        /* For odd numbers skip the last bb here */
 +        nc2 = (grid->cxy_na[i]+3)>>(2+1);
 +        for (c2 = sc2; c2 < sc2+nc2; c2++)
 +        {
 +#ifdef NBNXN_SEARCH_BB_SSE
 +            __m128 min_SSE, max_SSE;
 +
-                 grid->bbj[(c2*2+0)*NNBSBB_C+j] = min(bb[(c2*4+0)*NNBSBB_C+j],
-                                                      bb[(c2*4+2)*NNBSBB_C+j]);
-                 grid->bbj[(c2*2+1)*NNBSBB_C+j] = max(bb[(c2*4+1)*NNBSBB_C+j],
-                                                      bb[(c2*4+3)*NNBSBB_C+j]);
++            min_SSE = _mm_min_ps(_mm_load_ps(&bb[c2*2+0].lower[0]),
++                                 _mm_load_ps(&bb[c2*2+1].lower[0]));
++            max_SSE = _mm_max_ps(_mm_load_ps(&bb[c2*2+0].upper[0]),
++                                 _mm_load_ps(&bb[c2*2+1].upper[0]));
++            _mm_store_ps(&grid->bbj[c2].lower[0], min_SSE);
++            _mm_store_ps(&grid->bbj[c2].upper[0], max_SSE);
 +#else
 +            for (j = 0; j < NNBSBB_C; j++)
 +            {
-                 grid->bbj[(c2*2+0)*NNBSBB_C+j] = bb[(c2*4+0)*NNBSBB_C+j];
-                 grid->bbj[(c2*2+1)*NNBSBB_C+j] = bb[(c2*4+1)*NNBSBB_C+j];
++                grid->bbj[c2].lower[j] = min(bb[c2*2].lower[j],
++                                             bb[c2*2].lower[j]);
++                grid->bbj[c2].upper[j] = max(bb[c2*2].upper[j],
++                                             bb[c2*2].upper[j]);
 +            }
 +#endif
 +        }
 +        if (((grid->cxy_na[i]+3)>>2) & 1)
 +        {
 +            /* Copy the last bb for odd bb count in this column */
 +            for (j = 0; j < NNBSBB_C; j++)
 +            {
-             ba[d] += grid->bb[c*NNBSBB_B+NNBSBB_C+d] - grid->bb[c*NNBSBB_B+d];
++                grid->bbj[c2].lower[j] = bb[c2*2].lower[j];
++                grid->bbj[c2].upper[j] = bb[c2*2].upper[j];
 +            }
 +        }
 +    }
 +}
 +
 +
 +/* Prints the average bb size, used for debug output */
 +static void print_bbsizes_simple(FILE                *fp,
 +                                 const nbnxn_search_t nbs,
 +                                 const nbnxn_grid_t  *grid)
 +{
 +    int  c, d;
 +    dvec ba;
 +
 +    clear_dvec(ba);
 +    for (c = 0; c < grid->nc; c++)
 +    {
 +        for (d = 0; d < DIM; d++)
 +        {
-                         grid->bb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_PBB+i] -
-                         grid->bb[cs_w*NNBSBB_XXXX+     d *STRIDE_PBB+i];
++            ba[d] += grid->bb[c].upper[d] - grid->bb[c].lower[d];
 +        }
 +    }
 +    dsvmul(1.0/grid->nc, ba, ba);
 +
 +    fprintf(fp, "ns bb: %4.2f %4.2f %4.2f  %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 +            nbs->box[XX][XX]/grid->ncx,
 +            nbs->box[YY][YY]/grid->ncy,
 +            nbs->box[ZZ][ZZ]*grid->ncx*grid->ncy/grid->nc,
 +            ba[XX], ba[YY], ba[ZZ],
 +            ba[XX]*grid->ncx/nbs->box[XX][XX],
 +            ba[YY]*grid->ncy/nbs->box[YY][YY],
 +            ba[ZZ]*grid->nc/(grid->ncx*grid->ncy*nbs->box[ZZ][ZZ]));
 +}
 +
 +/* Prints the average bb size, used for debug output */
 +static void print_bbsizes_supersub(FILE                *fp,
 +                                   const nbnxn_search_t nbs,
 +                                   const nbnxn_grid_t  *grid)
 +{
 +    int  ns, c, s;
 +    dvec ba;
 +
 +    clear_dvec(ba);
 +    ns = 0;
 +    for (c = 0; c < grid->nc; c++)
 +    {
 +#ifdef NBNXN_BBXXXX
 +        for (s = 0; s < grid->nsubc[c]; s += STRIDE_PBB)
 +        {
 +            int cs_w, i, d;
 +
 +            cs_w = (c*GPU_NSUBCELL + s)/STRIDE_PBB;
 +            for (i = 0; i < STRIDE_PBB; i++)
 +            {
 +                for (d = 0; d < DIM; d++)
 +                {
 +                    ba[d] +=
-                 ba[d] +=
-                     grid->bb[cs*NNBSBB_B+NNBSBB_C+d] -
-                     grid->bb[cs*NNBSBB_B         +d];
++                        grid->pbb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_PBB+i] -
++                        grid->pbb[cs_w*NNBSBB_XXXX+     d *STRIDE_PBB+i];
 +                }
 +            }
 +        }
 +#else
 +        for (s = 0; s < grid->nsubc[c]; s++)
 +        {
 +            int cs, d;
 +
 +            cs = c*GPU_NSUBCELL + s;
 +            for (d = 0; d < DIM; d++)
 +            {
-                float *bb_work)
++                ba[d] += grid->bb[cs].upper[d] - grid->bb[cs].lower[d];
 +            }
 +        }
 +#endif
 +        ns += grid->nsubc[c];
 +    }
 +    dsvmul(1.0/ns, ba, ba);
 +
 +    fprintf(fp, "ns bb: %4.2f %4.2f %4.2f  %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 +            nbs->box[XX][XX]/(grid->ncx*GPU_NSUBCELL_X),
 +            nbs->box[YY][YY]/(grid->ncy*GPU_NSUBCELL_Y),
 +            nbs->box[ZZ][ZZ]*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z),
 +            ba[XX], ba[YY], ba[ZZ],
 +            ba[XX]*grid->ncx*GPU_NSUBCELL_X/nbs->box[XX][XX],
 +            ba[YY]*grid->ncy*GPU_NSUBCELL_Y/nbs->box[YY][YY],
 +            ba[ZZ]*grid->nc*GPU_NSUBCELL_Z/(grid->ncx*grid->ncy*nbs->box[ZZ][ZZ]));
 +}
 +
 +/* Potentially sorts atoms on LJ coefficients !=0 and ==0.
 + * Also sets interaction flags.
 + */
 +void sort_on_lj(int na_c,
 +                int a0, int a1, const int *atinfo,
 +                int *order,
 +                int *flags)
 +{
 +    int      subc, s, a, n1, n2, a_lj_max, i, j;
 +    int      sort1[NBNXN_NA_SC_MAX/GPU_NSUBCELL];
 +    int      sort2[NBNXN_NA_SC_MAX/GPU_NSUBCELL];
 +    gmx_bool haveQ;
 +
 +    *flags = 0;
 +
 +    subc = 0;
 +    for (s = a0; s < a1; s += na_c)
 +    {
 +        /* Make lists for this (sub-)cell on atoms with and without LJ */
 +        n1       = 0;
 +        n2       = 0;
 +        haveQ    = FALSE;
 +        a_lj_max = -1;
 +        for (a = s; a < min(s+na_c, a1); a++)
 +        {
 +            haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
 +
 +            if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
 +            {
 +                sort1[n1++] = order[a];
 +                a_lj_max    = a;
 +            }
 +            else
 +            {
 +                sort2[n2++] = order[a];
 +            }
 +        }
 +
 +        /* If we don't have atom with LJ, there's nothing to sort */
 +        if (n1 > 0)
 +        {
 +            *flags |= NBNXN_CI_DO_LJ(subc);
 +
 +            if (2*n1 <= na_c)
 +            {
 +                /* Only sort when strictly necessary. Ordering particles
 +                 * Ordering particles can lead to less accurate summation
 +                 * due to rounding, both for LJ and Coulomb interactions.
 +                 */
 +                if (2*(a_lj_max - s) >= na_c)
 +                {
 +                    for (i = 0; i < n1; i++)
 +                    {
 +                        order[a0+i] = sort1[i];
 +                    }
 +                    for (j = 0; j < n2; j++)
 +                    {
 +                        order[a0+n1+j] = sort2[j];
 +                    }
 +                }
 +
 +                *flags |= NBNXN_CI_HALF_LJ(subc);
 +            }
 +        }
 +        if (haveQ)
 +        {
 +            *flags |= NBNXN_CI_DO_COUL(subc);
 +        }
 +        subc++;
 +    }
 +}
 +
 +/* Fill a pair search cell with atoms.
 + * Potentially sorts atoms and sets the interaction flags.
 + */
 +void fill_cell(const nbnxn_search_t nbs,
 +               nbnxn_grid_t *grid,
 +               nbnxn_atomdata_t *nbat,
 +               int a0, int a1,
 +               const int *atinfo,
 +               rvec *x,
 +               int sx, int sy, int sz,
-     int     na, a;
-     size_t  offset;
-     float  *bb_ptr;
++               nbnxn_bb_t *bb_work_aligned)
 +{
-         offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
++    int        na, a;
++    size_t     offset;
++    nbnxn_bb_t *bb_ptr;
++#ifdef NBNXN_BBXXXX
++    float      *pbb_ptr;
++#endif
 +
 +    na = a1 - a0;
 +
 +    if (grid->bSimple)
 +    {
 +        sort_on_lj(grid->na_c, a0, a1, atinfo, nbs->a,
 +                   grid->flags+(a0>>grid->na_c_2log)-grid->cell0);
 +    }
 +
 +    /* Now we have sorted the atoms, set the cell indices */
 +    for (a = a0; a < a1; a++)
 +    {
 +        nbs->cell[nbs->a[a]] = a;
 +    }
 +
 +    copy_rvec_to_nbat_real(nbs->a+a0, a1-a0, grid->na_c, x,
 +                           nbat->XFormat, nbat->x, a0,
 +                           sx, sy, sz);
 +
 +    if (nbat->XFormat == nbatX4)
 +    {
 +        /* Store the bounding boxes as xyz.xyz. */
-         offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
++        offset = (a0 - grid->cell0*grid->na_sc) >> grid->na_c_2log;
 +        bb_ptr = grid->bb + offset;
 +
 +#if defined GMX_NBNXN_SIMD && GMX_SIMD_WIDTH_HERE == 2
 +        if (2*grid->na_cj == grid->na_c)
 +        {
 +            calc_bounding_box_x_x4_halves(na, nbat->x+X4_IND_A(a0), bb_ptr,
 +                                          grid->bbj+offset*2);
 +        }
 +        else
 +#endif
 +        {
 +            calc_bounding_box_x_x4(na, nbat->x+X4_IND_A(a0), bb_ptr);
 +        }
 +    }
 +    else if (nbat->XFormat == nbatX8)
 +    {
 +        /* Store the bounding boxes as xyz.xyz. */
-         bb_ptr =
-             grid->bb +
++        offset = (a0 - grid->cell0*grid->na_sc) >> grid->na_c_2log;
 +        bb_ptr = grid->bb + offset;
 +
 +        calc_bounding_box_x_x8(na, nbat->x+X8_IND_A(a0), bb_ptr);
 +    }
 +#ifdef NBNXN_BBXXXX
 +    else if (!grid->bSimple)
 +    {
 +        /* Store the bounding boxes in a format convenient
 +         * for SSE calculations: xxxxyyyyzzzz...
 +         */
-                                        bb_work, bb_ptr);
++        pbb_ptr =
++            grid->pbb +
 +            ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_PBB_2LOG))*NNBSBB_XXXX +
 +            (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_PBB-1));
 +
 +#ifdef NBNXN_SEARCH_SSE_SINGLE
 +        if (nbat->XFormat == nbatXYZQ)
 +        {
 +            calc_bounding_box_xxxx_sse(na, nbat->x+a0*nbat->xstride,
-                                    bb_ptr);
++                                       bb_work_aligned, pbb_ptr);
 +        }
 +        else
 +#endif
 +        {
 +            calc_bounding_box_xxxx(na, nbat->xstride, nbat->x+a0*nbat->xstride,
-                     bb_ptr[0*STRIDE_PBB], bb_ptr[3*STRIDE_PBB],
-                     bb_ptr[1*STRIDE_PBB], bb_ptr[4*STRIDE_PBB],
-                     bb_ptr[2*STRIDE_PBB], bb_ptr[5*STRIDE_PBB]);
++                                   pbb_ptr);
 +        }
 +        if (gmx_debug_at)
 +        {
 +            fprintf(debug, "%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 +                    sx, sy, sz,
-         bb_ptr = grid->bb+((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
++                    pbb_ptr[0*STRIDE_PBB], pbb_ptr[3*STRIDE_PBB],
++                    pbb_ptr[1*STRIDE_PBB], pbb_ptr[4*STRIDE_PBB],
++                    pbb_ptr[2*STRIDE_PBB], pbb_ptr[5*STRIDE_PBB]);
 +        }
 +    }
 +#endif
 +    else
 +    {
 +        /* Store the bounding boxes as xyz.xyz. */
-                     (grid->bb+bbo*NNBSBB_B)[BBL_X],
-                     (grid->bb+bbo*NNBSBB_B)[BBU_X],
-                     (grid->bb+bbo*NNBSBB_B)[BBL_Y],
-                     (grid->bb+bbo*NNBSBB_B)[BBU_Y],
-                     (grid->bb+bbo*NNBSBB_B)[BBL_Z],
-                     (grid->bb+bbo*NNBSBB_B)[BBU_Z]);
++        bb_ptr = grid->bb+((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log);
 +
 +        calc_bounding_box(na, nbat->xstride, nbat->x+a0*nbat->xstride,
 +                          bb_ptr);
 +
 +        if (gmx_debug_at)
 +        {
 +            int bbo;
 +            bbo = (a0 - grid->cell0*grid->na_sc)/grid->na_c;
 +            fprintf(debug, "%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 +                    sx, sy, sz,
-             grid->bbcz[c*NNBSBB_D  ] = grid->bb[cfilled*NNBSBB_B+2];
-             grid->bbcz[c*NNBSBB_D+1] = grid->bb[cfilled*NNBSBB_B+6];
++                    grid->bb[bbo].lower[BB_X],
++                    grid->bb[bbo].lower[BB_Y],
++                    grid->bb[bbo].lower[BB_Z],
++                    grid->bb[bbo].upper[BB_X],
++                    grid->bb[bbo].upper[BB_Y],
++                    grid->bb[bbo].upper[BB_Z]);
 +        }
 +    }
 +}
 +
 +/* Spatially sort the atoms within one grid column */
 +static void sort_columns_simple(const nbnxn_search_t nbs,
 +                                int dd_zone,
 +                                nbnxn_grid_t *grid,
 +                                int a0, int a1,
 +                                const int *atinfo,
 +                                rvec *x,
 +                                nbnxn_atomdata_t *nbat,
 +                                int cxy_start, int cxy_end,
 +                                int *sort_work)
 +{
 +    int  cxy;
 +    int  cx, cy, cz, ncz, cfilled, c;
 +    int  na, ash, ind, a;
 +    int  na_c, ash_c;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
 +                grid->cell0, cxy_start, cxy_end, a0, a1);
 +    }
 +
 +    /* Sort the atoms within each x,y column in 3 dimensions */
 +    for (cxy = cxy_start; cxy < cxy_end; cxy++)
 +    {
 +        cx = cxy/grid->ncy;
 +        cy = cxy - cx*grid->ncy;
 +
 +        na  = grid->cxy_na[cxy];
 +        ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
 +        ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
 +
 +        /* Sort the atoms within each x,y column on z coordinate */
 +        sort_atoms(ZZ, FALSE,
 +                   nbs->a+ash, na, x,
 +                   grid->c0[ZZ],
 +                   1.0/nbs->box[ZZ][ZZ], ncz*grid->na_sc,
 +                   sort_work);
 +
 +        /* Fill the ncz cells in this column */
 +        cfilled = grid->cxy_ind[cxy];
 +        for (cz = 0; cz < ncz; cz++)
 +        {
 +            c  = grid->cxy_ind[cxy] + cz;
 +
 +            ash_c = ash + cz*grid->na_sc;
 +            na_c  = min(grid->na_sc, na-(ash_c-ash));
 +
 +            fill_cell(nbs, grid, nbat,
 +                      ash_c, ash_c+na_c, atinfo, x,
 +                      grid->na_sc*cx + (dd_zone >> 2),
 +                      grid->na_sc*cy + (dd_zone & 3),
 +                      grid->na_sc*cz,
 +                      NULL);
 +
 +            /* This copy to bbcz is not really necessary.
 +             * But it allows to use the same grid search code
 +             * for the simple and supersub cell setups.
 +             */
 +            if (na_c > 0)
 +            {
 +                cfilled = c;
 +            }
-     float bb_work_array[NNBSBB_B+3], *bb_work_align;
++            grid->bbcz[c*NNBSBB_D  ] = grid->bb[cfilled].lower[BB_Z];
++            grid->bbcz[c*NNBSBB_D+1] = grid->bb[cfilled].upper[BB_Z];
 +        }
 +
 +        /* Set the unused atom indices to -1 */
 +        for (ind = na; ind < ncz*grid->na_sc; ind++)
 +        {
 +            nbs->a[ash+ind] = -1;
 +        }
 +    }
 +}
 +
 +/* Spatially sort the atoms within one grid column */
 +static void sort_columns_supersub(const nbnxn_search_t nbs,
 +                                  int dd_zone,
 +                                  nbnxn_grid_t *grid,
 +                                  int a0, int a1,
 +                                  const int *atinfo,
 +                                  rvec *x,
 +                                  nbnxn_atomdata_t *nbat,
 +                                  int cxy_start, int cxy_end,
 +                                  int *sort_work)
 +{
 +    int  cxy;
 +    int  cx, cy, cz = -1, c = -1, ncz;
 +    int  na, ash, na_c, ind, a;
 +    int  subdiv_z, sub_z, na_z, ash_z;
 +    int  subdiv_y, sub_y, na_y, ash_y;
 +    int  subdiv_x, sub_x, na_x, ash_x;
 +
 +    /* cppcheck-suppress unassignedVariable */
-     bb_work_align = (float *)(((size_t)(bb_work_array+3)) & (~((size_t)15)));
++    nbnxn_bb_t bb_work_array[2], *bb_work_aligned;
 +
-                               bb_work_align);
++    bb_work_aligned = (nbnxn_bb_t *)(((size_t)(bb_work_array+1)) & (~((size_t)15)));
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "cell0 %d sorting columns %d - %d, atoms %d - %d\n",
 +                grid->cell0, cxy_start, cxy_end, a0, a1);
 +    }
 +
 +    subdiv_x = grid->na_c;
 +    subdiv_y = GPU_NSUBCELL_X*subdiv_x;
 +    subdiv_z = GPU_NSUBCELL_Y*subdiv_y;
 +
 +    /* Sort the atoms within each x,y column in 3 dimensions */
 +    for (cxy = cxy_start; cxy < cxy_end; cxy++)
 +    {
 +        cx = cxy/grid->ncy;
 +        cy = cxy - cx*grid->ncy;
 +
 +        na  = grid->cxy_na[cxy];
 +        ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
 +        ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
 +
 +        /* Sort the atoms within each x,y column on z coordinate */
 +        sort_atoms(ZZ, FALSE,
 +                   nbs->a+ash, na, x,
 +                   grid->c0[ZZ],
 +                   1.0/nbs->box[ZZ][ZZ], ncz*grid->na_sc,
 +                   sort_work);
 +
 +        /* This loop goes over the supercells and subcells along z at once */
 +        for (sub_z = 0; sub_z < ncz*GPU_NSUBCELL_Z; sub_z++)
 +        {
 +            ash_z = ash + sub_z*subdiv_z;
 +            na_z  = min(subdiv_z, na-(ash_z-ash));
 +
 +            /* We have already sorted on z */
 +
 +            if (sub_z % GPU_NSUBCELL_Z == 0)
 +            {
 +                cz = sub_z/GPU_NSUBCELL_Z;
 +                c  = grid->cxy_ind[cxy] + cz;
 +
 +                /* The number of atoms in this supercell */
 +                na_c = min(grid->na_sc, na-(ash_z-ash));
 +
 +                grid->nsubc[c] = min(GPU_NSUBCELL, (na_c+grid->na_c-1)/grid->na_c);
 +
 +                /* Store the z-boundaries of the super cell */
 +                grid->bbcz[c*NNBSBB_D  ] = x[nbs->a[ash_z]][ZZ];
 +                grid->bbcz[c*NNBSBB_D+1] = x[nbs->a[ash_z+na_c-1]][ZZ];
 +            }
 +
 +#if GPU_NSUBCELL_Y > 1
 +            /* Sort the atoms along y */
 +            sort_atoms(YY, (sub_z & 1),
 +                       nbs->a+ash_z, na_z, x,
 +                       grid->c0[YY]+cy*grid->sy,
 +                       grid->inv_sy, subdiv_z,
 +                       sort_work);
 +#endif
 +
 +            for (sub_y = 0; sub_y < GPU_NSUBCELL_Y; sub_y++)
 +            {
 +                ash_y = ash_z + sub_y*subdiv_y;
 +                na_y  = min(subdiv_y, na-(ash_y-ash));
 +
 +#if GPU_NSUBCELL_X > 1
 +                /* Sort the atoms along x */
 +                sort_atoms(XX, ((cz*GPU_NSUBCELL_Y + sub_y) & 1),
 +                           nbs->a+ash_y, na_y, x,
 +                           grid->c0[XX]+cx*grid->sx,
 +                           grid->inv_sx, subdiv_y,
 +                           sort_work);
 +#endif
 +
 +                for (sub_x = 0; sub_x < GPU_NSUBCELL_X; sub_x++)
 +                {
 +                    ash_x = ash_y + sub_x*subdiv_x;
 +                    na_x  = min(subdiv_x, na-(ash_x-ash));
 +
 +                    fill_cell(nbs, grid, nbat,
 +                              ash_x, ash_x+na_x, atinfo, x,
 +                              grid->na_c*(cx*GPU_NSUBCELL_X+sub_x) + (dd_zone >> 2),
 +                              grid->na_c*(cy*GPU_NSUBCELL_Y+sub_y) + (dd_zone & 3),
 +                              grid->na_c*sub_z,
-     float        *bbcz, *bb;
++                              bb_work_aligned);
 +                }
 +            }
 +        }
 +
 +        /* Set the unused atom indices to -1 */
 +        for (ind = na; ind < ncz*grid->na_sc; ind++)
 +        {
 +            nbs->a[ash+ind] = -1;
 +        }
 +    }
 +}
 +
 +/* Determine in which grid column atoms should go */
 +static void calc_column_indices(nbnxn_grid_t *grid,
 +                                int a0, int a1,
 +                                rvec *x,
 +                                int dd_zone, const int *move,
 +                                int thread, int nthread,
 +                                int *cell,
 +                                int *cxy_na)
 +{
 +    int  n0, n1, i;
 +    int  cx, cy;
 +
 +    /* We add one extra cell for particles which moved during DD */
 +    for (i = 0; i < grid->ncx*grid->ncy+1; i++)
 +    {
 +        cxy_na[i] = 0;
 +    }
 +
 +    n0 = a0 + (int)((thread+0)*(a1 - a0))/nthread;
 +    n1 = a0 + (int)((thread+1)*(a1 - a0))/nthread;
 +    if (dd_zone == 0)
 +    {
 +        /* Home zone */
 +        for (i = n0; i < n1; i++)
 +        {
 +            if (move == NULL || move[i] >= 0)
 +            {
 +                /* We need to be careful with rounding,
 +                 * particles might be a few bits outside the local zone.
 +                 * The int cast takes care of the lower bound,
 +                 * we will explicitly take care of the upper bound.
 +                 */
 +                cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
 +                cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
 +
 +#ifndef NDEBUG
 +                if (cx < 0 || cx > grid->ncx ||
 +                    cy < 0 || cy > grid->ncy)
 +                {
 +                    gmx_fatal(FARGS,
 +                              "grid cell cx %d cy %d out of range (max %d %d)\n"
 +                              "atom %f %f %f, grid->c0 %f %f",
 +                              cx, cy, grid->ncx, grid->ncy,
 +                              x[i][XX], x[i][YY], x[i][ZZ], grid->c0[XX], grid->c0[YY]);
 +                }
 +#endif
 +                /* Take care of potential rouding issues */
 +                cx = min(cx, grid->ncx - 1);
 +                cy = min(cy, grid->ncy - 1);
 +
 +                /* For the moment cell will contain only the, grid local,
 +                 * x and y indices, not z.
 +                 */
 +                cell[i] = cx*grid->ncy + cy;
 +            }
 +            else
 +            {
 +                /* Put this moved particle after the end of the grid,
 +                 * so we can process it later without using conditionals.
 +                 */
 +                cell[i] = grid->ncx*grid->ncy;
 +            }
 +
 +            cxy_na[cell[i]]++;
 +        }
 +    }
 +    else
 +    {
 +        /* Non-home zone */
 +        for (i = n0; i < n1; i++)
 +        {
 +            cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
 +            cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
 +
 +            /* For non-home zones there could be particles outside
 +             * the non-bonded cut-off range, which have been communicated
 +             * for bonded interactions only. For the result it doesn't
 +             * matter where these end up on the grid. For performance
 +             * we put them in an extra row at the border.
 +             */
 +            cx = max(cx, 0);
 +            cx = min(cx, grid->ncx - 1);
 +            cy = max(cy, 0);
 +            cy = min(cy, grid->ncy - 1);
 +
 +            /* For the moment cell will contain only the, grid local,
 +             * x and y indices, not z.
 +             */
 +            cell[i] = cx*grid->ncy + cy;
 +
 +            cxy_na[cell[i]]++;
 +        }
 +    }
 +}
 +
 +/* Determine in which grid cells the atoms should go */
 +static void calc_cell_indices(const nbnxn_search_t nbs,
 +                              int dd_zone,
 +                              nbnxn_grid_t *grid,
 +                              int a0, int a1,
 +                              const int *atinfo,
 +                              rvec *x,
 +                              const int *move,
 +                              nbnxn_atomdata_t *nbat)
 +{
 +    int   n0, n1, i;
 +    int   cx, cy, cxy, ncz_max, ncz;
 +    int   nthread, thread;
 +    int  *cxy_na, cxy_na_i;
 +
 +    nthread = gmx_omp_nthreads_get(emntPairsearch);
 +
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +    for (thread = 0; thread < nthread; thread++)
 +    {
 +        calc_column_indices(grid, a0, a1, x, dd_zone, move, thread, nthread,
 +                            nbs->cell, nbs->work[thread].cxy_na);
 +    }
 +
 +    /* Make the cell index as a function of x and y */
 +    ncz_max          = 0;
 +    ncz              = 0;
 +    grid->cxy_ind[0] = 0;
 +    for (i = 0; i < grid->ncx*grid->ncy+1; i++)
 +    {
 +        /* We set ncz_max at the beginning of the loop iso at the end
 +         * to skip i=grid->ncx*grid->ncy which are moved particles
 +         * that do not need to be ordered on the grid.
 +         */
 +        if (ncz > ncz_max)
 +        {
 +            ncz_max = ncz;
 +        }
 +        cxy_na_i = nbs->work[0].cxy_na[i];
 +        for (thread = 1; thread < nthread; thread++)
 +        {
 +            cxy_na_i += nbs->work[thread].cxy_na[i];
 +        }
 +        ncz = (cxy_na_i + grid->na_sc - 1)/grid->na_sc;
 +        if (nbat->XFormat == nbatX8)
 +        {
 +            /* Make the number of cell a multiple of 2 */
 +            ncz = (ncz + 1) & ~1;
 +        }
 +        grid->cxy_ind[i+1] = grid->cxy_ind[i] + ncz;
 +        /* Clear cxy_na, so we can reuse the array below */
 +        grid->cxy_na[i] = 0;
 +    }
 +    grid->nc = grid->cxy_ind[grid->ncx*grid->ncy] - grid->cxy_ind[0];
 +
 +    nbat->natoms = (grid->cell0 + grid->nc)*grid->na_sc;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
 +                grid->na_sc, grid->na_c, grid->nc,
 +                grid->ncx, grid->ncy, grid->nc/((double)(grid->ncx*grid->ncy)),
 +                ncz_max);
 +        if (gmx_debug_at)
 +        {
 +            i = 0;
 +            for (cy = 0; cy < grid->ncy; cy++)
 +            {
 +                for (cx = 0; cx < grid->ncx; cx++)
 +                {
 +                    fprintf(debug, " %2d", grid->cxy_ind[i+1]-grid->cxy_ind[i]);
 +                    i++;
 +                }
 +                fprintf(debug, "\n");
 +            }
 +        }
 +    }
 +
 +    /* Make sure the work array for sorting is large enough */
 +    if (ncz_max*grid->na_sc*SGSF > nbs->work[0].sort_work_nalloc)
 +    {
 +        for (thread = 0; thread < nbs->nthread_max; thread++)
 +        {
 +            nbs->work[thread].sort_work_nalloc =
 +                over_alloc_large(ncz_max*grid->na_sc*SGSF);
 +            srenew(nbs->work[thread].sort_work,
 +                   nbs->work[thread].sort_work_nalloc);
 +            /* When not in use, all elements should be -1 */
 +            for (i = 0; i < nbs->work[thread].sort_work_nalloc; i++)
 +            {
 +                nbs->work[thread].sort_work[i] = -1;
 +            }
 +        }
 +    }
 +
 +    /* Now we know the dimensions we can fill the grid.
 +     * This is the first, unsorted fill. We sort the columns after this.
 +     */
 +    for (i = a0; i < a1; i++)
 +    {
 +        /* At this point nbs->cell contains the local grid x,y indices */
 +        cxy = nbs->cell[i];
 +        nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
 +    }
 +
 +    if (dd_zone == 0)
 +    {
 +        /* Set the cell indices for the moved particles */
 +        n0 = grid->nc*grid->na_sc;
 +        n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
 +        if (dd_zone == 0)
 +        {
 +            for (i = n0; i < n1; i++)
 +            {
 +                nbs->cell[nbs->a[i]] = i;
 +            }
 +        }
 +    }
 +
 +    /* Sort the super-cell columns along z into the sub-cells. */
 +#pragma omp parallel for num_threads(nbs->nthread_max) schedule(static)
 +    for (thread = 0; thread < nbs->nthread_max; thread++)
 +    {
 +        if (grid->bSimple)
 +        {
 +            sort_columns_simple(nbs, dd_zone, grid, a0, a1, atinfo, x, nbat,
 +                                ((thread+0)*grid->ncx*grid->ncy)/nthread,
 +                                ((thread+1)*grid->ncx*grid->ncy)/nthread,
 +                                nbs->work[thread].sort_work);
 +        }
 +        else
 +        {
 +            sort_columns_supersub(nbs, dd_zone, grid, a0, a1, atinfo, x, nbat,
 +                                  ((thread+0)*grid->ncx*grid->ncy)/nthread,
 +                                  ((thread+1)*grid->ncx*grid->ncy)/nthread,
 +                                  nbs->work[thread].sort_work);
 +        }
 +    }
 +
 +    if (grid->bSimple && nbat->XFormat == nbatX8)
 +    {
 +        combine_bounding_box_pairs(grid, grid->bb);
 +    }
 +
 +    if (!grid->bSimple)
 +    {
 +        grid->nsubc_tot = 0;
 +        for (i = 0; i < grid->nc; i++)
 +        {
 +            grid->nsubc_tot += grid->nsubc[i];
 +        }
 +    }
 +
 +    if (debug)
 +    {
 +        if (grid->bSimple)
 +        {
 +            print_bbsizes_simple(debug, nbs, grid);
 +        }
 +        else
 +        {
 +            fprintf(debug, "ns non-zero sub-cells: %d average atoms %.2f\n",
 +                    grid->nsubc_tot, (a1-a0)/(double)grid->nsubc_tot);
 +
 +            print_bbsizes_supersub(debug, nbs, grid);
 +        }
 +    }
 +}
 +
 +static void init_buffer_flags(nbnxn_buffer_flags_t *flags,
 +                              int                   natoms)
 +{
 +    int b;
 +
 +    flags->nflag = (natoms + NBNXN_BUFFERFLAG_SIZE - 1)/NBNXN_BUFFERFLAG_SIZE;
 +    if (flags->nflag > flags->flag_nalloc)
 +    {
 +        flags->flag_nalloc = over_alloc_large(flags->nflag);
 +        srenew(flags->flag, flags->flag_nalloc);
 +    }
 +    for (b = 0; b < flags->nflag; b++)
 +    {
 +        flags->flag[b] = 0;
 +    }
 +}
 +
 +/* Sets up a grid and puts the atoms on the grid.
 + * This function only operates on one domain of the domain decompostion.
 + * Note that without domain decomposition there is only one domain.
 + */
 +void nbnxn_put_on_grid(nbnxn_search_t nbs,
 +                       int ePBC, matrix box,
 +                       int dd_zone,
 +                       rvec corner0, rvec corner1,
 +                       int a0, int a1,
 +                       real atom_density,
 +                       const int *atinfo,
 +                       rvec *x,
 +                       int nmoved, int *move,
 +                       int nb_kernel_type,
 +                       nbnxn_atomdata_t *nbat)
 +{
 +    nbnxn_grid_t *grid;
 +    int           n;
 +    int           nc_max_grid, nc_max;
 +
 +    grid = &nbs->grid[dd_zone];
 +
 +    nbs_cycle_start(&nbs->cc[enbsCCgrid]);
 +
 +    grid->bSimple = nbnxn_kernel_pairlist_simple(nb_kernel_type);
 +
 +    grid->na_c      = nbnxn_kernel_to_ci_size(nb_kernel_type);
 +    grid->na_cj     = nbnxn_kernel_to_cj_size(nb_kernel_type);
 +    grid->na_sc     = (grid->bSimple ? 1 : GPU_NSUBCELL)*grid->na_c;
 +    grid->na_c_2log = get_2log(grid->na_c);
 +
 +    nbat->na_c = grid->na_c;
 +
 +    if (dd_zone == 0)
 +    {
 +        grid->cell0 = 0;
 +    }
 +    else
 +    {
 +        grid->cell0 =
 +            (nbs->grid[dd_zone-1].cell0 + nbs->grid[dd_zone-1].nc)*
 +            nbs->grid[dd_zone-1].na_sc/grid->na_sc;
 +    }
 +
 +    n = a1 - a0;
 +
 +    if (dd_zone == 0)
 +    {
 +        nbs->ePBC = ePBC;
 +        copy_mat(box, nbs->box);
 +
 +        if (atom_density >= 0)
 +        {
 +            grid->atom_density = atom_density;
 +        }
 +        else
 +        {
 +            grid->atom_density = grid_atom_density(n-nmoved, corner0, corner1);
 +        }
 +
 +        grid->cell0 = 0;
 +
 +        nbs->natoms_local    = a1 - nmoved;
 +        /* We assume that nbnxn_put_on_grid is called first
 +         * for the local atoms (dd_zone=0).
 +         */
 +        nbs->natoms_nonlocal = a1 - nmoved;
 +    }
 +    else
 +    {
 +        nbs->natoms_nonlocal = max(nbs->natoms_nonlocal, a1);
 +    }
 +
 +    nc_max_grid = set_grid_size_xy(nbs, grid,
 +                                   dd_zone, n-nmoved, corner0, corner1,
 +                                   nbs->grid[0].atom_density);
 +
 +    nc_max = grid->cell0 + nc_max_grid;
 +
 +    if (a1 > nbs->cell_nalloc)
 +    {
 +        nbs->cell_nalloc = over_alloc_large(a1);
 +        srenew(nbs->cell, nbs->cell_nalloc);
 +    }
 +
 +    /* To avoid conditionals we store the moved particles at the end of a,
 +     * make sure we have enough space.
 +     */
 +    if (nc_max*grid->na_sc + nmoved > nbs->a_nalloc)
 +    {
 +        nbs->a_nalloc = over_alloc_large(nc_max*grid->na_sc + nmoved);
 +        srenew(nbs->a, nbs->a_nalloc);
 +    }
 +
 +    /* We need padding up to a multiple of the buffer flag size: simply add */
 +    if (nc_max*grid->na_sc + NBNXN_BUFFERFLAG_SIZE > nbat->nalloc)
 +    {
 +        nbnxn_atomdata_realloc(nbat, nc_max*grid->na_sc+NBNXN_BUFFERFLAG_SIZE);
 +    }
 +
 +    calc_cell_indices(nbs, dd_zone, grid, a0, a1, atinfo, x, move, nbat);
 +
 +    if (dd_zone == 0)
 +    {
 +        nbat->natoms_local = nbat->natoms;
 +    }
 +
 +    nbs_cycle_stop(&nbs->cc[enbsCCgrid]);
 +}
 +
 +/* Calls nbnxn_put_on_grid for all non-local domains */
 +void nbnxn_put_on_grid_nonlocal(nbnxn_search_t            nbs,
 +                                const gmx_domdec_zones_t *zones,
 +                                const int                *atinfo,
 +                                rvec                     *x,
 +                                int                       nb_kernel_type,
 +                                nbnxn_atomdata_t         *nbat)
 +{
 +    int  zone, d;
 +    rvec c0, c1;
 +
 +    for (zone = 1; zone < zones->n; zone++)
 +    {
 +        for (d = 0; d < DIM; d++)
 +        {
 +            c0[d] = zones->size[zone].bb_x0[d];
 +            c1[d] = zones->size[zone].bb_x1[d];
 +        }
 +
 +        nbnxn_put_on_grid(nbs, nbs->ePBC, NULL,
 +                          zone, c0, c1,
 +                          zones->cg_range[zone],
 +                          zones->cg_range[zone+1],
 +                          -1,
 +                          atinfo,
 +                          x,
 +                          0, NULL,
 +                          nb_kernel_type,
 +                          nbat);
 +    }
 +}
 +
 +/* Add simple grid type information to the local super/sub grid */
 +void nbnxn_grid_add_simple(nbnxn_search_t    nbs,
 +                           nbnxn_atomdata_t *nbat)
 +{
 +    nbnxn_grid_t *grid;
-         srenew(grid->bb_simple, grid->nc_nalloc_simple*NNBSBB_B);
++    float        *bbcz;
++    nbnxn_bb_t   *bb;
 +    int           ncd, sc;
 +
 +    grid = &nbs->grid[0];
 +
 +    if (grid->bSimple)
 +    {
 +        gmx_incons("nbnxn_grid_simple called with a simple grid");
 +    }
 +
 +    ncd = grid->na_sc/NBNXN_CPU_CLUSTER_I_SIZE;
 +
 +    if (grid->nc*ncd > grid->nc_nalloc_simple)
 +    {
 +        grid->nc_nalloc_simple = over_alloc_large(grid->nc*ncd);
 +        srenew(grid->bbcz_simple, grid->nc_nalloc_simple*NNBSBB_D);
-                                                bb+tx*NNBSBB_B);
++        srenew(grid->bb_simple, grid->nc_nalloc_simple);
 +        srenew(grid->flags_simple, grid->nc_nalloc_simple);
 +        if (nbat->XFormat)
 +        {
 +            sfree_aligned(grid->bbj);
 +            snew_aligned(grid->bbj, grid->nc_nalloc_simple/2, 16);
 +        }
 +    }
 +
 +    bbcz = grid->bbcz_simple;
 +    bb   = grid->bb_simple;
 +
 +#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
 +    for (sc = 0; sc < grid->nc; sc++)
 +    {
 +        int c, tx, na;
 +
 +        for (c = 0; c < ncd; c++)
 +        {
 +            tx = sc*ncd + c;
 +
 +            na = NBNXN_CPU_CLUSTER_I_SIZE;
 +            while (na > 0 &&
 +                   nbat->type[tx*NBNXN_CPU_CLUSTER_I_SIZE+na-1] == nbat->ntype-1)
 +            {
 +                na--;
 +            }
 +
 +            if (na > 0)
 +            {
 +                switch (nbat->XFormat)
 +                {
 +                    case nbatX4:
 +                        /* PACK_X4==NBNXN_CPU_CLUSTER_I_SIZE, so this is simple */
 +                        calc_bounding_box_x_x4(na, nbat->x+tx*STRIDE_P4,
-                                                bb+tx*NNBSBB_B);
++                                               bb+tx);
 +                        break;
 +                    case nbatX8:
 +                        /* PACK_X8>NBNXN_CPU_CLUSTER_I_SIZE, more complicated */
 +                        calc_bounding_box_x_x8(na, nbat->x+X8_IND_A(tx*NBNXN_CPU_CLUSTER_I_SIZE),
-                                           bb+tx*NNBSBB_B);
++                                               bb+tx);
 +                        break;
 +                    default:
 +                        calc_bounding_box(na, nbat->xstride,
 +                                          nbat->x+tx*NBNXN_CPU_CLUSTER_I_SIZE*nbat->xstride,
-                 bbcz[tx*NNBSBB_D+0] = bb[tx*NNBSBB_B         +ZZ];
-                 bbcz[tx*NNBSBB_D+1] = bb[tx*NNBSBB_B+NNBSBB_C+ZZ];
++                                          bb+tx);
 +                        break;
 +                }
-                        const float *bb)
++                bbcz[tx*NNBSBB_D+0] = bb[tx].lower[BB_Z];
++                bbcz[tx*NNBSBB_D+1] = bb[tx].upper[BB_Z];
 +
 +                /* No interaction optimization yet here */
 +                grid->flags_simple[tx] = NBNXN_CI_DO_LJ(0) | NBNXN_CI_DO_COUL(0);
 +            }
 +            else
 +            {
 +                grid->flags_simple[tx] = 0;
 +            }
 +        }
 +    }
 +
 +    if (grid->bSimple && nbat->XFormat == nbatX8)
 +    {
 +        combine_bounding_box_pairs(grid, grid->bb_simple);
 +    }
 +}
 +
 +void nbnxn_get_ncells(nbnxn_search_t nbs, int *ncx, int *ncy)
 +{
 +    *ncx = nbs->grid[0].ncx;
 +    *ncy = nbs->grid[0].ncy;
 +}
 +
 +void nbnxn_get_atomorder(nbnxn_search_t nbs, int **a, int *n)
 +{
 +    const nbnxn_grid_t *grid;
 +
 +    grid = &nbs->grid[0];
 +
 +    /* Return the atom order for the home cell (index 0) */
 +    *a  = nbs->a;
 +
 +    *n = grid->cxy_ind[grid->ncx*grid->ncy]*grid->na_sc;
 +}
 +
 +void nbnxn_set_atomorder(nbnxn_search_t nbs)
 +{
 +    nbnxn_grid_t *grid;
 +    int           ao, cx, cy, cxy, cz, j;
 +
 +    /* Set the atom order for the home cell (index 0) */
 +    grid = &nbs->grid[0];
 +
 +    ao = 0;
 +    for (cx = 0; cx < grid->ncx; cx++)
 +    {
 +        for (cy = 0; cy < grid->ncy; cy++)
 +        {
 +            cxy = cx*grid->ncy + cy;
 +            j   = grid->cxy_ind[cxy]*grid->na_sc;
 +            for (cz = 0; cz < grid->cxy_na[cxy]; cz++)
 +            {
 +                nbs->a[j]     = ao;
 +                nbs->cell[ao] = j;
 +                ao++;
 +                j++;
 +            }
 +        }
 +    }
 +}
 +
 +/* Determines the cell range along one dimension that
 + * the bounding box b0 - b1 sees.
 + */
 +static void get_cell_range(real b0, real b1,
 +                           int nc, real c0, real s, real invs,
 +                           real d2, real r2, int *cf, int *cl)
 +{
 +    *cf = max((int)((b0 - c0)*invs), 0);
 +
 +    while (*cf > 0 && d2 + sqr((b0 - c0) - (*cf-1+1)*s) < r2)
 +    {
 +        (*cf)--;
 +    }
 +
 +    *cl = min((int)((b1 - c0)*invs), nc-1);
 +    while (*cl < nc-1 && d2 + sqr((*cl+1)*s - (b1 - c0)) < r2)
 +    {
 +        (*cl)++;
 +    }
 +}
 +
 +/* Reference code calculating the distance^2 between two bounding boxes */
 +static float box_dist2(float bx0, float bx1, float by0,
 +                       float by1, float bz0, float bz1,
-     dl  = bx0 - bb[BBU_X];
-     dh  = bb[BBL_X] - bx1;
++                       const nbnxn_bb_t *bb)
 +{
 +    float d2;
 +    float dl, dh, dm, dm0;
 +
 +    d2 = 0;
 +
-     dl  = by0 - bb[BBU_Y];
-     dh  = bb[BBL_Y] - by1;
++    dl  = bx0 - bb->upper[BB_X];
++    dh  = bb->lower[BB_X] - bx1;
 +    dm  = max(dl, dh);
 +    dm0 = max(dm, 0);
 +    d2 += dm0*dm0;
 +
-     dl  = bz0 - bb[BBU_Z];
-     dh  = bb[BBL_Z] - bz1;
++    dl  = by0 - bb->upper[BB_Y];
++    dh  = bb->lower[BB_Y] - by1;
 +    dm  = max(dl, dh);
 +    dm0 = max(dm, 0);
 +    d2 += dm0*dm0;
 +
- static float subc_bb_dist2(int si, const float *bb_i_ci,
-                            int csj, const float *bb_j_all)
++    dl  = bz0 - bb->upper[BB_Z];
++    dh  = bb->lower[BB_Z] - bz1;
 +    dm  = max(dl, dh);
 +    dm0 = max(dm, 0);
 +    d2 += dm0*dm0;
 +
 +    return d2;
 +}
 +
 +/* Plain C code calculating the distance^2 between two bounding boxes */
-     const float *bb_i, *bb_j;
-     float        d2;
-     float        dl, dh, dm, dm0;
++static float subc_bb_dist2(int si, const nbnxn_bb_t *bb_i_ci,
++                           int csj, const nbnxn_bb_t *bb_j_all)
 +{
-     bb_i = bb_i_ci  +  si*NNBSBB_B;
-     bb_j = bb_j_all + csj*NNBSBB_B;
++    const nbnxn_bb_t *bb_i, *bb_j;
++    float             d2;
++    float             dl, dh, dm, dm0;
 +
-     dl  = bb_i[BBL_X] - bb_j[BBU_X];
-     dh  = bb_j[BBL_X] - bb_i[BBU_X];
++    bb_i = bb_i_ci  +  si;
++    bb_j = bb_j_all + csj;
 +
 +    d2 = 0;
 +
-     dl  = bb_i[BBL_Y] - bb_j[BBU_Y];
-     dh  = bb_j[BBL_Y] - bb_i[BBU_Y];
++    dl  = bb_i->lower[BB_X] - bb_j->upper[BB_X];
++    dh  = bb_j->lower[BB_X] - bb_i->upper[BB_X];
 +    dm  = max(dl, dh);
 +    dm0 = max(dm, 0);
 +    d2 += dm0*dm0;
 +
-     dl  = bb_i[BBL_Z] - bb_j[BBU_Z];
-     dh  = bb_j[BBL_Z] - bb_i[BBU_Z];
++    dl  = bb_i->lower[BB_Y] - bb_j->upper[BB_Y];
++    dh  = bb_j->lower[BB_Y] - bb_i->upper[BB_Y];
 +    dm  = max(dl, dh);
 +    dm0 = max(dm, 0);
 +    d2 += dm0*dm0;
 +
- static float subc_bb_dist2_sse(int si, const float *bb_i_ci,
-                                int csj, const float *bb_j_all)
++    dl  = bb_i->lower[BB_Z] - bb_j->upper[BB_Z];
++    dh  = bb_j->lower[BB_Z] - bb_i->upper[BB_Z];
 +    dm  = max(dl, dh);
 +    dm0 = max(dm, 0);
 +    d2 += dm0*dm0;
 +
 +    return d2;
 +}
 +
 +#ifdef NBNXN_SEARCH_BB_SSE
 +
 +/* SSE code for bb distance for bb format xyz0 */
-     const float *bb_i, *bb_j;
++static float subc_bb_dist2_sse(int si, const nbnxn_bb_t *bb_i_ci,
++                               int csj, const nbnxn_bb_t *bb_j_all)
 +{
-     bb_i = bb_i_ci  +  si*NNBSBB_B;
-     bb_j = bb_j_all + csj*NNBSBB_B;
-     bb_i_SSE0 = _mm_load_ps(bb_i);
-     bb_i_SSE1 = _mm_load_ps(bb_i+NNBSBB_C);
-     bb_j_SSE0 = _mm_load_ps(bb_j);
-     bb_j_SSE1 = _mm_load_ps(bb_j+NNBSBB_C);
 +    __m128       bb_i_SSE0, bb_i_SSE1;
 +    __m128       bb_j_SSE0, bb_j_SSE1;
 +    __m128       dl_SSE;
 +    __m128       dh_SSE;
 +    __m128       dm_SSE;
 +    __m128       dm0_SSE;
 +    __m128       d2_SSE;
 +#ifndef GMX_X86_SSE4_1
 +    float        d2_array[7], *d2_align;
 +
 +    d2_align = (float *)(((size_t)(d2_array+3)) & (~((size_t)15)));
 +#else
 +    float d2;
 +#endif
 +
-     snew_aligned(nbl->work->bb_ci, GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX, NBNXN_MEM_ALIGN);
++    bb_i_SSE0 = _mm_load_ps(&bb_i_ci[si].lower[0]);
++    bb_i_SSE1 = _mm_load_ps(&bb_i_ci[si].upper[0]);
++    bb_j_SSE0 = _mm_load_ps(&bb_j_all[csj].lower[0]);
++    bb_j_SSE1 = _mm_load_ps(&bb_j_all[csj].upper[0]);
 +
 +    dl_SSE    = _mm_sub_ps(bb_i_SSE0, bb_j_SSE1);
 +    dh_SSE    = _mm_sub_ps(bb_j_SSE0, bb_i_SSE1);
 +
 +    dm_SSE    = _mm_max_ps(dl_SSE, dh_SSE);
 +    dm0_SSE   = _mm_max_ps(dm_SSE, _mm_setzero_ps());
 +#ifndef GMX_X86_SSE4_1
 +    d2_SSE    = _mm_mul_ps(dm0_SSE, dm0_SSE);
 +
 +    _mm_store_ps(d2_align, d2_SSE);
 +
 +    return d2_align[0] + d2_align[1] + d2_align[2];
 +#else
 +    /* SSE4.1 dot product of components 0,1,2 */
 +    d2_SSE    = _mm_dp_ps(dm0_SSE, dm0_SSE, 0x71);
 +
 +    _mm_store_ss(&d2, d2_SSE);
 +
 +    return d2;
 +#endif
 +}
 +
 +/* Calculate bb bounding distances of bb_i[si,...,si+3] and store them in d2 */
 +#define SUBC_BB_DIST2_SSE_XXXX_INNER(si, bb_i, d2) \
 +    {                                                \
 +        int    shi;                                  \
 +                                                 \
 +        __m128 dx_0, dy_0, dz_0;                       \
 +        __m128 dx_1, dy_1, dz_1;                       \
 +                                                 \
 +        __m128 mx, my, mz;                             \
 +        __m128 m0x, m0y, m0z;                          \
 +                                                 \
 +        __m128 d2x, d2y, d2z;                          \
 +        __m128 d2s, d2t;                              \
 +                                                 \
 +        shi = si*NNBSBB_D*DIM;                       \
 +                                                 \
 +        xi_l = _mm_load_ps(bb_i+shi+0*STRIDE_PBB);   \
 +        yi_l = _mm_load_ps(bb_i+shi+1*STRIDE_PBB);   \
 +        zi_l = _mm_load_ps(bb_i+shi+2*STRIDE_PBB);   \
 +        xi_h = _mm_load_ps(bb_i+shi+3*STRIDE_PBB);   \
 +        yi_h = _mm_load_ps(bb_i+shi+4*STRIDE_PBB);   \
 +        zi_h = _mm_load_ps(bb_i+shi+5*STRIDE_PBB);   \
 +                                                 \
 +        dx_0 = _mm_sub_ps(xi_l, xj_h);                \
 +        dy_0 = _mm_sub_ps(yi_l, yj_h);                \
 +        dz_0 = _mm_sub_ps(zi_l, zj_h);                \
 +                                                 \
 +        dx_1 = _mm_sub_ps(xj_l, xi_h);                \
 +        dy_1 = _mm_sub_ps(yj_l, yi_h);                \
 +        dz_1 = _mm_sub_ps(zj_l, zi_h);                \
 +                                                 \
 +        mx   = _mm_max_ps(dx_0, dx_1);                \
 +        my   = _mm_max_ps(dy_0, dy_1);                \
 +        mz   = _mm_max_ps(dz_0, dz_1);                \
 +                                                 \
 +        m0x  = _mm_max_ps(mx, zero);                  \
 +        m0y  = _mm_max_ps(my, zero);                  \
 +        m0z  = _mm_max_ps(mz, zero);                  \
 +                                                 \
 +        d2x  = _mm_mul_ps(m0x, m0x);                  \
 +        d2y  = _mm_mul_ps(m0y, m0y);                  \
 +        d2z  = _mm_mul_ps(m0z, m0z);                  \
 +                                                 \
 +        d2s  = _mm_add_ps(d2x, d2y);                  \
 +        d2t  = _mm_add_ps(d2s, d2z);                  \
 +                                                 \
 +        _mm_store_ps(d2+si, d2t);                     \
 +    }
 +
 +/* SSE code for nsi bb distances for bb format xxxxyyyyzzzz */
 +static void subc_bb_dist2_sse_xxxx(const float *bb_j,
 +                                   int nsi, const float *bb_i,
 +                                   float *d2)
 +{
 +    __m128 xj_l, yj_l, zj_l;
 +    __m128 xj_h, yj_h, zj_h;
 +    __m128 xi_l, yi_l, zi_l;
 +    __m128 xi_h, yi_h, zi_h;
 +
 +    __m128 zero;
 +
 +    zero = _mm_setzero_ps();
 +
 +    xj_l = _mm_set1_ps(bb_j[0*STRIDE_PBB]);
 +    yj_l = _mm_set1_ps(bb_j[1*STRIDE_PBB]);
 +    zj_l = _mm_set1_ps(bb_j[2*STRIDE_PBB]);
 +    xj_h = _mm_set1_ps(bb_j[3*STRIDE_PBB]);
 +    yj_h = _mm_set1_ps(bb_j[4*STRIDE_PBB]);
 +    zj_h = _mm_set1_ps(bb_j[5*STRIDE_PBB]);
 +
 +    /* Here we "loop" over si (0,STRIDE_PBB) from 0 to nsi with step STRIDE_PBB.
 +     * But as we know the number of iterations is 1 or 2, we unroll manually.
 +     */
 +    SUBC_BB_DIST2_SSE_XXXX_INNER(0, bb_i, d2);
 +    if (STRIDE_PBB < nsi)
 +    {
 +        SUBC_BB_DIST2_SSE_XXXX_INNER(STRIDE_PBB, bb_i, d2);
 +    }
 +}
 +
 +#endif /* NBNXN_SEARCH_BB_SSE */
 +
 +/* Plain C function which determines if any atom pair between two cells
 + * is within distance sqrt(rl2).
 + */
 +static gmx_bool subc_in_range_x(int na_c,
 +                                int si, const real *x_i,
 +                                int csj, int stride, const real *x_j,
 +                                real rl2)
 +{
 +    int  i, j, i0, j0;
 +    real d2;
 +
 +    for (i = 0; i < na_c; i++)
 +    {
 +        i0 = (si*na_c + i)*DIM;
 +        for (j = 0; j < na_c; j++)
 +        {
 +            j0 = (csj*na_c + j)*stride;
 +
 +            d2 = sqr(x_i[i0  ] - x_j[j0  ]) +
 +                sqr(x_i[i0+1] - x_j[j0+1]) +
 +                sqr(x_i[i0+2] - x_j[j0+2]);
 +
 +            if (d2 < rl2)
 +            {
 +                return TRUE;
 +            }
 +        }
 +    }
 +
 +    return FALSE;
 +}
 +
 +#ifdef NBNXN_SEARCH_SSE_SINGLE
 +/* When we make seperate single/double precision SIMD vector operation
 + * include files, this function should be moved there (also using FMA).
 + */
 +static inline __m128
 +gmx_mm_calc_rsq_ps(__m128 x, __m128 y, __m128 z)
 +{
 +    return _mm_add_ps( _mm_add_ps( _mm_mul_ps(x, x), _mm_mul_ps(y, y) ), _mm_mul_ps(z, z) );
 +}
 +#endif
 +
 +/* SSE function which determines if any atom pair between two cells,
 + * both with 8 atoms, is within distance sqrt(rl2).
 + * Not performance critical, so only uses plain SSE.
 + */
 +static gmx_bool subc_in_range_sse8(int na_c,
 +                                   int si, const real *x_i,
 +                                   int csj, int stride, const real *x_j,
 +                                   real rl2)
 +{
 +#ifdef NBNXN_SEARCH_SSE_SINGLE
 +    __m128 ix_SSE0, iy_SSE0, iz_SSE0;
 +    __m128 ix_SSE1, iy_SSE1, iz_SSE1;
 +
 +    __m128 rc2_SSE;
 +
 +    int    na_c_sse;
 +    int    j0, j1;
 +
 +    rc2_SSE   = _mm_set1_ps(rl2);
 +
 +    na_c_sse = NBNXN_GPU_CLUSTER_SIZE/STRIDE_PBB;
 +    ix_SSE0  = _mm_load_ps(x_i+(si*na_c_sse*DIM+0)*STRIDE_PBB);
 +    iy_SSE0  = _mm_load_ps(x_i+(si*na_c_sse*DIM+1)*STRIDE_PBB);
 +    iz_SSE0  = _mm_load_ps(x_i+(si*na_c_sse*DIM+2)*STRIDE_PBB);
 +    ix_SSE1  = _mm_load_ps(x_i+(si*na_c_sse*DIM+3)*STRIDE_PBB);
 +    iy_SSE1  = _mm_load_ps(x_i+(si*na_c_sse*DIM+4)*STRIDE_PBB);
 +    iz_SSE1  = _mm_load_ps(x_i+(si*na_c_sse*DIM+5)*STRIDE_PBB);
 +
 +    /* We loop from the outer to the inner particles to maximize
 +     * the chance that we find a pair in range quickly and return.
 +     */
 +    j0 = csj*na_c;
 +    j1 = j0 + na_c - 1;
 +    while (j0 < j1)
 +    {
 +        __m128 jx0_SSE, jy0_SSE, jz0_SSE;
 +        __m128 jx1_SSE, jy1_SSE, jz1_SSE;
 +
 +        __m128 dx_SSE0, dy_SSE0, dz_SSE0;
 +        __m128 dx_SSE1, dy_SSE1, dz_SSE1;
 +        __m128 dx_SSE2, dy_SSE2, dz_SSE2;
 +        __m128 dx_SSE3, dy_SSE3, dz_SSE3;
 +
 +        __m128 rsq_SSE0;
 +        __m128 rsq_SSE1;
 +        __m128 rsq_SSE2;
 +        __m128 rsq_SSE3;
 +
 +        __m128 wco_SSE0;
 +        __m128 wco_SSE1;
 +        __m128 wco_SSE2;
 +        __m128 wco_SSE3;
 +        __m128 wco_any_SSE01, wco_any_SSE23, wco_any_SSE;
 +
 +        jx0_SSE = _mm_load1_ps(x_j+j0*stride+0);
 +        jy0_SSE = _mm_load1_ps(x_j+j0*stride+1);
 +        jz0_SSE = _mm_load1_ps(x_j+j0*stride+2);
 +
 +        jx1_SSE = _mm_load1_ps(x_j+j1*stride+0);
 +        jy1_SSE = _mm_load1_ps(x_j+j1*stride+1);
 +        jz1_SSE = _mm_load1_ps(x_j+j1*stride+2);
 +
 +        /* Calculate distance */
 +        dx_SSE0            = _mm_sub_ps(ix_SSE0, jx0_SSE);
 +        dy_SSE0            = _mm_sub_ps(iy_SSE0, jy0_SSE);
 +        dz_SSE0            = _mm_sub_ps(iz_SSE0, jz0_SSE);
 +        dx_SSE1            = _mm_sub_ps(ix_SSE1, jx0_SSE);
 +        dy_SSE1            = _mm_sub_ps(iy_SSE1, jy0_SSE);
 +        dz_SSE1            = _mm_sub_ps(iz_SSE1, jz0_SSE);
 +        dx_SSE2            = _mm_sub_ps(ix_SSE0, jx1_SSE);
 +        dy_SSE2            = _mm_sub_ps(iy_SSE0, jy1_SSE);
 +        dz_SSE2            = _mm_sub_ps(iz_SSE0, jz1_SSE);
 +        dx_SSE3            = _mm_sub_ps(ix_SSE1, jx1_SSE);
 +        dy_SSE3            = _mm_sub_ps(iy_SSE1, jy1_SSE);
 +        dz_SSE3            = _mm_sub_ps(iz_SSE1, jz1_SSE);
 +
 +        /* rsq = dx*dx+dy*dy+dz*dz */
 +        rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0, dy_SSE0, dz_SSE0);
 +        rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1, dy_SSE1, dz_SSE1);
 +        rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2, dy_SSE2, dz_SSE2);
 +        rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3, dy_SSE3, dz_SSE3);
 +
 +        wco_SSE0           = _mm_cmplt_ps(rsq_SSE0, rc2_SSE);
 +        wco_SSE1           = _mm_cmplt_ps(rsq_SSE1, rc2_SSE);
 +        wco_SSE2           = _mm_cmplt_ps(rsq_SSE2, rc2_SSE);
 +        wco_SSE3           = _mm_cmplt_ps(rsq_SSE3, rc2_SSE);
 +
 +        wco_any_SSE01      = _mm_or_ps(wco_SSE0, wco_SSE1);
 +        wco_any_SSE23      = _mm_or_ps(wco_SSE2, wco_SSE3);
 +        wco_any_SSE        = _mm_or_ps(wco_any_SSE01, wco_any_SSE23);
 +
 +        if (_mm_movemask_ps(wco_any_SSE))
 +        {
 +            return TRUE;
 +        }
 +
 +        j0++;
 +        j1--;
 +    }
 +    return FALSE;
 +
 +#else
 +    /* No SSE */
 +    gmx_incons("SSE function called without SSE support");
 +
 +    return TRUE;
 +#endif
 +}
 +
 +/* Returns the j sub-cell for index cj_ind */
 +static int nbl_cj(const nbnxn_pairlist_t *nbl, int cj_ind)
 +{
 +    return nbl->cj4[cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG].cj[cj_ind & (NBNXN_GPU_JGROUP_SIZE - 1)];
 +}
 +
 +/* Returns the i-interaction mask of the j sub-cell for index cj_ind */
 +static unsigned nbl_imask0(const nbnxn_pairlist_t *nbl, int cj_ind)
 +{
 +    return nbl->cj4[cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG].imei[0].imask;
 +}
 +
 +/* Ensures there is enough space for extra extra exclusion masks */
 +static void check_excl_space(nbnxn_pairlist_t *nbl, int extra)
 +{
 +    if (nbl->nexcl+extra > nbl->excl_nalloc)
 +    {
 +        nbl->excl_nalloc = over_alloc_small(nbl->nexcl+extra);
 +        nbnxn_realloc_void((void **)&nbl->excl,
 +                           nbl->nexcl*sizeof(*nbl->excl),
 +                           nbl->excl_nalloc*sizeof(*nbl->excl),
 +                           nbl->alloc, nbl->free);
 +    }
 +}
 +
 +/* Ensures there is enough space for ncell extra j-cells in the list */
 +static void check_subcell_list_space_simple(nbnxn_pairlist_t *nbl,
 +                                            int               ncell)
 +{
 +    int cj_max;
 +
 +    cj_max = nbl->ncj + ncell;
 +
 +    if (cj_max > nbl->cj_nalloc)
 +    {
 +        nbl->cj_nalloc = over_alloc_small(cj_max);
 +        nbnxn_realloc_void((void **)&nbl->cj,
 +                           nbl->ncj*sizeof(*nbl->cj),
 +                           nbl->cj_nalloc*sizeof(*nbl->cj),
 +                           nbl->alloc, nbl->free);
 +    }
 +}
 +
 +/* Ensures there is enough space for ncell extra j-subcells in the list */
 +static void check_subcell_list_space_supersub(nbnxn_pairlist_t *nbl,
 +                                              int               nsupercell)
 +{
 +    int ncj4_max, j4, j, w, t;
 +
 +#define NWARP       2
 +#define WARP_SIZE  32
 +
 +    /* We can have maximally nsupercell*GPU_NSUBCELL sj lists */
 +    /* We can store 4 j-subcell - i-supercell pairs in one struct.
 +     * since we round down, we need one extra entry.
 +     */
 +    ncj4_max = ((nbl->work->cj_ind + nsupercell*GPU_NSUBCELL + NBNXN_GPU_JGROUP_SIZE - 1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
 +
 +    if (ncj4_max > nbl->cj4_nalloc)
 +    {
 +        nbl->cj4_nalloc = over_alloc_small(ncj4_max);
 +        nbnxn_realloc_void((void **)&nbl->cj4,
 +                           nbl->work->cj4_init*sizeof(*nbl->cj4),
 +                           nbl->cj4_nalloc*sizeof(*nbl->cj4),
 +                           nbl->alloc, nbl->free);
 +    }
 +
 +    if (ncj4_max > nbl->work->cj4_init)
 +    {
 +        for (j4 = nbl->work->cj4_init; j4 < ncj4_max; j4++)
 +        {
 +            /* No i-subcells and no excl's in the list initially */
 +            for (w = 0; w < NWARP; w++)
 +            {
 +                nbl->cj4[j4].imei[w].imask    = 0U;
 +                nbl->cj4[j4].imei[w].excl_ind = 0;
 +
 +            }
 +        }
 +        nbl->work->cj4_init = ncj4_max;
 +    }
 +}
 +
 +/* Set all excl masks for one GPU warp no exclusions */
 +static void set_no_excls(nbnxn_excl_t *excl)
 +{
 +    int t;
 +
 +    for (t = 0; t < WARP_SIZE; t++)
 +    {
 +        /* Turn all interaction bits on */
 +        excl->pair[t] = NBNXN_INTERACTION_MASK_ALL;
 +    }
 +}
 +
 +/* Initializes a single nbnxn_pairlist_t data structure */
 +static void nbnxn_init_pairlist(nbnxn_pairlist_t *nbl,
 +                                gmx_bool          bSimple,
 +                                nbnxn_alloc_t    *alloc,
 +                                nbnxn_free_t     *free)
 +{
 +    if (alloc == NULL)
 +    {
 +        nbl->alloc = nbnxn_alloc_aligned;
 +    }
 +    else
 +    {
 +        nbl->alloc = alloc;
 +    }
 +    if (free == NULL)
 +    {
 +        nbl->free = nbnxn_free_aligned;
 +    }
 +    else
 +    {
 +        nbl->free = free;
 +    }
 +
 +    nbl->bSimple     = bSimple;
 +    nbl->na_sc       = 0;
 +    nbl->na_ci       = 0;
 +    nbl->na_cj       = 0;
 +    nbl->nci         = 0;
 +    nbl->ci          = NULL;
 +    nbl->ci_nalloc   = 0;
 +    nbl->ncj         = 0;
 +    nbl->cj          = NULL;
 +    nbl->cj_nalloc   = 0;
 +    nbl->ncj4        = 0;
 +    /* We need one element extra in sj, so alloc initially with 1 */
 +    nbl->cj4_nalloc  = 0;
 +    nbl->cj4         = NULL;
 +    nbl->nci_tot     = 0;
 +
 +    if (!nbl->bSimple)
 +    {
 +        nbl->excl        = NULL;
 +        nbl->excl_nalloc = 0;
 +        nbl->nexcl       = 0;
 +        check_excl_space(nbl, 1);
 +        nbl->nexcl       = 1;
 +        set_no_excls(&nbl->excl[0]);
 +    }
 +
 +    snew(nbl->work, 1);
++    if (nbl->bSimple)
++    {
++        snew_aligned(nbl->work->bb_ci, 1, NBNXN_MEM_ALIGN);
++    }
++    else
++    {
 +#ifdef NBNXN_BBXXXX
-     snew_aligned(nbl->work->bb_ci, GPU_NSUBCELL*NNBSBB_B, NBNXN_MEM_ALIGN);
++        snew_aligned(nbl->work->pbb_ci, GPU_NSUBCELL/STRIDE_PBB*NNBSBB_XXXX, NBNXN_MEM_ALIGN);
 +#else
-     const float             *bb_ci;
++        snew_aligned(nbl->work->bb_ci, GPU_NSUBCELL, NBNXN_MEM_ALIGN);
 +#endif
++    }
 +    snew_aligned(nbl->work->x_ci, NBNXN_NA_SC_MAX*DIM, NBNXN_MEM_ALIGN);
 +#ifdef GMX_NBNXN_SIMD
 +    snew_aligned(nbl->work->x_ci_simd_4xn, 1, NBNXN_MEM_ALIGN);
 +    snew_aligned(nbl->work->x_ci_simd_2xnn, 1, NBNXN_MEM_ALIGN);
 +#endif
 +    snew_aligned(nbl->work->d2, GPU_NSUBCELL, NBNXN_MEM_ALIGN);
 +
 +    nbl->work->sort            = NULL;
 +    nbl->work->sort_nalloc     = 0;
 +    nbl->work->sci_sort        = NULL;
 +    nbl->work->sci_sort_nalloc = 0;
 +}
 +
 +void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
 +                             gmx_bool bSimple, gmx_bool bCombined,
 +                             nbnxn_alloc_t *alloc,
 +                             nbnxn_free_t  *free)
 +{
 +    int i;
 +
 +    nbl_list->bSimple   = bSimple;
 +    nbl_list->bCombined = bCombined;
 +
 +    nbl_list->nnbl = gmx_omp_nthreads_get(emntNonbonded);
 +
 +    if (!nbl_list->bCombined &&
 +        nbl_list->nnbl > NBNXN_BUFFERFLAG_MAX_THREADS)
 +    {
 +        gmx_fatal(FARGS, "%d OpenMP threads were requested. Since the non-bonded force buffer reduction is prohibitively slow with more than %d threads, we do not allow this. Use %d or less OpenMP threads.",
 +                  nbl_list->nnbl, NBNXN_BUFFERFLAG_MAX_THREADS, NBNXN_BUFFERFLAG_MAX_THREADS);
 +    }
 +
 +    snew(nbl_list->nbl, nbl_list->nnbl);
 +    /* Execute in order to avoid memory interleaving between threads */
 +#pragma omp parallel for num_threads(nbl_list->nnbl) schedule(static)
 +    for (i = 0; i < nbl_list->nnbl; i++)
 +    {
 +        /* Allocate the nblist data structure locally on each thread
 +         * to optimize memory access for NUMA architectures.
 +         */
 +        snew(nbl_list->nbl[i], 1);
 +
 +        /* Only list 0 is used on the GPU, use normal allocation for i>0 */
 +        if (i == 0)
 +        {
 +            nbnxn_init_pairlist(nbl_list->nbl[i], nbl_list->bSimple, alloc, free);
 +        }
 +        else
 +        {
 +            nbnxn_init_pairlist(nbl_list->nbl[i], nbl_list->bSimple, NULL, NULL);
 +        }
 +    }
 +}
 +
 +/* Print statistics of a pair list, used for debug output */
 +static void print_nblist_statistics_simple(FILE *fp, const nbnxn_pairlist_t *nbl,
 +                                           const nbnxn_search_t nbs, real rl)
 +{
 +    const nbnxn_grid_t *grid;
 +    int                 cs[SHIFTS];
 +    int                 s, i, j;
 +    int                 npexcl;
 +
 +    /* This code only produces correct statistics with domain decomposition */
 +    grid = &nbs->grid[0];
 +
 +    fprintf(fp, "nbl nci %d ncj %d\n",
 +            nbl->nci, nbl->ncj);
 +    fprintf(fp, "nbl na_sc %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
 +            nbl->na_sc, rl, nbl->ncj, nbl->ncj/(double)grid->nc,
 +            nbl->ncj/(double)grid->nc*grid->na_sc,
 +            nbl->ncj/(double)grid->nc*grid->na_sc/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nc*grid->na_sc/det(nbs->box)));
 +
 +    fprintf(fp, "nbl average j cell list length %.1f\n",
 +            0.25*nbl->ncj/(double)nbl->nci);
 +
 +    for (s = 0; s < SHIFTS; s++)
 +    {
 +        cs[s] = 0;
 +    }
 +    npexcl = 0;
 +    for (i = 0; i < nbl->nci; i++)
 +    {
 +        cs[nbl->ci[i].shift & NBNXN_CI_SHIFT] +=
 +            nbl->ci[i].cj_ind_end - nbl->ci[i].cj_ind_start;
 +
 +        j = nbl->ci[i].cj_ind_start;
 +        while (j < nbl->ci[i].cj_ind_end &&
 +               nbl->cj[j].excl != NBNXN_INTERACTION_MASK_ALL)
 +        {
 +            npexcl++;
 +            j++;
 +        }
 +    }
 +    fprintf(fp, "nbl cell pairs, total: %d excl: %d %.1f%%\n",
 +            nbl->ncj, npexcl, 100*npexcl/(double)nbl->ncj);
 +    for (s = 0; s < SHIFTS; s++)
 +    {
 +        if (cs[s] > 0)
 +        {
 +            fprintf(fp, "nbl shift %2d ncj %3d\n", s, cs[s]);
 +        }
 +    }
 +}
 +
 +/* Print statistics of a pair lists, used for debug output */
 +static void print_nblist_statistics_supersub(FILE *fp, const nbnxn_pairlist_t *nbl,
 +                                             const nbnxn_search_t nbs, real rl)
 +{
 +    const nbnxn_grid_t *grid;
 +    int                 i, j4, j, si, b;
 +    int                 c[GPU_NSUBCELL+1];
 +
 +    /* This code only produces correct statistics with domain decomposition */
 +    grid = &nbs->grid[0];
 +
 +    fprintf(fp, "nbl nsci %d ncj4 %d nsi %d excl4 %d\n",
 +            nbl->nsci, nbl->ncj4, nbl->nci_tot, nbl->nexcl);
 +    fprintf(fp, "nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
 +            nbl->na_ci, rl, nbl->nci_tot, nbl->nci_tot/(double)grid->nsubc_tot,
 +            nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c,
 +            nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nsubc_tot*grid->na_c/det(nbs->box)));
 +
 +    fprintf(fp, "nbl average j super cell list length %.1f\n",
 +            0.25*nbl->ncj4/(double)nbl->nsci);
 +    fprintf(fp, "nbl average i sub cell list length %.1f\n",
 +            nbl->nci_tot/((double)nbl->ncj4));
 +
 +    for (si = 0; si <= GPU_NSUBCELL; si++)
 +    {
 +        c[si] = 0;
 +    }
 +    for (i = 0; i < nbl->nsci; i++)
 +    {
 +        for (j4 = nbl->sci[i].cj4_ind_start; j4 < nbl->sci[i].cj4_ind_end; j4++)
 +        {
 +            for (j = 0; j < NBNXN_GPU_JGROUP_SIZE; j++)
 +            {
 +                b = 0;
 +                for (si = 0; si < GPU_NSUBCELL; si++)
 +                {
 +                    if (nbl->cj4[j4].imei[0].imask & (1U << (j*GPU_NSUBCELL + si)))
 +                    {
 +                        b++;
 +                    }
 +                }
 +                c[b]++;
 +            }
 +        }
 +    }
 +    for (b = 0; b <= GPU_NSUBCELL; b++)
 +    {
 +        fprintf(fp, "nbl j-list #i-subcell %d %7d %4.1f\n",
 +                b, c[b], 100.0*c[b]/(double)(nbl->ncj4*NBNXN_GPU_JGROUP_SIZE));
 +    }
 +}
 +
 +/* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp */
 +static void low_get_nbl_exclusions(nbnxn_pairlist_t *nbl, int cj4,
 +                                   int warp, nbnxn_excl_t **excl)
 +{
 +    if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
 +    {
 +        /* No exclusions set, make a new list entry */
 +        nbl->cj4[cj4].imei[warp].excl_ind = nbl->nexcl;
 +        nbl->nexcl++;
 +        *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
 +        set_no_excls(*excl);
 +    }
 +    else
 +    {
 +        /* We already have some exclusions, new ones can be added to the list */
 +        *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
 +    }
 +}
 +
 +/* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp,
 + * allocates extra memory, if necessary.
 + */
 +static void get_nbl_exclusions_1(nbnxn_pairlist_t *nbl, int cj4,
 +                                 int warp, nbnxn_excl_t **excl)
 +{
 +    if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
 +    {
 +        /* We need to make a new list entry, check if we have space */
 +        check_excl_space(nbl, 1);
 +    }
 +    low_get_nbl_exclusions(nbl, cj4, warp, excl);
 +}
 +
 +/* Returns pointers to the exclusion mask for cj4-unit cj4 for both warps,
 + * allocates extra memory, if necessary.
 + */
 +static void get_nbl_exclusions_2(nbnxn_pairlist_t *nbl, int cj4,
 +                                 nbnxn_excl_t **excl_w0,
 +                                 nbnxn_excl_t **excl_w1)
 +{
 +    /* Check for space we might need */
 +    check_excl_space(nbl, 2);
 +
 +    low_get_nbl_exclusions(nbl, cj4, 0, excl_w0);
 +    low_get_nbl_exclusions(nbl, cj4, 1, excl_w1);
 +}
 +
 +/* Sets the self exclusions i=j and pair exclusions i>j */
 +static void set_self_and_newton_excls_supersub(nbnxn_pairlist_t *nbl,
 +                                               int cj4_ind, int sj_offset,
 +                                               int si)
 +{
 +    nbnxn_excl_t *excl[2];
 +    int           ei, ej, w;
 +
 +    /* Here we only set the set self and double pair exclusions */
 +
 +    get_nbl_exclusions_2(nbl, cj4_ind, &excl[0], &excl[1]);
 +
 +    /* Only minor < major bits set */
 +    for (ej = 0; ej < nbl->na_ci; ej++)
 +    {
 +        w = (ej>>2);
 +        for (ei = ej; ei < nbl->na_ci; ei++)
 +        {
 +            excl[w]->pair[(ej & (NBNXN_GPU_JGROUP_SIZE-1))*nbl->na_ci + ei] &=
 +                ~(1U << (sj_offset*GPU_NSUBCELL + si));
 +        }
 +    }
 +}
 +
 +/* Returns a diagonal or off-diagonal interaction mask for plain C lists */
 +static unsigned int get_imask(gmx_bool rdiag, int ci, int cj)
 +{
 +    return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
 +}
 +
 +/* Returns a diagonal or off-diagonal interaction mask for cj-size=2 */
 +static unsigned int get_imask_simd_j2(gmx_bool rdiag, int ci, int cj)
 +{
 +    return (rdiag && ci*2 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_0 :
 +            (rdiag && ci*2+1 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_1 :
 +             NBNXN_INTERACTION_MASK_ALL));
 +}
 +
 +/* Returns a diagonal or off-diagonal interaction mask for cj-size=4 */
 +static unsigned int get_imask_simd_j4(gmx_bool rdiag, int ci, int cj)
 +{
 +    return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
 +}
 +
 +/* Returns a diagonal or off-diagonal interaction mask for cj-size=8 */
 +static unsigned int get_imask_simd_j8(gmx_bool rdiag, int ci, int cj)
 +{
 +    return (rdiag && ci == cj*2 ? NBNXN_INTERACTION_MASK_DIAG_J8_0 :
 +            (rdiag && ci == cj*2+1 ? NBNXN_INTERACTION_MASK_DIAG_J8_1 :
 +             NBNXN_INTERACTION_MASK_ALL));
 +}
 +
 +#ifdef GMX_NBNXN_SIMD
 +#if GMX_SIMD_WIDTH_HERE == 2
 +#define get_imask_simd_4xn  get_imask_simd_j2
 +#endif
 +#if GMX_SIMD_WIDTH_HERE == 4
 +#define get_imask_simd_4xn  get_imask_simd_j4
 +#endif
 +#if GMX_SIMD_WIDTH_HERE == 8
 +#define get_imask_simd_4xn  get_imask_simd_j8
 +#define get_imask_simd_2xnn get_imask_simd_j4
 +#endif
 +#if GMX_SIMD_WIDTH_HERE == 16
 +#define get_imask_simd_2xnn get_imask_simd_j8
 +#endif
 +#endif
 +
 +/* Plain C code for making a pair list of cell ci vs cell cjf-cjl.
 + * Checks bounding box distances and possibly atom pair distances.
 + */
 +static void make_cluster_list_simple(const nbnxn_grid_t *gridj,
 +                                     nbnxn_pairlist_t *nbl,
 +                                     int ci, int cjf, int cjl,
 +                                     gmx_bool remove_sub_diag,
 +                                     const real *x_j,
 +                                     real rl2, float rbb2,
 +                                     int *ndistc)
 +{
 +    const nbnxn_list_work_t *work;
 +
-     const float *bb_ci;
++    const nbnxn_bb_t        *bb_ci;
 +    const real              *x_ci;
 +
 +    gmx_bool                 InRange;
 +    real                     d2;
 +    int                      cjf_gl, cjl_gl, cj;
 +
 +    work = nbl->work;
 +
 +    bb_ci = nbl->work->bb_ci;
 +    x_ci  = nbl->work->x_ci;
 +
 +    InRange = FALSE;
 +    while (!InRange && cjf <= cjl)
 +    {
 +        d2       = subc_bb_dist2(0, bb_ci, cjf, gridj->bb);
 +        *ndistc += 2;
 +
 +        /* Check if the distance is within the distance where
 +         * we use only the bounding box distance rbb,
 +         * or within the cut-off and there is at least one atom pair
 +         * within the cut-off.
 +         */
 +        if (d2 < rbb2)
 +        {
 +            InRange = TRUE;
 +        }
 +        else if (d2 < rl2)
 +        {
 +            int i, j;
 +
 +            cjf_gl = gridj->cell0 + cjf;
 +            for (i = 0; i < NBNXN_CPU_CLUSTER_I_SIZE && !InRange; i++)
 +            {
 +                for (j = 0; j < NBNXN_CPU_CLUSTER_I_SIZE; j++)
 +                {
 +                    InRange = InRange ||
 +                        (sqr(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+XX]) +
 +                         sqr(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+YY]) +
 +                         sqr(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+ZZ]) < rl2);
 +                }
 +            }
 +            *ndistc += NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE;
 +        }
 +        if (!InRange)
 +        {
 +            cjf++;
 +        }
 +    }
 +    if (!InRange)
 +    {
 +        return;
 +    }
 +
 +    InRange = FALSE;
 +    while (!InRange && cjl > cjf)
 +    {
 +        d2       = subc_bb_dist2(0, bb_ci, cjl, gridj->bb);
 +        *ndistc += 2;
 +
 +        /* Check if the distance is within the distance where
 +         * we use only the bounding box distance rbb,
 +         * or within the cut-off and there is at least one atom pair
 +         * within the cut-off.
 +         */
 +        if (d2 < rbb2)
 +        {
 +            InRange = TRUE;
 +        }
 +        else if (d2 < rl2)
 +        {
 +            int i, j;
 +
 +            cjl_gl = gridj->cell0 + cjl;
 +            for (i = 0; i < NBNXN_CPU_CLUSTER_I_SIZE && !InRange; i++)
 +            {
 +                for (j = 0; j < NBNXN_CPU_CLUSTER_I_SIZE; j++)
 +                {
 +                    InRange = InRange ||
 +                        (sqr(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+XX]) +
 +                         sqr(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+YY]) +
 +                         sqr(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+ZZ]) < rl2);
 +                }
 +            }
 +            *ndistc += NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE;
 +        }
 +        if (!InRange)
 +        {
 +            cjl--;
 +        }
 +    }
 +
 +    if (cjf <= cjl)
 +    {
 +        for (cj = cjf; cj <= cjl; cj++)
 +        {
 +            /* Store cj and the interaction mask */
 +            nbl->cj[nbl->ncj].cj   = gridj->cell0 + cj;
 +            nbl->cj[nbl->ncj].excl = get_imask(remove_sub_diag, ci, cj);
 +            nbl->ncj++;
 +        }
 +        /* Increase the closing index in i super-cell list */
 +        nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
 +    }
 +}
 +
 +#ifdef GMX_NBNXN_SIMD_4XN
 +#include "nbnxn_search_simd_4xn.h"
 +#endif
 +#ifdef GMX_NBNXN_SIMD_2XNN
 +#include "nbnxn_search_simd_2xnn.h"
 +#endif
 +
 +/* Plain C or SSE code for making a pair list of super-cell sci vs scj.
 + * Checks bounding box distances and possibly atom pair distances.
 + */
 +static void make_cluster_list_supersub(const nbnxn_grid_t *gridi,
 +                                       const nbnxn_grid_t *gridj,
 +                                       nbnxn_pairlist_t *nbl,
 +                                       int sci, int scj,
 +                                       gmx_bool sci_equals_scj,
 +                                       int stride, const real *x,
 +                                       real rl2, float rbb2,
 +                                       int *ndistc)
 +{
 +    int          na_c;
 +    int          npair;
 +    int          cjo, ci1, ci, cj, cj_gl;
 +    int          cj4_ind, cj_offset;
 +    unsigned     imask;
 +    nbnxn_cj4_t *cj4;
-     bb_ci = nbl->work->bb_ci;
-     x_ci  = nbl->work->x_ci;
++#ifdef NBNXN_BBXXXX
++    const float      *pbb_ci;
++#else
++    const nbnxn_bb_t *bb_ci;
++#endif
 +    const real  *x_ci;
 +    float       *d2l, d2;
 +    int          w;
 +#define PRUNE_LIST_CPU_ONE
 +#ifdef PRUNE_LIST_CPU_ONE
 +    int  ci_last = -1;
 +#endif
 +
 +    d2l = nbl->work->d2;
 +
-         subc_bb_dist2_sse_xxxx(gridj->bb+(cj>>STRIDE_PBB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_PBB-1)),
-                                ci1, bb_ci, d2l);
++#ifdef NBNXN_BBXXXX
++    pbb_ci = nbl->work->pbb_ci;
++#else
++    bb_ci  = nbl->work->bb_ci;
++#endif
++    x_ci   = nbl->work->x_ci;
 +
 +    na_c = gridj->na_c;
 +
 +    for (cjo = 0; cjo < gridj->nsubc[scj]; cjo++)
 +    {
 +        cj4_ind   = (nbl->work->cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG);
 +        cj_offset = nbl->work->cj_ind - cj4_ind*NBNXN_GPU_JGROUP_SIZE;
 +        cj4       = &nbl->cj4[cj4_ind];
 +
 +        cj = scj*GPU_NSUBCELL + cjo;
 +
 +        cj_gl = gridj->cell0*GPU_NSUBCELL + cj;
 +
 +        /* Initialize this j-subcell i-subcell list */
 +        cj4->cj[cj_offset] = cj_gl;
 +        imask              = 0;
 +
 +        if (sci_equals_scj)
 +        {
 +            ci1 = cjo + 1;
 +        }
 +        else
 +        {
 +            ci1 = gridi->nsubc[sci];
 +        }
 +
 +#ifdef NBNXN_BBXXXX
 +        /* Determine all ci1 bb distances in one call with SSE */
- static void set_icell_bb_simple(const float *bb, int ci,
-                                 real shx, real shy, real shz,
-                                 float *bb_ci)
++        subc_bb_dist2_sse_xxxx(gridj->pbb+(cj>>STRIDE_PBB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_PBB-1)),
++                               ci1, pbb_ci, d2l);
 +        *ndistc += na_c*2;
 +#endif
 +
 +        npair = 0;
 +        /* We use a fixed upper-bound instead of ci1 to help optimization */
 +        for (ci = 0; ci < GPU_NSUBCELL; ci++)
 +        {
 +            if (ci == ci1)
 +            {
 +                break;
 +            }
 +
 +#ifndef NBNXN_BBXXXX
 +            /* Determine the bb distance between ci and cj */
 +            d2l[ci]  = subc_bb_dist2(ci, bb_ci, cj, gridj->bb);
 +            *ndistc += 2;
 +#endif
 +            d2 = d2l[ci];
 +
 +#ifdef PRUNE_LIST_CPU_ALL
 +            /* Check if the distance is within the distance where
 +             * we use only the bounding box distance rbb,
 +             * or within the cut-off and there is at least one atom pair
 +             * within the cut-off. This check is very costly.
 +             */
 +            *ndistc += na_c*na_c;
 +            if (d2 < rbb2 ||
 +                (d2 < rl2 &&
 +#ifdef NBNXN_PBB_SSE
 +                 subc_in_range_sse8
 +#else
 +                 subc_in_range_x
 +#endif
 +                     (na_c, ci, x_ci, cj_gl, stride, x, rl2)))
 +#else
 +            /* Check if the distance between the two bounding boxes
 +             * in within the pair-list cut-off.
 +             */
 +            if (d2 < rl2)
 +#endif
 +            {
 +                /* Flag this i-subcell to be taken into account */
 +                imask |= (1U << (cj_offset*GPU_NSUBCELL+ci));
 +
 +#ifdef PRUNE_LIST_CPU_ONE
 +                ci_last = ci;
 +#endif
 +
 +                npair++;
 +            }
 +        }
 +
 +#ifdef PRUNE_LIST_CPU_ONE
 +        /* If we only found 1 pair, check if any atoms are actually
 +         * within the cut-off, so we could get rid of it.
 +         */
 +        if (npair == 1 && d2l[ci_last] >= rbb2)
 +        {
 +            /* Avoid using function pointers here, as it's slower */
 +            if (
 +#ifdef NBNXN_PBB_SSE
 +                !subc_in_range_sse8
 +#else
 +                !subc_in_range_x
 +#endif
 +                    (na_c, ci_last, x_ci, cj_gl, stride, x, rl2))
 +            {
 +                imask &= ~(1U << (cj_offset*GPU_NSUBCELL+ci_last));
 +                npair--;
 +            }
 +        }
 +#endif
 +
 +        if (npair > 0)
 +        {
 +            /* We have a useful sj entry, close it now */
 +
 +            /* Set the exclucions for the ci== sj entry.
 +             * Here we don't bother to check if this entry is actually flagged,
 +             * as it will nearly always be in the list.
 +             */
 +            if (sci_equals_scj)
 +            {
 +                set_self_and_newton_excls_supersub(nbl, cj4_ind, cj_offset, cjo);
 +            }
 +
 +            /* Copy the cluster interaction mask to the list */
 +            for (w = 0; w < NWARP; w++)
 +            {
 +                cj4->imei[w].imask |= imask;
 +            }
 +
 +            nbl->work->cj_ind++;
 +
 +            /* Keep the count */
 +            nbl->nci_tot += npair;
 +
 +            /* Increase the closing index in i super-cell list */
 +            nbl->sci[nbl->nsci].cj4_ind_end =
 +                ((nbl->work->cj_ind+NBNXN_GPU_JGROUP_SIZE-1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
 +        }
 +    }
 +}
 +
 +/* Set all atom-pair exclusions from the topology stored in excl
 + * as masks in the pair-list for simple list i-entry nbl_ci
 + */
 +static void set_ci_top_excls(const nbnxn_search_t nbs,
 +                             nbnxn_pairlist_t    *nbl,
 +                             gmx_bool             diagRemoved,
 +                             int                  na_ci_2log,
 +                             int                  na_cj_2log,
 +                             const nbnxn_ci_t    *nbl_ci,
 +                             const t_blocka      *excl)
 +{
 +    const int    *cell;
 +    int           ci;
 +    int           cj_ind_first, cj_ind_last;
 +    int           cj_first, cj_last;
 +    int           ndirect;
 +    int           i, ai, aj, si, eind, ge, se;
 +    int           found, cj_ind_0, cj_ind_1, cj_ind_m;
 +    int           cj_m;
 +    gmx_bool      Found_si;
 +    int           si_ind;
 +    nbnxn_excl_t *nbl_excl;
 +    int           inner_i, inner_e;
 +
 +    cell = nbs->cell;
 +
 +    if (nbl_ci->cj_ind_end == nbl_ci->cj_ind_start)
 +    {
 +        /* Empty list */
 +        return;
 +    }
 +
 +    ci = nbl_ci->ci;
 +
 +    cj_ind_first = nbl_ci->cj_ind_start;
 +    cj_ind_last  = nbl->ncj - 1;
 +
 +    cj_first = nbl->cj[cj_ind_first].cj;
 +    cj_last  = nbl->cj[cj_ind_last].cj;
 +
 +    /* Determine how many contiguous j-cells we have starting
 +     * from the first i-cell. This number can be used to directly
 +     * calculate j-cell indices for excluded atoms.
 +     */
 +    ndirect = 0;
 +    if (na_ci_2log == na_cj_2log)
 +    {
 +        while (cj_ind_first + ndirect <= cj_ind_last &&
 +               nbl->cj[cj_ind_first+ndirect].cj == ci + ndirect)
 +        {
 +            ndirect++;
 +        }
 +    }
 +#ifdef NBNXN_SEARCH_BB_SSE
 +    else
 +    {
 +        while (cj_ind_first + ndirect <= cj_ind_last &&
 +               nbl->cj[cj_ind_first+ndirect].cj == ci_to_cj(na_cj_2log, ci) + ndirect)
 +        {
 +            ndirect++;
 +        }
 +    }
 +#endif
 +
 +    /* Loop over the atoms in the i super-cell */
 +    for (i = 0; i < nbl->na_sc; i++)
 +    {
 +        ai = nbs->a[ci*nbl->na_sc+i];
 +        if (ai >= 0)
 +        {
 +            si  = (i>>na_ci_2log);
 +
 +            /* Loop over the topology-based exclusions for this i-atom */
 +            for (eind = excl->index[ai]; eind < excl->index[ai+1]; eind++)
 +            {
 +                aj = excl->a[eind];
 +
 +                if (aj == ai)
 +                {
 +                    /* The self exclusion are already set, save some time */
 +                    continue;
 +                }
 +
 +                ge = cell[aj];
 +
 +                /* Without shifts we only calculate interactions j>i
 +                 * for one-way pair-lists.
 +                 */
 +                if (diagRemoved && ge <= ci*nbl->na_sc + i)
 +                {
 +                    continue;
 +                }
 +
 +                se = (ge >> na_cj_2log);
 +
 +                /* Could the cluster se be in our list? */
 +                if (se >= cj_first && se <= cj_last)
 +                {
 +                    if (se < cj_first + ndirect)
 +                    {
 +                        /* We can calculate cj_ind directly from se */
 +                        found = cj_ind_first + se - cj_first;
 +                    }
 +                    else
 +                    {
 +                        /* Search for se using bisection */
 +                        found    = -1;
 +                        cj_ind_0 = cj_ind_first + ndirect;
 +                        cj_ind_1 = cj_ind_last + 1;
 +                        while (found == -1 && cj_ind_0 < cj_ind_1)
 +                        {
 +                            cj_ind_m = (cj_ind_0 + cj_ind_1)>>1;
 +
 +                            cj_m = nbl->cj[cj_ind_m].cj;
 +
 +                            if (se == cj_m)
 +                            {
 +                                found = cj_ind_m;
 +                            }
 +                            else if (se < cj_m)
 +                            {
 +                                cj_ind_1 = cj_ind_m;
 +                            }
 +                            else
 +                            {
 +                                cj_ind_0 = cj_ind_m + 1;
 +                            }
 +                        }
 +                    }
 +
 +                    if (found >= 0)
 +                    {
 +                        inner_i = i  - (si << na_ci_2log);
 +                        inner_e = ge - (se << na_cj_2log);
 +
 +                        nbl->cj[found].excl &= ~(1U<<((inner_i<<na_cj_2log) + inner_e));
 +                    }
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* Set all atom-pair exclusions from the topology stored in excl
 + * as masks in the pair-list for i-super-cell entry nbl_sci
 + */
 +static void set_sci_top_excls(const nbnxn_search_t nbs,
 +                              nbnxn_pairlist_t    *nbl,
 +                              gmx_bool             diagRemoved,
 +                              int                  na_c_2log,
 +                              const nbnxn_sci_t   *nbl_sci,
 +                              const t_blocka      *excl)
 +{
 +    const int    *cell;
 +    int           na_c;
 +    int           sci;
 +    int           cj_ind_first, cj_ind_last;
 +    int           cj_first, cj_last;
 +    int           ndirect;
 +    int           i, ai, aj, si, eind, ge, se;
 +    int           found, cj_ind_0, cj_ind_1, cj_ind_m;
 +    int           cj_m;
 +    gmx_bool      Found_si;
 +    int           si_ind;
 +    nbnxn_excl_t *nbl_excl;
 +    int           inner_i, inner_e, w;
 +
 +    cell = nbs->cell;
 +
 +    na_c = nbl->na_ci;
 +
 +    if (nbl_sci->cj4_ind_end == nbl_sci->cj4_ind_start)
 +    {
 +        /* Empty list */
 +        return;
 +    }
 +
 +    sci = nbl_sci->sci;
 +
 +    cj_ind_first = nbl_sci->cj4_ind_start*NBNXN_GPU_JGROUP_SIZE;
 +    cj_ind_last  = nbl->work->cj_ind - 1;
 +
 +    cj_first = nbl->cj4[nbl_sci->cj4_ind_start].cj[0];
 +    cj_last  = nbl_cj(nbl, cj_ind_last);
 +
 +    /* Determine how many contiguous j-clusters we have starting
 +     * from the first i-cluster. This number can be used to directly
 +     * calculate j-cluster indices for excluded atoms.
 +     */
 +    ndirect = 0;
 +    while (cj_ind_first + ndirect <= cj_ind_last &&
 +           nbl_cj(nbl, cj_ind_first+ndirect) == sci*GPU_NSUBCELL + ndirect)
 +    {
 +        ndirect++;
 +    }
 +
 +    /* Loop over the atoms in the i super-cell */
 +    for (i = 0; i < nbl->na_sc; i++)
 +    {
 +        ai = nbs->a[sci*nbl->na_sc+i];
 +        if (ai >= 0)
 +        {
 +            si  = (i>>na_c_2log);
 +
 +            /* Loop over the topology-based exclusions for this i-atom */
 +            for (eind = excl->index[ai]; eind < excl->index[ai+1]; eind++)
 +            {
 +                aj = excl->a[eind];
 +
 +                if (aj == ai)
 +                {
 +                    /* The self exclusion are already set, save some time */
 +                    continue;
 +                }
 +
 +                ge = cell[aj];
 +
 +                /* Without shifts we only calculate interactions j>i
 +                 * for one-way pair-lists.
 +                 */
 +                if (diagRemoved && ge <= sci*nbl->na_sc + i)
 +                {
 +                    continue;
 +                }
 +
 +                se = ge>>na_c_2log;
 +                /* Could the cluster se be in our list? */
 +                if (se >= cj_first && se <= cj_last)
 +                {
 +                    if (se < cj_first + ndirect)
 +                    {
 +                        /* We can calculate cj_ind directly from se */
 +                        found = cj_ind_first + se - cj_first;
 +                    }
 +                    else
 +                    {
 +                        /* Search for se using bisection */
 +                        found    = -1;
 +                        cj_ind_0 = cj_ind_first + ndirect;
 +                        cj_ind_1 = cj_ind_last + 1;
 +                        while (found == -1 && cj_ind_0 < cj_ind_1)
 +                        {
 +                            cj_ind_m = (cj_ind_0 + cj_ind_1)>>1;
 +
 +                            cj_m = nbl_cj(nbl, cj_ind_m);
 +
 +                            if (se == cj_m)
 +                            {
 +                                found = cj_ind_m;
 +                            }
 +                            else if (se < cj_m)
 +                            {
 +                                cj_ind_1 = cj_ind_m;
 +                            }
 +                            else
 +                            {
 +                                cj_ind_0 = cj_ind_m + 1;
 +                            }
 +                        }
 +                    }
 +
 +                    if (found >= 0)
 +                    {
 +                        inner_i = i  - si*na_c;
 +                        inner_e = ge - se*na_c;
 +
 +/* Macro for getting the index of atom a within a cluster */
 +#define AMODCJ4(a)  ((a) & (NBNXN_GPU_JGROUP_SIZE - 1))
 +/* Macro for converting an atom number to a cluster number */
 +#define A2CJ4(a)    ((a) >> NBNXN_GPU_JGROUP_SIZE_2LOG)
 +/* Macro for getting the index of an i-atom within a warp */
 +#define AMODWI(a)   ((a) & (NBNXN_GPU_CLUSTER_SIZE/2 - 1))
 +
 +                        if (nbl_imask0(nbl, found) & (1U << (AMODCJ4(found)*GPU_NSUBCELL + si)))
 +                        {
 +                            w       = (inner_e >> 2);
 +
 +                            get_nbl_exclusions_1(nbl, A2CJ4(found), w, &nbl_excl);
 +
 +                            nbl_excl->pair[AMODWI(inner_e)*nbl->na_ci+inner_i] &=
 +                                ~(1U << (AMODCJ4(found)*GPU_NSUBCELL + si));
 +                        }
 +
 +#undef AMODCJ4
 +#undef A2CJ4
 +#undef AMODWI
 +                    }
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* Reallocate the simple ci list for at least n entries */
 +static void nb_realloc_ci(nbnxn_pairlist_t *nbl, int n)
 +{
 +    nbl->ci_nalloc = over_alloc_small(n);
 +    nbnxn_realloc_void((void **)&nbl->ci,
 +                       nbl->nci*sizeof(*nbl->ci),
 +                       nbl->ci_nalloc*sizeof(*nbl->ci),
 +                       nbl->alloc, nbl->free);
 +}
 +
 +/* Reallocate the super-cell sci list for at least n entries */
 +static void nb_realloc_sci(nbnxn_pairlist_t *nbl, int n)
 +{
 +    nbl->sci_nalloc = over_alloc_small(n);
 +    nbnxn_realloc_void((void **)&nbl->sci,
 +                       nbl->nsci*sizeof(*nbl->sci),
 +                       nbl->sci_nalloc*sizeof(*nbl->sci),
 +                       nbl->alloc, nbl->free);
 +}
 +
 +/* Make a new ci entry at index nbl->nci */
 +static void new_ci_entry(nbnxn_pairlist_t *nbl, int ci, int shift, int flags)
 +{
 +    if (nbl->nci + 1 > nbl->ci_nalloc)
 +    {
 +        nb_realloc_ci(nbl, nbl->nci+1);
 +    }
 +    nbl->ci[nbl->nci].ci            = ci;
 +    nbl->ci[nbl->nci].shift         = shift;
 +    /* Store the interaction flags along with the shift */
 +    nbl->ci[nbl->nci].shift        |= flags;
 +    nbl->ci[nbl->nci].cj_ind_start  = nbl->ncj;
 +    nbl->ci[nbl->nci].cj_ind_end    = nbl->ncj;
 +}
 +
 +/* Make a new sci entry at index nbl->nsci */
 +static void new_sci_entry(nbnxn_pairlist_t *nbl, int sci, int shift)
 +{
 +    if (nbl->nsci + 1 > nbl->sci_nalloc)
 +    {
 +        nb_realloc_sci(nbl, nbl->nsci+1);
 +    }
 +    nbl->sci[nbl->nsci].sci           = sci;
 +    nbl->sci[nbl->nsci].shift         = shift;
 +    nbl->sci[nbl->nsci].cj4_ind_start = nbl->ncj4;
 +    nbl->sci[nbl->nsci].cj4_ind_end   = nbl->ncj4;
 +}
 +
 +/* Sort the simple j-list cj on exclusions.
 + * Entries with exclusions will all be sorted to the beginning of the list.
 + */
 +static void sort_cj_excl(nbnxn_cj_t *cj, int ncj,
 +                         nbnxn_list_work_t *work)
 +{
 +    int jnew, j;
 +
 +    if (ncj > work->cj_nalloc)
 +    {
 +        work->cj_nalloc = over_alloc_large(ncj);
 +        srenew(work->cj, work->cj_nalloc);
 +    }
 +
 +    /* Make a list of the j-cells involving exclusions */
 +    jnew = 0;
 +    for (j = 0; j < ncj; j++)
 +    {
 +        if (cj[j].excl != NBNXN_INTERACTION_MASK_ALL)
 +        {
 +            work->cj[jnew++] = cj[j];
 +        }
 +    }
 +    /* Check if there are exclusions at all or not just the first entry */
 +    if (!((jnew == 0) ||
 +          (jnew == 1 && cj[0].excl != NBNXN_INTERACTION_MASK_ALL)))
 +    {
 +        for (j = 0; j < ncj; j++)
 +        {
 +            if (cj[j].excl == NBNXN_INTERACTION_MASK_ALL)
 +            {
 +                work->cj[jnew++] = cj[j];
 +            }
 +        }
 +        for (j = 0; j < ncj; j++)
 +        {
 +            cj[j] = work->cj[j];
 +        }
 +    }
 +}
 +
 +/* Close this simple list i entry */
 +static void close_ci_entry_simple(nbnxn_pairlist_t *nbl)
 +{
 +    int jlen;
 +
 +    /* All content of the new ci entry have already been filled correctly,
 +     * we only need to increase the count here (for non empty lists).
 +     */
 +    jlen = nbl->ci[nbl->nci].cj_ind_end - nbl->ci[nbl->nci].cj_ind_start;
 +    if (jlen > 0)
 +    {
 +        sort_cj_excl(nbl->cj+nbl->ci[nbl->nci].cj_ind_start, jlen, nbl->work);
 +
 +        /* The counts below are used for non-bonded pair/flop counts
 +         * and should therefore match the available kernel setups.
 +         */
 +        if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)))
 +        {
 +            nbl->work->ncj_noq += jlen;
 +        }
 +        else if ((nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0)) ||
 +                 !(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_LJ(0)))
 +        {
 +            nbl->work->ncj_hlj += jlen;
 +        }
 +
 +        nbl->nci++;
 +    }
 +}
 +
 +/* Split sci entry for load balancing on the GPU.
 + * Splitting ensures we have enough lists to fully utilize the whole GPU.
 + * With progBal we generate progressively smaller lists, which improves
 + * load balancing. As we only know the current count on our own thread,
 + * we will need to estimate the current total amount of i-entries.
 + * As the lists get concatenated later, this estimate depends
 + * both on nthread and our own thread index.
 + */
 +static void split_sci_entry(nbnxn_pairlist_t *nbl,
 +                            int nsp_max_av, gmx_bool progBal, int nc_bal,
 +                            int thread, int nthread)
 +{
 +    int nsci_est;
 +    int nsp_max;
 +    int cj4_start, cj4_end, j4len, cj4;
 +    int sci;
 +    int nsp, nsp_sci, nsp_cj4, nsp_cj4_e, nsp_cj4_p;
 +    int p;
 +
 +    if (progBal)
 +    {
 +        /* Estimate the total numbers of ci's of the nblist combined
 +         * over all threads using the target number of ci's.
 +         */
 +        nsci_est = nc_bal*thread/nthread + nbl->nsci;
 +
 +        /* The first ci blocks should be larger, to avoid overhead.
 +         * The last ci blocks should be smaller, to improve load balancing.
 +         */
 +        nsp_max = max(1,
 +                      nsp_max_av*nc_bal*3/(2*(nsci_est - 1 + nc_bal)));
 +    }
 +    else
 +    {
 +        nsp_max = nsp_max_av;
 +    }
 +
 +    cj4_start = nbl->sci[nbl->nsci-1].cj4_ind_start;
 +    cj4_end   = nbl->sci[nbl->nsci-1].cj4_ind_end;
 +    j4len     = cj4_end - cj4_start;
 +
 +    if (j4len > 1 && j4len*GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE > nsp_max)
 +    {
 +        /* Remove the last ci entry and process the cj4's again */
 +        nbl->nsci -= 1;
 +
 +        sci        = nbl->nsci;
 +        nsp        = 0;
 +        nsp_sci    = 0;
 +        nsp_cj4_e  = 0;
 +        nsp_cj4    = 0;
 +        for (cj4 = cj4_start; cj4 < cj4_end; cj4++)
 +        {
 +            nsp_cj4_p = nsp_cj4;
 +            /* Count the number of cluster pairs in this cj4 group */
 +            nsp_cj4   = 0;
 +            for (p = 0; p < GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE; p++)
 +            {
 +                nsp_cj4 += (nbl->cj4[cj4].imei[0].imask >> p) & 1;
 +            }
 +
 +            if (nsp_cj4 > 0 && nsp + nsp_cj4 > nsp_max)
 +            {
 +                /* Split the list at cj4 */
 +                nbl->sci[sci].cj4_ind_end = cj4;
 +                /* Create a new sci entry */
 +                sci++;
 +                nbl->nsci++;
 +                if (nbl->nsci+1 > nbl->sci_nalloc)
 +                {
 +                    nb_realloc_sci(nbl, nbl->nsci+1);
 +                }
 +                nbl->sci[sci].sci           = nbl->sci[nbl->nsci-1].sci;
 +                nbl->sci[sci].shift         = nbl->sci[nbl->nsci-1].shift;
 +                nbl->sci[sci].cj4_ind_start = cj4;
 +                nsp_sci                     = nsp;
 +                nsp_cj4_e                   = nsp_cj4_p;
 +                nsp                         = 0;
 +            }
 +            nsp += nsp_cj4;
 +        }
 +
 +        /* Put the remaining cj4's in the last sci entry */
 +        nbl->sci[sci].cj4_ind_end = cj4_end;
 +
 +        /* Possibly balance out the last two sci's
 +         * by moving the last cj4 of the second last sci.
 +         */
 +        if (nsp_sci - nsp_cj4_e >= nsp + nsp_cj4_e)
 +        {
 +            nbl->sci[sci-1].cj4_ind_end--;
 +            nbl->sci[sci].cj4_ind_start--;
 +        }
 +
 +        nbl->nsci++;
 +    }
 +}
 +
 +/* Clost this super/sub list i entry */
 +static void close_ci_entry_supersub(nbnxn_pairlist_t *nbl,
 +                                    int nsp_max_av,
 +                                    gmx_bool progBal, int nc_bal,
 +                                    int thread, int nthread)
 +{
 +    int j4len, tlen;
 +    int nb, b;
 +
 +    /* All content of the new ci entry have already been filled correctly,
 +     * we only need to increase the count here (for non empty lists).
 +     */
 +    j4len = nbl->sci[nbl->nsci].cj4_ind_end - nbl->sci[nbl->nsci].cj4_ind_start;
 +    if (j4len > 0)
 +    {
 +        /* We can only have complete blocks of 4 j-entries in a list,
 +         * so round the count up before closing.
 +         */
 +        nbl->ncj4         = ((nbl->work->cj_ind + NBNXN_GPU_JGROUP_SIZE - 1) >> NBNXN_GPU_JGROUP_SIZE_2LOG);
 +        nbl->work->cj_ind = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
 +
 +        nbl->nsci++;
 +
 +        if (nsp_max_av > 0)
 +        {
 +            /* Measure the size of the new entry and potentially split it */
 +            split_sci_entry(nbl, nsp_max_av, progBal, nc_bal, thread, nthread);
 +        }
 +    }
 +}
 +
 +/* Syncs the working array before adding another grid pair to the list */
 +static void sync_work(nbnxn_pairlist_t *nbl)
 +{
 +    if (!nbl->bSimple)
 +    {
 +        nbl->work->cj_ind   = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
 +        nbl->work->cj4_init = nbl->ncj4;
 +    }
 +}
 +
 +/* Clears an nbnxn_pairlist_t data structure */
 +static void clear_pairlist(nbnxn_pairlist_t *nbl)
 +{
 +    nbl->nci           = 0;
 +    nbl->nsci          = 0;
 +    nbl->ncj           = 0;
 +    nbl->ncj4          = 0;
 +    nbl->nci_tot       = 0;
 +    nbl->nexcl         = 1;
 +
 +    nbl->work->ncj_noq = 0;
 +    nbl->work->ncj_hlj = 0;
 +}
 +
 +/* Sets a simple list i-cell bounding box, including PBC shift */
-     int ia;
-     ia           = ci*NNBSBB_B;
-     bb_ci[BBL_X] = bb[ia+BBL_X] + shx;
-     bb_ci[BBL_Y] = bb[ia+BBL_Y] + shy;
-     bb_ci[BBL_Z] = bb[ia+BBL_Z] + shz;
-     bb_ci[BBU_X] = bb[ia+BBU_X] + shx;
-     bb_ci[BBU_Y] = bb[ia+BBU_Y] + shy;
-     bb_ci[BBU_Z] = bb[ia+BBU_Z] + shz;
++static gmx_inline void set_icell_bb_simple(const nbnxn_bb_t *bb, int ci,
++                                           real shx, real shy, real shz,
++                                           nbnxn_bb_t *bb_ci)
 +{
- static void set_icell_bb_supersub(const float *bb, int ci,
-                                   real shx, real shy, real shz,
-                                   float *bb_ci)
++    bb_ci->lower[BB_X] = bb[ci].lower[BB_X] + shx;
++    bb_ci->lower[BB_Y] = bb[ci].lower[BB_Y] + shy;
++    bb_ci->lower[BB_Z] = bb[ci].lower[BB_Z] + shz;
++    bb_ci->upper[BB_X] = bb[ci].upper[BB_X] + shx;
++    bb_ci->upper[BB_Y] = bb[ci].upper[BB_Y] + shy;
++    bb_ci->upper[BB_Z] = bb[ci].upper[BB_Z] + shz;
 +}
 +
++#ifdef NBNXN_BBXXXX
 +/* Sets a super-cell and sub cell bounding boxes, including PBC shift */
- #ifdef NBNXN_BBXXXX
++static void set_icell_bbxxxx_supersub(const float *bb, int ci,
++                                      real shx, real shy, real shz,
++                                      float *bb_ci)
 +{
 +    int ia, m, i;
 +
- #else
-     ia = ci*GPU_NSUBCELL*NNBSBB_B;
-     for (i = 0; i < GPU_NSUBCELL*NNBSBB_B; i += NNBSBB_B)
 +    ia = ci*(GPU_NSUBCELL>>STRIDE_PBB_2LOG)*NNBSBB_XXXX;
 +    for (m = 0; m < (GPU_NSUBCELL>>STRIDE_PBB_2LOG)*NNBSBB_XXXX; m += NNBSBB_XXXX)
 +    {
 +        for (i = 0; i < STRIDE_PBB; i++)
 +        {
 +            bb_ci[m+0*STRIDE_PBB+i] = bb[ia+m+0*STRIDE_PBB+i] + shx;
 +            bb_ci[m+1*STRIDE_PBB+i] = bb[ia+m+1*STRIDE_PBB+i] + shy;
 +            bb_ci[m+2*STRIDE_PBB+i] = bb[ia+m+2*STRIDE_PBB+i] + shz;
 +            bb_ci[m+3*STRIDE_PBB+i] = bb[ia+m+3*STRIDE_PBB+i] + shx;
 +            bb_ci[m+4*STRIDE_PBB+i] = bb[ia+m+4*STRIDE_PBB+i] + shy;
 +            bb_ci[m+5*STRIDE_PBB+i] = bb[ia+m+5*STRIDE_PBB+i] + shz;
 +        }
 +    }
-         bb_ci[i+BBL_X] = bb[ia+i+BBL_X] + shx;
-         bb_ci[i+BBL_Y] = bb[ia+i+BBL_Y] + shy;
-         bb_ci[i+BBL_Z] = bb[ia+i+BBL_Z] + shz;
-         bb_ci[i+BBU_X] = bb[ia+i+BBU_X] + shx;
-         bb_ci[i+BBU_Y] = bb[ia+i+BBU_Y] + shy;
-         bb_ci[i+BBU_Z] = bb[ia+i+BBU_Z] + shz;
++}
++#endif
++
++/* Sets a super-cell and sub cell bounding boxes, including PBC shift */
++static void set_icell_bb_supersub(const nbnxn_bb_t *bb, int ci,
++                                  real shx, real shy, real shz,
++                                  nbnxn_bb_t *bb_ci)
++{
++    int i;
++
++    for (i = 0; i < GPU_NSUBCELL; i++)
 +    {
- #endif
++        set_icell_bb_simple(bb, ci*GPU_NSUBCELL+i,
++                            shx, shy, shz,
++                            &bb_ci[i]);
 +    }
-     const float *bb_i, *bbcz_i, *bbcz_j;
 +}
 +
 +/* Copies PBC shifted i-cell atom coordinates x,y,z to working array */
 +static void icell_set_x_simple(int ci,
 +                               real shx, real shy, real shz,
 +                               int gmx_unused na_c,
 +                               int stride, const real *x,
 +                               nbnxn_list_work_t *work)
 +{
 +    int  ia, i;
 +
 +    ia = ci*NBNXN_CPU_CLUSTER_I_SIZE;
 +
 +    for (i = 0; i < NBNXN_CPU_CLUSTER_I_SIZE; i++)
 +    {
 +        work->x_ci[i*STRIDE_XYZ+XX] = x[(ia+i)*stride+XX] + shx;
 +        work->x_ci[i*STRIDE_XYZ+YY] = x[(ia+i)*stride+YY] + shy;
 +        work->x_ci[i*STRIDE_XYZ+ZZ] = x[(ia+i)*stride+ZZ] + shz;
 +    }
 +}
 +
 +/* Copies PBC shifted super-cell atom coordinates x,y,z to working array */
 +static void icell_set_x_supersub(int ci,
 +                                 real shx, real shy, real shz,
 +                                 int na_c,
 +                                 int stride, const real *x,
 +                                 nbnxn_list_work_t *work)
 +{
 +    int  ia, i;
 +    real *x_ci;
 +
 +    x_ci = work->x_ci;
 +
 +    ia = ci*GPU_NSUBCELL*na_c;
 +    for (i = 0; i < GPU_NSUBCELL*na_c; i++)
 +    {
 +        x_ci[i*DIM + XX] = x[(ia+i)*stride + XX] + shx;
 +        x_ci[i*DIM + YY] = x[(ia+i)*stride + YY] + shy;
 +        x_ci[i*DIM + ZZ] = x[(ia+i)*stride + ZZ] + shz;
 +    }
 +}
 +
 +#ifdef NBNXN_SEARCH_BB_SSE
 +/* Copies PBC shifted super-cell packed atom coordinates to working array */
 +static void icell_set_x_supersub_sse8(int ci,
 +                                      real shx, real shy, real shz,
 +                                      int na_c,
 +                                      int stride, const real *x,
 +                                      nbnxn_list_work_t *work)
 +{
 +    int  si, io, ia, i, j;
 +    real *x_ci;
 +
 +    x_ci = work->x_ci;
 +
 +    for (si = 0; si < GPU_NSUBCELL; si++)
 +    {
 +        for (i = 0; i < na_c; i += STRIDE_PBB)
 +        {
 +            io = si*na_c + i;
 +            ia = ci*GPU_NSUBCELL*na_c + io;
 +            for (j = 0; j < STRIDE_PBB; j++)
 +            {
 +                x_ci[io*DIM + j + XX*STRIDE_PBB] = x[(ia+j)*stride+XX] + shx;
 +                x_ci[io*DIM + j + YY*STRIDE_PBB] = x[(ia+j)*stride+YY] + shy;
 +                x_ci[io*DIM + j + ZZ*STRIDE_PBB] = x[(ia+j)*stride+ZZ] + shz;
 +            }
 +        }
 +    }
 +}
 +#endif
 +
 +static real nbnxn_rlist_inc_nonloc_fac = 0.6;
 +
 +/* Due to the cluster size the effective pair-list is longer than
 + * that of a simple atom pair-list. This function gives the extra distance.
 + */
 +real nbnxn_get_rlist_effective_inc(int cluster_size, real atom_density)
 +{
 +    return ((0.5 + nbnxn_rlist_inc_nonloc_fac)*sqr(((cluster_size) - 1.0)/(cluster_size))*pow((cluster_size)/(atom_density), 1.0/3.0));
 +}
 +
 +/* Estimates the interaction volume^2 for non-local interactions */
 +static real nonlocal_vol2(const gmx_domdec_zones_t *zones, rvec ls, real r)
 +{
 +    int  z, d;
 +    real cl, ca, za;
 +    real vold_est;
 +    real vol2_est_tot;
 +
 +    vol2_est_tot = 0;
 +
 +    /* Here we simply add up the volumes of 1, 2 or 3 1D decomposition
 +     * not home interaction volume^2. As these volumes are not additive,
 +     * this is an overestimate, but it would only be significant in the limit
 +     * of small cells, where we anyhow need to split the lists into
 +     * as small parts as possible.
 +     */
 +
 +    for (z = 0; z < zones->n; z++)
 +    {
 +        if (zones->shift[z][XX] + zones->shift[z][YY] + zones->shift[z][ZZ] == 1)
 +        {
 +            cl = 0;
 +            ca = 1;
 +            za = 1;
 +            for (d = 0; d < DIM; d++)
 +            {
 +                if (zones->shift[z][d] == 0)
 +                {
 +                    cl += 0.5*ls[d];
 +                    ca *= ls[d];
 +                    za *= zones->size[z].x1[d] - zones->size[z].x0[d];
 +                }
 +            }
 +
 +            /* 4 octants of a sphere */
 +            vold_est  = 0.25*M_PI*r*r*r*r;
 +            /* 4 quarter pie slices on the edges */
 +            vold_est += 4*cl*M_PI/6.0*r*r*r;
 +            /* One rectangular volume on a face */
 +            vold_est += ca*0.5*r*r;
 +
 +            vol2_est_tot += vold_est*za;
 +        }
 +    }
 +
 +    return vol2_est_tot;
 +}
 +
 +/* Estimates the average size of a full j-list for super/sub setup */
 +static int get_nsubpair_max(const nbnxn_search_t nbs,
 +                            int                  iloc,
 +                            real                 rlist,
 +                            int                  min_ci_balanced)
 +{
 +    const nbnxn_grid_t *grid;
 +    rvec ls;
 +    real xy_diag2, r_eff_sup, vol_est, nsp_est, nsp_est_nl;
 +    int  nsubpair_max;
 +
 +    grid = &nbs->grid[0];
 +
 +    ls[XX] = (grid->c1[XX] - grid->c0[XX])/(grid->ncx*GPU_NSUBCELL_X);
 +    ls[YY] = (grid->c1[YY] - grid->c0[YY])/(grid->ncy*GPU_NSUBCELL_Y);
 +    ls[ZZ] = (grid->c1[ZZ] - grid->c0[ZZ])*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z);
 +
 +    /* The average squared length of the diagonal of a sub cell */
 +    xy_diag2 = ls[XX]*ls[XX] + ls[YY]*ls[YY] + ls[ZZ]*ls[ZZ];
 +
 +    /* The formulas below are a heuristic estimate of the average nsj per si*/
 +    r_eff_sup = rlist + nbnxn_rlist_inc_nonloc_fac*sqr((grid->na_c - 1.0)/grid->na_c)*sqrt(xy_diag2/3);
 +
 +    if (!nbs->DomDec || nbs->zones->n == 1)
 +    {
 +        nsp_est_nl = 0;
 +    }
 +    else
 +    {
 +        nsp_est_nl =
 +            sqr(grid->atom_density/grid->na_c)*
 +            nonlocal_vol2(nbs->zones, ls, r_eff_sup);
 +    }
 +
 +    if (LOCAL_I(iloc))
 +    {
 +        /* Sub-cell interacts with itself */
 +        vol_est  = ls[XX]*ls[YY]*ls[ZZ];
 +        /* 6/2 rectangular volume on the faces */
 +        vol_est += (ls[XX]*ls[YY] + ls[XX]*ls[ZZ] + ls[YY]*ls[ZZ])*r_eff_sup;
 +        /* 12/2 quarter pie slices on the edges */
 +        vol_est += 2*(ls[XX] + ls[YY] + ls[ZZ])*0.25*M_PI*sqr(r_eff_sup);
 +        /* 4 octants of a sphere */
 +        vol_est += 0.5*4.0/3.0*M_PI*pow(r_eff_sup, 3);
 +
 +        nsp_est = grid->nsubc_tot*vol_est*grid->atom_density/grid->na_c;
 +
 +        /* Subtract the non-local pair count */
 +        nsp_est -= nsp_est_nl;
 +
 +        if (debug)
 +        {
 +            fprintf(debug, "nsp_est local %5.1f non-local %5.1f\n",
 +                    nsp_est, nsp_est_nl);
 +        }
 +    }
 +    else
 +    {
 +        nsp_est = nsp_est_nl;
 +    }
 +
 +    if (min_ci_balanced <= 0 || grid->nc >= min_ci_balanced || grid->nc == 0)
 +    {
 +        /* We don't need to worry */
 +        nsubpair_max = -1;
 +    }
 +    else
 +    {
 +        /* Thus the (average) maximum j-list size should be as follows */
 +        nsubpair_max = max(1, (int)(nsp_est/min_ci_balanced+0.5));
 +
 +        /* Since the target value is a maximum (this avoids high outliers,
 +         * which lead to load imbalance), not average, we add half the
 +         * number of pairs in a cj4 block to get the average about right.
 +         */
 +        nsubpair_max += GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE/2;
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "nbl nsp estimate %.1f, nsubpair_max %d\n",
 +                nsp_est, nsubpair_max);
 +    }
 +
 +    return nsubpair_max;
 +}
 +
 +/* Debug list print function */
 +static void print_nblist_ci_cj(FILE *fp, const nbnxn_pairlist_t *nbl)
 +{
 +    int i, j;
 +
 +    for (i = 0; i < nbl->nci; i++)
 +    {
 +        fprintf(fp, "ci %4d  shift %2d  ncj %3d\n",
 +                nbl->ci[i].ci, nbl->ci[i].shift,
 +                nbl->ci[i].cj_ind_end - nbl->ci[i].cj_ind_start);
 +
 +        for (j = nbl->ci[i].cj_ind_start; j < nbl->ci[i].cj_ind_end; j++)
 +        {
 +            fprintf(fp, "  cj %5d  imask %x\n",
 +                    nbl->cj[j].cj,
 +                    nbl->cj[j].excl);
 +        }
 +    }
 +}
 +
 +/* Debug list print function */
 +static void print_nblist_sci_cj(FILE *fp, const nbnxn_pairlist_t *nbl)
 +{
 +    int i, j4, j, ncp, si;
 +
 +    for (i = 0; i < nbl->nsci; i++)
 +    {
 +        fprintf(fp, "ci %4d  shift %2d  ncj4 %2d\n",
 +                nbl->sci[i].sci, nbl->sci[i].shift,
 +                nbl->sci[i].cj4_ind_end - nbl->sci[i].cj4_ind_start);
 +
 +        ncp = 0;
 +        for (j4 = nbl->sci[i].cj4_ind_start; j4 < nbl->sci[i].cj4_ind_end; j4++)
 +        {
 +            for (j = 0; j < NBNXN_GPU_JGROUP_SIZE; j++)
 +            {
 +                fprintf(fp, "  sj %5d  imask %x\n",
 +                        nbl->cj4[j4].cj[j],
 +                        nbl->cj4[j4].imei[0].imask);
 +                for (si = 0; si < GPU_NSUBCELL; si++)
 +                {
 +                    if (nbl->cj4[j4].imei[0].imask & (1U << (j*GPU_NSUBCELL + si)))
 +                    {
 +                        ncp++;
 +                    }
 +                }
 +            }
 +        }
 +        fprintf(fp, "ci %4d  shift %2d  ncj4 %2d ncp %3d\n",
 +                nbl->sci[i].sci, nbl->sci[i].shift,
 +                nbl->sci[i].cj4_ind_end - nbl->sci[i].cj4_ind_start,
 +                ncp);
 +    }
 +}
 +
 +/* Combine pair lists *nbl generated on multiple threads nblc */
 +static void combine_nblists(int nnbl, nbnxn_pairlist_t **nbl,
 +                            nbnxn_pairlist_t *nblc)
 +{
 +    int nsci, ncj4, nexcl;
 +    int n, i;
 +
 +    if (nblc->bSimple)
 +    {
 +        gmx_incons("combine_nblists does not support simple lists");
 +    }
 +
 +    nsci  = nblc->nsci;
 +    ncj4  = nblc->ncj4;
 +    nexcl = nblc->nexcl;
 +    for (i = 0; i < nnbl; i++)
 +    {
 +        nsci  += nbl[i]->nsci;
 +        ncj4  += nbl[i]->ncj4;
 +        nexcl += nbl[i]->nexcl;
 +    }
 +
 +    if (nsci > nblc->sci_nalloc)
 +    {
 +        nb_realloc_sci(nblc, nsci);
 +    }
 +    if (ncj4 > nblc->cj4_nalloc)
 +    {
 +        nblc->cj4_nalloc = over_alloc_small(ncj4);
 +        nbnxn_realloc_void((void **)&nblc->cj4,
 +                           nblc->ncj4*sizeof(*nblc->cj4),
 +                           nblc->cj4_nalloc*sizeof(*nblc->cj4),
 +                           nblc->alloc, nblc->free);
 +    }
 +    if (nexcl > nblc->excl_nalloc)
 +    {
 +        nblc->excl_nalloc = over_alloc_small(nexcl);
 +        nbnxn_realloc_void((void **)&nblc->excl,
 +                           nblc->nexcl*sizeof(*nblc->excl),
 +                           nblc->excl_nalloc*sizeof(*nblc->excl),
 +                           nblc->alloc, nblc->free);
 +    }
 +
 +    /* Each thread should copy its own data to the combined arrays,
 +     * as otherwise data will go back and forth between different caches.
 +     */
 +#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
 +    for (n = 0; n < nnbl; n++)
 +    {
 +        int sci_offset;
 +        int cj4_offset;
 +        int ci_offset;
 +        int excl_offset;
 +        int i, j4;
 +        const nbnxn_pairlist_t *nbli;
 +
 +        /* Determine the offset in the combined data for our thread */
 +        sci_offset  = nblc->nsci;
 +        cj4_offset  = nblc->ncj4;
 +        ci_offset   = nblc->nci_tot;
 +        excl_offset = nblc->nexcl;
 +
 +        for (i = 0; i < n; i++)
 +        {
 +            sci_offset  += nbl[i]->nsci;
 +            cj4_offset  += nbl[i]->ncj4;
 +            ci_offset   += nbl[i]->nci_tot;
 +            excl_offset += nbl[i]->nexcl;
 +        }
 +
 +        nbli = nbl[n];
 +
 +        for (i = 0; i < nbli->nsci; i++)
 +        {
 +            nblc->sci[sci_offset+i]                = nbli->sci[i];
 +            nblc->sci[sci_offset+i].cj4_ind_start += cj4_offset;
 +            nblc->sci[sci_offset+i].cj4_ind_end   += cj4_offset;
 +        }
 +
 +        for (j4 = 0; j4 < nbli->ncj4; j4++)
 +        {
 +            nblc->cj4[cj4_offset+j4]                   = nbli->cj4[j4];
 +            nblc->cj4[cj4_offset+j4].imei[0].excl_ind += excl_offset;
 +            nblc->cj4[cj4_offset+j4].imei[1].excl_ind += excl_offset;
 +        }
 +
 +        for (j4 = 0; j4 < nbli->nexcl; j4++)
 +        {
 +            nblc->excl[excl_offset+j4] = nbli->excl[j4];
 +        }
 +    }
 +
 +    for (n = 0; n < nnbl; n++)
 +    {
 +        nblc->nsci    += nbl[n]->nsci;
 +        nblc->ncj4    += nbl[n]->ncj4;
 +        nblc->nci_tot += nbl[n]->nci_tot;
 +        nblc->nexcl   += nbl[n]->nexcl;
 +    }
 +}
 +
 +/* Returns the next ci to be processes by our thread */
 +static gmx_bool next_ci(const nbnxn_grid_t *grid,
 +                        int conv,
 +                        int nth, int ci_block,
 +                        int *ci_x, int *ci_y,
 +                        int *ci_b, int *ci)
 +{
 +    (*ci_b)++;
 +    (*ci)++;
 +
 +    if (*ci_b == ci_block)
 +    {
 +        /* Jump to the next block assigned to this task */
 +        *ci   += (nth - 1)*ci_block;
 +        *ci_b  = 0;
 +    }
 +
 +    if (*ci >= grid->nc*conv)
 +    {
 +        return FALSE;
 +    }
 +
 +    while (*ci >= grid->cxy_ind[*ci_x*grid->ncy + *ci_y + 1]*conv)
 +    {
 +        *ci_y += 1;
 +        if (*ci_y == grid->ncy)
 +        {
 +            *ci_x += 1;
 +            *ci_y  = 0;
 +        }
 +    }
 +
 +    return TRUE;
 +}
 +
 +/* Returns the distance^2 for which we put cell pairs in the list
 + * without checking atom pair distances. This is usually < rlist^2.
 + */
 +static float boundingbox_only_distance2(const nbnxn_grid_t *gridi,
 +                                        const nbnxn_grid_t *gridj,
 +                                        real                rlist,
 +                                        gmx_bool            simple)
 +{
 +    /* If the distance between two sub-cell bounding boxes is less
 +     * than this distance, do not check the distance between
 +     * all particle pairs in the sub-cell, since then it is likely
 +     * that the box pair has atom pairs within the cut-off.
 +     * We use the nblist cut-off minus 0.5 times the average x/y diagonal
 +     * spacing of the sub-cells. Around 40% of the checked pairs are pruned.
 +     * Using more than 0.5 gains at most 0.5%.
 +     * If forces are calculated more than twice, the performance gain
 +     * in the force calculation outweighs the cost of checking.
 +     * Note that with subcell lists, the atom-pair distance check
 +     * is only performed when only 1 out of 8 sub-cells in within range,
 +     * this is because the GPU is much faster than the cpu.
 +     */
 +    real bbx, bby;
 +    real rbb2;
 +
 +    bbx = 0.5*(gridi->sx + gridj->sx);
 +    bby = 0.5*(gridi->sy + gridj->sy);
 +    if (!simple)
 +    {
 +        bbx /= GPU_NSUBCELL_X;
 +        bby /= GPU_NSUBCELL_Y;
 +    }
 +
 +    rbb2 = sqr(max(0, rlist - 0.5*sqrt(bbx*bbx + bby*bby)));
 +
 +#ifndef GMX_DOUBLE
 +    return rbb2;
 +#else
 +    return (float)((1+GMX_FLOAT_EPS)*rbb2);
 +#endif
 +}
 +
 +static int get_ci_block_size(const nbnxn_grid_t *gridi,
 +                             gmx_bool bDomDec, int nth)
 +{
 +    const int ci_block_enum      = 5;
 +    const int ci_block_denom     = 11;
 +    const int ci_block_min_atoms = 16;
 +    int ci_block;
 +
 +    /* Here we decide how to distribute the blocks over the threads.
 +     * We use prime numbers to try to avoid that the grid size becomes
 +     * a multiple of the number of threads, which would lead to some
 +     * threads getting "inner" pairs and others getting boundary pairs,
 +     * which in turns will lead to load imbalance between threads.
 +     * Set the block size as 5/11/ntask times the average number of cells
 +     * in a y,z slab. This should ensure a quite uniform distribution
 +     * of the grid parts of the different thread along all three grid
 +     * zone boundaries with 3D domain decomposition. At the same time
 +     * the blocks will not become too small.
 +     */
 +    ci_block = (gridi->nc*ci_block_enum)/(ci_block_denom*gridi->ncx*nth);
 +
 +    /* Ensure the blocks are not too small: avoids cache invalidation */
 +    if (ci_block*gridi->na_sc < ci_block_min_atoms)
 +    {
 +        ci_block = (ci_block_min_atoms + gridi->na_sc - 1)/gridi->na_sc;
 +    }
 +
 +    /* Without domain decomposition
 +     * or with less than 3 blocks per task, divide in nth blocks.
 +     */
 +    if (!bDomDec || ci_block*3*nth > gridi->nc)
 +    {
 +        ci_block = (gridi->nc + nth - 1)/nth;
 +    }
 +
 +    return ci_block;
 +}
 +
 +/* Generates the part of pair-list nbl assigned to our thread */
 +static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
 +                                     const nbnxn_grid_t *gridi,
 +                                     const nbnxn_grid_t *gridj,
 +                                     nbnxn_search_work_t *work,
 +                                     const nbnxn_atomdata_t *nbat,
 +                                     const t_blocka *excl,
 +                                     real rlist,
 +                                     int nb_kernel_type,
 +                                     int ci_block,
 +                                     gmx_bool bFBufferFlag,
 +                                     int nsubpair_max,
 +                                     gmx_bool progBal,
 +                                     int min_ci_balanced,
 +                                     int th, int nth,
 +                                     nbnxn_pairlist_t *nbl)
 +{
 +    int  na_cj_2log;
 +    matrix box;
 +    real rl2;
 +    float rbb2;
 +    int  d;
 +    int  ci_b, ci, ci_x, ci_y, ci_xy, cj;
 +    ivec shp;
 +    int  tx, ty, tz;
 +    int  shift;
 +    gmx_bool bMakeList;
 +    real shx, shy, shz;
 +    int  conv_i, cell0_i;
-         bb_i    = gridi->bb;
++    const nbnxn_bb_t *bb_i=NULL;
++#ifdef NBNXN_BBXXXX
++    const float *pbb_i=NULL;
++#endif
++    const float *bbcz_i, *bbcz_j;
 +    const int *flags_i;
 +    real bx0, bx1, by0, by1, bz0, bz1;
 +    real bz1_frac;
 +    real d2cx, d2z, d2z_cx, d2z_cy, d2zx, d2zxy, d2xy;
 +    int  cxf, cxl, cyf, cyf_x, cyl;
 +    int  cx, cy;
 +    int  c0, c1, cs, cf, cl;
 +    int  ndistc;
 +    int  ncpcheck;
 +    int  gridi_flag_shift = 0, gridj_flag_shift = 0;
 +    unsigned *gridj_flag  = NULL;
 +    int  ncj_old_i, ncj_old_j;
 +
 +    nbs_cycle_start(&work->cc[enbsCCsearch]);
 +
 +    if (gridj->bSimple != nbl->bSimple)
 +    {
 +        gmx_incons("Grid incompatible with pair-list");
 +    }
 +
 +    sync_work(nbl);
 +    nbl->na_sc = gridj->na_sc;
 +    nbl->na_ci = gridj->na_c;
 +    nbl->na_cj = nbnxn_kernel_to_cj_size(nb_kernel_type);
 +    na_cj_2log = get_2log(nbl->na_cj);
 +
 +    nbl->rlist  = rlist;
 +
 +    if (bFBufferFlag)
 +    {
 +        /* Determine conversion of clusters to flag blocks */
 +        gridi_flag_shift = 0;
 +        while ((nbl->na_ci<<gridi_flag_shift) < NBNXN_BUFFERFLAG_SIZE)
 +        {
 +            gridi_flag_shift++;
 +        }
 +        gridj_flag_shift = 0;
 +        while ((nbl->na_cj<<gridj_flag_shift) < NBNXN_BUFFERFLAG_SIZE)
 +        {
 +            gridj_flag_shift++;
 +        }
 +
 +        gridj_flag = work->buffer_flags.flag;
 +    }
 +
 +    copy_mat(nbs->box, box);
 +
 +    rl2 = nbl->rlist*nbl->rlist;
 +
 +    rbb2 = boundingbox_only_distance2(gridi, gridj, nbl->rlist, nbl->bSimple);
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "nbl bounding box only distance %f\n", sqrt(rbb2));
 +    }
 +
 +    /* Set the shift range */
 +    for (d = 0; d < DIM; d++)
 +    {
 +        /* Check if we need periodicity shifts.
 +         * Without PBC or with domain decomposition we don't need them.
 +         */
 +        if (d >= ePBC2npbcdim(nbs->ePBC) || nbs->dd_dim[d])
 +        {
 +            shp[d] = 0;
 +        }
 +        else
 +        {
 +            if (d == XX &&
 +                box[XX][XX] - fabs(box[YY][XX]) - fabs(box[ZZ][XX]) < sqrt(rl2))
 +            {
 +                shp[d] = 2;
 +            }
 +            else
 +            {
 +                shp[d] = 1;
 +            }
 +        }
 +    }
 +
 +    if (nbl->bSimple && !gridi->bSimple)
 +    {
 +        conv_i  = gridi->na_sc/gridj->na_sc;
 +        bb_i    = gridi->bb_simple;
 +        bbcz_i  = gridi->bbcz_simple;
 +        flags_i = gridi->flags_simple;
 +    }
 +    else
 +    {
 +        conv_i  = 1;
-                 bx1 = bb_i[ci*NNBSBB_B+NNBSBB_C+XX];
++#ifdef NBNXN_BBXXXX
++        if (gridi->bSimple)
++        {
++            bb_i  = gridi->bb;
++        }
++        else
++        {
++            pbb_i = gridi->pbb;
++        }
++#else
++        /* We use the normal bounding box format for both grid types */
++        bb_i  = gridi->bb;
++#endif
 +        bbcz_i  = gridi->bbcz;
 +        flags_i = gridi->flags;
 +    }
 +    cell0_i = gridi->cell0*conv_i;
 +
 +    bbcz_j = gridj->bbcz;
 +
 +    if (conv_i != 1)
 +    {
 +        /* Blocks of the conversion factor - 1 give a large repeat count
 +         * combined with a small block size. This should result in good
 +         * load balancing for both small and large domains.
 +         */
 +        ci_block = conv_i - 1;
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug, "nbl nc_i %d col.av. %.1f ci_block %d\n",
 +                gridi->nc, gridi->nc/(double)(gridi->ncx*gridi->ncy), ci_block);
 +    }
 +
 +    ndistc   = 0;
 +    ncpcheck = 0;
 +
 +    /* Initially ci_b and ci to 1 before where we want them to start,
 +     * as they will both be incremented in next_ci.
 +     */
 +    ci_b = -1;
 +    ci   = th*ci_block - 1;
 +    ci_x = 0;
 +    ci_y = 0;
 +    while (next_ci(gridi, conv_i, nth, ci_block, &ci_x, &ci_y, &ci_b, &ci))
 +    {
 +        if (nbl->bSimple && flags_i[ci] == 0)
 +        {
 +            continue;
 +        }
 +
 +        ncj_old_i = nbl->ncj;
 +
 +        d2cx = 0;
 +        if (gridj != gridi && shp[XX] == 0)
 +        {
 +            if (nbl->bSimple)
 +            {
-                     by0 = bb_i[ci*NNBSBB_B         +YY] + shy;
-                     by1 = bb_i[ci*NNBSBB_B+NNBSBB_C+YY] + shy;
++                bx1 = bb_i[ci].upper[BB_X];
 +            }
 +            else
 +            {
 +                bx1 = gridi->c0[XX] + (ci_x+1)*gridi->sx;
 +            }
 +            if (bx1 < gridj->c0[XX])
 +            {
 +                d2cx = sqr(gridj->c0[XX] - bx1);
 +
 +                if (d2cx >= rl2)
 +                {
 +                    continue;
 +                }
 +            }
 +        }
 +
 +        ci_xy = ci_x*gridi->ncy + ci_y;
 +
 +        /* Loop over shift vectors in three dimensions */
 +        for (tz = -shp[ZZ]; tz <= shp[ZZ]; tz++)
 +        {
 +            shz = tz*box[ZZ][ZZ];
 +
 +            bz0 = bbcz_i[ci*NNBSBB_D  ] + shz;
 +            bz1 = bbcz_i[ci*NNBSBB_D+1] + shz;
 +
 +            if (tz == 0)
 +            {
 +                d2z = 0;
 +            }
 +            else if (tz < 0)
 +            {
 +                d2z = sqr(bz1);
 +            }
 +            else
 +            {
 +                d2z = sqr(bz0 - box[ZZ][ZZ]);
 +            }
 +
 +            d2z_cx = d2z + d2cx;
 +
 +            if (d2z_cx >= rl2)
 +            {
 +                continue;
 +            }
 +
 +            bz1_frac =
 +                bz1/((real)(gridi->cxy_ind[ci_xy+1] - gridi->cxy_ind[ci_xy]));
 +            if (bz1_frac < 0)
 +            {
 +                bz1_frac = 0;
 +            }
 +            /* The check with bz1_frac close to or larger than 1 comes later */
 +
 +            for (ty = -shp[YY]; ty <= shp[YY]; ty++)
 +            {
 +                shy = ty*box[YY][YY] + tz*box[ZZ][YY];
 +
 +                if (nbl->bSimple)
 +                {
-                         bx0 = bb_i[ci*NNBSBB_B         +XX] + shx;
-                         bx1 = bb_i[ci*NNBSBB_B+NNBSBB_C+XX] + shx;
++                    by0 = bb_i[ci].lower[BB_Y] + shy;
++                    by1 = bb_i[ci].upper[BB_Y] + shy;
 +                }
 +                else
 +                {
 +                    by0 = gridi->c0[YY] + (ci_y  )*gridi->sy + shy;
 +                    by1 = gridi->c0[YY] + (ci_y+1)*gridi->sy + shy;
 +                }
 +
 +                get_cell_range(by0, by1,
 +                               gridj->ncy, gridj->c0[YY], gridj->sy, gridj->inv_sy,
 +                               d2z_cx, rl2,
 +                               &cyf, &cyl);
 +
 +                if (cyf > cyl)
 +                {
 +                    continue;
 +                }
 +
 +                d2z_cy = d2z;
 +                if (by1 < gridj->c0[YY])
 +                {
 +                    d2z_cy += sqr(gridj->c0[YY] - by1);
 +                }
 +                else if (by0 > gridj->c1[YY])
 +                {
 +                    d2z_cy += sqr(by0 - gridj->c1[YY]);
 +                }
 +
 +                for (tx = -shp[XX]; tx <= shp[XX]; tx++)
 +                {
 +                    shift = XYZ2IS(tx, ty, tz);
 +
 +#ifdef NBNXN_SHIFT_BACKWARD
 +                    if (gridi == gridj && shift > CENTRAL)
 +                    {
 +                        continue;
 +                    }
 +#endif
 +
 +                    shx = tx*box[XX][XX] + ty*box[YY][XX] + tz*box[ZZ][XX];
 +
 +                    if (nbl->bSimple)
 +                    {
-                                         if (box_dist2(bx0, bx1, by0, by1, bz0, bz1,
-                                                       bb+k*NNBSBB_B) < rl2 &&
++                        bx0 = bb_i[ci].lower[BB_X] + shx;
++                        bx1 = bb_i[ci].upper[BB_X] + shx;
 +                    }
 +                    else
 +                    {
 +                        bx0 = gridi->c0[XX] + (ci_x  )*gridi->sx + shx;
 +                        bx1 = gridi->c0[XX] + (ci_x+1)*gridi->sx + shx;
 +                    }
 +
 +                    get_cell_range(bx0, bx1,
 +                                   gridj->ncx, gridj->c0[XX], gridj->sx, gridj->inv_sx,
 +                                   d2z_cy, rl2,
 +                                   &cxf, &cxl);
 +
 +                    if (cxf > cxl)
 +                    {
 +                        continue;
 +                    }
 +
 +                    if (nbl->bSimple)
 +                    {
 +                        new_ci_entry(nbl, cell0_i+ci, shift, flags_i[ci]);
 +                    }
 +                    else
 +                    {
 +                        new_sci_entry(nbl, cell0_i+ci, shift);
 +                    }
 +
 +#ifndef NBNXN_SHIFT_BACKWARD
 +                    if (cxf < ci_x)
 +#else
 +                    if (shift == CENTRAL && gridi == gridj &&
 +                        cxf < ci_x)
 +#endif
 +                    {
 +                        /* Leave the pairs with i > j.
 +                         * x is the major index, so skip half of it.
 +                         */
 +                        cxf = ci_x;
 +                    }
 +
 +                    if (nbl->bSimple)
 +                    {
 +                        set_icell_bb_simple(bb_i, ci, shx, shy, shz,
 +                                            nbl->work->bb_ci);
 +                    }
 +                    else
 +                    {
++#ifdef NBNXN_BBXXXX
++                        set_icell_bbxxxx_supersub(pbb_i, ci, shx, shy, shz,
++                                                  nbl->work->pbb_ci);
++#else
 +                        set_icell_bb_supersub(bb_i, ci, shx, shy, shz,
 +                                              nbl->work->bb_ci);
++#endif
 +                    }
 +
 +                    nbs->icell_set_x(cell0_i+ci, shx, shy, shz,
 +                                     gridi->na_c, nbat->xstride, nbat->x,
 +                                     nbl->work);
 +
 +                    for (cx = cxf; cx <= cxl; cx++)
 +                    {
 +                        d2zx = d2z;
 +                        if (gridj->c0[XX] + cx*gridj->sx > bx1)
 +                        {
 +                            d2zx += sqr(gridj->c0[XX] + cx*gridj->sx - bx1);
 +                        }
 +                        else if (gridj->c0[XX] + (cx+1)*gridj->sx < bx0)
 +                        {
 +                            d2zx += sqr(gridj->c0[XX] + (cx+1)*gridj->sx - bx0);
 +                        }
 +
 +#ifndef NBNXN_SHIFT_BACKWARD
 +                        if (gridi == gridj &&
 +                            cx == 0 && cyf < ci_y)
 +#else
 +                        if (gridi == gridj &&
 +                            cx == 0 && shift == CENTRAL && cyf < ci_y)
 +#endif
 +                        {
 +                            /* Leave the pairs with i > j.
 +                             * Skip half of y when i and j have the same x.
 +                             */
 +                            cyf_x = ci_y;
 +                        }
 +                        else
 +                        {
 +                            cyf_x = cyf;
 +                        }
 +
 +                        for (cy = cyf_x; cy <= cyl; cy++)
 +                        {
 +                            c0 = gridj->cxy_ind[cx*gridj->ncy+cy];
 +                            c1 = gridj->cxy_ind[cx*gridj->ncy+cy+1];
 +#ifdef NBNXN_SHIFT_BACKWARD
 +                            if (gridi == gridj &&
 +                                shift == CENTRAL && c0 < ci)
 +                            {
 +                                c0 = ci;
 +                            }
 +#endif
 +
 +                            d2zxy = d2zx;
 +                            if (gridj->c0[YY] + cy*gridj->sy > by1)
 +                            {
 +                                d2zxy += sqr(gridj->c0[YY] + cy*gridj->sy - by1);
 +                            }
 +                            else if (gridj->c0[YY] + (cy+1)*gridj->sy < by0)
 +                            {
 +                                d2zxy += sqr(gridj->c0[YY] + (cy+1)*gridj->sy - by0);
 +                            }
 +                            if (c1 > c0 && d2zxy < rl2)
 +                            {
 +                                cs = c0 + (int)(bz1_frac*(c1 - c0));
 +                                if (cs >= c1)
 +                                {
 +                                    cs = c1 - 1;
 +                                }
 +
 +                                d2xy = d2zxy - d2z;
 +
 +                                /* Find the lowest cell that can possibly
 +                                 * be within range.
 +                                 */
 +                                cf = cs;
 +                                while (cf > c0 &&
 +                                       (bbcz_j[cf*NNBSBB_D+1] >= bz0 ||
 +                                        d2xy + sqr(bbcz_j[cf*NNBSBB_D+1] - bz0) < rl2))
 +                                {
 +                                    cf--;
 +                                }
 +
 +                                /* Find the highest cell that can possibly
 +                                 * be within range.
 +                                 */
 +                                cl = cs;
 +                                while (cl < c1-1 &&
 +                                       (bbcz_j[cl*NNBSBB_D] <= bz1 ||
 +                                        d2xy + sqr(bbcz_j[cl*NNBSBB_D] - bz1) < rl2))
 +                                {
 +                                    cl++;
 +                                }
 +
 +#ifdef NBNXN_REFCODE
 +                                {
 +                                    /* Simple reference code, for debugging,
 +                                     * overrides the more complex code above.
 +                                     */
 +                                    int k;
 +                                    cf = c1;
 +                                    cl = -1;
 +                                    for (k = c0; k < c1; k++)
 +                                    {
-                                         if (box_dist2(bx0, bx1, by0, by1, bz0, bz1,
-                                                       bb+k*NNBSBB_B) < rl2 &&
++                                        if (box_dist2(bx0, bx1, by0, by1, bz0, bz1, bb+k) < rl2 &&
 +                                            k < cf)
 +                                        {
 +                                            cf = k;
 +                                        }
++                                        if (box_dist2(bx0, bx1, by0, by1, bz0, bz1, bb+k) < rl2 &&
 +                                            k > cl)
 +                                        {
 +                                            cl = k;
 +                                        }
 +                                    }
 +                                }
 +#endif
 +
 +                                if (gridi == gridj)
 +                                {
 +                                    /* We want each atom/cell pair only once,
 +                                     * only use cj >= ci.
 +                                     */
 +#ifndef NBNXN_SHIFT_BACKWARD
 +                                    cf = max(cf, ci);
 +#else
 +                                    if (shift == CENTRAL)
 +                                    {
 +                                        cf = max(cf, ci);
 +                                    }
 +#endif
 +                                }
 +
 +                                if (cf <= cl)
 +                                {
 +                                    /* For f buffer flags with simple lists */
 +                                    ncj_old_j = nbl->ncj;
 +
 +                                    switch (nb_kernel_type)
 +                                    {
 +                                        case nbnxnk4x4_PlainC:
 +                                            check_subcell_list_space_simple(nbl, cl-cf+1);
 +
 +                                            make_cluster_list_simple(gridj,
 +                                                                     nbl, ci, cf, cl,
 +                                                                     (gridi == gridj && shift == CENTRAL),
 +                                                                     nbat->x,
 +                                                                     rl2, rbb2,
 +                                                                     &ndistc);
 +                                            break;
 +#ifdef GMX_NBNXN_SIMD_4XN
 +                                        case nbnxnk4xN_SIMD_4xN:
 +                                            check_subcell_list_space_simple(nbl, ci_to_cj(na_cj_2log, cl-cf)+2);
 +                                            make_cluster_list_simd_4xn(gridj,
 +                                                                       nbl, ci, cf, cl,
 +                                                                       (gridi == gridj && shift == CENTRAL),
 +                                                                       nbat->x,
 +                                                                       rl2, rbb2,
 +                                                                       &ndistc);
 +                                            break;
 +#endif
 +#ifdef GMX_NBNXN_SIMD_2XNN
 +                                        case nbnxnk4xN_SIMD_2xNN:
 +                                            check_subcell_list_space_simple(nbl, ci_to_cj(na_cj_2log, cl-cf)+2);
 +                                            make_cluster_list_simd_2xnn(gridj,
 +                                                                        nbl, ci, cf, cl,
 +                                                                        (gridi == gridj && shift == CENTRAL),
 +                                                                        nbat->x,
 +                                                                        rl2, rbb2,
 +                                                                        &ndistc);
 +                                            break;
 +#endif
 +                                        case nbnxnk8x8x8_PlainC:
 +                                        case nbnxnk8x8x8_CUDA:
 +                                            check_subcell_list_space_supersub(nbl, cl-cf+1);
 +                                            for (cj = cf; cj <= cl; cj++)
 +                                            {
 +                                                make_cluster_list_supersub(gridi, gridj,
 +                                                                           nbl, ci, cj,
 +                                                                           (gridi == gridj && shift == CENTRAL && ci == cj),
 +                                                                           nbat->xstride, nbat->x,
 +                                                                           rl2, rbb2,
 +                                                                           &ndistc);
 +                                            }
 +                                            break;
 +                                    }
 +                                    ncpcheck += cl - cf + 1;
 +
 +                                    if (bFBufferFlag && nbl->ncj > ncj_old_j)
 +                                    {
 +                                        int cbf, cbl, cb;
 +
 +                                        cbf = nbl->cj[ncj_old_j].cj >> gridj_flag_shift;
 +                                        cbl = nbl->cj[nbl->ncj-1].cj >> gridj_flag_shift;
 +                                        for (cb = cbf; cb <= cbl; cb++)
 +                                        {
 +                                            gridj_flag[cb] = 1U<<th;
 +                                        }
 +                                    }
 +                                }
 +                            }
 +                        }
 +                    }
 +
 +                    /* Set the exclusions for this ci list */
 +                    if (nbl->bSimple)
 +                    {
 +                        set_ci_top_excls(nbs,
 +                                         nbl,
 +                                         shift == CENTRAL && gridi == gridj,
 +                                         gridj->na_c_2log,
 +                                         na_cj_2log,
 +                                         &(nbl->ci[nbl->nci]),
 +                                         excl);
 +                    }
 +                    else
 +                    {
 +                        set_sci_top_excls(nbs,
 +                                          nbl,
 +                                          shift == CENTRAL && gridi == gridj,
 +                                          gridj->na_c_2log,
 +                                          &(nbl->sci[nbl->nsci]),
 +                                          excl);
 +                    }
 +
 +                    /* Close this ci list */
 +                    if (nbl->bSimple)
 +                    {
 +                        close_ci_entry_simple(nbl);
 +                    }
 +                    else
 +                    {
 +                        close_ci_entry_supersub(nbl,
 +                                                nsubpair_max,
 +                                                progBal, min_ci_balanced,
 +                                                th, nth);
 +                    }
 +                }
 +            }
 +        }
 +
 +        if (bFBufferFlag && nbl->ncj > ncj_old_i)
 +        {
 +            work->buffer_flags.flag[(gridi->cell0+ci)>>gridi_flag_shift] = 1U<<th;
 +        }
 +    }
 +
 +    work->ndistc = ndistc;
 +
 +    nbs_cycle_stop(&work->cc[enbsCCsearch]);
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "number of distance checks %d\n", ndistc);
 +        fprintf(debug, "ncpcheck %s %d\n", gridi == gridj ? "local" : "non-local",
 +                ncpcheck);
 +
 +        if (nbl->bSimple)
 +        {
 +            print_nblist_statistics_simple(debug, nbl, nbs, rlist);
 +        }
 +        else
 +        {
 +            print_nblist_statistics_supersub(debug, nbl, nbs, rlist);
 +        }
 +
 +    }
 +}
 +
 +static void reduce_buffer_flags(const nbnxn_search_t        nbs,
 +                                int                         nsrc,
 +                                const nbnxn_buffer_flags_t *dest)
 +{
 +    int s, b;
 +    const unsigned *flag;
 +
 +    for (s = 0; s < nsrc; s++)
 +    {
 +        flag = nbs->work[s].buffer_flags.flag;
 +
 +        for (b = 0; b < dest->nflag; b++)
 +        {
 +            dest->flag[b] |= flag[b];
 +        }
 +    }
 +}
 +
 +static void print_reduction_cost(const nbnxn_buffer_flags_t *flags, int nout)
 +{
 +    int nelem, nkeep, ncopy, nred, b, c, out;
 +
 +    nelem = 0;
 +    nkeep = 0;
 +    ncopy = 0;
 +    nred  = 0;
 +    for (b = 0; b < flags->nflag; b++)
 +    {
 +        if (flags->flag[b] == 1)
 +        {
 +            /* Only flag 0 is set, no copy of reduction required */
 +            nelem++;
 +            nkeep++;
 +        }
 +        else if (flags->flag[b] > 0)
 +        {
 +            c = 0;
 +            for (out = 0; out < nout; out++)
 +            {
 +                if (flags->flag[b] & (1U<<out))
 +                {
 +                    c++;
 +                }
 +            }
 +            nelem += c;
 +            if (c == 1)
 +            {
 +                ncopy++;
 +            }
 +            else
 +            {
 +                nred += c;
 +            }
 +        }
 +    }
 +
 +    fprintf(debug, "nbnxn reduction: #flag %d #list %d elem %4.2f, keep %4.2f copy %4.2f red %4.2f\n",
 +            flags->nflag, nout,
 +            nelem/(double)(flags->nflag),
 +            nkeep/(double)(flags->nflag),
 +            ncopy/(double)(flags->nflag),
 +            nred/(double)(flags->nflag));
 +}
 +
 +/* Perform a count (linear) sort to sort the smaller lists to the end.
 + * This avoids load imbalance on the GPU, as large lists will be
 + * scheduled and executed first and the smaller lists later.
 + * Load balancing between multi-processors only happens at the end
 + * and there smaller lists lead to more effective load balancing.
 + * The sorting is done on the cj4 count, not on the actual pair counts.
 + * Not only does this make the sort faster, but it also results in
 + * better load balancing than using a list sorted on exact load.
 + * This function swaps the pointer in the pair list to avoid a copy operation.
 + */
 +static void sort_sci(nbnxn_pairlist_t *nbl)
 +{
 +    nbnxn_list_work_t *work;
 +    int                m, i, s, s0, s1;
 +    nbnxn_sci_t       *sci_sort;
 +
 +    if (nbl->ncj4 <= nbl->nsci)
 +    {
 +        /* nsci = 0 or all sci have size 1, sorting won't change the order */
 +        return;
 +    }
 +
 +    work = nbl->work;
 +
 +    /* We will distinguish differences up to double the average */
 +    m = (2*nbl->ncj4)/nbl->nsci;
 +
 +    if (m + 1 > work->sort_nalloc)
 +    {
 +        work->sort_nalloc = over_alloc_large(m + 1);
 +        srenew(work->sort, work->sort_nalloc);
 +    }
 +
 +    if (work->sci_sort_nalloc != nbl->sci_nalloc)
 +    {
 +        work->sci_sort_nalloc = nbl->sci_nalloc;
 +        nbnxn_realloc_void((void **)&work->sci_sort,
 +                           0,
 +                           work->sci_sort_nalloc*sizeof(*work->sci_sort),
 +                           nbl->alloc, nbl->free);
 +    }
 +
 +    /* Count the entries of each size */
 +    for (i = 0; i <= m; i++)
 +    {
 +        work->sort[i] = 0;
 +    }
 +    for (s = 0; s < nbl->nsci; s++)
 +    {
 +        i = min(m, nbl->sci[s].cj4_ind_end - nbl->sci[s].cj4_ind_start);
 +        work->sort[i]++;
 +    }
 +    /* Calculate the offset for each count */
 +    s0            = work->sort[m];
 +    work->sort[m] = 0;
 +    for (i = m - 1; i >= 0; i--)
 +    {
 +        s1            = work->sort[i];
 +        work->sort[i] = work->sort[i + 1] + s0;
 +        s0            = s1;
 +    }
 +
 +    /* Sort entries directly into place */
 +    sci_sort = work->sci_sort;
 +    for (s = 0; s < nbl->nsci; s++)
 +    {
 +        i = min(m, nbl->sci[s].cj4_ind_end - nbl->sci[s].cj4_ind_start);
 +        sci_sort[work->sort[i]++] = nbl->sci[s];
 +    }
 +
 +    /* Swap the sci pointers so we use the new, sorted list */
 +    work->sci_sort = nbl->sci;
 +    nbl->sci       = sci_sort;
 +}
 +
 +/* Make a local or non-local pair-list, depending on iloc */
 +void nbnxn_make_pairlist(const nbnxn_search_t  nbs,
 +                         nbnxn_atomdata_t     *nbat,
 +                         const t_blocka       *excl,
 +                         real                  rlist,
 +                         int                   min_ci_balanced,
 +                         nbnxn_pairlist_set_t *nbl_list,
 +                         int                   iloc,
 +                         int                   nb_kernel_type,
 +                         t_nrnb               *nrnb)
 +{
 +    nbnxn_grid_t *gridi, *gridj;
 +    gmx_bool bGPUCPU;
 +    int nzi, zi, zj0, zj1, zj;
 +    int nsubpair_max;
 +    int th;
 +    int nnbl;
 +    nbnxn_pairlist_t **nbl;
 +    int ci_block;
 +    gmx_bool CombineNBLists;
 +    gmx_bool progBal;
 +    int np_tot, np_noq, np_hlj, nap;
 +
 +    /* Check if we are running hybrid GPU + CPU nbnxn mode */
 +    bGPUCPU = (!nbs->grid[0].bSimple && nbl_list->bSimple);
 +
 +    nnbl            = nbl_list->nnbl;
 +    nbl             = nbl_list->nbl;
 +    CombineNBLists  = nbl_list->bCombined;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "ns making %d nblists\n", nnbl);
 +    }
 +
 +    nbat->bUseBufferFlags = (nbat->nout > 1);
 +    /* We should re-init the flags before making the first list */
 +    if (nbat->bUseBufferFlags && (LOCAL_I(iloc) || bGPUCPU))
 +    {
 +        init_buffer_flags(&nbat->buffer_flags, nbat->natoms);
 +    }
 +
 +    if (nbl_list->bSimple)
 +    {
 +        switch (nb_kernel_type)
 +        {
 +#ifdef GMX_NBNXN_SIMD_4XN
 +            case nbnxnk4xN_SIMD_4xN:
 +                nbs->icell_set_x = icell_set_x_simd_4xn;
 +                break;
 +#endif
 +#ifdef GMX_NBNXN_SIMD_2XNN
 +            case nbnxnk4xN_SIMD_2xNN:
 +                nbs->icell_set_x = icell_set_x_simd_2xnn;
 +                break;
 +#endif
 +            default:
 +                nbs->icell_set_x = icell_set_x_simple;
 +                break;
 +        }
 +    }
 +    else
 +    {
 +#ifdef NBNXN_SEARCH_BB_SSE
 +        nbs->icell_set_x = icell_set_x_supersub_sse8;
 +#else
 +        nbs->icell_set_x = icell_set_x_supersub;
 +#endif
 +    }
 +
 +    if (LOCAL_I(iloc))
 +    {
 +        /* Only zone (grid) 0 vs 0 */
 +        nzi = 1;
 +        zj0 = 0;
 +        zj1 = 1;
 +    }
 +    else
 +    {
 +        nzi = nbs->zones->nizone;
 +    }
 +
 +    if (!nbl_list->bSimple && min_ci_balanced > 0)
 +    {
 +        nsubpair_max = get_nsubpair_max(nbs, iloc, rlist, min_ci_balanced);
 +    }
 +    else
 +    {
 +        nsubpair_max = 0;
 +    }
 +
 +    /* Clear all pair-lists */
 +    for (th = 0; th < nnbl; th++)
 +    {
 +        clear_pairlist(nbl[th]);
 +    }
 +
 +    for (zi = 0; zi < nzi; zi++)
 +    {
 +        gridi = &nbs->grid[zi];
 +
 +        if (NONLOCAL_I(iloc))
 +        {
 +            zj0 = nbs->zones->izone[zi].j0;
 +            zj1 = nbs->zones->izone[zi].j1;
 +            if (zi == 0)
 +            {
 +                zj0++;
 +            }
 +        }
 +        for (zj = zj0; zj < zj1; zj++)
 +        {
 +            gridj = &nbs->grid[zj];
 +
 +            if (debug)
 +            {
 +                fprintf(debug, "ns search grid %d vs %d\n", zi, zj);
 +            }
 +
 +            nbs_cycle_start(&nbs->cc[enbsCCsearch]);
 +
 +            if (nbl[0]->bSimple && !gridi->bSimple)
 +            {
 +                /* Hybrid list, determine blocking later */
 +                ci_block = 0;
 +            }
 +            else
 +            {
 +                ci_block = get_ci_block_size(gridi, nbs->DomDec, nnbl);
 +            }
 +
 +#pragma omp parallel for num_threads(nnbl) schedule(static)
 +            for (th = 0; th < nnbl; th++)
 +            {
 +                /* Re-init the thread-local work flag data before making
 +                 * the first list (not an elegant conditional).
 +                 */
 +                if (nbat->bUseBufferFlags && ((zi == 0 && zj == 0) ||
 +                                              (bGPUCPU && zi == 0 && zj == 1)))
 +                {
 +                    init_buffer_flags(&nbs->work[th].buffer_flags, nbat->natoms);
 +                }
 +
 +                if (CombineNBLists && th > 0)
 +                {
 +                    clear_pairlist(nbl[th]);
 +                }
 +
 +                /* With GPU: generate progressively smaller lists for
 +                 * load balancing for local only or non-local with 2 zones.
 +                 */
 +                progBal = (LOCAL_I(iloc) || nbs->zones->n <= 2);
 +
 +                /* Divide the i super cell equally over the nblists */
 +                nbnxn_make_pairlist_part(nbs, gridi, gridj,
 +                                         &nbs->work[th], nbat, excl,
 +                                         rlist,
 +                                         nb_kernel_type,
 +                                         ci_block,
 +                                         nbat->bUseBufferFlags,
 +                                         nsubpair_max,
 +                                         progBal, min_ci_balanced,
 +                                         th, nnbl,
 +                                         nbl[th]);
 +            }
 +            nbs_cycle_stop(&nbs->cc[enbsCCsearch]);
 +
 +            np_tot = 0;
 +            np_noq = 0;
 +            np_hlj = 0;
 +            for (th = 0; th < nnbl; th++)
 +            {
 +                inc_nrnb(nrnb, eNR_NBNXN_DIST2, nbs->work[th].ndistc);
 +
 +                if (nbl_list->bSimple)
 +                {
 +                    np_tot += nbl[th]->ncj;
 +                    np_noq += nbl[th]->work->ncj_noq;
 +                    np_hlj += nbl[th]->work->ncj_hlj;
 +                }
 +                else
 +                {
 +                    /* This count ignores potential subsequent pair pruning */
 +                    np_tot += nbl[th]->nci_tot;
 +                }
 +            }
 +            nap                   = nbl[0]->na_ci*nbl[0]->na_cj;
 +            nbl_list->natpair_ljq = (np_tot - np_noq)*nap - np_hlj*nap/2;
 +            nbl_list->natpair_lj  = np_noq*nap;
 +            nbl_list->natpair_q   = np_hlj*nap/2;
 +
 +            if (CombineNBLists && nnbl > 1)
 +            {
 +                nbs_cycle_start(&nbs->cc[enbsCCcombine]);
 +
 +                combine_nblists(nnbl-1, nbl+1, nbl[0]);
 +
 +                nbs_cycle_stop(&nbs->cc[enbsCCcombine]);
 +            }
 +        }
 +    }
 +
 +    if (!nbl_list->bSimple)
 +    {
 +        /* Sort the entries on size, large ones first */
 +        if (CombineNBLists || nnbl == 1)
 +        {
 +            sort_sci(nbl[0]);
 +        }
 +        else
 +        {
 +#pragma omp parallel for num_threads(nnbl) schedule(static)
 +            for (th = 0; th < nnbl; th++)
 +            {
 +                sort_sci(nbl[th]);
 +            }
 +        }
 +    }
 +
 +    if (nbat->bUseBufferFlags)
 +    {
 +        reduce_buffer_flags(nbs, nnbl, &nbat->buffer_flags);
 +    }
 +
 +    /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
 +    if (LOCAL_I(iloc))
 +    {
 +        nbs->search_count++;
 +    }
 +    if (nbs->print_cycles &&
 +        (!nbs->DomDec || (nbs->DomDec && !LOCAL_I(iloc))) &&
 +        nbs->search_count % 100 == 0)
 +    {
 +        nbs_cycle_print(stderr, nbs);
 +    }
 +
 +    if (debug && (CombineNBLists && nnbl > 1))
 +    {
 +        if (nbl[0]->bSimple)
 +        {
 +            print_nblist_statistics_simple(debug, nbl[0], nbs, rlist);
 +        }
 +        else
 +        {
 +            print_nblist_statistics_supersub(debug, nbl[0], nbs, rlist);
 +        }
 +    }
 +
 +    if (debug)
 +    {
 +        if (gmx_debug_at)
 +        {
 +            if (nbl[0]->bSimple)
 +            {
 +                print_nblist_ci_cj(debug, nbl[0]);
 +            }
 +            else
 +            {
 +                print_nblist_sci_cj(debug, nbl[0]);
 +            }
 +        }
 +
 +        if (nbat->bUseBufferFlags)
 +        {
 +            print_reduction_cost(&nbat->buffer_flags, nnbl);
 +        }
 +    }
 +}
index d6e4fea39f60a15f56f053fa94babb6cf43e1c01,0000000000000000000000000000000000000000..f872c0c04172a260f1c97cc5bff5f81bd6331ac7
mode 100644,000000..100644
--- /dev/null
@@@ -1,301 -1,0 +1,301 @@@
-     const float                  *bb_ci;
 +/*
 + * This file is part of the GROMACS molecular simulation package.
 + *
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 + * Copyright (c) 2012, by the GROMACS development team, led by
 + * David van der Spoel, Berk Hess, Erik Lindahl, and including many
 + * others, as listed in the AUTHORS file in the top-level source
 + * directory and at http://www.gromacs.org.
 + *
 + * GROMACS is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2.1
 + * of the License, or (at your option) any later version.
 + *
 + * GROMACS is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with GROMACS; if not, see
 + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 + *
 + * If you want to redistribute modifications to GROMACS, please
 + * consider that scientific software is very special. Version
 + * control is crucial - bugs must be traceable. We will be happy to
 + * consider code for inclusion in the official distribution, but
 + * derived work must not be called official GROMACS. Details are found
 + * in the README & COPYING files - if they are missing, get the
 + * official version at http://www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org.
 + */
 +
 +/* Get the half-width SIMD stuff from the kernel utils files */
 +#include "nbnxn_kernels/nbnxn_kernel_simd_utils.h"
 +
 +
 +#if GMX_SIMD_WIDTH_HERE >= 2*NBNXN_CPU_CLUSTER_I_SIZE
 +#define STRIDE_S  (GMX_SIMD_WIDTH_HERE/2)
 +#else
 +#define STRIDE_S  NBNXN_CPU_CLUSTER_I_SIZE
 +#endif
 +
 +static gmx_inline gmx_mm_pr gmx_load_hpr_hilo_pr(const real *a)
 +{
 +    gmx_mm_hpr a_S;
 +    gmx_mm_pr  a_a_S;
 +
 +    gmx_load_hpr(&a_S, a);
 +
 +    gmx_2hpr_to_pr(a_S, a_S, &a_a_S);
 +
 +    return a_a_S;
 +}
 +
 +static gmx_inline gmx_mm_pr gmx_set_2real_shift_pr(const real *a, real shift)
 +{
 +    gmx_mm_hpr a0_S, a1_S;
 +    gmx_mm_pr  a0_a1_S;
 +
 +    gmx_set1_hpr(&a0_S, a[0] + shift);
 +    gmx_set1_hpr(&a1_S, a[1] + shift);
 +
 +    gmx_2hpr_to_pr(a0_S, a1_S, &a0_a1_S);
 +
 +    return a0_a1_S;
 +}
 +
 +/* Copies PBC shifted i-cell packed atom coordinates to working array */
 +static gmx_inline void
 +icell_set_x_simd_2xnn(int ci,
 +                      real shx, real shy, real shz,
 +                      int gmx_unused na_c,
 +                      int gmx_unused stride, const real *x,
 +                      nbnxn_list_work_t *work)
 +{
 +    int                     ia;
 +    nbnxn_x_ci_simd_2xnn_t *x_ci;
 +
 +    x_ci = work->x_ci_simd_2xnn;
 +
 +    ia = X_IND_CI_SIMD_2XNN(ci);
 +
 +    x_ci->ix_SSE0 = gmx_set_2real_shift_pr(x + ia + 0*STRIDE_S + 0, shx);
 +    x_ci->iy_SSE0 = gmx_set_2real_shift_pr(x + ia + 1*STRIDE_S + 0, shy);
 +    x_ci->iz_SSE0 = gmx_set_2real_shift_pr(x + ia + 2*STRIDE_S + 0, shz);
 +    x_ci->ix_SSE2 = gmx_set_2real_shift_pr(x + ia + 0*STRIDE_S + 2, shx);
 +    x_ci->iy_SSE2 = gmx_set_2real_shift_pr(x + ia + 1*STRIDE_S + 2, shy);
 +    x_ci->iz_SSE2 = gmx_set_2real_shift_pr(x + ia + 2*STRIDE_S + 2, shz);
 +}
 +
 +#ifndef GMX_SIMD_HAVE_ANYTRUE
 +/* Fallback function in case gmx_anytrue_pr is not present */
 +static gmx_inline gmx_bool
 +gmx_anytrue_2xn_pb(gmx_mm_pb bool_S)
 +{
 +    real     bools_array[2*GMX_SIMD_WIDTH_HERE], *bools;
 +    gmx_bool any;
 +    int      s;
 +
 +    bools = gmx_simd_align_real(bools_array);
 +
 +    gmx_store_pb(bools, bool_S);
 +
 +    any = FALSE;
 +    for (s = 0; s < GMX_SIMD_WIDTH_HERE; s++)
 +    {
 +        if (GMX_SIMD_IS_TRUE(s))
 +        {
 +            any = TRUE;
 +        }
 +    }
 +
 +    return any;
 +}
 +#endif
 +
 +/* SIMD code for making a pair list of cell ci vs cell cjf-cjl
 + * for coordinates in packed format.
 + * Checks bouding box distances and possibly atom pair distances.
 + * This is an accelerated version of make_cluster_list_simple.
 + */
 +static gmx_inline void
 +make_cluster_list_simd_2xnn(const nbnxn_grid_t *gridj,
 +                            nbnxn_pairlist_t *nbl,
 +                            int ci, int cjf, int cjl,
 +                            gmx_bool remove_sub_diag,
 +                            const real *x_j,
 +                            real rl2, float rbb2,
 +                            int *ndistc)
 +{
 +    const nbnxn_x_ci_simd_2xnn_t *work;
++    const nbnxn_bb_t             *bb_ci;
 +
 +    gmx_mm_pr                     jx_SSE, jy_SSE, jz_SSE;
 +
 +    gmx_mm_pr                     dx_SSE0, dy_SSE0, dz_SSE0;
 +    gmx_mm_pr                     dx_SSE2, dy_SSE2, dz_SSE2;
 +
 +    gmx_mm_pr                     rsq_SSE0;
 +    gmx_mm_pr                     rsq_SSE2;
 +
 +    gmx_mm_pb                     wco_SSE0;
 +    gmx_mm_pb                     wco_SSE2;
 +    gmx_mm_pb                     wco_any_SSE;
 +
 +    gmx_mm_pr                     rc2_SSE;
 +
 +    gmx_bool                      InRange;
 +    float                         d2;
 +    int                           xind_f, xind_l, cj;
 +
 +    cjf = CI_TO_CJ_SIMD_2XNN(cjf);
 +    cjl = CI_TO_CJ_SIMD_2XNN(cjl+1) - 1;
 +
 +    work = nbl->work->x_ci_simd_2xnn;
 +
 +    bb_ci = nbl->work->bb_ci;
 +
 +    rc2_SSE   = gmx_set1_pr(rl2);
 +
 +    InRange = FALSE;
 +    while (!InRange && cjf <= cjl)
 +    {
 +#ifdef NBNXN_SEARCH_BB_SSE
 +        d2 = subc_bb_dist2_sse(0, bb_ci, cjf, gridj->bbj);
 +#else
 +        d2 = subc_bb_dist2(0, bb_ci, cjf, gridj->bbj);
 +#endif
 +        *ndistc += 2;
 +
 +        /* Check if the distance is within the distance where
 +         * we use only the bounding box distance rbb,
 +         * or within the cut-off and there is at least one atom pair
 +         * within the cut-off.
 +         */
 +        if (d2 < rbb2)
 +        {
 +            InRange = TRUE;
 +        }
 +        else if (d2 < rl2)
 +        {
 +            xind_f  = X_IND_CJ_SIMD_2XNN(CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cjf);
 +
 +            jx_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_f+0*STRIDE_S);
 +            jy_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_f+1*STRIDE_S);
 +            jz_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_f+2*STRIDE_S);
 +
 +            /* Calculate distance */
 +            dx_SSE0            = gmx_sub_pr(work->ix_SSE0, jx_SSE);
 +            dy_SSE0            = gmx_sub_pr(work->iy_SSE0, jy_SSE);
 +            dz_SSE0            = gmx_sub_pr(work->iz_SSE0, jz_SSE);
 +            dx_SSE2            = gmx_sub_pr(work->ix_SSE2, jx_SSE);
 +            dy_SSE2            = gmx_sub_pr(work->iy_SSE2, jy_SSE);
 +            dz_SSE2            = gmx_sub_pr(work->iz_SSE2, jz_SSE);
 +
 +            /* rsq = dx*dx+dy*dy+dz*dz */
 +            rsq_SSE0           = gmx_calc_rsq_pr(dx_SSE0, dy_SSE0, dz_SSE0);
 +            rsq_SSE2           = gmx_calc_rsq_pr(dx_SSE2, dy_SSE2, dz_SSE2);
 +
 +            wco_SSE0           = gmx_cmplt_pr(rsq_SSE0, rc2_SSE);
 +            wco_SSE2           = gmx_cmplt_pr(rsq_SSE2, rc2_SSE);
 +
 +            wco_any_SSE        = gmx_or_pb(wco_SSE0, wco_SSE2);
 +
 +#ifdef GMX_SIMD_HAVE_ANYTRUE
 +            InRange            = gmx_anytrue_pb(wco_any_SSE);
 +#else
 +            InRange            = gmx_anytrue_2xn_pb(wco_any_SSE);
 +#endif
 +
 +            *ndistc += 2*GMX_SIMD_WIDTH_HERE;
 +        }
 +        if (!InRange)
 +        {
 +            cjf++;
 +        }
 +    }
 +    if (!InRange)
 +    {
 +        return;
 +    }
 +
 +    InRange = FALSE;
 +    while (!InRange && cjl > cjf)
 +    {
 +#ifdef NBNXN_SEARCH_BB_SSE
 +        d2 = subc_bb_dist2_sse(0, bb_ci, cjl, gridj->bbj);
 +#else
 +        d2 = subc_bb_dist2(0, bb_ci, cjl, gridj->bbj);
 +#endif
 +        *ndistc += 2;
 +
 +        /* Check if the distance is within the distance where
 +         * we use only the bounding box distance rbb,
 +         * or within the cut-off and there is at least one atom pair
 +         * within the cut-off.
 +         */
 +        if (d2 < rbb2)
 +        {
 +            InRange = TRUE;
 +        }
 +        else if (d2 < rl2)
 +        {
 +            xind_l  = X_IND_CJ_SIMD_2XNN(CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cjl);
 +
 +            jx_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_l+0*STRIDE_S);
 +            jy_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_l+1*STRIDE_S);
 +            jz_SSE  = gmx_load_hpr_hilo_pr(x_j+xind_l+2*STRIDE_S);
 +
 +            /* Calculate distance */
 +            dx_SSE0            = gmx_sub_pr(work->ix_SSE0, jx_SSE);
 +            dy_SSE0            = gmx_sub_pr(work->iy_SSE0, jy_SSE);
 +            dz_SSE0            = gmx_sub_pr(work->iz_SSE0, jz_SSE);
 +            dx_SSE2            = gmx_sub_pr(work->ix_SSE2, jx_SSE);
 +            dy_SSE2            = gmx_sub_pr(work->iy_SSE2, jy_SSE);
 +            dz_SSE2            = gmx_sub_pr(work->iz_SSE2, jz_SSE);
 +
 +            /* rsq = dx*dx+dy*dy+dz*dz */
 +            rsq_SSE0           = gmx_calc_rsq_pr(dx_SSE0, dy_SSE0, dz_SSE0);
 +            rsq_SSE2           = gmx_calc_rsq_pr(dx_SSE2, dy_SSE2, dz_SSE2);
 +
 +            wco_SSE0           = gmx_cmplt_pr(rsq_SSE0, rc2_SSE);
 +            wco_SSE2           = gmx_cmplt_pr(rsq_SSE2, rc2_SSE);
 +
 +            wco_any_SSE        = gmx_or_pb(wco_SSE0, wco_SSE2);
 +
 +#ifdef GMX_SIMD_HAVE_ANYTRUE
 +            InRange            = gmx_anytrue_pb(wco_any_SSE);
 +#else
 +            InRange            = gmx_anytrue_2xn_pb(wco_any_SSE);
 +#endif
 +
 +            *ndistc += 2*GMX_SIMD_WIDTH_HERE;
 +        }
 +        if (!InRange)
 +        {
 +            cjl--;
 +        }
 +    }
 +
 +    if (cjf <= cjl)
 +    {
 +        for (cj = cjf; cj <= cjl; cj++)
 +        {
 +            /* Store cj and the interaction mask */
 +            nbl->cj[nbl->ncj].cj   = CI_TO_CJ_SIMD_2XNN(gridj->cell0) + cj;
 +            nbl->cj[nbl->ncj].excl = get_imask_simd_2xnn(remove_sub_diag, ci, cj);
 +            nbl->ncj++;
 +        }
 +        /* Increase the closing index in i super-cell list */
 +        nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
 +    }
 +}
 +
 +#undef STRIDE_S
index eb4252fbd9a0c019342f0a5f418404570c4272cb,0000000000000000000000000000000000000000..5a9c203553ee4ce551bb9a19d4206f276e1bc4af
mode 100644,000000..100644
--- /dev/null
@@@ -1,311 -1,0 +1,311 @@@
-     const float                 *bb_ci;
 +/*
 + * This file is part of the GROMACS molecular simulation package.
 + *
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 + * Copyright (c) 2012, by the GROMACS development team, led by
 + * David van der Spoel, Berk Hess, Erik Lindahl, and including many
 + * others, as listed in the AUTHORS file in the top-level source
 + * directory and at http://www.gromacs.org.
 + *
 + * GROMACS is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2.1
 + * of the License, or (at your option) any later version.
 + *
 + * GROMACS is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with GROMACS; if not, see
 + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 + *
 + * If you want to redistribute modifications to GROMACS, please
 + * consider that scientific software is very special. Version
 + * control is crucial - bugs must be traceable. We will be happy to
 + * consider code for inclusion in the official distribution, but
 + * derived work must not be called official GROMACS. Details are found
 + * in the README & COPYING files - if they are missing, get the
 + * official version at http://www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org.
 + */
 +
 +
 +#if GMX_SIMD_WIDTH_HERE >= NBNXN_CPU_CLUSTER_I_SIZE
 +#define STRIDE_S  (GMX_SIMD_WIDTH_HERE)
 +#else
 +#define STRIDE_S  NBNXN_CPU_CLUSTER_I_SIZE
 +#endif
 +
 +/* Copies PBC shifted i-cell packed atom coordinates to working array */
 +static gmx_inline void
 +icell_set_x_simd_4xn(int ci,
 +                     real shx, real shy, real shz,
 +                     int gmx_unused na_c,
 +                     int gmx_unused stride, const real *x,
 +                     nbnxn_list_work_t *work)
 +{
 +    int                    ia;
 +    nbnxn_x_ci_simd_4xn_t *x_ci;
 +
 +    x_ci = work->x_ci_simd_4xn;
 +
 +    ia = X_IND_CI_SIMD_4XN(ci);
 +
 +    x_ci->ix_SSE0 = gmx_set1_pr(x[ia + 0*STRIDE_S    ] + shx);
 +    x_ci->iy_SSE0 = gmx_set1_pr(x[ia + 1*STRIDE_S    ] + shy);
 +    x_ci->iz_SSE0 = gmx_set1_pr(x[ia + 2*STRIDE_S    ] + shz);
 +    x_ci->ix_SSE1 = gmx_set1_pr(x[ia + 0*STRIDE_S + 1] + shx);
 +    x_ci->iy_SSE1 = gmx_set1_pr(x[ia + 1*STRIDE_S + 1] + shy);
 +    x_ci->iz_SSE1 = gmx_set1_pr(x[ia + 2*STRIDE_S + 1] + shz);
 +    x_ci->ix_SSE2 = gmx_set1_pr(x[ia + 0*STRIDE_S + 2] + shx);
 +    x_ci->iy_SSE2 = gmx_set1_pr(x[ia + 1*STRIDE_S + 2] + shy);
 +    x_ci->iz_SSE2 = gmx_set1_pr(x[ia + 2*STRIDE_S + 2] + shz);
 +    x_ci->ix_SSE3 = gmx_set1_pr(x[ia + 0*STRIDE_S + 3] + shx);
 +    x_ci->iy_SSE3 = gmx_set1_pr(x[ia + 1*STRIDE_S + 3] + shy);
 +    x_ci->iz_SSE3 = gmx_set1_pr(x[ia + 2*STRIDE_S + 3] + shz);
 +}
 +
 +#ifndef GMX_SIMD_HAVE_ANYTRUE
 +/* Fallback function in case gmx_anytrue_pr is not present */
 +static gmx_inline gmx_bool
 +gmx_anytrue_4xn_pb(gmx_mm_pb bool_S)
 +{
 +    real     bools_array[2*GMX_SIMD_WIDTH_HERE], *bools;
 +    gmx_bool any;
 +    int      s;
 +
 +    bools = gmx_simd_align_real(bools_array);
 +
 +    gmx_store_pb(bools, bool_S);
 +
 +    any = FALSE;
 +    for (s = 0; s < GMX_SIMD_WIDTH_HERE; s++)
 +    {
 +        if (GMX_SIMD_IS_TRUE(bools[s]))
 +        {
 +            any = TRUE;
 +        }
 +    }
 +
 +    return any;
 +}
 +#endif
 +
 +/* SIMD code for making a pair list of cell ci vs cell cjf-cjl
 + * for coordinates in packed format.
 + * Checks bouding box distances and possibly atom pair distances.
 + * This is an accelerated version of make_cluster_list_simple.
 + */
 +static gmx_inline void
 +make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj,
 +                           nbnxn_pairlist_t *nbl,
 +                           int ci, int cjf, int cjl,
 +                           gmx_bool remove_sub_diag,
 +                           const real *x_j,
 +                           real rl2, float rbb2,
 +                           int *ndistc)
 +{
 +    const nbnxn_x_ci_simd_4xn_t *work;
++    const nbnxn_bb_t            *bb_ci;
 +
 +    gmx_mm_pr                    jx_SSE, jy_SSE, jz_SSE;
 +
 +    gmx_mm_pr                    dx_SSE0, dy_SSE0, dz_SSE0;
 +    gmx_mm_pr                    dx_SSE1, dy_SSE1, dz_SSE1;
 +    gmx_mm_pr                    dx_SSE2, dy_SSE2, dz_SSE2;
 +    gmx_mm_pr                    dx_SSE3, dy_SSE3, dz_SSE3;
 +
 +    gmx_mm_pr                    rsq_SSE0;
 +    gmx_mm_pr                    rsq_SSE1;
 +    gmx_mm_pr                    rsq_SSE2;
 +    gmx_mm_pr                    rsq_SSE3;
 +
 +    gmx_mm_pb                    wco_SSE0;
 +    gmx_mm_pb                    wco_SSE1;
 +    gmx_mm_pb                    wco_SSE2;
 +    gmx_mm_pb                    wco_SSE3;
 +    gmx_mm_pb                    wco_any_SSE01, wco_any_SSE23, wco_any_SSE;
 +
 +    gmx_mm_pr                    rc2_SSE;
 +
 +    gmx_bool                     InRange;
 +    float                        d2;
 +    int                          xind_f, xind_l, cj;
 +
 +    cjf = CI_TO_CJ_SIMD_4XN(cjf);
 +    cjl = CI_TO_CJ_SIMD_4XN(cjl+1) - 1;
 +
 +    work = nbl->work->x_ci_simd_4xn;
 +
 +    bb_ci = nbl->work->bb_ci;
 +
 +    rc2_SSE   = gmx_set1_pr(rl2);
 +
 +    InRange = FALSE;
 +    while (!InRange && cjf <= cjl)
 +    {
 +#ifdef NBNXN_SEARCH_BB_SSE
 +        d2 = subc_bb_dist2_sse(0, bb_ci, cjf, gridj->bbj);
 +#else
 +        d2 = subc_bb_dist2(0, bb_ci, cjf, gridj->bbj);
 +#endif
 +        *ndistc += 2;
 +
 +        /* Check if the distance is within the distance where
 +         * we use only the bounding box distance rbb,
 +         * or within the cut-off and there is at least one atom pair
 +         * within the cut-off.
 +         */
 +        if (d2 < rbb2)
 +        {
 +            InRange = TRUE;
 +        }
 +        else if (d2 < rl2)
 +        {
 +            xind_f  = X_IND_CJ_SIMD_4XN(CI_TO_CJ_SIMD_4XN(gridj->cell0) + cjf);
 +
 +            jx_SSE  = gmx_load_pr(x_j+xind_f+0*STRIDE_S);
 +            jy_SSE  = gmx_load_pr(x_j+xind_f+1*STRIDE_S);
 +            jz_SSE  = gmx_load_pr(x_j+xind_f+2*STRIDE_S);
 +
 +
 +            /* Calculate distance */
 +            dx_SSE0            = gmx_sub_pr(work->ix_SSE0, jx_SSE);
 +            dy_SSE0            = gmx_sub_pr(work->iy_SSE0, jy_SSE);
 +            dz_SSE0            = gmx_sub_pr(work->iz_SSE0, jz_SSE);
 +            dx_SSE1            = gmx_sub_pr(work->ix_SSE1, jx_SSE);
 +            dy_SSE1            = gmx_sub_pr(work->iy_SSE1, jy_SSE);
 +            dz_SSE1            = gmx_sub_pr(work->iz_SSE1, jz_SSE);
 +            dx_SSE2            = gmx_sub_pr(work->ix_SSE2, jx_SSE);
 +            dy_SSE2            = gmx_sub_pr(work->iy_SSE2, jy_SSE);
 +            dz_SSE2            = gmx_sub_pr(work->iz_SSE2, jz_SSE);
 +            dx_SSE3            = gmx_sub_pr(work->ix_SSE3, jx_SSE);
 +            dy_SSE3            = gmx_sub_pr(work->iy_SSE3, jy_SSE);
 +            dz_SSE3            = gmx_sub_pr(work->iz_SSE3, jz_SSE);
 +
 +            /* rsq = dx*dx+dy*dy+dz*dz */
 +            rsq_SSE0           = gmx_calc_rsq_pr(dx_SSE0, dy_SSE0, dz_SSE0);
 +            rsq_SSE1           = gmx_calc_rsq_pr(dx_SSE1, dy_SSE1, dz_SSE1);
 +            rsq_SSE2           = gmx_calc_rsq_pr(dx_SSE2, dy_SSE2, dz_SSE2);
 +            rsq_SSE3           = gmx_calc_rsq_pr(dx_SSE3, dy_SSE3, dz_SSE3);
 +
 +            wco_SSE0           = gmx_cmplt_pr(rsq_SSE0, rc2_SSE);
 +            wco_SSE1           = gmx_cmplt_pr(rsq_SSE1, rc2_SSE);
 +            wco_SSE2           = gmx_cmplt_pr(rsq_SSE2, rc2_SSE);
 +            wco_SSE3           = gmx_cmplt_pr(rsq_SSE3, rc2_SSE);
 +
 +            wco_any_SSE01      = gmx_or_pb(wco_SSE0, wco_SSE1);
 +            wco_any_SSE23      = gmx_or_pb(wco_SSE2, wco_SSE3);
 +            wco_any_SSE        = gmx_or_pb(wco_any_SSE01, wco_any_SSE23);
 +
 +#ifdef GMX_SIMD_HAVE_ANYTRUE
 +            InRange            = gmx_anytrue_pb(wco_any_SSE);
 +#else
 +            InRange            = gmx_anytrue_4xn_pb(wco_any_SSE);
 +#endif
 +
 +            *ndistc += 4*GMX_SIMD_WIDTH_HERE;
 +        }
 +        if (!InRange)
 +        {
 +            cjf++;
 +        }
 +    }
 +    if (!InRange)
 +    {
 +        return;
 +    }
 +
 +    InRange = FALSE;
 +    while (!InRange && cjl > cjf)
 +    {
 +#ifdef NBNXN_SEARCH_BB_SSE
 +        d2 = subc_bb_dist2_sse(0, bb_ci, cjl, gridj->bbj);
 +#else
 +        d2 = subc_bb_dist2(0, bb_ci, cjl, gridj->bbj);
 +#endif
 +        *ndistc += 2;
 +
 +        /* Check if the distance is within the distance where
 +         * we use only the bounding box distance rbb,
 +         * or within the cut-off and there is at least one atom pair
 +         * within the cut-off.
 +         */
 +        if (d2 < rbb2)
 +        {
 +            InRange = TRUE;
 +        }
 +        else if (d2 < rl2)
 +        {
 +            xind_l  = X_IND_CJ_SIMD_4XN(CI_TO_CJ_SIMD_4XN(gridj->cell0) + cjl);
 +
 +            jx_SSE  = gmx_load_pr(x_j+xind_l+0*STRIDE_S);
 +            jy_SSE  = gmx_load_pr(x_j+xind_l+1*STRIDE_S);
 +            jz_SSE  = gmx_load_pr(x_j+xind_l+2*STRIDE_S);
 +
 +            /* Calculate distance */
 +            dx_SSE0            = gmx_sub_pr(work->ix_SSE0, jx_SSE);
 +            dy_SSE0            = gmx_sub_pr(work->iy_SSE0, jy_SSE);
 +            dz_SSE0            = gmx_sub_pr(work->iz_SSE0, jz_SSE);
 +            dx_SSE1            = gmx_sub_pr(work->ix_SSE1, jx_SSE);
 +            dy_SSE1            = gmx_sub_pr(work->iy_SSE1, jy_SSE);
 +            dz_SSE1            = gmx_sub_pr(work->iz_SSE1, jz_SSE);
 +            dx_SSE2            = gmx_sub_pr(work->ix_SSE2, jx_SSE);
 +            dy_SSE2            = gmx_sub_pr(work->iy_SSE2, jy_SSE);
 +            dz_SSE2            = gmx_sub_pr(work->iz_SSE2, jz_SSE);
 +            dx_SSE3            = gmx_sub_pr(work->ix_SSE3, jx_SSE);
 +            dy_SSE3            = gmx_sub_pr(work->iy_SSE3, jy_SSE);
 +            dz_SSE3            = gmx_sub_pr(work->iz_SSE3, jz_SSE);
 +
 +            /* rsq = dx*dx+dy*dy+dz*dz */
 +            rsq_SSE0           = gmx_calc_rsq_pr(dx_SSE0, dy_SSE0, dz_SSE0);
 +            rsq_SSE1           = gmx_calc_rsq_pr(dx_SSE1, dy_SSE1, dz_SSE1);
 +            rsq_SSE2           = gmx_calc_rsq_pr(dx_SSE2, dy_SSE2, dz_SSE2);
 +            rsq_SSE3           = gmx_calc_rsq_pr(dx_SSE3, dy_SSE3, dz_SSE3);
 +
 +            wco_SSE0           = gmx_cmplt_pr(rsq_SSE0, rc2_SSE);
 +            wco_SSE1           = gmx_cmplt_pr(rsq_SSE1, rc2_SSE);
 +            wco_SSE2           = gmx_cmplt_pr(rsq_SSE2, rc2_SSE);
 +            wco_SSE3           = gmx_cmplt_pr(rsq_SSE3, rc2_SSE);
 +
 +            wco_any_SSE01      = gmx_or_pb(wco_SSE0, wco_SSE1);
 +            wco_any_SSE23      = gmx_or_pb(wco_SSE2, wco_SSE3);
 +            wco_any_SSE        = gmx_or_pb(wco_any_SSE01, wco_any_SSE23);
 +
 +#ifdef GMX_SIMD_HAVE_ANYTRUE
 +            InRange            = gmx_anytrue_pb(wco_any_SSE);
 +#else
 +            InRange            = gmx_anytrue_4xn_pb(wco_any_SSE);
 +#endif
 +
 +            *ndistc += 4*GMX_SIMD_WIDTH_HERE;
 +        }
 +        if (!InRange)
 +        {
 +            cjl--;
 +        }
 +    }
 +
 +    if (cjf <= cjl)
 +    {
 +        for (cj = cjf; cj <= cjl; cj++)
 +        {
 +            /* Store cj and the interaction mask */
 +            nbl->cj[nbl->ncj].cj   = CI_TO_CJ_SIMD_4XN(gridj->cell0) + cj;
 +            nbl->cj[nbl->ncj].excl = get_imask_simd_4xn(remove_sub_diag, ci, cj);
 +            nbl->ncj++;
 +        }
 +        /* Increase the closing index in i super-cell list */
 +        nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
 +    }
 +}
 +
 +#undef STRIDE_S
 +
index 372f34a5759d615c35791c0029ad2f16d2db2098,0000000000000000000000000000000000000000..ecc064a99fdf75c93146d8b0394389bc6664e200
mode 100644,000000..100644
--- /dev/null
@@@ -1,1659 -1,0 +1,1660 @@@
-         /* Steps are divided over the nodes iso splitting the atoms */
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +#include <signal.h>
 +#include <stdlib.h>
 +#ifdef HAVE_UNISTD_H
 +#include <unistd.h>
 +#endif
 +#include <string.h>
 +#include <assert.h>
 +
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "sysstuff.h"
 +#include "statutil.h"
 +#include "force.h"
 +#include "mdrun.h"
 +#include "md_logging.h"
 +#include "md_support.h"
 +#include "network.h"
 +#include "pull.h"
 +#include "pull_rotation.h"
 +#include "names.h"
 +#include "disre.h"
 +#include "orires.h"
 +#include "pme.h"
 +#include "mdatoms.h"
 +#include "repl_ex.h"
 +#include "qmmm.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "coulomb.h"
 +#include "constr.h"
 +#include "mvdata.h"
 +#include "checkpoint.h"
 +#include "mtop_util.h"
 +#include "sighandler.h"
 +#include "tpxio.h"
 +#include "txtdump.h"
 +#include "gmx_detect_hardware.h"
 +#include "gmx_omp_nthreads.h"
 +#include "pull_rotation.h"
 +#include "calc_verletbuf.h"
 +#include "../mdlib/nbnxn_search.h"
 +#include "../mdlib/nbnxn_consts.h"
 +#include "gmx_fatal_collective.h"
 +#include "membed.h"
 +#include "macros.h"
 +#include "gmx_omp.h"
 +#include "gmx_thread_affinity.h"
 +
 +#include "gromacs/utility/gmxmpi.h"
 +
 +#ifdef GMX_FAHCORE
 +#include "corewrap.h"
 +#endif
 +
 +#include "gpu_utils.h"
 +#include "nbnxn_cuda_data_mgmt.h"
 +
 +typedef struct {
 +    gmx_integrator_t *func;
 +} gmx_intp_t;
 +
 +/* The array should match the eI array in include/types/enums.h */
 +const gmx_intp_t    integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md}, {do_md}};
 +
 +gmx_large_int_t     deform_init_init_step_tpx;
 +matrix              deform_init_box_tpx;
 +#ifdef GMX_THREAD_MPI
 +tMPI_Thread_mutex_t deform_init_box_mutex = TMPI_THREAD_MUTEX_INITIALIZER;
 +#endif
 +
 +
 +#ifdef GMX_THREAD_MPI
 +struct mdrunner_arglist
 +{
 +    gmx_hw_opt_t   *hw_opt;
 +    FILE           *fplog;
 +    t_commrec      *cr;
 +    int             nfile;
 +    const t_filenm *fnm;
 +    output_env_t    oenv;
 +    gmx_bool        bVerbose;
 +    gmx_bool        bCompact;
 +    int             nstglobalcomm;
 +    ivec            ddxyz;
 +    int             dd_node_order;
 +    real            rdd;
 +    real            rconstr;
 +    const char     *dddlb_opt;
 +    real            dlb_scale;
 +    const char     *ddcsx;
 +    const char     *ddcsy;
 +    const char     *ddcsz;
 +    const char     *nbpu_opt;
 +    gmx_large_int_t nsteps_cmdline;
 +    int             nstepout;
 +    int             resetstep;
 +    int             nmultisim;
 +    int             repl_ex_nst;
 +    int             repl_ex_nex;
 +    int             repl_ex_seed;
 +    real            pforce;
 +    real            cpt_period;
 +    real            max_hours;
 +    const char     *deviceOptions;
 +    unsigned long   Flags;
 +    int             ret; /* return value */
 +};
 +
 +
 +/* The function used for spawning threads. Extracts the mdrunner()
 +   arguments from its one argument and calls mdrunner(), after making
 +   a commrec. */
 +static void mdrunner_start_fn(void *arg)
 +{
 +    struct mdrunner_arglist *mda = (struct mdrunner_arglist*)arg;
 +    struct mdrunner_arglist  mc  = *mda; /* copy the arg list to make sure
 +                                            that it's thread-local. This doesn't
 +                                            copy pointed-to items, of course,
 +                                            but those are all const. */
 +    t_commrec *cr;                       /* we need a local version of this */
 +    FILE      *fplog = NULL;
 +    t_filenm  *fnm;
 +
 +    fnm = dup_tfn(mc.nfile, mc.fnm);
 +
 +    cr = init_par_threads(mc.cr);
 +
 +    if (MASTER(cr))
 +    {
 +        fplog = mc.fplog;
 +    }
 +
 +    mda->ret = mdrunner(mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv,
 +                        mc.bVerbose, mc.bCompact, mc.nstglobalcomm,
 +                        mc.ddxyz, mc.dd_node_order, mc.rdd,
 +                        mc.rconstr, mc.dddlb_opt, mc.dlb_scale,
 +                        mc.ddcsx, mc.ddcsy, mc.ddcsz,
 +                        mc.nbpu_opt,
 +                        mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
 +                        mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce,
 +                        mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.Flags);
 +}
 +
 +/* called by mdrunner() to start a specific number of threads (including
 +   the main thread) for thread-parallel runs. This in turn calls mdrunner()
 +   for each thread.
 +   All options besides nthreads are the same as for mdrunner(). */
 +static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
 +                                         FILE *fplog, t_commrec *cr, int nfile,
 +                                         const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
 +                                         gmx_bool bCompact, int nstglobalcomm,
 +                                         ivec ddxyz, int dd_node_order, real rdd, real rconstr,
 +                                         const char *dddlb_opt, real dlb_scale,
 +                                         const char *ddcsx, const char *ddcsy, const char *ddcsz,
 +                                         const char *nbpu_opt,
 +                                         gmx_large_int_t nsteps_cmdline,
 +                                         int nstepout, int resetstep,
 +                                         int nmultisim, int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
 +                                         real pforce, real cpt_period, real max_hours,
 +                                         const char *deviceOptions, unsigned long Flags)
 +{
 +    int                      ret;
 +    struct mdrunner_arglist *mda;
 +    t_commrec               *crn; /* the new commrec */
 +    t_filenm                *fnmn;
 +
 +    /* first check whether we even need to start tMPI */
 +    if (hw_opt->nthreads_tmpi < 2)
 +    {
 +        return cr;
 +    }
 +
 +    /* a few small, one-time, almost unavoidable memory leaks: */
 +    snew(mda, 1);
 +    fnmn = dup_tfn(nfile, fnm);
 +
 +    /* fill the data structure to pass as void pointer to thread start fn */
 +    mda->hw_opt         = hw_opt;
 +    mda->fplog          = fplog;
 +    mda->cr             = cr;
 +    mda->nfile          = nfile;
 +    mda->fnm            = fnmn;
 +    mda->oenv           = oenv;
 +    mda->bVerbose       = bVerbose;
 +    mda->bCompact       = bCompact;
 +    mda->nstglobalcomm  = nstglobalcomm;
 +    mda->ddxyz[XX]      = ddxyz[XX];
 +    mda->ddxyz[YY]      = ddxyz[YY];
 +    mda->ddxyz[ZZ]      = ddxyz[ZZ];
 +    mda->dd_node_order  = dd_node_order;
 +    mda->rdd            = rdd;
 +    mda->rconstr        = rconstr;
 +    mda->dddlb_opt      = dddlb_opt;
 +    mda->dlb_scale      = dlb_scale;
 +    mda->ddcsx          = ddcsx;
 +    mda->ddcsy          = ddcsy;
 +    mda->ddcsz          = ddcsz;
 +    mda->nbpu_opt       = nbpu_opt;
 +    mda->nsteps_cmdline = nsteps_cmdline;
 +    mda->nstepout       = nstepout;
 +    mda->resetstep      = resetstep;
 +    mda->nmultisim      = nmultisim;
 +    mda->repl_ex_nst    = repl_ex_nst;
 +    mda->repl_ex_nex    = repl_ex_nex;
 +    mda->repl_ex_seed   = repl_ex_seed;
 +    mda->pforce         = pforce;
 +    mda->cpt_period     = cpt_period;
 +    mda->max_hours      = max_hours;
 +    mda->deviceOptions  = deviceOptions;
 +    mda->Flags          = Flags;
 +
 +    /* now spawn new threads that start mdrunner_start_fn(), while
 +       the main thread returns, we set thread affinity later */
 +    ret = tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi, TMPI_AFFINITY_NONE,
 +                       mdrunner_start_fn, (void*)(mda) );
 +    if (ret != TMPI_SUCCESS)
 +    {
 +        return NULL;
 +    }
 +
 +    /* make a new comm_rec to reflect the new situation */
 +    crn = init_par_threads(cr);
 +    return crn;
 +}
 +
 +
 +static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
 +                                        const gmx_hw_opt_t  *hw_opt,
 +                                        int                  nthreads_tot,
 +                                        int                  ngpu)
 +{
 +    int nthreads_tmpi;
 +
 +    /* There are no separate PME nodes here, as we ensured in
 +     * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
 +     * and a conditional ensures we would not have ended up here.
 +     * Note that separate PME nodes might be switched on later.
 +     */
 +    if (ngpu > 0)
 +    {
 +        nthreads_tmpi = ngpu;
 +        if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi)
 +        {
 +            nthreads_tmpi = nthreads_tot;
 +        }
 +    }
 +    else if (hw_opt->nthreads_omp > 0)
 +    {
 +        /* Here we could oversubscribe, when we do, we issue a warning later */
 +        nthreads_tmpi = max(1, nthreads_tot/hw_opt->nthreads_omp);
 +    }
 +    else
 +    {
 +        /* TODO choose nthreads_omp based on hardware topology
 +           when we have a hardware topology detection library */
 +        /* In general, when running up to 4 threads, OpenMP should be faster.
 +         * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
 +         * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
 +         * even on two CPUs it's usually faster (but with many OpenMP threads
 +         * it could be faster not to use HT, currently we always use HT).
 +         * On Nehalem/Westmere we want to avoid running 16 threads over
 +         * two CPUs with HT, so we need a limit<16; thus we use 12.
 +         * A reasonable limit for Intel Sandy and Ivy bridge,
 +         * not knowing the topology, is 16 threads.
 +         */
 +        const int nthreads_omp_always_faster             =  4;
 +        const int nthreads_omp_always_faster_Nehalem     = 12;
 +        const int nthreads_omp_always_faster_SandyBridge = 16;
 +        const int first_model_Nehalem                    = 0x1A;
 +        const int first_model_SandyBridge                = 0x2A;
 +        gmx_bool  bIntel_Family6;
 +
 +        bIntel_Family6 =
 +            (gmx_cpuid_vendor(hwinfo->cpuid_info) == GMX_CPUID_VENDOR_INTEL &&
 +             gmx_cpuid_family(hwinfo->cpuid_info) == 6);
 +
 +        if (nthreads_tot <= nthreads_omp_always_faster ||
 +            (bIntel_Family6 &&
 +             ((gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_Nehalem && nthreads_tot <= nthreads_omp_always_faster_Nehalem) ||
 +              (gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_SandyBridge && nthreads_tot <= nthreads_omp_always_faster_SandyBridge))))
 +        {
 +            /* Use pure OpenMP parallelization */
 +            nthreads_tmpi = 1;
 +        }
 +        else
 +        {
 +            /* Don't use OpenMP parallelization */
 +            nthreads_tmpi = nthreads_tot;
 +        }
 +    }
 +
 +    return nthreads_tmpi;
 +}
 +
 +
 +/* Get the number of threads to use for thread-MPI based on how many
 + * were requested, which algorithms we're using,
 + * and how many particles there are.
 + * At the point we have already called check_and_update_hw_opt.
 + * Thus all options should be internally consistent and consistent
 + * with the hardware, except that ntmpi could be larger than #GPU.
 + */
 +static int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
 +                            gmx_hw_opt_t *hw_opt,
 +                            t_inputrec *inputrec, gmx_mtop_t *mtop,
 +                            const t_commrec *cr,
 +                            FILE *fplog)
 +{
 +    int      nthreads_hw, nthreads_tot_max, nthreads_tmpi, nthreads_new, ngpu;
 +    int      min_atoms_per_mpi_thread;
 +    char    *env;
 +    char     sbuf[STRLEN];
 +    gmx_bool bCanUseGPU;
 +
 +    if (hw_opt->nthreads_tmpi > 0)
 +    {
 +        /* Trivial, return right away */
 +        return hw_opt->nthreads_tmpi;
 +    }
 +
 +    nthreads_hw = hwinfo->nthreads_hw_avail;
 +
 +    /* How many total (#tMPI*#OpenMP) threads can we start? */
 +    if (hw_opt->nthreads_tot > 0)
 +    {
 +        nthreads_tot_max = hw_opt->nthreads_tot;
 +    }
 +    else
 +    {
 +        nthreads_tot_max = nthreads_hw;
 +    }
 +
 +    bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
 +    if (bCanUseGPU)
 +    {
 +        ngpu = hwinfo->gpu_info.ncuda_dev_use;
 +    }
 +    else
 +    {
 +        ngpu = 0;
 +    }
 +
 +    nthreads_tmpi =
 +        get_tmpi_omp_thread_division(hwinfo, hw_opt, nthreads_tot_max, ngpu);
 +
 +    if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
 +    {
-         if (!EI_TPI(inputrec->eI))
++        /* Dims/steps are divided over the nodes iso splitting the atoms */
 +        min_atoms_per_mpi_thread = 0;
 +    }
 +    else
 +    {
 +        if (bCanUseGPU)
 +        {
 +            min_atoms_per_mpi_thread = MIN_ATOMS_PER_GPU;
 +        }
 +        else
 +        {
 +            min_atoms_per_mpi_thread = MIN_ATOMS_PER_MPI_THREAD;
 +        }
 +    }
 +
 +    /* Check if an algorithm does not support parallel simulation.  */
 +    if (nthreads_tmpi != 1 &&
 +        ( inputrec->eI == eiLBFGS ||
 +          inputrec->coulombtype == eelEWALD ) )
 +    {
 +        nthreads_tmpi = 1;
 +
 +        md_print_warn(cr, fplog, "The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
 +        if (hw_opt->nthreads_tmpi > nthreads_tmpi)
 +        {
 +            gmx_fatal(FARGS, "You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
 +        }
 +    }
 +    else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_thread)
 +    {
 +        /* the thread number was chosen automatically, but there are too many
 +           threads (too few atoms per thread) */
 +        nthreads_new = max(1, mtop->natoms/min_atoms_per_mpi_thread);
 +
 +        /* Avoid partial use of Hyper-Threading */
 +        if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
 +            nthreads_new > nthreads_hw/2 && nthreads_new < nthreads_hw)
 +        {
 +            nthreads_new = nthreads_hw/2;
 +        }
 +
 +        /* Avoid large prime numbers in the thread count */
 +        if (nthreads_new >= 6)
 +        {
 +            /* Use only 6,8,10 with additional factors of 2 */
 +            int fac;
 +
 +            fac = 2;
 +            while (3*fac*2 <= nthreads_new)
 +            {
 +                fac *= 2;
 +            }
 +
 +            nthreads_new = (nthreads_new/fac)*fac;
 +        }
 +        else
 +        {
 +            /* Avoid 5 */
 +            if (nthreads_new == 5)
 +            {
 +                nthreads_new = 4;
 +            }
 +        }
 +
 +        nthreads_tmpi = nthreads_new;
 +
 +        fprintf(stderr, "\n");
 +        fprintf(stderr, "NOTE: Parallelization is limited by the small number of atoms,\n");
 +        fprintf(stderr, "      only starting %d thread-MPI threads.\n", nthreads_tmpi);
 +        fprintf(stderr, "      You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
 +    }
 +
 +    return nthreads_tmpi;
 +}
 +#endif /* GMX_THREAD_MPI */
 +
 +
 +/* Environment variable for setting nstlist */
 +static const char*  NSTLIST_ENVVAR          =  "GMX_NSTLIST";
 +/* Try to increase nstlist when using a GPU with nstlist less than this */
 +static const int    NSTLIST_GPU_ENOUGH      = 20;
 +/* Increase nstlist until the non-bonded cost increases more than this factor */
 +static const float  NBNXN_GPU_LIST_OK_FAC   = 1.25;
 +/* Don't increase nstlist beyond a non-bonded cost increases of this factor */
 +static const float  NBNXN_GPU_LIST_MAX_FAC  = 1.40;
 +
 +/* Try to increase nstlist when running on a GPU */
 +static void increase_nstlist(FILE *fp, t_commrec *cr,
 +                             t_inputrec *ir, const gmx_mtop_t *mtop, matrix box)
 +{
 +    char                  *env;
 +    int                    nstlist_orig, nstlist_prev;
 +    verletbuf_list_setup_t ls;
 +    real                   rlist_inc, rlist_ok, rlist_max, rlist_new, rlist_prev;
 +    int                    i;
 +    t_state                state_tmp;
 +    gmx_bool               bBox, bDD, bCont;
 +    const char            *nstl_fmt = "\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
 +    const char            *vbd_err  = "Can not increase nstlist for GPU run because verlet-buffer-drift is not set or used";
 +    const char            *box_err  = "Can not increase nstlist for GPU run because the box is too small";
 +    const char            *dd_err   = "Can not increase nstlist for GPU run because of domain decomposition limitations";
 +    char                   buf[STRLEN];
 +
 +    /* Number of + nstlist alternative values to try when switching  */
 +    const int nstl[] = { 20, 25, 40, 50 };
 +#define NNSTL  sizeof(nstl)/sizeof(nstl[0])
 +
 +    env = getenv(NSTLIST_ENVVAR);
 +    if (env == NULL)
 +    {
 +        if (fp != NULL)
 +        {
 +            fprintf(fp, nstl_fmt, ir->nstlist);
 +        }
 +    }
 +
 +    if (ir->verletbuf_drift == 0)
 +    {
 +        gmx_fatal(FARGS, "You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
 +    }
 +
 +    if (ir->verletbuf_drift < 0)
 +    {
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr, "%s\n", vbd_err);
 +        }
 +        if (fp != NULL)
 +        {
 +            fprintf(fp, "%s\n", vbd_err);
 +        }
 +
 +        return;
 +    }
 +
 +    nstlist_orig = ir->nstlist;
 +    if (env != NULL)
 +    {
 +        sprintf(buf, "Getting nstlist from environment variable GMX_NSTLIST=%s", env);
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr, "%s\n", buf);
 +        }
 +        if (fp != NULL)
 +        {
 +            fprintf(fp, "%s\n", buf);
 +        }
 +        sscanf(env, "%d", &ir->nstlist);
 +    }
 +
 +    verletbuf_get_list_setup(TRUE, &ls);
 +
 +    /* Allow rlist to make the list double the size of the cut-off sphere */
 +    rlist_inc = nbnxn_get_rlist_effective_inc(NBNXN_GPU_CLUSTER_SIZE, mtop->natoms/det(box));
 +    rlist_ok  = (max(ir->rvdw, ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_OK_FAC, 1.0/3.0) - rlist_inc;
 +    rlist_max = (max(ir->rvdw, ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_MAX_FAC, 1.0/3.0) - rlist_inc;
 +    if (debug)
 +    {
 +        fprintf(debug, "GPU nstlist tuning: rlist_inc %.3f rlist_max %.3f\n",
 +                rlist_inc, rlist_max);
 +    }
 +
 +    i            = 0;
 +    nstlist_prev = nstlist_orig;
 +    rlist_prev   = ir->rlist;
 +    do
 +    {
 +        if (env == NULL)
 +        {
 +            ir->nstlist = nstl[i];
 +        }
 +
 +        /* Set the pair-list buffer size in ir */
 +        calc_verlet_buffer_size(mtop, det(box), ir, ir->verletbuf_drift, &ls,
 +                                NULL, &rlist_new);
 +
 +        /* Does rlist fit in the box? */
 +        bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC, box));
 +        bDD  = TRUE;
 +        if (bBox && DOMAINDECOMP(cr))
 +        {
 +            /* Check if rlist fits in the domain decomposition */
 +            if (inputrec2nboundeddim(ir) < DIM)
 +            {
 +                gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
 +            }
 +            copy_mat(box, state_tmp.box);
 +            bDD = change_dd_cutoff(cr, &state_tmp, ir, rlist_new);
 +        }
 +
 +        bCont = FALSE;
 +
 +        if (env == NULL)
 +        {
 +            if (bBox && bDD && rlist_new <= rlist_max)
 +            {
 +                /* Increase nstlist */
 +                nstlist_prev = ir->nstlist;
 +                rlist_prev   = rlist_new;
 +                bCont        = (i+1 < NNSTL && rlist_new < rlist_ok);
 +            }
 +            else
 +            {
 +                /* Stick with the previous nstlist */
 +                ir->nstlist = nstlist_prev;
 +                rlist_new   = rlist_prev;
 +                bBox        = TRUE;
 +                bDD         = TRUE;
 +            }
 +        }
 +
 +        i++;
 +    }
 +    while (bCont);
 +
 +    if (!bBox || !bDD)
 +    {
 +        gmx_warning(!bBox ? box_err : dd_err);
 +        if (fp != NULL)
 +        {
 +            fprintf(fp, "\n%s\n", bBox ? box_err : dd_err);
 +        }
 +        ir->nstlist = nstlist_orig;
 +    }
 +    else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
 +    {
 +        sprintf(buf, "Changing nstlist from %d to %d, rlist from %g to %g",
 +                nstlist_orig, ir->nstlist,
 +                ir->rlist, rlist_new);
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr, "%s\n\n", buf);
 +        }
 +        if (fp != NULL)
 +        {
 +            fprintf(fp, "%s\n\n", buf);
 +        }
 +        ir->rlist     = rlist_new;
 +        ir->rlistlong = rlist_new;
 +    }
 +}
 +
 +static void prepare_verlet_scheme(FILE                           *fplog,
 +                                  const gmx_hw_info_t            *hwinfo,
 +                                  t_commrec                      *cr,
 +                                  t_inputrec                     *ir,
 +                                  const gmx_mtop_t               *mtop,
 +                                  matrix                          box,
 +                                  gmx_bool                       *bUseGPU)
 +{
 +    /* Here we only check for GPU usage on the MPI master process,
 +     * as here we don't know how many GPUs we will use yet.
 +     * We check for a GPU on all processes later.
 +     */
 +    *bUseGPU = hwinfo->bCanUseGPU || (getenv("GMX_EMULATE_GPU") != NULL);
 +
 +    if (ir->verletbuf_drift > 0)
 +    {
 +        /* Update the Verlet buffer size for the current run setup */
 +        verletbuf_list_setup_t ls;
 +        real                   rlist_new;
 +
 +        /* Here we assume CPU acceleration is on. But as currently
 +         * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
 +         * and 4x2 gives a larger buffer than 4x4, this is ok.
 +         */
 +        verletbuf_get_list_setup(*bUseGPU, &ls);
 +
 +        calc_verlet_buffer_size(mtop, det(box), ir,
 +                                ir->verletbuf_drift, &ls,
 +                                NULL, &rlist_new);
 +        if (rlist_new != ir->rlist)
 +        {
 +            if (fplog != NULL)
 +            {
 +                fprintf(fplog, "\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
 +                        ir->rlist, rlist_new,
 +                        ls.cluster_size_i, ls.cluster_size_j);
 +            }
 +            ir->rlist     = rlist_new;
 +            ir->rlistlong = rlist_new;
 +        }
 +    }
 +
 +    /* With GPU or emulation we should check nstlist for performance */
 +    if ((EI_DYNAMICS(ir->eI) &&
 +         *bUseGPU &&
 +         ir->nstlist < NSTLIST_GPU_ENOUGH) ||
 +        getenv(NSTLIST_ENVVAR) != NULL)
 +    {
 +        /* Choose a better nstlist */
 +        increase_nstlist(fplog, cr, ir, mtop, box);
 +    }
 +}
 +
 +static void convert_to_verlet_scheme(FILE *fplog,
 +                                     t_inputrec *ir,
 +                                     gmx_mtop_t *mtop, real box_vol)
 +{
 +    char *conv_mesg = "Converting input file with group cut-off scheme to the Verlet cut-off scheme";
 +
 +    md_print_warn(NULL, fplog, "%s\n", conv_mesg);
 +
 +    ir->cutoff_scheme   = ecutsVERLET;
 +    ir->verletbuf_drift = 0.005;
 +
 +    if (ir->rcoulomb != ir->rvdw)
 +    {
 +        gmx_fatal(FARGS, "The VdW and Coulomb cut-offs are different, whereas the Verlet scheme only supports equal cut-offs");
 +    }
 +
 +    if (ir->vdwtype == evdwUSER || EEL_USER(ir->coulombtype))
 +    {
 +        gmx_fatal(FARGS, "User non-bonded potentials are not (yet) supported with the Verlet scheme");
 +    }
 +    else if (EVDW_SWITCHED(ir->vdwtype) || EEL_SWITCHED(ir->coulombtype))
 +    {
 +        md_print_warn(NULL, fplog, "Converting switched or shifted interactions to a shifted potential (without force shift), this will lead to slightly different interaction potentials");
 +
 +        if (EVDW_SWITCHED(ir->vdwtype))
 +        {
 +            ir->vdwtype = evdwCUT;
 +        }
 +        if (EEL_SWITCHED(ir->coulombtype))
 +        {
 +            if (EEL_FULL(ir->coulombtype))
 +            {
 +                /* With full electrostatic only PME can be switched */
 +                ir->coulombtype = eelPME;
 +            }
 +            else
 +            {
 +                md_print_warn(NULL, fplog, "NOTE: Replacing %s electrostatics with reaction-field with epsilon-rf=inf\n", eel_names[ir->coulombtype]);
 +                ir->coulombtype = eelRF;
 +                ir->epsilon_rf  = 0.0;
 +            }
 +        }
 +
 +        /* We set the target energy drift to a small number.
 +         * Note that this is only for testing. For production the user
 +         * should think about this and set the mdp options.
 +         */
 +        ir->verletbuf_drift = 1e-4;
 +    }
 +
 +    if (inputrec2nboundeddim(ir) != 3)
 +    {
 +        gmx_fatal(FARGS, "Can only convert old tpr files to the Verlet cut-off scheme with 3D pbc");
 +    }
 +
 +    if (ir->efep != efepNO || ir->implicit_solvent != eisNO)
 +    {
 +        gmx_fatal(FARGS, "Will not convert old tpr files to the Verlet cut-off scheme with free-energy calculations or implicit solvent");
 +    }
 +
 +    if (EI_DYNAMICS(ir->eI) && !(EI_MD(ir->eI) && ir->etc == etcNO))
 +    {
 +        verletbuf_list_setup_t ls;
 +
 +        verletbuf_get_list_setup(FALSE, &ls);
 +        calc_verlet_buffer_size(mtop, box_vol, ir, ir->verletbuf_drift, &ls,
 +                                NULL, &ir->rlist);
 +    }
 +    else
 +    {
 +        ir->verletbuf_drift = -1;
 +        ir->rlist           = 1.05*max(ir->rvdw, ir->rcoulomb);
 +    }
 +
 +    gmx_mtop_remove_chargegroups(mtop);
 +}
 +
 +static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt,
 +                                    int           cutoff_scheme,
 +                                    gmx_bool      bIsSimMaster)
 +{
 +    gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp, bIsSimMaster);
 +
 +#ifndef GMX_THREAD_MPI
 +    if (hw_opt->nthreads_tot > 0)
 +    {
 +        gmx_fatal(FARGS, "Setting the total number of threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
 +    }
 +    if (hw_opt->nthreads_tmpi > 0)
 +    {
 +        gmx_fatal(FARGS, "Setting the number of thread-MPI threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
 +    }
 +#endif
 +
 +    if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
 +    {
 +        /* We have the same number of OpenMP threads for PP and PME processes,
 +         * thus we can perform several consistency checks.
 +         */
 +        if (hw_opt->nthreads_tmpi > 0 &&
 +            hw_opt->nthreads_omp > 0 &&
 +            hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
 +        {
 +            gmx_fatal(FARGS, "The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
 +                      hw_opt->nthreads_tot, hw_opt->nthreads_tmpi, hw_opt->nthreads_omp);
 +        }
 +
 +        if (hw_opt->nthreads_tmpi > 0 &&
 +            hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
 +        {
 +            gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
 +                      hw_opt->nthreads_tot, hw_opt->nthreads_tmpi);
 +        }
 +
 +        if (hw_opt->nthreads_omp > 0 &&
 +            hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
 +        {
 +            gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
 +                      hw_opt->nthreads_tot, hw_opt->nthreads_omp);
 +        }
 +
 +        if (hw_opt->nthreads_tmpi > 0 &&
 +            hw_opt->nthreads_omp <= 0)
 +        {
 +            hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
 +        }
 +    }
 +
 +#ifndef GMX_OPENMP
 +    if (hw_opt->nthreads_omp > 1)
 +    {
 +        gmx_fatal(FARGS, "OpenMP threads are requested, but Gromacs was compiled without OpenMP support");
 +    }
 +#endif
 +
 +    if (cutoff_scheme == ecutsGROUP)
 +    {
 +        /* We only have OpenMP support for PME only nodes */
 +        if (hw_opt->nthreads_omp > 1)
 +        {
 +            gmx_fatal(FARGS, "OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
 +                      ecutscheme_names[cutoff_scheme],
 +                      ecutscheme_names[ecutsVERLET]);
 +        }
 +        hw_opt->nthreads_omp = 1;
 +    }
 +
 +    if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
 +    {
 +        gmx_fatal(FARGS, "You need to specify -ntomp in addition to -ntomp_pme");
 +    }
 +
 +    if (hw_opt->nthreads_tot == 1)
 +    {
 +        hw_opt->nthreads_tmpi = 1;
 +
 +        if (hw_opt->nthreads_omp > 1)
 +        {
 +            gmx_fatal(FARGS, "You requested %d OpenMP threads with %d total threads",
 +                      hw_opt->nthreads_tmpi, hw_opt->nthreads_tot);
 +        }
 +        hw_opt->nthreads_omp = 1;
 +    }
 +
 +    if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
 +    {
 +        hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
 +                hw_opt->nthreads_tot,
 +                hw_opt->nthreads_tmpi,
 +                hw_opt->nthreads_omp,
 +                hw_opt->nthreads_omp_pme,
 +                hw_opt->gpu_id != NULL ? hw_opt->gpu_id : "");
 +
 +    }
 +}
 +
 +
 +/* Override the value in inputrec with value passed on the command line (if any) */
 +static void override_nsteps_cmdline(FILE            *fplog,
 +                                    gmx_large_int_t  nsteps_cmdline,
 +                                    t_inputrec      *ir,
 +                                    const t_commrec *cr)
 +{
 +    char sbuf[STEPSTRSIZE];
 +
 +    assert(ir);
 +    assert(cr);
 +
 +    /* override with anything else than the default -2 */
 +    if (nsteps_cmdline > -2)
 +    {
 +        char stmp[STRLEN];
 +
 +        ir->nsteps = nsteps_cmdline;
 +        if (EI_DYNAMICS(ir->eI))
 +        {
 +            sprintf(stmp, "Overriding nsteps with value passed on the command line: %s steps, %.3f ps",
 +                    gmx_step_str(nsteps_cmdline, sbuf),
 +                    nsteps_cmdline*ir->delta_t);
 +        }
 +        else
 +        {
 +            sprintf(stmp, "Overriding nsteps with value passed on the command line: %s steps",
 +                    gmx_step_str(nsteps_cmdline, sbuf));
 +        }
 +
 +        md_print_warn(cr, fplog, "%s\n", stmp);
 +    }
 +}
 +
 +/* Data structure set by SIMMASTER which needs to be passed to all nodes
 + * before the other nodes have read the tpx file and called gmx_detect_hardware.
 + */
 +typedef struct {
 +    int      cutoff_scheme; /* The cutoff scheme from inputrec_t */
 +    gmx_bool bUseGPU;       /* Use GPU or GPU emulation          */
 +} master_inf_t;
 +
 +int mdrunner(gmx_hw_opt_t *hw_opt,
 +             FILE *fplog, t_commrec *cr, int nfile,
 +             const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
 +             gmx_bool bCompact, int nstglobalcomm,
 +             ivec ddxyz, int dd_node_order, real rdd, real rconstr,
 +             const char *dddlb_opt, real dlb_scale,
 +             const char *ddcsx, const char *ddcsy, const char *ddcsz,
 +             const char *nbpu_opt,
 +             gmx_large_int_t nsteps_cmdline, int nstepout, int resetstep,
 +             int nmultisim, int repl_ex_nst, int repl_ex_nex,
 +             int repl_ex_seed, real pforce, real cpt_period, real max_hours,
 +             const char *deviceOptions, unsigned long Flags)
 +{
 +    gmx_bool        bForceUseGPU, bTryUseGPU;
 +    double          nodetime = 0, realtime;
 +    t_inputrec     *inputrec;
 +    t_state        *state = NULL;
 +    matrix          box;
 +    gmx_ddbox_t     ddbox = {0};
 +    int             npme_major, npme_minor;
 +    real            tmpr1, tmpr2;
 +    t_nrnb         *nrnb;
 +    gmx_mtop_t     *mtop       = NULL;
 +    t_mdatoms      *mdatoms    = NULL;
 +    t_forcerec     *fr         = NULL;
 +    t_fcdata       *fcd        = NULL;
 +    real            ewaldcoeff = 0;
 +    gmx_pme_t      *pmedata    = NULL;
 +    gmx_vsite_t    *vsite      = NULL;
 +    gmx_constr_t    constr;
 +    int             i, m, nChargePerturbed = -1, status, nalloc;
 +    char           *gro;
 +    gmx_wallcycle_t wcycle;
 +    gmx_bool        bReadRNG, bReadEkin;
 +    int             list;
 +    gmx_runtime_t   runtime;
 +    int             rc;
 +    gmx_large_int_t reset_counters;
 +    gmx_edsam_t     ed           = NULL;
 +    t_commrec      *cr_old       = cr;
 +    int             nthreads_pme = 1;
 +    int             nthreads_pp  = 1;
 +    gmx_membed_t    membed       = NULL;
 +    gmx_hw_info_t  *hwinfo       = NULL;
 +    master_inf_t    minf         = {-1, FALSE};
 +
 +    /* CAUTION: threads may be started later on in this function, so
 +       cr doesn't reflect the final parallel state right now */
 +    snew(inputrec, 1);
 +    snew(mtop, 1);
 +
 +    if (Flags & MD_APPENDFILES)
 +    {
 +        fplog = NULL;
 +    }
 +
 +    bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
 +    bTryUseGPU   = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
 +
 +    /* Detect hardware, gather information. This is an operation that is
 +     * global for this process (MPI rank). */
 +    hwinfo = gmx_detect_hardware(fplog, cr,
 +                                 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
 +
 +
 +    snew(state, 1);
 +    if (SIMMASTER(cr))
 +    {
 +        /* Read (nearly) all data required for the simulation */
 +        read_tpx_state(ftp2fn(efTPX, nfile, fnm), inputrec, state, NULL, mtop);
 +
 +        if (inputrec->cutoff_scheme != ecutsVERLET &&
 +            ((Flags & MD_TESTVERLET) || getenv("GMX_VERLET_SCHEME") != NULL))
 +        {
 +            convert_to_verlet_scheme(fplog, inputrec, mtop, det(state->box));
 +        }
 +
 +
 +        minf.cutoff_scheme = inputrec->cutoff_scheme;
 +        minf.bUseGPU       = FALSE;
 +
 +        if (inputrec->cutoff_scheme == ecutsVERLET)
 +        {
 +            prepare_verlet_scheme(fplog, hwinfo, cr,
 +                                  inputrec, mtop, state->box,
 +                                  &minf.bUseGPU);
 +        }
 +        else if (hwinfo->bCanUseGPU)
 +        {
 +            md_print_warn(cr, fplog,
 +                          "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
 +                          "      To use a GPU, set the mdp option: cutoff-scheme = Verlet\n"
 +                          "      (for quick performance testing you can use the -testverlet option)\n");
 +
 +            if (bForceUseGPU)
 +            {
 +                gmx_fatal(FARGS, "GPU requested, but can't be used without cutoff-scheme=Verlet");
 +            }
 +        }
 +    }
 +#ifndef GMX_THREAD_MPI
 +    if (PAR(cr))
 +    {
 +        gmx_bcast_sim(sizeof(minf), &minf, cr);
 +    }
 +#endif
 +    if (minf.bUseGPU && cr->npmenodes == -1)
 +    {
 +        /* Don't automatically use PME-only nodes with GPUs */
 +        cr->npmenodes = 0;
 +    }
 +
 +    /* Check for externally set OpenMP affinity and turn off internal
 +     * pinning if any is found. We need to do this check early to tell
 +     * thread-MPI whether it should do pinning when spawning threads.
 +     * TODO: the above no longer holds, we should move these checks down
 +     */
 +    gmx_omp_check_thread_affinity(fplog, cr, hw_opt);
 +
 +#ifdef GMX_THREAD_MPI
 +    /* With thread-MPI inputrec is only set here on the master thread */
 +    if (SIMMASTER(cr))
 +#endif
 +    {
 +        check_and_update_hw_opt(hw_opt, minf.cutoff_scheme, SIMMASTER(cr));
 +
 +#ifdef GMX_THREAD_MPI
 +        /* Early check for externally set process affinity. Can't do over all
 +         * MPI processes because hwinfo is not available everywhere, but with
 +         * thread-MPI it's needed as pinning might get turned off which needs
 +         * to be known before starting thread-MPI. */
 +        gmx_check_thread_affinity_set(fplog,
 +                                      NULL,
 +                                      hw_opt, hwinfo->nthreads_hw_avail, FALSE);
 +#endif
 +
 +#ifdef GMX_THREAD_MPI
 +        if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
 +        {
 +            gmx_fatal(FARGS, "You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME nodes");
 +        }
 +#endif
 +
 +        if (hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
 +            cr->npmenodes <= 0)
 +        {
 +            gmx_fatal(FARGS, "You need to explicitly specify the number of PME nodes (-npme) when using different number of OpenMP threads for PP and PME nodes");
 +        }
 +    }
 +
 +#ifdef GMX_THREAD_MPI
 +    if (SIMMASTER(cr))
 +    {
 +        /* NOW the threads will be started: */
 +        hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
 +                                                 hw_opt,
 +                                                 inputrec, mtop,
 +                                                 cr, fplog);
 +        if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
 +        {
 +            hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
 +        }
 +
 +        if (hw_opt->nthreads_tmpi > 1)
 +        {
 +            /* now start the threads. */
 +            cr = mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm,
 +                                        oenv, bVerbose, bCompact, nstglobalcomm,
 +                                        ddxyz, dd_node_order, rdd, rconstr,
 +                                        dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
 +                                        nbpu_opt,
 +                                        nsteps_cmdline, nstepout, resetstep, nmultisim,
 +                                        repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
 +                                        cpt_period, max_hours, deviceOptions,
 +                                        Flags);
 +            /* the main thread continues here with a new cr. We don't deallocate
 +               the old cr because other threads may still be reading it. */
 +            if (cr == NULL)
 +            {
 +                gmx_comm("Failed to spawn threads");
 +            }
 +        }
 +    }
 +#endif
 +    /* END OF CAUTION: cr is now reliable */
 +
 +    /* g_membed initialisation *
 +     * Because we change the mtop, init_membed is called before the init_parallel *
 +     * (in case we ever want to make it run in parallel) */
 +    if (opt2bSet("-membed", nfile, fnm))
 +    {
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr, "Initializing membed");
 +        }
 +        membed = init_membed(fplog, nfile, fnm, mtop, inputrec, state, cr, &cpt_period);
 +    }
 +
 +    if (PAR(cr))
 +    {
 +        /* now broadcast everything to the non-master nodes/threads: */
 +        init_parallel(cr, inputrec, mtop);
 +
 +        /* This check needs to happen after get_nthreads_mpi() */
 +        if (inputrec->cutoff_scheme == ecutsVERLET && (Flags & MD_PARTDEC))
 +        {
 +            gmx_fatal_collective(FARGS, cr, NULL,
 +                                 "The Verlet cut-off scheme is not supported with particle decomposition.\n"
 +                                 "You can achieve the same effect as particle decomposition by running in parallel using only OpenMP threads.");
 +        }
 +    }
 +    if (fplog != NULL)
 +    {
 +        pr_inputrec(fplog, 0, "Input Parameters", inputrec, FALSE);
 +    }
 +
 +    /* now make sure the state is initialized and propagated */
 +    set_state_entries(state, inputrec, cr->nnodes);
 +
 +    /* A parallel command line option consistency check that we can
 +       only do after any threads have started. */
 +    if (!PAR(cr) &&
 +        (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
 +    {
 +        gmx_fatal(FARGS,
 +                  "The -dd or -npme option request a parallel simulation, "
 +#ifndef GMX_MPI
 +                  "but %s was compiled without threads or MPI enabled"
 +#else
 +#ifdef GMX_THREAD_MPI
 +                  "but the number of threads (option -nt) is 1"
 +#else
 +                  "but %s was not started through mpirun/mpiexec or only one process was requested through mpirun/mpiexec"
 +#endif
 +#endif
 +                  , ShortProgram()
 +                  );
 +    }
 +
 +    if ((Flags & MD_RERUN) &&
 +        (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
 +    {
 +        gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
 +    }
 +
 +    if (can_use_allvsall(inputrec, TRUE, cr, fplog) && PAR(cr))
 +    {
 +        /* Simple neighbour searching and (also?) all-vs-all loops
 +         * do not work with domain decomposition. */
 +        Flags |= MD_PARTDEC;
 +    }
 +
 +    if (!EEL_PME(inputrec->coulombtype) || (Flags & MD_PARTDEC))
 +    {
 +        if (cr->npmenodes > 0)
 +        {
 +            if (!EEL_PME(inputrec->coulombtype))
 +            {
 +                gmx_fatal_collective(FARGS, cr, NULL,
 +                                     "PME nodes are requested, but the system does not use PME electrostatics");
 +            }
 +            if (Flags & MD_PARTDEC)
 +            {
 +                gmx_fatal_collective(FARGS, cr, NULL,
 +                                     "PME nodes are requested, but particle decomposition does not support separate PME nodes");
 +            }
 +        }
 +
 +        cr->npmenodes = 0;
 +    }
 +
 +#ifdef GMX_FAHCORE
 +    fcRegisterSteps(inputrec->nsteps, inputrec->init_step);
 +#endif
 +
 +    /* NMR restraints must be initialized before load_checkpoint,
 +     * since with time averaging the history is added to t_state.
 +     * For proper consistency check we therefore need to extend
 +     * t_state here.
 +     * So the PME-only nodes (if present) will also initialize
 +     * the distance restraints.
 +     */
 +    snew(fcd, 1);
 +
 +    /* This needs to be called before read_checkpoint to extend the state */
 +    init_disres(fplog, mtop, inputrec, cr, Flags & MD_PARTDEC, fcd, state, repl_ex_nst > 0);
 +
 +    if (gmx_mtop_ftype_count(mtop, F_ORIRES) > 0)
 +    {
 +        if (PAR(cr) && !(Flags & MD_PARTDEC))
 +        {
 +            gmx_fatal(FARGS, "Orientation restraints do not work (yet) with domain decomposition, use particle decomposition (mdrun option -pd)");
 +        }
 +        /* Orientation restraints */
 +        if (MASTER(cr))
 +        {
 +            init_orires(fplog, mtop, state->x, inputrec, cr->ms, &(fcd->orires),
 +                        state);
 +        }
 +    }
 +
 +    if (DEFORM(*inputrec))
 +    {
 +        /* Store the deform reference box before reading the checkpoint */
 +        if (SIMMASTER(cr))
 +        {
 +            copy_mat(state->box, box);
 +        }
 +        if (PAR(cr))
 +        {
 +            gmx_bcast(sizeof(box), box, cr);
 +        }
 +        /* Because we do not have the update struct available yet
 +         * in which the reference values should be stored,
 +         * we store them temporarily in static variables.
 +         * This should be thread safe, since they are only written once
 +         * and with identical values.
 +         */
 +#ifdef GMX_THREAD_MPI
 +        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
 +#endif
 +        deform_init_init_step_tpx = inputrec->init_step;
 +        copy_mat(box, deform_init_box_tpx);
 +#ifdef GMX_THREAD_MPI
 +        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
 +#endif
 +    }
 +
 +    if (opt2bSet("-cpi", nfile, fnm))
 +    {
 +        /* Check if checkpoint file exists before doing continuation.
 +         * This way we can use identical input options for the first and subsequent runs...
 +         */
 +        if (gmx_fexist_master(opt2fn_master("-cpi", nfile, fnm, cr), cr) )
 +        {
 +            load_checkpoint(opt2fn_master("-cpi", nfile, fnm, cr), &fplog,
 +                            cr, Flags & MD_PARTDEC, ddxyz,
 +                            inputrec, state, &bReadRNG, &bReadEkin,
 +                            (Flags & MD_APPENDFILES),
 +                            (Flags & MD_APPENDFILESSET));
 +
 +            if (bReadRNG)
 +            {
 +                Flags |= MD_READ_RNG;
 +            }
 +            if (bReadEkin)
 +            {
 +                Flags |= MD_READ_EKIN;
 +            }
 +        }
 +    }
 +
 +    if (((MASTER(cr) || (Flags & MD_SEPPOT)) && (Flags & MD_APPENDFILES))
 +#ifdef GMX_THREAD_MPI
 +        /* With thread MPI only the master node/thread exists in mdrun.c,
 +         * therefore non-master nodes need to open the "seppot" log file here.
 +         */
 +        || (!MASTER(cr) && (Flags & MD_SEPPOT))
 +#endif
 +        )
 +    {
 +        gmx_log_open(ftp2fn(efLOG, nfile, fnm), cr, !(Flags & MD_SEPPOT),
 +                     Flags, &fplog);
 +    }
 +
 +    /* override nsteps with value from cmdline */
 +    override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
 +
 +    if (SIMMASTER(cr))
 +    {
 +        copy_mat(state->box, box);
 +    }
 +
 +    if (PAR(cr))
 +    {
 +        gmx_bcast(sizeof(box), box, cr);
 +    }
 +
 +    /* Essential dynamics */
 +    if (opt2bSet("-ei", nfile, fnm))
 +    {
 +        /* Open input and output files, allocate space for ED data structure */
 +        ed = ed_open(mtop->natoms, &state->edsamstate, nfile, fnm, Flags, oenv, cr);
 +    }
 +
 +    if (PAR(cr) && !((Flags & MD_PARTDEC) ||
 +                     EI_TPI(inputrec->eI) ||
 +                     inputrec->eI == eiNM))
 +    {
 +        cr->dd = init_domain_decomposition(fplog, cr, Flags, ddxyz, rdd, rconstr,
 +                                           dddlb_opt, dlb_scale,
 +                                           ddcsx, ddcsy, ddcsz,
 +                                           mtop, inputrec,
 +                                           box, state->x,
 +                                           &ddbox, &npme_major, &npme_minor);
 +
 +        make_dd_communicators(fplog, cr, dd_node_order);
 +
 +        /* Set overallocation to avoid frequent reallocation of arrays */
 +        set_over_alloc_dd(TRUE);
 +    }
 +    else
 +    {
 +        /* PME, if used, is done on all nodes with 1D decomposition */
 +        cr->npmenodes = 0;
 +        cr->duty      = (DUTY_PP | DUTY_PME);
 +        npme_major    = 1;
 +        npme_minor    = 1;
++        /* NM and TPI perform single node energy calculations in parallel */
++        if (!(inputrec->eI == eiNM || EI_TPI(inputrec->eI)))
 +        {
 +            npme_major = cr->nnodes;
 +        }
 +
 +        if (inputrec->ePBC == epbcSCREW)
 +        {
 +            gmx_fatal(FARGS,
 +                      "pbc=%s is only implemented with domain decomposition",
 +                      epbc_names[inputrec->ePBC]);
 +        }
 +    }
 +
 +    if (PAR(cr))
 +    {
 +        /* After possible communicator splitting in make_dd_communicators.
 +         * we can set up the intra/inter node communication.
 +         */
 +        gmx_setup_nodecomm(fplog, cr);
 +    }
 +
 +    /* Initialize per-physical-node MPI process/thread ID and counters. */
 +    gmx_init_intranode_counters(cr);
 +
 +#ifdef GMX_MPI
 +    md_print_info(cr, fplog, "Using %d MPI %s\n",
 +                  cr->nnodes,
 +#ifdef GMX_THREAD_MPI
 +                  cr->nnodes == 1 ? "thread" : "threads"
 +#else
 +                  cr->nnodes == 1 ? "process" : "processes"
 +#endif
 +                  );
 +    fflush(stderr);
 +#endif
 +
 +    gmx_omp_nthreads_init(fplog, cr,
 +                          hwinfo->nthreads_hw_avail,
 +                          hw_opt->nthreads_omp,
 +                          hw_opt->nthreads_omp_pme,
 +                          (cr->duty & DUTY_PP) == 0,
 +                          inputrec->cutoff_scheme == ecutsVERLET);
 +
 +    /* check consistency and decide on the number of gpus to use. */
 +    gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi,
 +                                     minf.bUseGPU);
 +
 +    /* getting number of PP/PME threads
 +       PME: env variable should be read only on one node to make sure it is
 +       identical everywhere;
 +     */
 +    /* TODO nthreads_pp is only used for pinning threads.
 +     * This is a temporary solution until we have a hw topology library.
 +     */
 +    nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
 +    nthreads_pme = gmx_omp_nthreads_get(emntPME);
 +
 +    wcycle = wallcycle_init(fplog, resetstep, cr, nthreads_pp, nthreads_pme);
 +
 +    if (PAR(cr))
 +    {
 +        /* Master synchronizes its value of reset_counters with all nodes
 +         * including PME only nodes */
 +        reset_counters = wcycle_get_reset_counters(wcycle);
 +        gmx_bcast_sim(sizeof(reset_counters), &reset_counters, cr);
 +        wcycle_set_reset_counters(wcycle, reset_counters);
 +    }
 +
 +    snew(nrnb, 1);
 +    if (cr->duty & DUTY_PP)
 +    {
 +        /* For domain decomposition we allocate dynamically
 +         * in dd_partition_system.
 +         */
 +        if (DOMAINDECOMP(cr))
 +        {
 +            bcast_state_setup(cr, state);
 +        }
 +        else
 +        {
 +            if (PAR(cr))
 +            {
 +                bcast_state(cr, state, TRUE);
 +            }
 +        }
 +
 +        /* Initiate forcerecord */
 +        fr         = mk_forcerec();
 +        fr->hwinfo = hwinfo;
 +        init_forcerec(fplog, oenv, fr, fcd, inputrec, mtop, cr, box,
 +                      opt2fn("-table", nfile, fnm),
 +                      opt2fn("-tabletf", nfile, fnm),
 +                      opt2fn("-tablep", nfile, fnm),
 +                      opt2fn("-tableb", nfile, fnm),
 +                      nbpu_opt,
 +                      FALSE, pforce);
 +
 +        /* version for PCA_NOT_READ_NODE (see md.c) */
 +        /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
 +           "nofile","nofile","nofile","nofile",FALSE,pforce);
 +         */
 +        fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT);
 +
 +        /* Initialize QM-MM */
 +        if (fr->bQMMM)
 +        {
 +            init_QMMMrec(cr, mtop, inputrec, fr);
 +        }
 +
 +        /* Initialize the mdatoms structure.
 +         * mdatoms is not filled with atom data,
 +         * as this can not be done now with domain decomposition.
 +         */
 +        mdatoms = init_mdatoms(fplog, mtop, inputrec->efep != efepNO);
 +
 +        if (mdatoms->nPerturbed > 0 && inputrec->cutoff_scheme == ecutsVERLET)
 +        {
 +            gmx_fatal(FARGS, "The Verlet cut-off scheme does not (yet) support free-energy calculations with perturbed atoms, only perturbed interactions. This will be implemented soon. Use the group scheme for now.");
 +        }
 +
 +        /* Initialize the virtual site communication */
 +        vsite = init_vsite(mtop, cr, FALSE);
 +
 +        calc_shifts(box, fr->shift_vec);
 +
 +        /* With periodic molecules the charge groups should be whole at start up
 +         * and the virtual sites should not be far from their proper positions.
 +         */
 +        if (!inputrec->bContinuation && MASTER(cr) &&
 +            !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
 +        {
 +            /* Make molecules whole at start of run */
 +            if (fr->ePBC != epbcNONE)
 +            {
 +                do_pbc_first_mtop(fplog, inputrec->ePBC, box, mtop, state->x);
 +            }
 +            if (vsite)
 +            {
 +                /* Correct initial vsite positions are required
 +                 * for the initial distribution in the domain decomposition
 +                 * and for the initial shell prediction.
 +                 */
 +                construct_vsites_mtop(vsite, mtop, state->x);
 +            }
 +        }
 +
 +        if (EEL_PME(fr->eeltype))
 +        {
 +            ewaldcoeff = fr->ewaldcoeff;
 +            pmedata    = &fr->pmedata;
 +        }
 +        else
 +        {
 +            pmedata = NULL;
 +        }
 +    }
 +    else
 +    {
 +        /* This is a PME only node */
 +
 +        /* We don't need the state */
 +        done_state(state);
 +
 +        ewaldcoeff = calc_ewaldcoeff(inputrec->rcoulomb, inputrec->ewald_rtol);
 +        snew(pmedata, 1);
 +    }
 +
 +    if (hw_opt->thread_affinity != threadaffOFF)
 +    {
 +        /* Before setting affinity, check whether the affinity has changed
 +         * - which indicates that probably the OpenMP library has changed it
 +         * since we first checked).
 +         */
 +        gmx_check_thread_affinity_set(fplog, cr,
 +                                      hw_opt, hwinfo->nthreads_hw_avail, TRUE);
 +
 +        /* Set the CPU affinity */
 +        gmx_set_thread_affinity(fplog, cr, hw_opt, hwinfo);
 +    }
 +
 +    /* Initiate PME if necessary,
 +     * either on all nodes or on dedicated PME nodes only. */
 +    if (EEL_PME(inputrec->coulombtype))
 +    {
 +        if (mdatoms)
 +        {
 +            nChargePerturbed = mdatoms->nChargePerturbed;
 +        }
 +        if (cr->npmenodes > 0)
 +        {
 +            /* The PME only nodes need to know nChargePerturbed */
 +            gmx_bcast_sim(sizeof(nChargePerturbed), &nChargePerturbed, cr);
 +        }
 +
 +        if (cr->duty & DUTY_PME)
 +        {
 +            status = gmx_pme_init(pmedata, cr, npme_major, npme_minor, inputrec,
 +                                  mtop ? mtop->natoms : 0, nChargePerturbed,
 +                                  (Flags & MD_REPRODUCIBLE), nthreads_pme);
 +            if (status != 0)
 +            {
 +                gmx_fatal(FARGS, "Error %d initializing PME", status);
 +            }
 +        }
 +    }
 +
 +
 +    if (integrator[inputrec->eI].func == do_md)
 +    {
 +        /* Turn on signal handling on all nodes */
 +        /*
 +         * (A user signal from the PME nodes (if any)
 +         * is communicated to the PP nodes.
 +         */
 +        signal_handler_install();
 +    }
 +
 +    if (cr->duty & DUTY_PP)
 +    {
 +        if (inputrec->ePull != epullNO)
 +        {
 +            /* Initialize pull code */
 +            init_pull(fplog, inputrec, nfile, fnm, mtop, cr, oenv, inputrec->fepvals->init_lambda,
 +                      EI_DYNAMICS(inputrec->eI) && MASTER(cr), Flags);
 +        }
 +
 +        if (inputrec->bRot)
 +        {
 +            /* Initialize enforced rotation code */
 +            init_rot(fplog, inputrec, nfile, fnm, cr, state->x, box, mtop, oenv,
 +                     bVerbose, Flags);
 +        }
 +
 +        constr = init_constraints(fplog, mtop, inputrec, ed, state, cr);
 +
 +        if (DOMAINDECOMP(cr))
 +        {
 +            dd_init_bondeds(fplog, cr->dd, mtop, vsite, inputrec,
 +                            Flags & MD_DDBONDCHECK, fr->cginfo_mb);
 +
 +            set_dd_parameters(fplog, cr->dd, dlb_scale, inputrec, &ddbox);
 +
 +            setup_dd_grid(fplog, cr->dd);
 +        }
 +
 +        /* Now do whatever the user wants us to do (how flexible...) */
 +        integrator[inputrec->eI].func(fplog, cr, nfile, fnm,
 +                                      oenv, bVerbose, bCompact,
 +                                      nstglobalcomm,
 +                                      vsite, constr,
 +                                      nstepout, inputrec, mtop,
 +                                      fcd, state,
 +                                      mdatoms, nrnb, wcycle, ed, fr,
 +                                      repl_ex_nst, repl_ex_nex, repl_ex_seed,
 +                                      membed,
 +                                      cpt_period, max_hours,
 +                                      deviceOptions,
 +                                      Flags,
 +                                      &runtime);
 +
 +        if (inputrec->ePull != epullNO)
 +        {
 +            finish_pull(inputrec->pull);
 +        }
 +
 +        if (inputrec->bRot)
 +        {
 +            finish_rot(inputrec->rot);
 +        }
 +
 +    }
 +    else
 +    {
 +        /* do PME only */
 +        gmx_pmeonly(*pmedata, cr, nrnb, wcycle, ewaldcoeff, inputrec);
 +    }
 +
 +    if (EI_DYNAMICS(inputrec->eI) || EI_TPI(inputrec->eI))
 +    {
 +        /* Some timing stats */
 +        if (SIMMASTER(cr))
 +        {
 +            if (runtime.proc == 0)
 +            {
 +                runtime.proc = runtime.real;
 +            }
 +        }
 +        else
 +        {
 +            runtime.real = 0;
 +        }
 +    }
 +
 +    wallcycle_stop(wcycle, ewcRUN);
 +
 +    /* Finish up, write some stuff
 +     * if rerunMD, don't write last frame again
 +     */
 +    finish_run(fplog, cr,
 +               inputrec, nrnb, wcycle, &runtime,
 +               fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ?
 +               nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL,
 +               EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
 +
 +    if ((cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU)
 +    {
 +        char gpu_err_str[STRLEN];
 +
 +        /* free GPU memory and uninitialize GPU (by destroying the context) */
 +        nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
 +
 +        if (!free_gpu(gpu_err_str))
 +        {
 +            gmx_warning("On node %d failed to free GPU #%d: %s",
 +                        cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
 +        }
 +    }
 +
 +    if (opt2bSet("-membed", nfile, fnm))
 +    {
 +        sfree(membed);
 +    }
 +
 +    gmx_hardware_info_free(hwinfo);
 +
 +    /* Does what it says */
 +    print_date_and_time(fplog, cr->nodeid, "Finished mdrun", &runtime);
 +
 +    /* Close logfile already here if we were appending to it */
 +    if (MASTER(cr) && (Flags & MD_APPENDFILES))
 +    {
 +        gmx_log_close(fplog);
 +    }
 +
 +    rc = (int)gmx_get_stop_condition();
 +
 +#ifdef GMX_THREAD_MPI
 +    /* we need to join all threads. The sub-threads join when they
 +       exit this function, but the master thread needs to be told to
 +       wait for that. */
 +    if (PAR(cr) && MASTER(cr))
 +    {
 +        tMPI_Finalize();
 +    }
 +#endif
 +
 +    return rc;
 +}