Merge branch 'release-4-6'
authorMark Abraham <mark.j.abraham@gmail.com>
Mon, 2 Dec 2013 15:01:51 +0000 (16:01 +0100)
committerMark Abraham <mark.j.abraham@gmail.com>
Mon, 2 Dec 2013 15:01:51 +0000 (16:01 +0100)
Conflicts:
src/gromacs/legacyheaders/topsort.h
Resolved in favour of comment change in release-4-6

Change-Id: Ib930277635cdd260c136ab9b3e02809682587252

1  2 
admin/installguide/installguide.tex
src/gromacs/gmxlib/gpu_utils/gpu_utils.cu
src/gromacs/gmxlib/topsort.c
src/gromacs/legacyheaders/topsort.h
src/gromacs/mdlib/domdec.c
src/gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_outer.h
src/programs/mdrun/runner.c

Simple merge
index dcb701f4ac31731c8bd25e33e0fca3441ca56049,0000000000000000000000000000000000000000..eb3e7dd7813f227af57b597337265613e43edf23
mode 100644,000000..100644
--- /dev/null
@@@ -1,892 -1,0 +1,889 @@@
-     if (idx < 0 && idx >= gpu_opt->ncuda_dev_use)
-     {
-         return -1;
-     }
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2010,2012 The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +
 +#include <stdio.h>
 +#include <stdlib.h>
 +#include <assert.h>
 +
 +#include "smalloc.h"
 +#include "string2.h"
 +#include "types/hw_info.h"
 +
 +#include "gpu_utils.h"
 +#include "../cuda_tools/cudautils.cuh"
 +#include "memtestG80_core.h"
 +
 +
 +#define QUICK_MEM       250 /*!< Amount of memory to be used in quick memtest. */
 +#define QUICK_TESTS     MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS /*!< Bit flag with type of tests
 +                                                                            to run in quick memtest. */
 +#define QUICK_ITER      3 /*!< Number of iterations in quick memtest. */
 +
 +#define FULL_TESTS      0x3FFF /*!<  Bitflag with all test set on for full memetest. */
 +#define FULL_ITER       25 /*!< Number of iterations in full memtest. */
 +
 +#define TIMED_TESTS     MOD_20_32BIT | LOGIC_4_ITER_SHMEM | RANDOM_BLOCKS /*!< Bit flag with type of tests to
 +                                                                            run in time constrained memtest. */
 +
 +static int cuda_max_device_count = 32; /*! Max number of devices supported by CUDA (for consistency checking).
 +                                           In reality it 16 with CUDA <=v5.0, but let's stay on the safe side. */
 +
 +/*! Dummy kernel used for sanity checking. */
 +__global__ void k_dummy_test(){}
 +
 +
 +/*! Bit-flags which refer to memtestG80 test types and are used in do_memtest to specify which tests to run. */
 +enum memtest_G80_test_types {
 +    MOVING_INVERSIONS_10 =      0x1,
 +    MOVING_INVERSIONS_RAND =    0x2,
 +    WALKING_8BIT_M86 =          0x4,
 +    WALKING_0_8BIT =            0x8,
 +    WALKING_1_8BIT =            0x10,
 +    WALKING_0_32BIT =           0x20,
 +    WALKING_1_32BIT =           0x40,
 +    RANDOM_BLOCKS =             0x80,
 +    MOD_20_32BIT =              0x100,
 +    LOGIC_1_ITER =              0x200,
 +    LOGIC_4_ITER =              0x400,
 +    LOGIC_1_ITER_SHMEM =        0x800,
 +    LOGIC_4_ITER_SHMEM =        0x1000
 +};
 +
 +
 +/*! 
 +  * \brief Runs GPU sanity checks.
 +  *
 +  * Runs a series of checks to determine that the given GPU and underlying CUDA
 +  * driver/runtime functions properly.
 +  * Returns properties of a device with given ID or the one that has
 +  * already been initialized earlier in the case if of \dev_id == -1.
 +  *
 +  * \param[in]  dev_id      the device ID of the GPU or -1 if the device has already been initialized
 +  * \param[out] dev_prop    pointer to the structure in which the device properties will be returned
 +  * \returns                0 if the device looks OK
 +  *
 +  * TODO: introduce errors codes and handle errors more smoothly.
 +  */
 +static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
 +{
 +    cudaError_t cu_err;
 +    int         dev_count, id;
 +
 +    cu_err = cudaGetDeviceCount(&dev_count);
 +    if (cu_err != cudaSuccess)
 +    {
 +       fprintf(stderr, "Error %d while querying device count: %s\n", cu_err,
 +               cudaGetErrorString(cu_err));
 +        return -1;
 +    }
 +
 +    /* no CUDA compatible device at all */
 +    if (dev_count == 0)
 +        return -1;
 +
 +    /* things might go horribly wrong if cudart is not compatible with the driver */
 +    if (dev_count < 0 || dev_count > cuda_max_device_count)
 +        return -1;
 +
 +    if (dev_id == -1) /* device already selected let's not destroy the context */
 +    {
 +        cu_err = cudaGetDevice(&id);
 +        if (cu_err != cudaSuccess)
 +        {
 +            fprintf(stderr, "Error %d while querying device id: %s\n", cu_err,
 +                    cudaGetErrorString(cu_err));
 +            return -1;
 +        }
 +    }
 +    else
 +    {
 +        id = dev_id;
 +        if (id > dev_count - 1) /* pfff there's no such device */
 +        {
 +            fprintf(stderr, "The requested device with id %d does not seem to exist (device count=%d)\n",
 +                    dev_id, dev_count);
 +            return -1;
 +        }
 +    }
 +
 +    memset(dev_prop, 0, sizeof(cudaDeviceProp));
 +    cu_err = cudaGetDeviceProperties(dev_prop, id);
 +    if (cu_err != cudaSuccess)
 +    {
 +        fprintf(stderr, "Error %d while querying device properties: %s\n", cu_err,
 +                cudaGetErrorString(cu_err));
 +        return -1;
 +    }
 +
 +    /* both major & minor is 9999 if no CUDA capable devices are present */
 +    if (dev_prop->major == 9999 && dev_prop->minor == 9999)
 +        return -1;
 +    /* we don't care about emulation mode */
 +    if (dev_prop->major == 0)
 +        return -1;
 +
 +    if (id != -1)
 +    {
 +        cu_err = cudaSetDevice(id);
 +        if (cu_err != cudaSuccess)
 +        {
 +            fprintf(stderr, "Error %d while switching to device #%d: %s\n",
 +                    cu_err, id, cudaGetErrorString(cu_err));
 +            return -1;
 +        }
 +    }
 +
 +    /* try to execute a dummy kernel */
 +    k_dummy_test<<<1, 512>>>();
 +    if (cudaThreadSynchronize() != cudaSuccess)
 +    {
 +        return -1;
 +    }
 +
 +    /* destroy context if we created one */
 +    if (id != -1)
 +    {
 +#if CUDA_VERSION < 4000
 +        cu_err = cudaThreadExit();
 +        CU_RET_ERR(cu_err, "cudaThreadExit failed");
 +#else
 +        cu_err = cudaDeviceReset();
 +        CU_RET_ERR(cu_err, "cudaDeviceReset failed");
 +#endif
 +    }
 +
 +    return 0;
 +}
 +
 +
 +/*!
 + * \brief Runs a set of memory tests specified by the given bit-flags.
 + * Tries to allocate and do the test on \p megs Mb memory or
 + * the greatest amount that can be allocated (>10Mb).
 + * In case if an error is detected it stops without finishing the remaining
 + * steps/iterations and returns greater then zero value.
 + * In case of other errors (e.g. kernel launch errors, device querying errors)
 + * -1 is returned.
 + *
 + * \param[in] which_tests   variable with bit-flags of the requested tests
 + * \param[in] megs          amount of memory that will be tested in MB
 + * \param[in] iter          number of iterations
 + * \returns                 0 if no error was detected, otherwise >0
 + */
 +static int do_memtest(unsigned int which_tests, int megs, int iter)
 +{
 +    memtestState    tester;
 +    int             i;
 +    uint            err_count; //, err_iter;
 +
 +    // no parameter check as this fn won't be called externally
 +
 +    // let's try to allocate the mem
 +    while (!tester.allocate(megs) && (megs - 10 > 0))
 +        { megs -= 10; tester.deallocate(); }
 +
 +    if (megs <= 10)
 +    {
 +        fprintf(stderr, "Unable to allocate GPU memory!\n");
 +        return -1;
 +    }
 +
 +    // clear the first 18 bits
 +    which_tests &= 0x3FFF;
 +    for (i = 0; i < iter; i++)
 +    {
 +        // Moving Inversions (ones and zeros)
 +        if ((MOVING_INVERSIONS_10 & which_tests) == MOVING_INVERSIONS_10)
 +        {
 +            tester.gpuMovingInversionsOnesZeros(err_count);
 +            if (err_count > 0)
 +                return MOVING_INVERSIONS_10;
 +        }
 +        // Moving Inversions (random)
 +        if ((MOVING_INVERSIONS_RAND & which_tests) == MOVING_INVERSIONS_RAND)
 +        {
 +            tester.gpuMovingInversionsRandom(err_count);
 +            if (err_count > 0)
 +                return MOVING_INVERSIONS_RAND;
 +        }
 +       // Memtest86 Walking 8-bit
 +        if ((WALKING_8BIT_M86 & which_tests) == WALKING_8BIT_M86)
 +        {
 +            for (uint shift = 0; shift < 8; shift++)
 +            {
 +                tester.gpuWalking8BitM86(err_count, shift);
 +                if (err_count > 0)
 +                    return WALKING_8BIT_M86;
 +            }
 +      }
 +        // True Walking zeros (8-bit)
 +        if ((WALKING_0_8BIT & which_tests) == WALKING_0_8BIT)
 +        {
 +            for (uint shift = 0; shift < 8; shift++)
 +            {
 +                tester.gpuWalking8Bit(err_count, false, shift);
 +                if (err_count > 0)
 +                    return WALKING_0_8BIT;
 +            }
 +        }
 +        // True Walking ones (8-bit)
 +        if ((WALKING_1_8BIT & which_tests) == WALKING_1_8BIT)
 +        {
 +            for (uint shift = 0; shift < 8; shift++)
 +            {
 +                tester.gpuWalking8Bit(err_count, true, shift);
 +                if (err_count > 0)
 +                    return WALKING_1_8BIT;
 +            }
 +        }
 +        // Memtest86 Walking zeros (32-bit)
 +        if ((WALKING_0_32BIT & which_tests) == WALKING_0_32BIT)
 +        {
 +            for (uint shift = 0; shift < 32; shift++)
 +            {
 +                tester.gpuWalking32Bit(err_count, false, shift);
 +                if (err_count > 0)
 +                    return WALKING_0_32BIT;
 +            }
 +        }
 +       // Memtest86 Walking ones (32-bit)
 +        if ((WALKING_1_32BIT & which_tests) == WALKING_1_32BIT)
 +        {
 +            for (uint shift = 0; shift < 32; shift++)
 +            {
 +                tester.gpuWalking32Bit(err_count, true, shift);
 +                if (err_count > 0)
 +                    return WALKING_1_32BIT;
 +            }
 +       }
 +        // Random blocks
 +        if ((RANDOM_BLOCKS & which_tests) == RANDOM_BLOCKS)
 +        {
 +            tester.gpuRandomBlocks(err_count,rand());
 +            if (err_count > 0)
 +                return RANDOM_BLOCKS;
 +
 +        }
 +
 +        // Memtest86 Modulo-20
 +        if ((MOD_20_32BIT & which_tests) == MOD_20_32BIT)
 +        {
 +            for (uint shift = 0; shift < 20; shift++)
 +            {
 +                tester.gpuModuloX(err_count, shift, rand(), 20, 2);
 +                if (err_count > 0)
 +                    return MOD_20_32BIT;
 +            }
 +        }
 +        // Logic (one iteration)
 +        if ((LOGIC_1_ITER & which_tests) == LOGIC_1_ITER)
 +        {
 +            tester.gpuShortLCG0(err_count,1);
 +            if (err_count > 0)
 +                return LOGIC_1_ITER;
 +        }
 +        // Logic (4 iterations)
 +        if ((LOGIC_4_ITER & which_tests) == LOGIC_4_ITER)
 +        {
 +            tester.gpuShortLCG0(err_count,4);
 +            if (err_count > 0)
 +                return LOGIC_4_ITER;
 +
 +        }
 +        // Logic (shared memory, one iteration)
 +        if ((LOGIC_1_ITER_SHMEM & which_tests) == LOGIC_1_ITER_SHMEM)
 +        {
 +            tester.gpuShortLCG0Shmem(err_count,1);
 +            if (err_count > 0)
 +                return LOGIC_1_ITER_SHMEM;
 +        }
 +        // Logic (shared-memory, 4 iterations)
 +        if ((LOGIC_4_ITER_SHMEM & which_tests) == LOGIC_4_ITER_SHMEM)
 +        {
 +            tester.gpuShortLCG0Shmem(err_count,4);
 +            if (err_count > 0)
 +                return LOGIC_4_ITER_SHMEM;
 +        }
 +    }
 +
 +    tester.deallocate();
 +    return err_count;
 +}
 +
 +/*! \brief Runs a quick memory test and returns 0 in case if no error is detected.
 + * If an error is detected it stops before completing the test and returns a
 + * value greater then 0. In case of other errors (e.g. kernel launch errors,
 + * device querying errors) -1 is returned.
 + *
 + * \param[in] dev_id    the device id of the GPU or -1 if the device has already been selected
 + * \returns             0 if no error was detected, otherwise >0
 + */
 +int do_quick_memtest(int dev_id)
 +{
 +    cudaDeviceProp  dev_prop;
 +    int             devmem, res, time=0;
 +
 +    if (debug) { time = getTimeMilliseconds(); }
 +
 +    if (do_sanity_checks(dev_id, &dev_prop) != 0)
 +    {
 +        // something went wrong
 +        return -1;
 +    }
 +
 +    if (debug)
 +    {
 +        devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
 +        fprintf(debug, ">> Running QUICK memtests on %d MiB (out of total %d MiB), %d iterations\n",
 +            QUICK_MEM, devmem, QUICK_ITER);
 +    }
 +
 +    res = do_memtest(QUICK_TESTS, QUICK_MEM, QUICK_ITER);
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Q-RES = %d\n", res);
 +        fprintf(debug, "Q-runtime: %d ms\n", getTimeMilliseconds() - time);
 +    }
 +
 +    /* destroy context only if we created it */
 +    if (dev_id !=-1) cudaThreadExit();
 +    return res;
 +}
 +
 +/*! \brief Runs a full memory test and returns 0 in case if no error is detected.
 + * If an error is detected  it stops before completing the test and returns a
 + * value greater then 0. In case of other errors (e.g. kernel launch errors,
 + * device querying errors) -1 is returned.
 + *
 + * \param[in] dev_id    the device id of the GPU or -1 if the device has already been selected
 + * \returns             0 if no error was detected, otherwise >0
 + */
 +
 +int do_full_memtest(int dev_id)
 +{
 +    cudaDeviceProp  dev_prop;
 +    int             devmem, res, time=0;
 +
 +    if (debug) { time = getTimeMilliseconds(); }
 +
 +    if (do_sanity_checks(dev_id, &dev_prop) != 0)
 +    {
 +        // something went wrong
 +        return -1;
 +    }
 +
 +    devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
 +
 +    if (debug) 
 +    { 
 +        fprintf(debug, ">> Running FULL memtests on %d MiB (out of total %d MiB), %d iterations\n",
 +            devmem, devmem, FULL_ITER); 
 +    }
 +
 +    /* do all test on the entire memory */
 +    res = do_memtest(FULL_TESTS, devmem, FULL_ITER);
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "F-RES = %d\n", res);
 +        fprintf(debug, "F-runtime: %d ms\n", getTimeMilliseconds() - time);
 +    }
 +
 +    /* destroy context only if we created it */
 +    if (dev_id != -1) cudaThreadExit();
 +    return res;
 +}
 +
 +/*! \brief Runs a time constrained memory test and returns 0 in case if no error is detected.
 + * If an error is detected it stops before completing the test and returns a value greater
 + * than zero. In case of other errors (e.g. kernel launch errors, device querying errors) -1
 + * is returned. Note, that test iterations are not interrupted therefor the total runtime of
 + * the test will always be multipple of one iteration's runtime.
 + *
 + * \param[in] dev_id        the device id of the GPU or -1 if the device has laredy been selected
 + * \param[in] time_constr   the time limit of the testing
 + * \returns                 0 if no error was detected, otherwise >0
 + */
 +int do_timed_memtest(int dev_id, int time_constr)
 +{
 +    cudaDeviceProp  dev_prop;
 +    int             devmem, res=0, time=0, startt;
 +
 +    if (debug) { time = getTimeMilliseconds(); }
 +
 +    time_constr *= 1000;  /* convert to ms for convenience */
 +    startt = getTimeMilliseconds();
 +
 +    if (do_sanity_checks(dev_id, &dev_prop) != 0)
 +    {
 +        // something went wrong
 +        return -1;
 +    }
 +
 +    devmem = dev_prop.totalGlobalMem/(1024*1024); // in MiB
 +
 +    if (debug) 
 +    { 
 +        fprintf(debug, ">> Running time constrained memtests on %d MiB (out of total %d MiB), time limit of %d s \n",
 +        devmem, devmem, time_constr); 
 +    }
 +
 +    /* do the TIMED_TESTS set, one step at a time on the entire memory 
 +       that can be allocated, and stop when the given time is exceeded */
 +    while ( ((int)getTimeMilliseconds() - startt) < time_constr)
 +    {        
 +        res = do_memtest(TIMED_TESTS, devmem, 1);
 +        if (res != 0) break;
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "T-RES = %d\n", res);
 +        fprintf(debug, "T-runtime: %d ms\n", getTimeMilliseconds() - time);
 +    }
 +
 +    /* destroy context only if we created it */
 +    if (dev_id != -1) cudaThreadExit();
 +    return res;
 +}
 +
 +/*! \brief Initializes the GPU with the given index.
 + *
 + * The varible \mygpu is the index of the GPU to initialize in the
 + * gpu_info.cuda_dev array.
 + *
 + * \param[in]  mygpu        index of the GPU to initialize
 + * \param[out] result_str   the message related to the error that occurred
 + *                          during the initialization (if there was any).
 + * \param[in] gpu_info      GPU info of all detected devices in the system.
 + * \param[in] gpu_opt       options for using the GPUs in gpu_info
 + * \returns                 true if no error occurs during initialization.
 + */
 +gmx_bool init_gpu(int mygpu, char *result_str,
 +                  const gmx_gpu_info_t *gpu_info,
 +                  const gmx_gpu_opt_t *gpu_opt)
 +{
 +    cudaError_t stat;
 +    char sbuf[STRLEN];
 +    int gpuid;
 +
 +    assert(gpu_info);
 +    assert(result_str);
 +
 +    if (mygpu < 0 || mygpu >= gpu_opt->ncuda_dev_use)
 +    {
 +        sprintf(sbuf, "Trying to initialize an inexistent GPU: "
 +                "there are %d %s-selected GPU(s), but #%d was requested.",
 +                 gpu_opt->ncuda_dev_use, gpu_opt->bUserSet ? "user" : "auto", mygpu);
 +        gmx_incons(sbuf);
 +    }
 +
 +    gpuid = gpu_info->cuda_dev[gpu_opt->cuda_dev_use[mygpu]].id;
 +
 +    stat = cudaSetDevice(gpuid);
 +    strncpy(result_str, cudaGetErrorString(stat), STRLEN);
 +
 +    if (debug)
 +    {
 +        fprintf(stderr, "Initialized GPU ID #%d: %s\n", gpuid, gpu_info->cuda_dev[gpuid].prop.name);
 +    }
 +
 +    return (stat == cudaSuccess);
 +}
 +
 +/*! \brief Frees up the CUDA GPU used by the active context at the time of calling.
 + *
 + * The context is explicitly destroyed and therefore all data uploaded to the GPU
 + * is lost. This should only be called when none of this data is required anymore.
 + *
 + * \param[out] result_str   the message related to the error that occurred
 + *                          during the initialization (if there was any).
 + * \returns                 true if no error occurs during the freeing.
 + */
 +gmx_bool free_gpu(char *result_str)
 +{
 +    cudaError_t stat;
 +
 +    assert(result_str);
 +
 +    if (debug)
 +    {
 +        int gpuid;
 +        stat = cudaGetDevice(&gpuid);
 +        CU_RET_ERR(stat, "cudaGetDevice failed");
 +        fprintf(stderr, "Cleaning up context on GPU ID #%d\n", gpuid);
 +    }
 +
 +#if CUDA_VERSION < 4000
 +    stat = cudaThreadExit();
 +#else
 +    stat = cudaDeviceReset();
 +#endif
 +    strncpy(result_str, cudaGetErrorString(stat), STRLEN);
 +
 +    return (stat == cudaSuccess);
 +}
 +
 +/*! \brief Returns true if the gpu characterized by the device properties is
 + *  supported by the native gpu acceleration.
 + *
 + * \param[in] dev_prop  the CUDA device properties of the gpus to test.
 + * \returns             true if the GPU properties passed indicate a compatible
 + *                      GPU, otherwise false.
 + */
 +static bool is_gmx_supported_gpu(const cudaDeviceProp *dev_prop)
 +{
 +    return (dev_prop->major >= 2);
 +}
 +
 +/*! \brief Helper function that checks whether a given GPU status indicates compatible GPU.
 + *
 + * \param[in] stat  GPU status.
 + * \returns         true if the provided status is egpuCompatible, otherwise false.
 + */
 +static bool is_compatible_gpu(int stat)
 +{
 +    return (stat == egpuCompatible);
 +}
 +
 +/*! \brief Checks if a GPU with a given ID is supported by the native GROMACS acceleration.
 + *
 + *  Returns a status value which indicates compatibility or one of the following
 + *  errors: incompatibility, insistence, or insanity (=unexpected behavior).
 + *  It also returns the respective device's properties in \dev_prop (if applicable).
 + *
 + *  \param[in]  dev_id   the ID of the GPU to check.
 + *  \param[out] dev_prop the CUDA device properties of the device checked.
 + *  \returns             the status of the requested device
 + */
 +static int is_gmx_supported_gpu_id(int dev_id, cudaDeviceProp *dev_prop)
 +{
 +    cudaError_t stat;
 +    int         ndev;
 +
 +    stat = cudaGetDeviceCount(&ndev);
 +    if (stat != cudaSuccess)
 +    {
 +        return egpuInsane;
 +    }
 +
 +    if (dev_id > ndev - 1)
 +    {
 +        return egpuNonexistent;
 +    }
 +
 +    /* TODO: currently we do not make a distinction between the type of errors
 +     * that can appear during sanity checks. This needs to be improved, e.g if
 +     * the dummy test kernel fails to execute with a "device busy message" we
 +     * should appropriately report that the device is busy instead of insane.
 +     */
 +    if (do_sanity_checks(dev_id, dev_prop) == 0)
 +    {
 +        if (is_gmx_supported_gpu(dev_prop))
 +        {
 +            return egpuCompatible;
 +        }
 +        else
 +        {
 +            return egpuIncompatible;
 +        }
 +    }
 +    else
 +    {
 +        return egpuInsane;
 +    }
 +}
 +
 +
 +/*! \brief Detect all NVIDIA GPUs in the system.
 + *
 + *  Will detect every NVIDIA GPU supported by the device driver in use. Also
 + *  check for the compatibility of each and fill the gpu_info->cuda_dev array
 + *  with the required information on each the device: ID, device properties,
 + *  status.
 + *
 + *  \param[in] gpu_info    pointer to structure holding GPU information.
 + *  \param[out] err_str    The error message of any CUDA API error that caused
 + *                         the detection to fail (if there was any). The memory
 + *                         the pointer points to should be managed externally.
 + *  \returns               non-zero if the detection encountered a failure, zero otherwise.
 + */
 +int detect_cuda_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
 +{
 +    int             i, ndev, checkres, retval;
 +    cudaError_t     stat;
 +    cudaDeviceProp  prop;
 +    cuda_dev_info_t *devs;
 +
 +    assert(gpu_info);
 +    assert(err_str);
 +
 +    gpu_info->ncuda_dev_compatible = 0;
 +
 +    ndev    = 0;
 +    devs    = NULL;
 +
 +    stat = cudaGetDeviceCount(&ndev);
 +    if (stat != cudaSuccess)
 +    {
 +        const char *s;
 +
 +        /* cudaGetDeviceCount failed which means that there is something
 +         * wrong with the machine: driver-runtime mismatch, all GPUs being
 +         * busy in exclusive mode, or some other condition which should
 +         * result in us issuing a warning a falling back to CPUs. */
 +        retval = -1;
 +        s = cudaGetErrorString(stat);
 +        strncpy(err_str, s, STRLEN*sizeof(err_str[0]));
 +    }
 +    else
 +    {
 +        snew(devs, ndev);
 +        for (i = 0; i < ndev; i++)
 +        {
 +            checkres = is_gmx_supported_gpu_id(i, &prop);
 +
 +            devs[i].id   = i;
 +            devs[i].prop = prop;
 +            devs[i].stat = checkres;
 +
 +            if (checkres == egpuCompatible)
 +            {
 +                gpu_info->ncuda_dev_compatible++;
 +            }
 +        }
 +        retval = 0;
 +    }
 +
 +    gpu_info->ncuda_dev = ndev;
 +    gpu_info->cuda_dev  = devs;
 +
 +    return retval;
 +}
 +
 +/*! \brief Select the GPUs compatible with the native GROMACS acceleration.
 + *
 + * This function selects the compatible gpus and initializes
 + * gpu_info->cuda_dev_use and gpu_info->ncuda_dev_use.
 + *
 + * Given the list of GPUs available in the system check each device in
 + * gpu_info->cuda_dev and place the indices of the compatible GPUs into
 + * cuda_dev_use with this marking the respective GPUs as "available for use."
 + * Note that \detect_cuda_gpus must have been called before.
 + *
 + * \param[in]     gpu_info    pointer to structure holding GPU information
 + * \param[in,out] gpu_opt     pointer to structure holding GPU options
 + */
 +void pick_compatible_gpus(const gmx_gpu_info_t *gpu_info,
 +                          gmx_gpu_opt_t *gpu_opt)
 +{
 +    int i, ncompat;
 +    int *compat;
 +
 +    assert(gpu_info);
 +    /* cuda_dev/ncuda_dev have to be either NULL/0 or not (NULL/0) */
 +    assert((gpu_info->ncuda_dev != 0 ? 0 : 1) ^ (gpu_info->cuda_dev == NULL ? 0 : 1));
 +
 +    snew(compat, gpu_info->ncuda_dev);
 +    ncompat = 0;
 +    for (i = 0; i < gpu_info->ncuda_dev; i++)
 +    {
 +        if (is_compatible_gpu(gpu_info->cuda_dev[i].stat))
 +        {
 +            ncompat++;
 +            compat[ncompat - 1] = i;
 +        }
 +    }
 +
 +    gpu_opt->ncuda_dev_use = ncompat;
 +    snew(gpu_opt->cuda_dev_use, ncompat);
 +    memcpy(gpu_opt->cuda_dev_use, compat, ncompat*sizeof(*compat));
 +    sfree(compat);
 +}
 +
 +/*! \brief Check the existence/compatibility of a set of GPUs specified by their device IDs.
 + *
 + * Given the a list of gpu->ncuda_dev_use GPU device IDs stored in
 + * gpu_opt->cuda_dev_use check the existence and compatibility
 + * of the respective GPUs. Also provide the caller with an array containing
 + * the result of checks in \checkres.
 + *
 + * \param[out]  checkres    check result for each ID passed in \requested_devs
 + * \param[in]   gpu_info    pointer to structure holding GPU information
 + * \param[out]  gpu_opt     pointer to structure holding GPU options
 + * \returns                 TRUE if every the requested GPUs are compatible
 + */
 +gmx_bool check_selected_cuda_gpus(int *checkres,
 +                                  const gmx_gpu_info_t *gpu_info,
 +                                  gmx_gpu_opt_t *gpu_opt)
 +{
 +    int i, id;
 +    bool bAllOk;
 +
 +    assert(checkres);
 +    assert(gpu_info);
 +    assert(gpu_opt->ncuda_dev_use >= 0);
 +
 +    if (gpu_opt->ncuda_dev_use == 0)
 +    {
 +        return TRUE;
 +    }
 +
 +    assert(gpu_opt->cuda_dev_use);
 +
 +    /* we will assume that all GPUs requested are valid IDs,
 +       otherwise we'll bail anyways */
 +
 +    bAllOk = true;
 +    for (i = 0; i < gpu_opt->ncuda_dev_use; i++)
 +    {
 +        id = gpu_opt->cuda_dev_use[i];
 +
 +        /* devices are stored in increasing order of IDs in cuda_dev */
 +        gpu_opt->cuda_dev_use[i] = id;
 +
 +        checkres[i] = (id >= gpu_info->ncuda_dev) ?
 +            egpuNonexistent : gpu_info->cuda_dev[id].stat;
 +
 +        bAllOk = bAllOk && is_compatible_gpu(checkres[i]);
 +    }
 +
 +    return bAllOk;
 +}
 +
 +/*! \brief Frees the cuda_dev and cuda_dev_use array fields of \gpu_info.
 + *
 + * \param[in]    gpu_info    pointer to structure holding GPU information
 + */
 +void free_gpu_info(const gmx_gpu_info_t *gpu_info)
 +{
 +    if (gpu_info == NULL)
 +    {
 +        return;
 +    }
 +
 +    sfree(gpu_info->cuda_dev);
 +}
 +
 +/*! \brief Formats and returns a device information string for a given GPU.
 + *
 + * Given an index *directly* into the array of available GPUs (cuda_dev)
 + * returns a formatted info string for the respective GPU which includes
 + * ID, name, compute capability, and detection status.
 + *
 + * \param[out]  s           pointer to output string (has to be allocated externally)
 + * \param[in]   gpu_info    pointer to structure holding GPU information
 + * \param[in]   index       an index *directly* into the array of available GPUs
 + */
 +void get_gpu_device_info_string(char *s, const gmx_gpu_info_t *gpu_info, int index)
 +{
 +    assert(s);
 +    assert(gpu_info);
 +
 +    if (index < 0 && index >= gpu_info->ncuda_dev)
 +    {
 +        return;
 +    }
 +
 +    cuda_dev_info_t *dinfo = &gpu_info->cuda_dev[index];
 +
 +    bool bGpuExists =
 +        dinfo->stat == egpuCompatible ||
 +        dinfo->stat == egpuIncompatible;
 +
 +    if (!bGpuExists)
 +    {
 +        sprintf(s, "#%d: %s, stat: %s",
 +                dinfo->id, "N/A",
 +                gpu_detect_res_str[dinfo->stat]);
 +    }
 +    else
 +    {
 +        sprintf(s, "#%d: NVIDIA %s, compute cap.: %d.%d, ECC: %3s, stat: %s",
 +                dinfo->id, dinfo->prop.name,
 +                dinfo->prop.major, dinfo->prop.minor,
 +                dinfo->prop.ECCEnabled ? "yes" : " no",
 +                gpu_detect_res_str[dinfo->stat]);
 +    }
 +}
 +
 +/*! \brief Returns the device ID of the GPU with a given index into the array of used GPUs.
 + *
 + * Getter function which, given an index into the array of GPUs in use
 + * (cuda_dev_use) -- typically a tMPI/MPI rank --, returns the device ID of the
 + * respective CUDA GPU.
 + *
 + * \param[in]    gpu_info   pointer to structure holding GPU information
 + * \param[in]    gpu_opt    pointer to structure holding GPU options
 + * \param[in]    idx        index into the array of used GPUs
 + * \returns                 device ID of the requested GPU
 + */
 +int get_gpu_device_id(const gmx_gpu_info_t *gpu_info,
 +                      const gmx_gpu_opt_t *gpu_opt,
 +                      int idx)
 +{
 +    assert(gpu_info);
 +    assert(gpu_opt);
++    assert(idx >= 0 && idx < gpu_opt->ncuda_dev_use);
 +
 +    return gpu_info->cuda_dev[gpu_opt->cuda_dev_use[idx]].id;
 +}
 +
 +/*! \brief Returns the device ID of the GPU currently in use.
 + *
 + * The GPU used is the one that is active at the time of the call in the active context.
 + *
 + * \param[in]    gpu_info   pointer to structure holding GPU information
 + * \returns                 device ID of the GPU in use at the time of the call
 + */
 +int get_current_gpu_device_id(void)
 +{
 +    int gpuid;
 +    CU_RET_ERR(cudaGetDevice(&gpuid), "cudaGetDevice failed");
 +
 +    return gpuid;
 +}
 +
 +/*! \brief Returns the size of the cuda_dev_info struct.
 + *
 + * The size of cuda_dev_info can be used for allocation and communication.
 + *
 + * \returns                 size in bytes of cuda_dev_info
 + */
 +size_t sizeof_cuda_dev_info(void)
 +{
 +    return sizeof(cuda_dev_info);
 +}
index 07858aaafbf6259427fc5ddce5c2353a45e5fcd6,0000000000000000000000000000000000000000..c6c976ed5d74242bbeddb172f57beb63cd0f8d1f
mode 100644,000000..100644
--- /dev/null
@@@ -1,272 -1,0 +1,272 @@@
-     return (bPert ? ilsortFE_UNSORTED : ilsortNO_FE);
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-  */
 +
 +/*
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2008, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include "typedefs.h"
 +#include "topsort.h"
 +#include "smalloc.h"
 +#include "gmx_fatal.h"
 +
 +static gmx_bool ip_pert(int ftype, const t_iparams *ip)
 +{
 +    gmx_bool bPert;
 +    int      i;
 +
 +    if (NRFPB(ftype) == 0)
 +    {
 +        return FALSE;
 +    }
 +
 +    switch (ftype)
 +    {
 +        case F_BONDS:
 +        case F_G96BONDS:
 +        case F_HARMONIC:
 +        case F_ANGLES:
 +        case F_G96ANGLES:
 +        case F_IDIHS:
 +            bPert = (ip->harmonic.rA  != ip->harmonic.rB ||
 +                     ip->harmonic.krA != ip->harmonic.krB);
 +            break;
 +        case F_MORSE:
 +            bPert = (ip->morse.b0A  != ip->morse.b0B ||
 +                     ip->morse.cbA  != ip->morse.cbB ||
 +                     ip->morse.betaA  != ip->morse.betaB);
 +            break;
 +        case F_RESTRBONDS:
 +            bPert = (ip->restraint.lowA  != ip->restraint.lowB ||
 +                     ip->restraint.up1A  != ip->restraint.up1B ||
 +                     ip->restraint.up2A  != ip->restraint.up2B ||
 +                     ip->restraint.kA    != ip->restraint.kB);
 +            break;
 +        case F_UREY_BRADLEY:
 +            bPert = (ip->u_b.thetaA  != ip->u_b.thetaB  ||
 +                     ip->u_b.kthetaA != ip->u_b.kthetaB ||
 +                     ip->u_b.r13A    != ip->u_b.r13B    ||
 +                     ip->u_b.kUBA    != ip->u_b.kUBB);
 +            break;
 +        case F_PDIHS:
 +        case F_PIDIHS:
 +        case F_ANGRES:
 +        case F_ANGRESZ:
 +            bPert = (ip->pdihs.phiA != ip->pdihs.phiB ||
 +                     ip->pdihs.cpA  != ip->pdihs.cpB);
 +            break;
 +        case F_RBDIHS:
 +            bPert = FALSE;
 +            for (i = 0; i < NR_RBDIHS; i++)
 +            {
 +                if (ip->rbdihs.rbcA[i] != ip->rbdihs.rbcB[i])
 +                {
 +                    bPert = TRUE;
 +                }
 +            }
 +            break;
 +        case F_TABBONDS:
 +        case F_TABBONDSNC:
 +        case F_TABANGLES:
 +        case F_TABDIHS:
 +            bPert = (ip->tab.kA != ip->tab.kB);
 +            break;
 +        case F_POSRES:
 +            bPert = FALSE;
 +            for (i = 0; i < DIM; i++)
 +            {
 +                if (ip->posres.pos0A[i] != ip->posres.pos0B[i] ||
 +                    ip->posres.fcA[i]   != ip->posres.fcB[i])
 +                {
 +                    bPert = TRUE;
 +                }
 +            }
 +            break;
 +        case F_DIHRES:
 +            bPert = ((ip->dihres.phiA != ip->dihres.phiB) ||
 +                     (ip->dihres.dphiA != ip->dihres.dphiB) ||
 +                     (ip->dihres.kfacA != ip->dihres.kfacB));
 +            break;
 +        case F_LJ14:
 +            bPert = (ip->lj14.c6A  != ip->lj14.c6B ||
 +                     ip->lj14.c12A != ip->lj14.c12B);
 +            break;
 +        case F_CMAP:
 +            bPert = FALSE;
 +            break;
 +        default:
 +            bPert = FALSE;
 +            gmx_fatal(FARGS, "Function type %s not implemented in ip_pert",
 +                      interaction_function[ftype].longname);
 +    }
 +
 +    return bPert;
 +}
 +
 +static gmx_bool ip_q_pert(int ftype, const t_iatom *ia,
 +                          const t_iparams *ip, const real *qA, const real *qB)
 +{
 +    /* 1-4 interactions do not have the charges stored in the iparams list,
 +     * so we need a separate check for those.
 +     */
 +    return (ip_pert(ftype, ip+ia[0]) ||
 +            (ftype == F_LJ14 && (qA[ia[1]] != qB[ia[1]] ||
 +                                 qA[ia[2]] != qB[ia[2]])));
 +}
 +
 +gmx_bool gmx_mtop_bondeds_free_energy(const gmx_mtop_t *mtop)
 +{
 +    const gmx_ffparams_t *ffparams;
 +    int                   i, ftype;
 +    int                   mb;
 +    t_atom               *atom;
 +    t_ilist              *il;
 +    t_iatom              *ia;
 +    gmx_bool              bPert;
 +
 +    ffparams = &mtop->ffparams;
 +
 +    /* Loop over all the function types and compare the A/B parameters */
 +    bPert = FALSE;
 +    for (i = 0; i < ffparams->ntypes; i++)
 +    {
 +        ftype = ffparams->functype[i];
 +        if (interaction_function[ftype].flags & IF_BOND)
 +        {
 +            if (ip_pert(ftype, &ffparams->iparams[i]))
 +            {
 +                bPert = TRUE;
 +            }
 +        }
 +    }
 +
 +    /* Check perturbed charges for 1-4 interactions */
 +    for (mb = 0; mb < mtop->nmolblock; mb++)
 +    {
 +        atom = mtop->moltype[mtop->molblock[mb].type].atoms.atom;
 +        il   = &mtop->moltype[mtop->molblock[mb].type].ilist[F_LJ14];
 +        ia   = il->iatoms;
 +        for (i = 0; i < il->nr; i += 3)
 +        {
 +            if (atom[ia[i+1]].q != atom[ia[i+1]].qB ||
 +                atom[ia[i+2]].q != atom[ia[i+2]].qB)
 +            {
 +                bPert = TRUE;
 +            }
 +        }
 +    }
 +
++    return bPert;
 +}
 +
 +void gmx_sort_ilist_fe(t_idef *idef, const real *qA, const real *qB)
 +{
 +    int        ftype, nral, i, ic, ib, a;
 +    t_iparams *iparams;
 +    t_ilist   *ilist;
 +    t_iatom   *iatoms;
 +    gmx_bool   bPert;
 +    t_iatom   *iabuf;
 +    int        iabuf_nalloc;
 +
 +    if (qB == NULL)
 +    {
 +        qB = qA;
 +    }
 +
 +    iabuf_nalloc = 0;
 +    iabuf        = NULL;
 +
 +    iparams = idef->iparams;
 +
 +    for (ftype = 0; ftype < F_NRE; ftype++)
 +    {
 +        if (interaction_function[ftype].flags & IF_BOND)
 +        {
 +            ilist  = &idef->il[ftype];
 +            iatoms = ilist->iatoms;
 +            nral   = NRAL(ftype);
 +            ic     = 0;
 +            ib     = 0;
 +            i      = 0;
 +            while (i < ilist->nr)
 +            {
 +                /* Check if this interaction is perturbed */
 +                if (ip_q_pert(ftype, iatoms+i, iparams, qA, qB))
 +                {
 +                    /* Copy to the perturbed buffer */
 +                    if (ib + 1 + nral > iabuf_nalloc)
 +                    {
 +                        iabuf_nalloc = over_alloc_large(ib+1+nral);
 +                        srenew(iabuf, iabuf_nalloc);
 +                    }
 +                    for (a = 0; a < 1+nral; a++)
 +                    {
 +                        iabuf[ib++] = iatoms[i++];
 +                    }
 +                }
 +                else
 +                {
 +                    /* Copy in place */
 +                    for (a = 0; a < 1+nral; a++)
 +                    {
 +                        iatoms[ic++] = iatoms[i++];
 +                    }
 +                }
 +            }
 +            /* Now we now the number of non-perturbed interactions */
 +            ilist->nr_nonperturbed = ic;
 +
 +            /* Copy the buffer with perturbed interactions to the ilist */
 +            for (a = 0; a < ib; a++)
 +            {
 +                iatoms[ic++] = iabuf[a];
 +            }
 +
 +            if (debug)
 +            {
 +                fprintf(debug, "%s non-pert %d pert %d\n",
 +                        interaction_function[ftype].longname,
 +                        ilist->nr_nonperturbed,
 +                        ilist->nr-ilist->nr_nonperturbed);
 +            }
 +        }
 +    }
 +
 +    sfree(iabuf);
 +
 +    idef->ilsort = ilsortFE_SORTED;
 +}
index 6179b34150193f7200c9cfc3359e798f1c4ddd4d,0000000000000000000000000000000000000000..649e4b7b14ec91c2f8a00530ce593d673987edd6
mode 100644,000000..100644
--- /dev/null
@@@ -1,58 -1,0 +1,58 @@@
- /* Returns if the are bonded interactions for free energy calculations */
 +/*
 + * This file is part of the GROMACS molecular simulation package.
 + *
 + * Copyright (c) 2008,2009,2010, by the GROMACS development team, led by
 + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 + * and including many others, as listed in the AUTHORS file in the
 + * top-level source directory and at http://www.gromacs.org.
 + *
 + * GROMACS is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2.1
 + * of the License, or (at your option) any later version.
 + *
 + * GROMACS is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with GROMACS; if not, see
 + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 + *
 + * If you want to redistribute modifications to GROMACS, please
 + * consider that scientific software is very special. Version
 + * control is crucial - bugs must be traceable. We will be happy to
 + * consider code for inclusion in the official distribution, but
 + * derived work must not be called official GROMACS. Details are found
 + * in the README & COPYING files - if they are missing, get the
 + * official version at http://www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org.
 + */
 +#ifndef _topsort_h
 +#define _topsort_h
 +
 +
 +#include "typedefs.h"
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
 +
++/* Returns if there are perturbed bonded interactions */
 +gmx_bool gmx_mtop_bondeds_free_energy(const gmx_mtop_t *mtop);
 +
 +/* Sort all the bonded ilists in idef to have the perturbed ones at the end
 + * and set nr_nr_nonperturbed in ilist.
 + */
 +void gmx_sort_ilist_fe(t_idef *idef, const real *qA, const real *qB);
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +#endif
index 0e58da49b30a2bd92b53aed199f876352733b2fb,0000000000000000000000000000000000000000..886bbcf813d3be36a9f4d7a36f95f68b014a22db
mode 100644,000000..100644
--- /dev/null
@@@ -1,9846 -1,0 +1,9846 @@@
-     gpu_id = get_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->nodeid);
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + * This file is part of Gromacs        Copyright (c) 1991-2008
 + * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org
 + *
 + * And Hey:
 + * Gnomes, ROck Monsters And Chili Sauce
 + */
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <stdio.h>
 +#include <time.h>
 +#include <math.h>
 +#include <string.h>
 +#include <stdlib.h>
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "gmx_fatal.h"
 +#include "gmx_fatal_collective.h"
 +#include "vec.h"
 +#include "domdec.h"
 +#include "domdec_network.h"
 +#include "nrnb.h"
 +#include "pbc.h"
 +#include "chargegroup.h"
 +#include "constr.h"
 +#include "mdatoms.h"
 +#include "names.h"
 +#include "force.h"
 +#include "pme.h"
 +#include "pull.h"
 +#include "pull_rotation.h"
 +#include "mdrun.h"
 +#include "nsgrid.h"
 +#include "shellfc.h"
 +#include "mtop_util.h"
 +#include "gmx_ga2la.h"
 +#include "gmx_sort.h"
 +#include "macros.h"
 +#include "nbnxn_search.h"
 +#include "bondf.h"
 +#include "gmx_omp_nthreads.h"
 +#include "gpu_utils.h"
 +
 +#include "gromacs/fileio/futil.h"
 +#include "gromacs/fileio/gmxfio.h"
 +#include "gromacs/fileio/pdbio.h"
 +#include "gromacs/timing/wallcycle.h"
 +#include "gromacs/utility/gmxmpi.h"
 +
 +#define DDRANK(dd, rank)    (rank)
 +#define DDMASTERRANK(dd)   (dd->masterrank)
 +
 +typedef struct gmx_domdec_master
 +{
 +    /* The cell boundaries */
 +    real **cell_x;
 +    /* The global charge group division */
 +    int   *ncg;    /* Number of home charge groups for each node */
 +    int   *index;  /* Index of nnodes+1 into cg */
 +    int   *cg;     /* Global charge group index */
 +    int   *nat;    /* Number of home atoms for each node. */
 +    int   *ibuf;   /* Buffer for communication */
 +    rvec  *vbuf;   /* Buffer for state scattering and gathering */
 +} gmx_domdec_master_t;
 +
 +typedef struct
 +{
 +    /* The numbers of charge groups to send and receive for each cell
 +     * that requires communication, the last entry contains the total
 +     * number of atoms that needs to be communicated.
 +     */
 +    int  nsend[DD_MAXIZONE+2];
 +    int  nrecv[DD_MAXIZONE+2];
 +    /* The charge groups to send */
 +    int *index;
 +    int  nalloc;
 +    /* The atom range for non-in-place communication */
 +    int  cell2at0[DD_MAXIZONE];
 +    int  cell2at1[DD_MAXIZONE];
 +} gmx_domdec_ind_t;
 +
 +typedef struct
 +{
 +    int               np;       /* Number of grid pulses in this dimension */
 +    int               np_dlb;   /* For dlb, for use with edlbAUTO          */
 +    gmx_domdec_ind_t *ind;      /* The indices to communicate, size np     */
 +    int               np_nalloc;
 +    gmx_bool          bInPlace; /* Can we communicate in place?            */
 +} gmx_domdec_comm_dim_t;
 +
 +typedef struct
 +{
 +    gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
 +    real     *cell_f;      /* State var.: cell boundaries, box relative      */
 +    real     *old_cell_f;  /* Temp. var.: old cell size                      */
 +    real     *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
 +    real     *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
 +    real     *bound_min;   /* Temp. var.: lower limit for cell boundary      */
 +    real     *bound_max;   /* Temp. var.: upper limit for cell boundary      */
 +    gmx_bool  bLimited;    /* State var.: is DLB limited in this dim and row */
 +    real     *buf_ncd;     /* Temp. var.                                     */
 +} gmx_domdec_root_t;
 +
 +#define DD_NLOAD_MAX 9
 +
 +/* Here floats are accurate enough, since these variables
 + * only influence the load balancing, not the actual MD results.
 + */
 +typedef struct
 +{
 +    int    nload;
 +    float *load;
 +    float  sum;
 +    float  max;
 +    float  sum_m;
 +    float  cvol_min;
 +    float  mdf;
 +    float  pme;
 +    int    flags;
 +} gmx_domdec_load_t;
 +
 +typedef struct
 +{
 +    int  nsc;
 +    int  ind_gl;
 +    int  ind;
 +} gmx_cgsort_t;
 +
 +typedef struct
 +{
 +    gmx_cgsort_t *sort;
 +    gmx_cgsort_t *sort2;
 +    int           sort_nalloc;
 +    gmx_cgsort_t *sort_new;
 +    int           sort_new_nalloc;
 +    int          *ibuf;
 +    int           ibuf_nalloc;
 +} gmx_domdec_sort_t;
 +
 +typedef struct
 +{
 +    rvec *v;
 +    int   nalloc;
 +} vec_rvec_t;
 +
 +/* This enum determines the order of the coordinates.
 + * ddnatHOME and ddnatZONE should be first and second,
 + * the others can be ordered as wanted.
 + */
 +enum {
 +    ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR
 +};
 +
 +enum {
 +    edlbAUTO, edlbNO, edlbYES, edlbNR
 +};
 +const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
 +
 +typedef struct
 +{
 +    int      dim;       /* The dimension                                          */
 +    gmx_bool dim_match; /* Tells if DD and PME dims match                         */
 +    int      nslab;     /* The number of PME slabs in this dimension              */
 +    real    *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
 +    int     *pp_min;    /* The minimum pp node location, size nslab               */
 +    int     *pp_max;    /* The maximum pp node location,size nslab                */
 +    int      maxshift;  /* The maximum shift for coordinate redistribution in PME */
 +} gmx_ddpme_t;
 +
 +typedef struct
 +{
 +    real min0;    /* The minimum bottom of this zone                        */
 +    real max1;    /* The maximum top of this zone                           */
 +    real min1;    /* The minimum top of this zone                           */
 +    real mch0;    /* The maximum bottom communicaton height for this zone   */
 +    real mch1;    /* The maximum top communicaton height for this zone      */
 +    real p1_0;    /* The bottom value of the first cell in this zone        */
 +    real p1_1;    /* The top value of the first cell in this zone           */
 +} gmx_ddzone_t;
 +
 +typedef struct
 +{
 +    gmx_domdec_ind_t ind;
 +    int             *ibuf;
 +    int              ibuf_nalloc;
 +    vec_rvec_t       vbuf;
 +    int              nsend;
 +    int              nat;
 +    int              nsend_zone;
 +} dd_comm_setup_work_t;
 +
 +typedef struct gmx_domdec_comm
 +{
 +    /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
 +     * unless stated otherwise.
 +     */
 +
 +    /* The number of decomposition dimensions for PME, 0: no PME */
 +    int         npmedecompdim;
 +    /* The number of nodes doing PME (PP/PME or only PME) */
 +    int         npmenodes;
 +    int         npmenodes_x;
 +    int         npmenodes_y;
 +    /* The communication setup including the PME only nodes */
 +    gmx_bool    bCartesianPP_PME;
 +    ivec        ntot;
 +    int         cartpmedim;
 +    int        *pmenodes;          /* size npmenodes                         */
 +    int        *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
 +                                    * but with bCartesianPP_PME              */
 +    gmx_ddpme_t ddpme[2];
 +
 +    /* The DD particle-particle nodes only */
 +    gmx_bool bCartesianPP;
 +    int     *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
 +
 +    /* The global charge groups */
 +    t_block cgs_gl;
 +
 +    /* Should we sort the cgs */
 +    int                nstSortCG;
 +    gmx_domdec_sort_t *sort;
 +
 +    /* Are there charge groups? */
 +    gmx_bool bCGs;
 +
 +    /* Are there bonded and multi-body interactions between charge groups? */
 +    gmx_bool bInterCGBondeds;
 +    gmx_bool bInterCGMultiBody;
 +
 +    /* Data for the optional bonded interaction atom communication range */
 +    gmx_bool  bBondComm;
 +    t_blocka *cglink;
 +    char     *bLocalCG;
 +
 +    /* The DLB option */
 +    int      eDLB;
 +    /* Are we actually using DLB? */
 +    gmx_bool bDynLoadBal;
 +
 +    /* Cell sizes for static load balancing, first index cartesian */
 +    real **slb_frac;
 +
 +    /* The width of the communicated boundaries */
 +    real     cutoff_mbody;
 +    real     cutoff;
 +    /* The minimum cell size (including triclinic correction) */
 +    rvec     cellsize_min;
 +    /* For dlb, for use with edlbAUTO */
 +    rvec     cellsize_min_dlb;
 +    /* The lower limit for the DD cell size with DLB */
 +    real     cellsize_limit;
 +    /* Effectively no NB cut-off limit with DLB for systems without PBC? */
 +    gmx_bool bVacDLBNoLimit;
 +
 +    /* With PME load balancing we set limits on DLB */
 +    gmx_bool bPMELoadBalDLBLimits;
 +    /* DLB needs to take into account that we want to allow this maximum
 +     * cut-off (for PME load balancing), this could limit cell boundaries.
 +     */
 +    real PMELoadBal_max_cutoff;
 +
 +    /* tric_dir is only stored here because dd_get_ns_ranges needs it */
 +    ivec tric_dir;
 +    /* box0 and box_size are required with dim's without pbc and -gcom */
 +    rvec box0;
 +    rvec box_size;
 +
 +    /* The cell boundaries */
 +    rvec cell_x0;
 +    rvec cell_x1;
 +
 +    /* The old location of the cell boundaries, to check cg displacements */
 +    rvec old_cell_x0;
 +    rvec old_cell_x1;
 +
 +    /* The communication setup and charge group boundaries for the zones */
 +    gmx_domdec_zones_t zones;
 +
 +    /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
 +     * cell boundaries of neighboring cells for dynamic load balancing.
 +     */
 +    gmx_ddzone_t zone_d1[2];
 +    gmx_ddzone_t zone_d2[2][2];
 +
 +    /* The coordinate/force communication setup and indices */
 +    gmx_domdec_comm_dim_t cd[DIM];
 +    /* The maximum number of cells to communicate with in one dimension */
 +    int                   maxpulse;
 +
 +    /* Which cg distribution is stored on the master node */
 +    int master_cg_ddp_count;
 +
 +    /* The number of cg's received from the direct neighbors */
 +    int  zone_ncg1[DD_MAXZONE];
 +
 +    /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
 +    int  nat[ddnatNR];
 +
 +    /* Array for signalling if atoms have moved to another domain */
 +    int  *moved;
 +    int   moved_nalloc;
 +
 +    /* Communication buffer for general use */
 +    int  *buf_int;
 +    int   nalloc_int;
 +
 +    /* Communication buffer for general use */
 +    vec_rvec_t vbuf;
 +
 +    /* Temporary storage for thread parallel communication setup */
 +    int                   nth;
 +    dd_comm_setup_work_t *dth;
 +
 +    /* Communication buffers only used with multiple grid pulses */
 +    int       *buf_int2;
 +    int        nalloc_int2;
 +    vec_rvec_t vbuf2;
 +
 +    /* Communication buffers for local redistribution */
 +    int  **cggl_flag;
 +    int    cggl_flag_nalloc[DIM*2];
 +    rvec **cgcm_state;
 +    int    cgcm_state_nalloc[DIM*2];
 +
 +    /* Cell sizes for dynamic load balancing */
 +    gmx_domdec_root_t **root;
 +    real               *cell_f_row;
 +    real                cell_f0[DIM];
 +    real                cell_f1[DIM];
 +    real                cell_f_max0[DIM];
 +    real                cell_f_min1[DIM];
 +
 +    /* Stuff for load communication */
 +    gmx_bool           bRecordLoad;
 +    gmx_domdec_load_t *load;
 +    int                nrank_gpu_shared;
 +#ifdef GMX_MPI
 +    MPI_Comm          *mpi_comm_load;
 +    MPI_Comm           mpi_comm_gpu_shared;
 +#endif
 +
 +    /* Maximum DLB scaling per load balancing step in percent */
 +    int dlb_scale_lim;
 +
 +    /* Cycle counters */
 +    float  cycl[ddCyclNr];
 +    int    cycl_n[ddCyclNr];
 +    float  cycl_max[ddCyclNr];
 +    /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
 +    int    eFlop;
 +    double flop;
 +    int    flop_n;
 +    /* Have often have did we have load measurements */
 +    int    n_load_have;
 +    /* Have often have we collected the load measurements */
 +    int    n_load_collect;
 +
 +    /* Statistics */
 +    double sum_nat[ddnatNR-ddnatZONE];
 +    int    ndecomp;
 +    int    nload;
 +    double load_step;
 +    double load_sum;
 +    double load_max;
 +    ivec   load_lim;
 +    double load_mdf;
 +    double load_pme;
 +
 +    /* The last partition step */
 +    gmx_large_int_t partition_step;
 +
 +    /* Debugging */
 +    int  nstDDDump;
 +    int  nstDDDumpGrid;
 +    int  DD_debug;
 +} gmx_domdec_comm_t;
 +
 +/* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 +#define DD_CGIBS 2
 +
 +/* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 +#define DD_FLAG_NRCG  65535
 +#define DD_FLAG_FW(d) (1<<(16+(d)*2))
 +#define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 +
 +/* Zone permutation required to obtain consecutive charge groups
 + * for neighbor searching.
 + */
 +static const int zone_perm[3][4] = { {0, 0, 0, 0}, {1, 0, 0, 0}, {3, 0, 1, 2} };
 +
 +/* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
 + * components see only j zones with that component 0.
 + */
 +
 +/* The DD zone order */
 +static const ivec dd_zo[DD_MAXZONE] =
 +{{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 +
 +/* The 3D setup */
 +#define dd_z3n  8
 +#define dd_zp3n 4
 +static const ivec dd_zp3[dd_zp3n] = {{0, 0, 8}, {1, 3, 6}, {2, 5, 6}, {3, 5, 7}};
 +
 +/* The 2D setup */
 +#define dd_z2n  4
 +#define dd_zp2n 2
 +static const ivec dd_zp2[dd_zp2n] = {{0, 0, 4}, {1, 3, 4}};
 +
 +/* The 1D setup */
 +#define dd_z1n  2
 +#define dd_zp1n 1
 +static const ivec dd_zp1[dd_zp1n] = {{0, 0, 2}};
 +
 +/* Factors used to avoid problems due to rounding issues */
 +#define DD_CELL_MARGIN       1.0001
 +#define DD_CELL_MARGIN2      1.00005
 +/* Factor to account for pressure scaling during nstlist steps */
 +#define DD_PRES_SCALE_MARGIN 1.02
 +
 +/* Allowed performance loss before we DLB or warn */
 +#define DD_PERF_LOSS 0.05
 +
 +#define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 +
 +/* Use separate MPI send and receive commands
 + * when nnodes <= GMX_DD_NNODES_SENDRECV.
 + * This saves memory (and some copying for small nnodes).
 + * For high parallelization scatter and gather calls are used.
 + */
 +#define GMX_DD_NNODES_SENDRECV 4
 +
 +
 +/*
 +   #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 +
 +   static void index2xyz(ivec nc,int ind,ivec xyz)
 +   {
 +   xyz[XX] = ind % nc[XX];
 +   xyz[YY] = (ind / nc[XX]) % nc[YY];
 +   xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 +   }
 + */
 +
 +/* This order is required to minimize the coordinate communication in PME
 + * which uses decomposition in the x direction.
 + */
 +#define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 +
 +static void ddindex2xyz(ivec nc, int ind, ivec xyz)
 +{
 +    xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 +    xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 +    xyz[ZZ] = ind % nc[ZZ];
 +}
 +
 +static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 +{
 +    int ddindex;
 +    int ddnodeid = -1;
 +
 +    ddindex = dd_index(dd->nc, c);
 +    if (dd->comm->bCartesianPP_PME)
 +    {
 +        ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 +    }
 +    else if (dd->comm->bCartesianPP)
 +    {
 +#ifdef GMX_MPI
 +        MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 +#endif
 +    }
 +    else
 +    {
 +        ddnodeid = ddindex;
 +    }
 +
 +    return ddnodeid;
 +}
 +
 +static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox, t_inputrec *ir)
 +{
 +    return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 +}
 +
 +int ddglatnr(gmx_domdec_t *dd, int i)
 +{
 +    int atnr;
 +
 +    if (dd == NULL)
 +    {
 +        atnr = i + 1;
 +    }
 +    else
 +    {
 +        if (i >= dd->comm->nat[ddnatNR-1])
 +        {
 +            gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->nat[ddnatNR-1]);
 +        }
 +        atnr = dd->gatindex[i] + 1;
 +    }
 +
 +    return atnr;
 +}
 +
 +t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 +{
 +    return &dd->comm->cgs_gl;
 +}
 +
 +static void vec_rvec_init(vec_rvec_t *v)
 +{
 +    v->nalloc = 0;
 +    v->v      = NULL;
 +}
 +
 +static void vec_rvec_check_alloc(vec_rvec_t *v, int n)
 +{
 +    if (n > v->nalloc)
 +    {
 +        v->nalloc = over_alloc_dd(n);
 +        srenew(v->v, v->nalloc);
 +    }
 +}
 +
 +void dd_store_state(gmx_domdec_t *dd, t_state *state)
 +{
 +    int i;
 +
 +    if (state->ddp_count != dd->ddp_count)
 +    {
 +        gmx_incons("The state does not the domain decomposition state");
 +    }
 +
 +    state->ncg_gl = dd->ncg_home;
 +    if (state->ncg_gl > state->cg_gl_nalloc)
 +    {
 +        state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 +        srenew(state->cg_gl, state->cg_gl_nalloc);
 +    }
 +    for (i = 0; i < state->ncg_gl; i++)
 +    {
 +        state->cg_gl[i] = dd->index_gl[i];
 +    }
 +
 +    state->ddp_count_cg_gl = dd->ddp_count;
 +}
 +
 +gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 +{
 +    return &dd->comm->zones;
 +}
 +
 +void dd_get_ns_ranges(gmx_domdec_t *dd, int icg,
 +                      int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 +{
 +    gmx_domdec_zones_t *zones;
 +    int                 izone, d, dim;
 +
 +    zones = &dd->comm->zones;
 +
 +    izone = 0;
 +    while (icg >= zones->izone[izone].cg1)
 +    {
 +        izone++;
 +    }
 +
 +    if (izone == 0)
 +    {
 +        *jcg0 = icg;
 +    }
 +    else if (izone < zones->nizone)
 +    {
 +        *jcg0 = zones->izone[izone].jcg0;
 +    }
 +    else
 +    {
 +        gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 +                  icg, izone, zones->nizone);
 +    }
 +
 +    *jcg1 = zones->izone[izone].jcg1;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim         = dd->dim[d];
 +        shift0[dim] = zones->izone[izone].shift0[dim];
 +        shift1[dim] = zones->izone[izone].shift1[dim];
 +        if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
 +        {
 +            /* A conservative approach, this can be optimized */
 +            shift0[dim] -= 1;
 +            shift1[dim] += 1;
 +        }
 +    }
 +}
 +
 +int dd_natoms_vsite(gmx_domdec_t *dd)
 +{
 +    return dd->comm->nat[ddnatVSITE];
 +}
 +
 +void dd_get_constraint_range(gmx_domdec_t *dd, int *at_start, int *at_end)
 +{
 +    *at_start = dd->comm->nat[ddnatCON-1];
 +    *at_end   = dd->comm->nat[ddnatCON];
 +}
 +
 +void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[])
 +{
 +    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 +    int                   *index, *cgindex;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    rvec                   shift = {0, 0, 0}, *buf, *rbuf;
 +    gmx_bool               bPBC, bScrew;
 +
 +    comm = dd->comm;
 +
 +    cgindex = dd->cgindex;
 +
 +    buf = comm->vbuf.v;
 +
 +    nzone   = 1;
 +    nat_tot = dd->nat_home;
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        bPBC   = (dd->ci[dd->dim[d]] == 0);
 +        bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 +        if (bPBC)
 +        {
 +            copy_rvec(box[dd->dim[d]], shift);
 +        }
 +        cd = &comm->cd[d];
 +        for (p = 0; p < cd->np; p++)
 +        {
 +            ind   = &cd->ind[p];
 +            index = ind->index;
 +            n     = 0;
 +            if (!bPBC)
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        copy_rvec(x[j], buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else if (!bScrew)
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        /* We need to shift the coordinates */
 +                        rvec_add(x[j], shift, buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        /* Shift x */
 +                        buf[n][XX] = x[j][XX] + shift[XX];
 +                        /* Rotate y and z.
 +                         * This operation requires a special shift force
 +                         * treatment, which is performed in calc_vir.
 +                         */
 +                        buf[n][YY] = box[YY][YY] - x[j][YY];
 +                        buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 +                        n++;
 +                    }
 +                }
 +            }
 +
 +            if (cd->bInPlace)
 +            {
 +                rbuf = x + nat_tot;
 +            }
 +            else
 +            {
 +                rbuf = comm->vbuf2.v;
 +            }
 +            /* Send and receive the coordinates */
 +            dd_sendrecv_rvec(dd, d, dddirBackward,
 +                             buf,  ind->nsend[nzone+1],
 +                             rbuf, ind->nrecv[nzone+1]);
 +            if (!cd->bInPlace)
 +            {
 +                j = 0;
 +                for (zone = 0; zone < nzone; zone++)
 +                {
 +                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 +                    {
 +                        copy_rvec(rbuf[j], x[i]);
 +                        j++;
 +                    }
 +                }
 +            }
 +            nat_tot += ind->nrecv[nzone+1];
 +        }
 +        nzone += nzone;
 +    }
 +}
 +
 +void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift)
 +{
 +    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 +    int                   *index, *cgindex;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    rvec                  *buf, *sbuf;
 +    ivec                   vis;
 +    int                    is;
 +    gmx_bool               bPBC, bScrew;
 +
 +    comm = dd->comm;
 +
 +    cgindex = dd->cgindex;
 +
 +    buf = comm->vbuf.v;
 +
 +    n       = 0;
 +    nzone   = comm->zones.n/2;
 +    nat_tot = dd->nat_tot;
 +    for (d = dd->ndim-1; d >= 0; d--)
 +    {
 +        bPBC   = (dd->ci[dd->dim[d]] == 0);
 +        bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 +        if (fshift == NULL && !bScrew)
 +        {
 +            bPBC = FALSE;
 +        }
 +        /* Determine which shift vector we need */
 +        clear_ivec(vis);
 +        vis[dd->dim[d]] = 1;
 +        is              = IVEC2IS(vis);
 +
 +        cd = &comm->cd[d];
 +        for (p = cd->np-1; p >= 0; p--)
 +        {
 +            ind      = &cd->ind[p];
 +            nat_tot -= ind->nrecv[nzone+1];
 +            if (cd->bInPlace)
 +            {
 +                sbuf = f + nat_tot;
 +            }
 +            else
 +            {
 +                sbuf = comm->vbuf2.v;
 +                j    = 0;
 +                for (zone = 0; zone < nzone; zone++)
 +                {
 +                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 +                    {
 +                        copy_rvec(f[i], sbuf[j]);
 +                        j++;
 +                    }
 +                }
 +            }
 +            /* Communicate the forces */
 +            dd_sendrecv_rvec(dd, d, dddirForward,
 +                             sbuf, ind->nrecv[nzone+1],
 +                             buf,  ind->nsend[nzone+1]);
 +            index = ind->index;
 +            /* Add the received forces */
 +            n = 0;
 +            if (!bPBC)
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        rvec_inc(f[j], buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else if (!bScrew)
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        rvec_inc(f[j], buf[n]);
 +                        /* Add this force to the shift force */
 +                        rvec_inc(fshift[is], buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else
 +            {
 +                for (i = 0; i < ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for (j = at0; j < at1; j++)
 +                    {
 +                        /* Rotate the force */
 +                        f[j][XX] += buf[n][XX];
 +                        f[j][YY] -= buf[n][YY];
 +                        f[j][ZZ] -= buf[n][ZZ];
 +                        if (fshift)
 +                        {
 +                            /* Add this force to the shift force */
 +                            rvec_inc(fshift[is], buf[n]);
 +                        }
 +                        n++;
 +                    }
 +                }
 +            }
 +        }
 +        nzone /= 2;
 +    }
 +}
 +
 +void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 +{
 +    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 +    int                   *index, *cgindex;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    real                  *buf, *rbuf;
 +
 +    comm = dd->comm;
 +
 +    cgindex = dd->cgindex;
 +
 +    buf = &comm->vbuf.v[0][0];
 +
 +    nzone   = 1;
 +    nat_tot = dd->nat_home;
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        cd = &comm->cd[d];
 +        for (p = 0; p < cd->np; p++)
 +        {
 +            ind   = &cd->ind[p];
 +            index = ind->index;
 +            n     = 0;
 +            for (i = 0; i < ind->nsend[nzone]; i++)
 +            {
 +                at0 = cgindex[index[i]];
 +                at1 = cgindex[index[i]+1];
 +                for (j = at0; j < at1; j++)
 +                {
 +                    buf[n] = v[j];
 +                    n++;
 +                }
 +            }
 +
 +            if (cd->bInPlace)
 +            {
 +                rbuf = v + nat_tot;
 +            }
 +            else
 +            {
 +                rbuf = &comm->vbuf2.v[0][0];
 +            }
 +            /* Send and receive the coordinates */
 +            dd_sendrecv_real(dd, d, dddirBackward,
 +                             buf,  ind->nsend[nzone+1],
 +                             rbuf, ind->nrecv[nzone+1]);
 +            if (!cd->bInPlace)
 +            {
 +                j = 0;
 +                for (zone = 0; zone < nzone; zone++)
 +                {
 +                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 +                    {
 +                        v[i] = rbuf[j];
 +                        j++;
 +                    }
 +                }
 +            }
 +            nat_tot += ind->nrecv[nzone+1];
 +        }
 +        nzone += nzone;
 +    }
 +}
 +
 +void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 +{
 +    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 +    int                   *index, *cgindex;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    real                  *buf, *sbuf;
 +
 +    comm = dd->comm;
 +
 +    cgindex = dd->cgindex;
 +
 +    buf = &comm->vbuf.v[0][0];
 +
 +    n       = 0;
 +    nzone   = comm->zones.n/2;
 +    nat_tot = dd->nat_tot;
 +    for (d = dd->ndim-1; d >= 0; d--)
 +    {
 +        cd = &comm->cd[d];
 +        for (p = cd->np-1; p >= 0; p--)
 +        {
 +            ind      = &cd->ind[p];
 +            nat_tot -= ind->nrecv[nzone+1];
 +            if (cd->bInPlace)
 +            {
 +                sbuf = v + nat_tot;
 +            }
 +            else
 +            {
 +                sbuf = &comm->vbuf2.v[0][0];
 +                j    = 0;
 +                for (zone = 0; zone < nzone; zone++)
 +                {
 +                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 +                    {
 +                        sbuf[j] = v[i];
 +                        j++;
 +                    }
 +                }
 +            }
 +            /* Communicate the forces */
 +            dd_sendrecv_real(dd, d, dddirForward,
 +                             sbuf, ind->nrecv[nzone+1],
 +                             buf,  ind->nsend[nzone+1]);
 +            index = ind->index;
 +            /* Add the received forces */
 +            n = 0;
 +            for (i = 0; i < ind->nsend[nzone]; i++)
 +            {
 +                at0 = cgindex[index[i]];
 +                at1 = cgindex[index[i]+1];
 +                for (j = at0; j < at1; j++)
 +                {
 +                    v[j] += buf[n];
 +                    n++;
 +                }
 +            }
 +        }
 +        nzone /= 2;
 +    }
 +}
 +
 +static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
 +{
 +    fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 +            d, i, j,
 +            zone->min0, zone->max1,
 +            zone->mch0, zone->mch0,
 +            zone->p1_0, zone->p1_1);
 +}
 +
 +
 +#define DDZONECOMM_MAXZONE  5
 +#define DDZONECOMM_BUFSIZE  3
 +
 +static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
 +                               int ddimind, int direction,
 +                               gmx_ddzone_t *buf_s, int n_s,
 +                               gmx_ddzone_t *buf_r, int n_r)
 +{
 +#define ZBS  DDZONECOMM_BUFSIZE
 +    rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
 +    rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
 +    int  i;
 +
 +    for (i = 0; i < n_s; i++)
 +    {
 +        vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
 +        vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
 +        vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
 +        vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
 +        vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
 +        vbuf_s[i*ZBS+1][2] = 0;
 +        vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
 +        vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
 +        vbuf_s[i*ZBS+2][2] = 0;
 +    }
 +
 +    dd_sendrecv_rvec(dd, ddimind, direction,
 +                     vbuf_s, n_s*ZBS,
 +                     vbuf_r, n_r*ZBS);
 +
 +    for (i = 0; i < n_r; i++)
 +    {
 +        buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
 +        buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
 +        buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
 +        buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
 +        buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
 +        buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
 +        buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
 +    }
 +
 +#undef ZBS
 +}
 +
 +static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
 +                          rvec cell_ns_x0, rvec cell_ns_x1)
 +{
 +    int                d, d1, dim, dim1, pos, buf_size, i, j, k, p, npulse, npulse_min;
 +    gmx_ddzone_t      *zp;
 +    gmx_ddzone_t       buf_s[DDZONECOMM_MAXZONE];
 +    gmx_ddzone_t       buf_r[DDZONECOMM_MAXZONE];
 +    gmx_ddzone_t       buf_e[DDZONECOMM_MAXZONE];
 +    rvec               extr_s[2], extr_r[2];
 +    rvec               dh;
 +    real               dist_d, c = 0, det;
 +    gmx_domdec_comm_t *comm;
 +    gmx_bool           bPBC, bUse;
 +
 +    comm = dd->comm;
 +
 +    for (d = 1; d < dd->ndim; d++)
 +    {
 +        dim      = dd->dim[d];
 +        zp       = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
 +        zp->min0 = cell_ns_x0[dim];
 +        zp->max1 = cell_ns_x1[dim];
 +        zp->min1 = cell_ns_x1[dim];
 +        zp->mch0 = cell_ns_x0[dim];
 +        zp->mch1 = cell_ns_x1[dim];
 +        zp->p1_0 = cell_ns_x0[dim];
 +        zp->p1_1 = cell_ns_x1[dim];
 +    }
 +
 +    for (d = dd->ndim-2; d >= 0; d--)
 +    {
 +        dim  = dd->dim[d];
 +        bPBC = (dim < ddbox->npbcdim);
 +
 +        /* Use an rvec to store two reals */
 +        extr_s[d][0] = comm->cell_f0[d+1];
 +        extr_s[d][1] = comm->cell_f1[d+1];
 +        extr_s[d][2] = comm->cell_f1[d+1];
 +
 +        pos = 0;
 +        /* Store the extremes in the backward sending buffer,
 +         * so the get updated separately from the forward communication.
 +         */
 +        for (d1 = d; d1 < dd->ndim-1; d1++)
 +        {
 +            /* We invert the order to be able to use the same loop for buf_e */
 +            buf_s[pos].min0 = extr_s[d1][1];
 +            buf_s[pos].max1 = extr_s[d1][0];
 +            buf_s[pos].min1 = extr_s[d1][2];
 +            buf_s[pos].mch0 = 0;
 +            buf_s[pos].mch1 = 0;
 +            /* Store the cell corner of the dimension we communicate along */
 +            buf_s[pos].p1_0 = comm->cell_x0[dim];
 +            buf_s[pos].p1_1 = 0;
 +            pos++;
 +        }
 +
 +        buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 +        pos++;
 +
 +        if (dd->ndim == 3 && d == 0)
 +        {
 +            buf_s[pos] = comm->zone_d2[0][1];
 +            pos++;
 +            buf_s[pos] = comm->zone_d1[0];
 +            pos++;
 +        }
 +
 +        /* We only need to communicate the extremes
 +         * in the forward direction
 +         */
 +        npulse = comm->cd[d].np;
 +        if (bPBC)
 +        {
 +            /* Take the minimum to avoid double communication */
 +            npulse_min = min(npulse, dd->nc[dim]-1-npulse);
 +        }
 +        else
 +        {
 +            /* Without PBC we should really not communicate over
 +             * the boundaries, but implementing that complicates
 +             * the communication setup and therefore we simply
 +             * do all communication, but ignore some data.
 +             */
 +            npulse_min = npulse;
 +        }
 +        for (p = 0; p < npulse_min; p++)
 +        {
 +            /* Communicate the extremes forward */
 +            bUse = (bPBC || dd->ci[dim] > 0);
 +
 +            dd_sendrecv_rvec(dd, d, dddirForward,
 +                             extr_s+d, dd->ndim-d-1,
 +                             extr_r+d, dd->ndim-d-1);
 +
 +            if (bUse)
 +            {
 +                for (d1 = d; d1 < dd->ndim-1; d1++)
 +                {
 +                    extr_s[d1][0] = max(extr_s[d1][0], extr_r[d1][0]);
 +                    extr_s[d1][1] = min(extr_s[d1][1], extr_r[d1][1]);
 +                    extr_s[d1][2] = min(extr_s[d1][2], extr_r[d1][2]);
 +                }
 +            }
 +        }
 +
 +        buf_size = pos;
 +        for (p = 0; p < npulse; p++)
 +        {
 +            /* Communicate all the zone information backward */
 +            bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
 +
 +            dd_sendrecv_ddzone(dd, d, dddirBackward,
 +                               buf_s, buf_size,
 +                               buf_r, buf_size);
 +
 +            clear_rvec(dh);
 +            if (p > 0)
 +            {
 +                for (d1 = d+1; d1 < dd->ndim; d1++)
 +                {
 +                    /* Determine the decrease of maximum required
 +                     * communication height along d1 due to the distance along d,
 +                     * this avoids a lot of useless atom communication.
 +                     */
 +                    dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
 +
 +                    if (ddbox->tric_dir[dim])
 +                    {
 +                        /* c is the off-diagonal coupling between the cell planes
 +                         * along directions d and d1.
 +                         */
 +                        c = ddbox->v[dim][dd->dim[d1]][dim];
 +                    }
 +                    else
 +                    {
 +                        c = 0;
 +                    }
 +                    det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
 +                    if (det > 0)
 +                    {
 +                        dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
 +                    }
 +                    else
 +                    {
 +                        /* A negative value signals out of range */
 +                        dh[d1] = -1;
 +                    }
 +                }
 +            }
 +
 +            /* Accumulate the extremes over all pulses */
 +            for (i = 0; i < buf_size; i++)
 +            {
 +                if (p == 0)
 +                {
 +                    buf_e[i] = buf_r[i];
 +                }
 +                else
 +                {
 +                    if (bUse)
 +                    {
 +                        buf_e[i].min0 = min(buf_e[i].min0, buf_r[i].min0);
 +                        buf_e[i].max1 = max(buf_e[i].max1, buf_r[i].max1);
 +                        buf_e[i].min1 = min(buf_e[i].min1, buf_r[i].min1);
 +                    }
 +
 +                    if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
 +                    {
 +                        d1 = 1;
 +                    }
 +                    else
 +                    {
 +                        d1 = d + 1;
 +                    }
 +                    if (bUse && dh[d1] >= 0)
 +                    {
 +                        buf_e[i].mch0 = max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
 +                        buf_e[i].mch1 = max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
 +                    }
 +                }
 +                /* Copy the received buffer to the send buffer,
 +                 * to pass the data through with the next pulse.
 +                 */
 +                buf_s[i] = buf_r[i];
 +            }
 +            if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
 +                (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
 +            {
 +                /* Store the extremes */
 +                pos = 0;
 +
 +                for (d1 = d; d1 < dd->ndim-1; d1++)
 +                {
 +                    extr_s[d1][1] = min(extr_s[d1][1], buf_e[pos].min0);
 +                    extr_s[d1][0] = max(extr_s[d1][0], buf_e[pos].max1);
 +                    extr_s[d1][2] = min(extr_s[d1][2], buf_e[pos].min1);
 +                    pos++;
 +                }
 +
 +                if (d == 1 || (d == 0 && dd->ndim == 3))
 +                {
 +                    for (i = d; i < 2; i++)
 +                    {
 +                        comm->zone_d2[1-d][i] = buf_e[pos];
 +                        pos++;
 +                    }
 +                }
 +                if (d == 0)
 +                {
 +                    comm->zone_d1[1] = buf_e[pos];
 +                    pos++;
 +                }
 +            }
 +        }
 +    }
 +
 +    if (dd->ndim >= 2)
 +    {
 +        dim = dd->dim[1];
 +        for (i = 0; i < 2; i++)
 +        {
 +            if (debug)
 +            {
 +                print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
 +            }
 +            cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d1[i].min0);
 +            cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d1[i].max1);
 +        }
 +    }
 +    if (dd->ndim >= 3)
 +    {
 +        dim = dd->dim[2];
 +        for (i = 0; i < 2; i++)
 +        {
 +            for (j = 0; j < 2; j++)
 +            {
 +                if (debug)
 +                {
 +                    print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
 +                }
 +                cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
 +                cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
 +            }
 +        }
 +    }
 +    for (d = 1; d < dd->ndim; d++)
 +    {
 +        comm->cell_f_max0[d] = extr_s[d-1][0];
 +        comm->cell_f_min1[d] = extr_s[d-1][1];
 +        if (debug)
 +        {
 +            fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
 +                    d, comm->cell_f_max0[d], comm->cell_f_min1[d]);
 +        }
 +    }
 +}
 +
 +static void dd_collect_cg(gmx_domdec_t *dd,
 +                          t_state      *state_local)
 +{
 +    gmx_domdec_master_t *ma = NULL;
 +    int                  buf2[2], *ibuf, i, ncg_home = 0, *cg = NULL, nat_home = 0;
 +    t_block             *cgs_gl;
 +
 +    if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
 +    {
 +        /* The master has the correct distribution */
 +        return;
 +    }
 +
 +    if (state_local->ddp_count == dd->ddp_count)
 +    {
 +        ncg_home = dd->ncg_home;
 +        cg       = dd->index_gl;
 +        nat_home = dd->nat_home;
 +    }
 +    else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
 +    {
 +        cgs_gl = &dd->comm->cgs_gl;
 +
 +        ncg_home = state_local->ncg_gl;
 +        cg       = state_local->cg_gl;
 +        nat_home = 0;
 +        for (i = 0; i < ncg_home; i++)
 +        {
 +            nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
 +        }
 +    }
 +    else
 +    {
 +        gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
 +    }
 +
 +    buf2[0] = dd->ncg_home;
 +    buf2[1] = dd->nat_home;
 +    if (DDMASTER(dd))
 +    {
 +        ma   = dd->ma;
 +        ibuf = ma->ibuf;
 +    }
 +    else
 +    {
 +        ibuf = NULL;
 +    }
 +    /* Collect the charge group and atom counts on the master */
 +    dd_gather(dd, 2*sizeof(int), buf2, ibuf);
 +
 +    if (DDMASTER(dd))
 +    {
 +        ma->index[0] = 0;
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            ma->ncg[i]     = ma->ibuf[2*i];
 +            ma->nat[i]     = ma->ibuf[2*i+1];
 +            ma->index[i+1] = ma->index[i] + ma->ncg[i];
 +
 +        }
 +        /* Make byte counts and indices */
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
 +            ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug, "Initial charge group distribution: ");
 +            for (i = 0; i < dd->nnodes; i++)
 +            {
 +                fprintf(debug, " %d", ma->ncg[i]);
 +            }
 +            fprintf(debug, "\n");
 +        }
 +    }
 +
 +    /* Collect the charge group indices on the master */
 +    dd_gatherv(dd,
 +               dd->ncg_home*sizeof(int), dd->index_gl,
 +               DDMASTER(dd) ? ma->ibuf : NULL,
 +               DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
 +               DDMASTER(dd) ? ma->cg : NULL);
 +
 +    dd->comm->master_cg_ddp_count = state_local->ddp_count;
 +}
 +
 +static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
 +                                    rvec *lv, rvec *v)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  n, i, c, a, nalloc = 0;
 +    rvec                *buf = NULL;
 +    t_block             *cgs_gl;
 +
 +    ma = dd->ma;
 +
 +    if (!DDMASTER(dd))
 +    {
 +#ifdef GMX_MPI
 +        MPI_Send(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
 +                 dd->rank, dd->mpi_comm_all);
 +#endif
 +    }
 +    else
 +    {
 +        /* Copy the master coordinates to the global array */
 +        cgs_gl = &dd->comm->cgs_gl;
 +
 +        n = DDMASTERRANK(dd);
 +        a = 0;
 +        for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +        {
 +            for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
 +            {
 +                copy_rvec(lv[a++], v[c]);
 +            }
 +        }
 +
 +        for (n = 0; n < dd->nnodes; n++)
 +        {
 +            if (n != dd->rank)
 +            {
 +                if (ma->nat[n] > nalloc)
 +                {
 +                    nalloc = over_alloc_dd(ma->nat[n]);
 +                    srenew(buf, nalloc);
 +                }
 +#ifdef GMX_MPI
 +                MPI_Recv(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, DDRANK(dd, n),
 +                         n, dd->mpi_comm_all, MPI_STATUS_IGNORE);
 +#endif
 +                a = 0;
 +                for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +                {
 +                    for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
 +                    {
 +                        copy_rvec(buf[a++], v[c]);
 +                    }
 +                }
 +            }
 +        }
 +        sfree(buf);
 +    }
 +}
 +
 +static void get_commbuffer_counts(gmx_domdec_t *dd,
 +                                  int **counts, int **disps)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  n;
 +
 +    ma = dd->ma;
 +
 +    /* Make the rvec count and displacment arrays */
 +    *counts  = ma->ibuf;
 +    *disps   = ma->ibuf + dd->nnodes;
 +    for (n = 0; n < dd->nnodes; n++)
 +    {
 +        (*counts)[n] = ma->nat[n]*sizeof(rvec);
 +        (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
 +    }
 +}
 +
 +static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
 +                                   rvec *lv, rvec *v)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                 *rcounts = NULL, *disps = NULL;
 +    int                  n, i, c, a;
 +    rvec                *buf = NULL;
 +    t_block             *cgs_gl;
 +
 +    ma = dd->ma;
 +
 +    if (DDMASTER(dd))
 +    {
 +        get_commbuffer_counts(dd, &rcounts, &disps);
 +
 +        buf = ma->vbuf;
 +    }
 +
 +    dd_gatherv(dd, dd->nat_home*sizeof(rvec), lv, rcounts, disps, buf);
 +
 +    if (DDMASTER(dd))
 +    {
 +        cgs_gl = &dd->comm->cgs_gl;
 +
 +        a = 0;
 +        for (n = 0; n < dd->nnodes; n++)
 +        {
 +            for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +            {
 +                for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
 +                {
 +                    copy_rvec(buf[a++], v[c]);
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +void dd_collect_vec(gmx_domdec_t *dd,
 +                    t_state *state_local, rvec *lv, rvec *v)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  n, i, c, a, nalloc = 0;
 +    rvec                *buf = NULL;
 +
 +    dd_collect_cg(dd, state_local);
 +
 +    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
 +    {
 +        dd_collect_vec_sendrecv(dd, lv, v);
 +    }
 +    else
 +    {
 +        dd_collect_vec_gatherv(dd, lv, v);
 +    }
 +}
 +
 +
 +void dd_collect_state(gmx_domdec_t *dd,
 +                      t_state *state_local, t_state *state)
 +{
 +    int est, i, j, nh;
 +
 +    nh = state->nhchainlength;
 +
 +    if (DDMASTER(dd))
 +    {
 +        for (i = 0; i < efptNR; i++)
 +        {
 +            state->lambda[i] = state_local->lambda[i];
 +        }
 +        state->fep_state = state_local->fep_state;
 +        state->veta      = state_local->veta;
 +        state->vol0      = state_local->vol0;
 +        copy_mat(state_local->box, state->box);
 +        copy_mat(state_local->boxv, state->boxv);
 +        copy_mat(state_local->svir_prev, state->svir_prev);
 +        copy_mat(state_local->fvir_prev, state->fvir_prev);
 +        copy_mat(state_local->pres_prev, state->pres_prev);
 +
 +        for (i = 0; i < state_local->ngtc; i++)
 +        {
 +            for (j = 0; j < nh; j++)
 +            {
 +                state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
 +                state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
 +            }
 +            state->therm_integral[i] = state_local->therm_integral[i];
 +        }
 +        for (i = 0; i < state_local->nnhpres; i++)
 +        {
 +            for (j = 0; j < nh; j++)
 +            {
 +                state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
 +                state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
 +            }
 +        }
 +    }
 +    for (est = 0; est < estNR; est++)
 +    {
 +        if (EST_DISTR(est) && (state_local->flags & (1<<est)))
 +        {
 +            switch (est)
 +            {
 +                case estX:
 +                    dd_collect_vec(dd, state_local, state_local->x, state->x);
 +                    break;
 +                case estV:
 +                    dd_collect_vec(dd, state_local, state_local->v, state->v);
 +                    break;
 +                case estSDX:
 +                    dd_collect_vec(dd, state_local, state_local->sd_X, state->sd_X);
 +                    break;
 +                case estCGP:
 +                    dd_collect_vec(dd, state_local, state_local->cg_p, state->cg_p);
 +                    break;
 +                case estLD_RNG:
 +                    if (state->nrngi == 1)
 +                    {
 +                        if (DDMASTER(dd))
 +                        {
 +                            for (i = 0; i < state_local->nrng; i++)
 +                            {
 +                                state->ld_rng[i] = state_local->ld_rng[i];
 +                            }
 +                        }
 +                    }
 +                    else
 +                    {
 +                        dd_gather(dd, state_local->nrng*sizeof(state->ld_rng[0]),
 +                                  state_local->ld_rng, state->ld_rng);
 +                    }
 +                    break;
 +                case estLD_RNGI:
 +                    if (state->nrngi == 1)
 +                    {
 +                        if (DDMASTER(dd))
 +                        {
 +                            state->ld_rngi[0] = state_local->ld_rngi[0];
 +                        }
 +                    }
 +                    else
 +                    {
 +                        dd_gather(dd, sizeof(state->ld_rngi[0]),
 +                                  state_local->ld_rngi, state->ld_rngi);
 +                    }
 +                    break;
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_collect_state");
 +            }
 +        }
 +    }
 +}
 +
 +static void dd_realloc_state(t_state *state, rvec **f, int nalloc)
 +{
 +    int est;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Reallocating state: currently %d, required %d, allocating %d\n", state->nalloc, nalloc, over_alloc_dd(nalloc));
 +    }
 +
 +    state->nalloc = over_alloc_dd(nalloc);
 +
 +    for (est = 0; est < estNR; est++)
 +    {
 +        if (EST_DISTR(est) && (state->flags & (1<<est)))
 +        {
 +            switch (est)
 +            {
 +                case estX:
 +                    srenew(state->x, state->nalloc);
 +                    break;
 +                case estV:
 +                    srenew(state->v, state->nalloc);
 +                    break;
 +                case estSDX:
 +                    srenew(state->sd_X, state->nalloc);
 +                    break;
 +                case estCGP:
 +                    srenew(state->cg_p, state->nalloc);
 +                    break;
 +                case estLD_RNG:
 +                case estLD_RNGI:
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* No reallocation required */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_realloc_state");
 +            }
 +        }
 +    }
 +
 +    if (f != NULL)
 +    {
 +        srenew(*f, state->nalloc);
 +    }
 +}
 +
 +static void dd_check_alloc_ncg(t_forcerec *fr, t_state *state, rvec **f,
 +                               int nalloc)
 +{
 +    if (nalloc > fr->cg_nalloc)
 +    {
 +        if (debug)
 +        {
 +            fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, nalloc, over_alloc_dd(nalloc));
 +        }
 +        fr->cg_nalloc = over_alloc_dd(nalloc);
 +        srenew(fr->cginfo, fr->cg_nalloc);
 +        if (fr->cutoff_scheme == ecutsGROUP)
 +        {
 +            srenew(fr->cg_cm, fr->cg_nalloc);
 +        }
 +    }
 +    if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
 +    {
 +        /* We don't use charge groups, we use x in state to set up
 +         * the atom communication.
 +         */
 +        dd_realloc_state(state, f, nalloc);
 +    }
 +}
 +
 +static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd, t_block *cgs,
 +                                       rvec *v, rvec *lv)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  n, i, c, a, nalloc = 0;
 +    rvec                *buf = NULL;
 +
 +    if (DDMASTER(dd))
 +    {
 +        ma  = dd->ma;
 +
 +        for (n = 0; n < dd->nnodes; n++)
 +        {
 +            if (n != dd->rank)
 +            {
 +                if (ma->nat[n] > nalloc)
 +                {
 +                    nalloc = over_alloc_dd(ma->nat[n]);
 +                    srenew(buf, nalloc);
 +                }
 +                /* Use lv as a temporary buffer */
 +                a = 0;
 +                for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +                {
 +                    for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
 +                    {
 +                        copy_rvec(v[c], buf[a++]);
 +                    }
 +                }
 +                if (a != ma->nat[n])
 +                {
 +                    gmx_fatal(FARGS, "Internal error a (%d) != nat (%d)",
 +                              a, ma->nat[n]);
 +                }
 +
 +#ifdef GMX_MPI
 +                MPI_Send(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE,
 +                         DDRANK(dd, n), n, dd->mpi_comm_all);
 +#endif
 +            }
 +        }
 +        sfree(buf);
 +        n = DDMASTERRANK(dd);
 +        a = 0;
 +        for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +        {
 +            for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
 +            {
 +                copy_rvec(v[c], lv[a++]);
 +            }
 +        }
 +    }
 +    else
 +    {
 +#ifdef GMX_MPI
 +        MPI_Recv(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
 +                 MPI_ANY_TAG, dd->mpi_comm_all, MPI_STATUS_IGNORE);
 +#endif
 +    }
 +}
 +
 +static void dd_distribute_vec_scatterv(gmx_domdec_t *dd, t_block *cgs,
 +                                       rvec *v, rvec *lv)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                 *scounts = NULL, *disps = NULL;
 +    int                  n, i, c, a, nalloc = 0;
 +    rvec                *buf = NULL;
 +
 +    if (DDMASTER(dd))
 +    {
 +        ma  = dd->ma;
 +
 +        get_commbuffer_counts(dd, &scounts, &disps);
 +
 +        buf = ma->vbuf;
 +        a   = 0;
 +        for (n = 0; n < dd->nnodes; n++)
 +        {
 +            for (i = ma->index[n]; i < ma->index[n+1]; i++)
 +            {
 +                for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
 +                {
 +                    copy_rvec(v[c], buf[a++]);
 +                }
 +            }
 +        }
 +    }
 +
 +    dd_scatterv(dd, scounts, disps, buf, dd->nat_home*sizeof(rvec), lv);
 +}
 +
 +static void dd_distribute_vec(gmx_domdec_t *dd, t_block *cgs, rvec *v, rvec *lv)
 +{
 +    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
 +    {
 +        dd_distribute_vec_sendrecv(dd, cgs, v, lv);
 +    }
 +    else
 +    {
 +        dd_distribute_vec_scatterv(dd, cgs, v, lv);
 +    }
 +}
 +
 +static void dd_distribute_dfhist(gmx_domdec_t *dd, df_history_t *dfhist)
 +{
 +    int i;
 +    dd_bcast(dd, sizeof(int), &dfhist->bEquil);
 +    dd_bcast(dd, sizeof(int), &dfhist->nlambda);
 +    dd_bcast(dd, sizeof(real), &dfhist->wl_delta);
 +
 +    if (dfhist->nlambda > 0)
 +    {
 +        int nlam = dfhist->nlambda;
 +        dd_bcast(dd, sizeof(int)*nlam, dfhist->n_at_lam);
 +        dd_bcast(dd, sizeof(real)*nlam, dfhist->wl_histo);
 +        dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_weights);
 +        dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_dg);
 +        dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_minvar);
 +        dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_variance);
 +
 +        for (i = 0; i < nlam; i++)
 +        {
 +            dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p[i]);
 +            dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m[i]);
 +            dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p2[i]);
 +            dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m2[i]);
 +            dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij[i]);
 +            dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij_empirical[i]);
 +        }
 +    }
 +}
 +
 +static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
 +                                t_state *state, t_state *state_local,
 +                                rvec **f)
 +{
 +    int  i, j, nh;
 +
 +    nh = state->nhchainlength;
 +
 +    if (DDMASTER(dd))
 +    {
 +        for (i = 0; i < efptNR; i++)
 +        {
 +            state_local->lambda[i] = state->lambda[i];
 +        }
 +        state_local->fep_state = state->fep_state;
 +        state_local->veta      = state->veta;
 +        state_local->vol0      = state->vol0;
 +        copy_mat(state->box, state_local->box);
 +        copy_mat(state->box_rel, state_local->box_rel);
 +        copy_mat(state->boxv, state_local->boxv);
 +        copy_mat(state->svir_prev, state_local->svir_prev);
 +        copy_mat(state->fvir_prev, state_local->fvir_prev);
 +        copy_df_history(&state_local->dfhist, &state->dfhist);
 +        for (i = 0; i < state_local->ngtc; i++)
 +        {
 +            for (j = 0; j < nh; j++)
 +            {
 +                state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
 +                state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
 +            }
 +            state_local->therm_integral[i] = state->therm_integral[i];
 +        }
 +        for (i = 0; i < state_local->nnhpres; i++)
 +        {
 +            for (j = 0; j < nh; j++)
 +            {
 +                state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
 +                state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
 +            }
 +        }
 +    }
 +    dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda);
 +    dd_bcast(dd, sizeof(int), &state_local->fep_state);
 +    dd_bcast(dd, sizeof(real), &state_local->veta);
 +    dd_bcast(dd, sizeof(real), &state_local->vol0);
 +    dd_bcast(dd, sizeof(state_local->box), state_local->box);
 +    dd_bcast(dd, sizeof(state_local->box_rel), state_local->box_rel);
 +    dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv);
 +    dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev);
 +    dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev);
 +    dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi);
 +    dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi);
 +    dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral);
 +    dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi);
 +    dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi);
 +
 +    /* communicate df_history -- required for restarting from checkpoint */
 +    dd_distribute_dfhist(dd, &state_local->dfhist);
 +
 +    if (dd->nat_home > state_local->nalloc)
 +    {
 +        dd_realloc_state(state_local, f, dd->nat_home);
 +    }
 +    for (i = 0; i < estNR; i++)
 +    {
 +        if (EST_DISTR(i) && (state_local->flags & (1<<i)))
 +        {
 +            switch (i)
 +            {
 +                case estX:
 +                    dd_distribute_vec(dd, cgs, state->x, state_local->x);
 +                    break;
 +                case estV:
 +                    dd_distribute_vec(dd, cgs, state->v, state_local->v);
 +                    break;
 +                case estSDX:
 +                    dd_distribute_vec(dd, cgs, state->sd_X, state_local->sd_X);
 +                    break;
 +                case estCGP:
 +                    dd_distribute_vec(dd, cgs, state->cg_p, state_local->cg_p);
 +                    break;
 +                case estLD_RNG:
 +                    if (state->nrngi == 1)
 +                    {
 +                        dd_bcastc(dd,
 +                                  state_local->nrng*sizeof(state_local->ld_rng[0]),
 +                                  state->ld_rng, state_local->ld_rng);
 +                    }
 +                    else
 +                    {
 +                        dd_scatter(dd,
 +                                   state_local->nrng*sizeof(state_local->ld_rng[0]),
 +                                   state->ld_rng, state_local->ld_rng);
 +                    }
 +                    break;
 +                case estLD_RNGI:
 +                    if (state->nrngi == 1)
 +                    {
 +                        dd_bcastc(dd, sizeof(state_local->ld_rngi[0]),
 +                                  state->ld_rngi, state_local->ld_rngi);
 +                    }
 +                    else
 +                    {
 +                        dd_scatter(dd, sizeof(state_local->ld_rngi[0]),
 +                                   state->ld_rngi, state_local->ld_rngi);
 +                    }
 +                    break;
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* Not implemented yet */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_distribute_state");
 +            }
 +        }
 +    }
 +}
 +
 +static char dim2char(int dim)
 +{
 +    char c = '?';
 +
 +    switch (dim)
 +    {
 +        case XX: c = 'X'; break;
 +        case YY: c = 'Y'; break;
 +        case ZZ: c = 'Z'; break;
 +        default: gmx_fatal(FARGS, "Unknown dim %d", dim);
 +    }
 +
 +    return c;
 +}
 +
 +static void write_dd_grid_pdb(const char *fn, gmx_large_int_t step,
 +                              gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox)
 +{
 +    rvec   grid_s[2], *grid_r = NULL, cx, r;
 +    char   fname[STRLEN], format[STRLEN], buf[22];
 +    FILE  *out;
 +    int    a, i, d, z, y, x;
 +    matrix tric;
 +    real   vol;
 +
 +    copy_rvec(dd->comm->cell_x0, grid_s[0]);
 +    copy_rvec(dd->comm->cell_x1, grid_s[1]);
 +
 +    if (DDMASTER(dd))
 +    {
 +        snew(grid_r, 2*dd->nnodes);
 +    }
 +
 +    dd_gather(dd, 2*sizeof(rvec), grid_s[0], DDMASTER(dd) ? grid_r[0] : NULL);
 +
 +    if (DDMASTER(dd))
 +    {
 +        for (d = 0; d < DIM; d++)
 +        {
 +            for (i = 0; i < DIM; i++)
 +            {
 +                if (d == i)
 +                {
 +                    tric[d][i] = 1;
 +                }
 +                else
 +                {
 +                    if (d < ddbox->npbcdim && dd->nc[d] > 1)
 +                    {
 +                        tric[d][i] = box[i][d]/box[i][i];
 +                    }
 +                    else
 +                    {
 +                        tric[d][i] = 0;
 +                    }
 +                }
 +            }
 +        }
 +        sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf));
 +        sprintf(format, "%s%s\n", get_pdbformat(), "%6.2f%6.2f");
 +        out = gmx_fio_fopen(fname, "w");
 +        gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
 +        a = 1;
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
 +            for (d = 0; d < DIM; d++)
 +            {
 +                vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
 +            }
 +            for (z = 0; z < 2; z++)
 +            {
 +                for (y = 0; y < 2; y++)
 +                {
 +                    for (x = 0; x < 2; x++)
 +                    {
 +                        cx[XX] = grid_r[i*2+x][XX];
 +                        cx[YY] = grid_r[i*2+y][YY];
 +                        cx[ZZ] = grid_r[i*2+z][ZZ];
 +                        mvmul(tric, cx, r);
 +                        fprintf(out, format, "ATOM", a++, "CA", "GLY", ' ', 1+i,
 +                                ' ', 10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol);
 +                    }
 +                }
 +            }
 +            for (d = 0; d < DIM; d++)
 +            {
 +                for (x = 0; x < 4; x++)
 +                {
 +                    switch (d)
 +                    {
 +                        case 0: y = 1 + i*8 + 2*x; break;
 +                        case 1: y = 1 + i*8 + 2*x - (x % 2); break;
 +                        case 2: y = 1 + i*8 + x; break;
 +                    }
 +                    fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<<d));
 +                }
 +            }
 +        }
 +        gmx_fio_fclose(out);
 +        sfree(grid_r);
 +    }
 +}
 +
 +void write_dd_pdb(const char *fn, gmx_large_int_t step, const char *title,
 +                  gmx_mtop_t *mtop, t_commrec *cr,
 +                  int natoms, rvec x[], matrix box)
 +{
 +    char          fname[STRLEN], format[STRLEN], format4[STRLEN], buf[22];
 +    FILE         *out;
 +    int           i, ii, resnr, c;
 +    char         *atomname, *resname;
 +    real          b;
 +    gmx_domdec_t *dd;
 +
 +    dd = cr->dd;
 +    if (natoms == -1)
 +    {
 +        natoms = dd->comm->nat[ddnatVSITE];
 +    }
 +
 +    sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid);
 +
 +    sprintf(format, "%s%s\n", get_pdbformat(), "%6.2f%6.2f");
 +    sprintf(format4, "%s%s\n", get_pdbformat4(), "%6.2f%6.2f");
 +
 +    out = gmx_fio_fopen(fname, "w");
 +
 +    fprintf(out, "TITLE     %s\n", title);
 +    gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
 +    for (i = 0; i < natoms; i++)
 +    {
 +        ii = dd->gatindex[i];
 +        gmx_mtop_atominfo_global(mtop, ii, &atomname, &resnr, &resname);
 +        if (i < dd->comm->nat[ddnatZONE])
 +        {
 +            c = 0;
 +            while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
 +            {
 +                c++;
 +            }
 +            b = c;
 +        }
 +        else if (i < dd->comm->nat[ddnatVSITE])
 +        {
 +            b = dd->comm->zones.n;
 +        }
 +        else
 +        {
 +            b = dd->comm->zones.n + 1;
 +        }
 +        fprintf(out, strlen(atomname) < 4 ? format : format4,
 +                "ATOM", (ii+1)%100000,
 +                atomname, resname, ' ', resnr%10000, ' ',
 +                10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b);
 +    }
 +    fprintf(out, "TER\n");
 +
 +    gmx_fio_fclose(out);
 +}
 +
 +real dd_cutoff_mbody(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                di;
 +    real               r;
 +
 +    comm = dd->comm;
 +
 +    r = -1;
 +    if (comm->bInterCGBondeds)
 +    {
 +        if (comm->cutoff_mbody > 0)
 +        {
 +            r = comm->cutoff_mbody;
 +        }
 +        else
 +        {
 +            /* cutoff_mbody=0 means we do not have DLB */
 +            r = comm->cellsize_min[dd->dim[0]];
 +            for (di = 1; di < dd->ndim; di++)
 +            {
 +                r = min(r, comm->cellsize_min[dd->dim[di]]);
 +            }
 +            if (comm->bBondComm)
 +            {
 +                r = max(r, comm->cutoff_mbody);
 +            }
 +            else
 +            {
 +                r = min(r, comm->cutoff);
 +            }
 +        }
 +    }
 +
 +    return r;
 +}
 +
 +real dd_cutoff_twobody(gmx_domdec_t *dd)
 +{
 +    real r_mb;
 +
 +    r_mb = dd_cutoff_mbody(dd);
 +
 +    return max(dd->comm->cutoff, r_mb);
 +}
 +
 +
 +static void dd_cart_coord2pmecoord(gmx_domdec_t *dd, ivec coord, ivec coord_pme)
 +{
 +    int nc, ntot;
 +
 +    nc   = dd->nc[dd->comm->cartpmedim];
 +    ntot = dd->comm->ntot[dd->comm->cartpmedim];
 +    copy_ivec(coord, coord_pme);
 +    coord_pme[dd->comm->cartpmedim] =
 +        nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
 +}
 +
 +static int low_ddindex2pmeindex(int ndd, int npme, int ddindex)
 +{
 +    /* Here we assign a PME node to communicate with this DD node
 +     * by assuming that the major index of both is x.
 +     * We add cr->npmenodes/2 to obtain an even distribution.
 +     */
 +    return (ddindex*npme + npme/2)/ndd;
 +}
 +
 +static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
 +{
 +    return low_ddindex2pmeindex(dd->nnodes, dd->comm->npmenodes, ddindex);
 +}
 +
 +static int cr_ddindex2pmeindex(const t_commrec *cr, int ddindex)
 +{
 +    return low_ddindex2pmeindex(cr->dd->nnodes, cr->npmenodes, ddindex);
 +}
 +
 +static int *dd_pmenodes(t_commrec *cr)
 +{
 +    int *pmenodes;
 +    int  n, i, p0, p1;
 +
 +    snew(pmenodes, cr->npmenodes);
 +    n = 0;
 +    for (i = 0; i < cr->dd->nnodes; i++)
 +    {
 +        p0 = cr_ddindex2pmeindex(cr, i);
 +        p1 = cr_ddindex2pmeindex(cr, i+1);
 +        if (i+1 == cr->dd->nnodes || p1 > p0)
 +        {
 +            if (debug)
 +            {
 +                fprintf(debug, "pmenode[%d] = %d\n", n, i+1+n);
 +            }
 +            pmenodes[n] = i + 1 + n;
 +            n++;
 +        }
 +    }
 +
 +    return pmenodes;
 +}
 +
 +static int gmx_ddcoord2pmeindex(t_commrec *cr, int x, int y, int z)
 +{
 +    gmx_domdec_t *dd;
 +    ivec          coords, coords_pme, nc;
 +    int           slab;
 +
 +    dd = cr->dd;
 +    /*
 +       if (dd->comm->bCartesian) {
 +       gmx_ddindex2xyz(dd->nc,ddindex,coords);
 +       dd_coords2pmecoords(dd,coords,coords_pme);
 +       copy_ivec(dd->ntot,nc);
 +       nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
 +       coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
 +
 +       slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
 +       } else {
 +       slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
 +       }
 +     */
 +    coords[XX] = x;
 +    coords[YY] = y;
 +    coords[ZZ] = z;
 +    slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
 +
 +    return slab;
 +}
 +
 +static int ddcoord2simnodeid(t_commrec *cr, int x, int y, int z)
 +{
 +    gmx_domdec_comm_t *comm;
 +    ivec               coords;
 +    int                ddindex, nodeid = -1;
 +
 +    comm = cr->dd->comm;
 +
 +    coords[XX] = x;
 +    coords[YY] = y;
 +    coords[ZZ] = z;
 +    if (comm->bCartesianPP_PME)
 +    {
 +#ifdef GMX_MPI
 +        MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
 +#endif
 +    }
 +    else
 +    {
 +        ddindex = dd_index(cr->dd->nc, coords);
 +        if (comm->bCartesianPP)
 +        {
 +            nodeid = comm->ddindex2simnodeid[ddindex];
 +        }
 +        else
 +        {
 +            if (comm->pmenodes)
 +            {
 +                nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
 +            }
 +            else
 +            {
 +                nodeid = ddindex;
 +            }
 +        }
 +    }
 +
 +    return nodeid;
 +}
 +
 +static int dd_simnode2pmenode(t_commrec *cr, int sim_nodeid)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    ivec               coord, coord_pme;
 +    int                i;
 +    int                pmenode = -1;
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    /* This assumes a uniform x domain decomposition grid cell size */
 +    if (comm->bCartesianPP_PME)
 +    {
 +#ifdef GMX_MPI
 +        MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
 +        if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 +        {
 +            /* This is a PP node */
 +            dd_cart_coord2pmecoord(dd, coord, coord_pme);
 +            MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
 +        }
 +#endif
 +    }
 +    else if (comm->bCartesianPP)
 +    {
 +        if (sim_nodeid < dd->nnodes)
 +        {
 +            pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
 +        }
 +    }
 +    else
 +    {
 +        /* This assumes DD cells with identical x coordinates
 +         * are numbered sequentially.
 +         */
 +        if (dd->comm->pmenodes == NULL)
 +        {
 +            if (sim_nodeid < dd->nnodes)
 +            {
 +                /* The DD index equals the nodeid */
 +                pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
 +            }
 +        }
 +        else
 +        {
 +            i = 0;
 +            while (sim_nodeid > dd->comm->pmenodes[i])
 +            {
 +                i++;
 +            }
 +            if (sim_nodeid < dd->comm->pmenodes[i])
 +            {
 +                pmenode = dd->comm->pmenodes[i];
 +            }
 +        }
 +    }
 +
 +    return pmenode;
 +}
 +
 +void get_pme_nnodes(const gmx_domdec_t *dd,
 +                    int *npmenodes_x, int *npmenodes_y)
 +{
 +    if (dd != NULL)
 +    {
 +        *npmenodes_x = dd->comm->npmenodes_x;
 +        *npmenodes_y = dd->comm->npmenodes_y;
 +    }
 +    else
 +    {
 +        *npmenodes_x = 1;
 +        *npmenodes_y = 1;
 +    }
 +}
 +
 +gmx_bool gmx_pmeonlynode(t_commrec *cr, int sim_nodeid)
 +{
 +    gmx_bool bPMEOnlyNode;
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        bPMEOnlyNode = (dd_simnode2pmenode(cr, sim_nodeid) == -1);
 +    }
 +    else
 +    {
 +        bPMEOnlyNode = FALSE;
 +    }
 +
 +    return bPMEOnlyNode;
 +}
 +
 +void get_pme_ddnodes(t_commrec *cr, int pmenodeid,
 +                     int *nmy_ddnodes, int **my_ddnodes, int *node_peer)
 +{
 +    gmx_domdec_t *dd;
 +    int           x, y, z;
 +    ivec          coord, coord_pme;
 +
 +    dd = cr->dd;
 +
 +    snew(*my_ddnodes, (dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
 +
 +    *nmy_ddnodes = 0;
 +    for (x = 0; x < dd->nc[XX]; x++)
 +    {
 +        for (y = 0; y < dd->nc[YY]; y++)
 +        {
 +            for (z = 0; z < dd->nc[ZZ]; z++)
 +            {
 +                if (dd->comm->bCartesianPP_PME)
 +                {
 +                    coord[XX] = x;
 +                    coord[YY] = y;
 +                    coord[ZZ] = z;
 +                    dd_cart_coord2pmecoord(dd, coord, coord_pme);
 +                    if (dd->ci[XX] == coord_pme[XX] &&
 +                        dd->ci[YY] == coord_pme[YY] &&
 +                        dd->ci[ZZ] == coord_pme[ZZ])
 +                    {
 +                        (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
 +                    }
 +                }
 +                else
 +                {
 +                    /* The slab corresponds to the nodeid in the PME group */
 +                    if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
 +                    {
 +                        (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    /* The last PP-only node is the peer node */
 +    *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Receive coordinates from PP nodes:");
 +        for (x = 0; x < *nmy_ddnodes; x++)
 +        {
 +            fprintf(debug, " %d", (*my_ddnodes)[x]);
 +        }
 +        fprintf(debug, "\n");
 +    }
 +}
 +
 +static gmx_bool receive_vir_ener(t_commrec *cr)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                pmenode, coords[DIM], rank;
 +    gmx_bool           bReceive;
 +
 +    bReceive = TRUE;
 +    if (cr->npmenodes < cr->dd->nnodes)
 +    {
 +        comm = cr->dd->comm;
 +        if (comm->bCartesianPP_PME)
 +        {
 +            pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
 +#ifdef GMX_MPI
 +            MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
 +            coords[comm->cartpmedim]++;
 +            if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
 +            {
 +                MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
 +                if (dd_simnode2pmenode(cr, rank) == pmenode)
 +                {
 +                    /* This is not the last PP node for pmenode */
 +                    bReceive = FALSE;
 +                }
 +            }
 +#endif
 +        }
 +        else
 +        {
 +            pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
 +            if (cr->sim_nodeid+1 < cr->nnodes &&
 +                dd_simnode2pmenode(cr, cr->sim_nodeid+1) == pmenode)
 +            {
 +                /* This is not the last PP node for pmenode */
 +                bReceive = FALSE;
 +            }
 +        }
 +    }
 +
 +    return bReceive;
 +}
 +
 +static void set_zones_ncg_home(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_zones_t *zones;
 +    int                 i;
 +
 +    zones = &dd->comm->zones;
 +
 +    zones->cg_range[0] = 0;
 +    for (i = 1; i < zones->n+1; i++)
 +    {
 +        zones->cg_range[i] = dd->ncg_home;
 +    }
 +    /* zone_ncg1[0] should always be equal to ncg_home */
 +    dd->comm->zone_ncg1[0] = dd->ncg_home;
 +}
 +
 +static void rebuild_cgindex(gmx_domdec_t *dd,
 +                            const int *gcgs_index, t_state *state)
 +{
 +    int nat, i, *ind, *dd_cg_gl, *cgindex, cg_gl;
 +
 +    ind        = state->cg_gl;
 +    dd_cg_gl   = dd->index_gl;
 +    cgindex    = dd->cgindex;
 +    nat        = 0;
 +    cgindex[0] = nat;
 +    for (i = 0; i < state->ncg_gl; i++)
 +    {
 +        cgindex[i]  = nat;
 +        cg_gl       = ind[i];
 +        dd_cg_gl[i] = cg_gl;
 +        nat        += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
 +    }
 +    cgindex[i] = nat;
 +
 +    dd->ncg_home = state->ncg_gl;
 +    dd->nat_home = nat;
 +
 +    set_zones_ncg_home(dd);
 +}
 +
 +static int ddcginfo(const cginfo_mb_t *cginfo_mb, int cg)
 +{
 +    while (cg >= cginfo_mb->cg_end)
 +    {
 +        cginfo_mb++;
 +    }
 +
 +    return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
 +}
 +
 +static void dd_set_cginfo(int *index_gl, int cg0, int cg1,
 +                          t_forcerec *fr, char *bLocalCG)
 +{
 +    cginfo_mb_t *cginfo_mb;
 +    int         *cginfo;
 +    int          cg;
 +
 +    if (fr != NULL)
 +    {
 +        cginfo_mb = fr->cginfo_mb;
 +        cginfo    = fr->cginfo;
 +
 +        for (cg = cg0; cg < cg1; cg++)
 +        {
 +            cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
 +        }
 +    }
 +
 +    if (bLocalCG != NULL)
 +    {
 +        for (cg = cg0; cg < cg1; cg++)
 +        {
 +            bLocalCG[index_gl[cg]] = TRUE;
 +        }
 +    }
 +}
 +
 +static void make_dd_indices(gmx_domdec_t *dd,
 +                            const int *gcgs_index, int cg_start)
 +{
 +    int          nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
 +    int         *zone2cg, *zone_ncg1, *index_gl, *gatindex;
 +    gmx_ga2la_t *ga2la;
 +    char        *bLocalCG;
 +    gmx_bool     bCGs;
 +
 +    bLocalCG = dd->comm->bLocalCG;
 +
 +    if (dd->nat_tot > dd->gatindex_nalloc)
 +    {
 +        dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
 +        srenew(dd->gatindex, dd->gatindex_nalloc);
 +    }
 +
 +    nzone      = dd->comm->zones.n;
 +    zone2cg    = dd->comm->zones.cg_range;
 +    zone_ncg1  = dd->comm->zone_ncg1;
 +    index_gl   = dd->index_gl;
 +    gatindex   = dd->gatindex;
 +    bCGs       = dd->comm->bCGs;
 +
 +    if (zone2cg[1] != dd->ncg_home)
 +    {
 +        gmx_incons("dd->ncg_zone is not up to date");
 +    }
 +
 +    /* Make the local to global and global to local atom index */
 +    a = dd->cgindex[cg_start];
 +    for (zone = 0; zone < nzone; zone++)
 +    {
 +        if (zone == 0)
 +        {
 +            cg0 = cg_start;
 +        }
 +        else
 +        {
 +            cg0 = zone2cg[zone];
 +        }
 +        cg1    = zone2cg[zone+1];
 +        cg1_p1 = cg0 + zone_ncg1[zone];
 +
 +        for (cg = cg0; cg < cg1; cg++)
 +        {
 +            zone1 = zone;
 +            if (cg >= cg1_p1)
 +            {
 +                /* Signal that this cg is from more than one pulse away */
 +                zone1 += nzone;
 +            }
 +            cg_gl = index_gl[cg];
 +            if (bCGs)
 +            {
 +                for (a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
 +                {
 +                    gatindex[a] = a_gl;
 +                    ga2la_set(dd->ga2la, a_gl, a, zone1);
 +                    a++;
 +                }
 +            }
 +            else
 +            {
 +                gatindex[a] = cg_gl;
 +                ga2la_set(dd->ga2la, cg_gl, a, zone1);
 +                a++;
 +            }
 +        }
 +    }
 +}
 +
 +static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
 +                          const char *where)
 +{
 +    int ncg, i, ngl, nerr;
 +
 +    nerr = 0;
 +    if (bLocalCG == NULL)
 +    {
 +        return nerr;
 +    }
 +    for (i = 0; i < dd->ncg_tot; i++)
 +    {
 +        if (!bLocalCG[dd->index_gl[i]])
 +        {
 +            fprintf(stderr,
 +                    "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i+1, dd->index_gl[i]+1, dd->ncg_home);
 +            nerr++;
 +        }
 +    }
 +    ngl = 0;
 +    for (i = 0; i < ncg_sys; i++)
 +    {
 +        if (bLocalCG[i])
 +        {
 +            ngl++;
 +        }
 +    }
 +    if (ngl != dd->ncg_tot)
 +    {
 +        fprintf(stderr, "DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd->rank, where, ngl, dd->ncg_tot);
 +        nerr++;
 +    }
 +
 +    return nerr;
 +}
 +
 +static void check_index_consistency(gmx_domdec_t *dd,
 +                                    int natoms_sys, int ncg_sys,
 +                                    const char *where)
 +{
 +    int   nerr, ngl, i, a, cell;
 +    int  *have;
 +
 +    nerr = 0;
 +
 +    if (dd->comm->DD_debug > 1)
 +    {
 +        snew(have, natoms_sys);
 +        for (a = 0; a < dd->nat_tot; a++)
 +        {
 +            if (have[dd->gatindex[a]] > 0)
 +            {
 +                fprintf(stderr, "DD node %d: global atom %d occurs twice: index %d and %d\n", dd->rank, dd->gatindex[a]+1, have[dd->gatindex[a]], a+1);
 +            }
 +            else
 +            {
 +                have[dd->gatindex[a]] = a + 1;
 +            }
 +        }
 +        sfree(have);
 +    }
 +
 +    snew(have, dd->nat_tot);
 +
 +    ngl  = 0;
 +    for (i = 0; i < natoms_sys; i++)
 +    {
 +        if (ga2la_get(dd->ga2la, i, &a, &cell))
 +        {
 +            if (a >= dd->nat_tot)
 +            {
 +                fprintf(stderr, "DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, dd->nat_tot);
 +                nerr++;
 +            }
 +            else
 +            {
 +                have[a] = 1;
 +                if (dd->gatindex[a] != i)
 +                {
 +                    fprintf(stderr, "DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->gatindex[a]+1);
 +                    nerr++;
 +                }
 +            }
 +            ngl++;
 +        }
 +    }
 +    if (ngl != dd->nat_tot)
 +    {
 +        fprintf(stderr,
 +                "DD node %d, %s: %d global atom indices, %d local atoms\n",
 +                dd->rank, where, ngl, dd->nat_tot);
 +    }
 +    for (a = 0; a < dd->nat_tot; a++)
 +    {
 +        if (have[a] == 0)
 +        {
 +            fprintf(stderr,
 +                    "DD node %d, %s: local atom %d, global %d has no global index\n",
 +                    dd->rank, where, a+1, dd->gatindex[a]+1);
 +        }
 +    }
 +    sfree(have);
 +
 +    nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
 +
 +    if (nerr > 0)
 +    {
 +        gmx_fatal(FARGS, "DD node %d, %s: %d atom/cg index inconsistencies",
 +                  dd->rank, where, nerr);
 +    }
 +}
 +
 +static void clear_dd_indices(gmx_domdec_t *dd, int cg_start, int a_start)
 +{
 +    int   i;
 +    char *bLocalCG;
 +
 +    if (a_start == 0)
 +    {
 +        /* Clear the whole list without searching */
 +        ga2la_clear(dd->ga2la);
 +    }
 +    else
 +    {
 +        for (i = a_start; i < dd->nat_tot; i++)
 +        {
 +            ga2la_del(dd->ga2la, dd->gatindex[i]);
 +        }
 +    }
 +
 +    bLocalCG = dd->comm->bLocalCG;
 +    if (bLocalCG)
 +    {
 +        for (i = cg_start; i < dd->ncg_tot; i++)
 +        {
 +            bLocalCG[dd->index_gl[i]] = FALSE;
 +        }
 +    }
 +
 +    dd_clear_local_vsite_indices(dd);
 +
 +    if (dd->constraints)
 +    {
 +        dd_clear_local_constraint_indices(dd);
 +    }
 +}
 +
 +/* This function should be used for moving the domain boudaries during DLB,
 + * for obtaining the minimum cell size. It checks the initially set limit
 + * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
 + * and, possibly, a longer cut-off limit set for PME load balancing.
 + */
 +static real cellsize_min_dlb(gmx_domdec_comm_t *comm, int dim_ind, int dim)
 +{
 +    real cellsize_min;
 +
 +    cellsize_min = comm->cellsize_min[dim];
 +
 +    if (!comm->bVacDLBNoLimit)
 +    {
 +        /* The cut-off might have changed, e.g. by PME load balacning,
 +         * from the value used to set comm->cellsize_min, so check it.
 +         */
 +        cellsize_min = max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
 +
 +        if (comm->bPMELoadBalDLBLimits)
 +        {
 +            /* Check for the cut-off limit set by the PME load balancing */
 +            cellsize_min = max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
 +        }
 +    }
 +
 +    return cellsize_min;
 +}
 +
 +static real grid_jump_limit(gmx_domdec_comm_t *comm, real cutoff,
 +                            int dim_ind)
 +{
 +    real grid_jump_limit;
 +
 +    /* The distance between the boundaries of cells at distance
 +     * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
 +     * and by the fact that cells should not be shifted by more than
 +     * half their size, such that cg's only shift by one cell
 +     * at redecomposition.
 +     */
 +    grid_jump_limit = comm->cellsize_limit;
 +    if (!comm->bVacDLBNoLimit)
 +    {
 +        if (comm->bPMELoadBalDLBLimits)
 +        {
 +            cutoff = max(cutoff, comm->PMELoadBal_max_cutoff);
 +        }
 +        grid_jump_limit = max(grid_jump_limit,
 +                              cutoff/comm->cd[dim_ind].np);
 +    }
 +
 +    return grid_jump_limit;
 +}
 +
 +static gmx_bool check_grid_jump(gmx_large_int_t step,
 +                                gmx_domdec_t   *dd,
 +                                real            cutoff,
 +                                gmx_ddbox_t    *ddbox,
 +                                gmx_bool        bFatal)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d, dim;
 +    real               limit, bfac;
 +    gmx_bool           bInvalid;
 +
 +    bInvalid = FALSE;
 +
 +    comm = dd->comm;
 +
 +    for (d = 1; d < dd->ndim; d++)
 +    {
 +        dim   = dd->dim[d];
 +        limit = grid_jump_limit(comm, cutoff, d);
 +        bfac  = ddbox->box_size[dim];
 +        if (ddbox->tric_dir[dim])
 +        {
 +            bfac *= ddbox->skew_fac[dim];
 +        }
 +        if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
 +                                                              (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
 +        {
 +            bInvalid = TRUE;
 +
 +            if (bFatal)
 +            {
 +                char buf[22];
 +
 +                /* This error should never be triggered under normal
 +                 * circumstances, but you never know ...
 +                 */
 +                gmx_fatal(FARGS, "Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with less nodes might avoid this issue.",
 +                          gmx_step_str(step, buf),
 +                          dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +            }
 +        }
 +    }
 +
 +    return bInvalid;
 +}
 +
 +static int dd_load_count(gmx_domdec_comm_t *comm)
 +{
 +    return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
 +}
 +
 +static float dd_force_load(gmx_domdec_comm_t *comm)
 +{
 +    float load;
 +
 +    if (comm->eFlop)
 +    {
 +        load = comm->flop;
 +        if (comm->eFlop > 1)
 +        {
 +            load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
 +        }
 +    }
 +    else
 +    {
 +        load = comm->cycl[ddCyclF];
 +        if (comm->cycl_n[ddCyclF] > 1)
 +        {
 +            /* Subtract the maximum of the last n cycle counts
 +             * to get rid of possible high counts due to other sources,
 +             * for instance system activity, that would otherwise
 +             * affect the dynamic load balancing.
 +             */
 +            load -= comm->cycl_max[ddCyclF];
 +        }
 +
 +#ifdef GMX_MPI
 +        if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
 +        {
 +            float gpu_wait, gpu_wait_sum;
 +
 +            gpu_wait = comm->cycl[ddCyclWaitGPU];
 +            if (comm->cycl_n[ddCyclF] > 1)
 +            {
 +                /* We should remove the WaitGPU time of the same MD step
 +                 * as the one with the maximum F time, since the F time
 +                 * and the wait time are not independent.
 +                 * Furthermore, the step for the max F time should be chosen
 +                 * the same on all ranks that share the same GPU.
 +                 * But to keep the code simple, we remove the average instead.
 +                 * The main reason for artificially long times at some steps
 +                 * is spurious CPU activity or MPI time, so we don't expect
 +                 * that changes in the GPU wait time matter a lot here.
 +                 */
 +                gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/(float)comm->cycl_n[ddCyclF];
 +            }
 +            /* Sum the wait times over the ranks that share the same GPU */
 +            MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
 +                          comm->mpi_comm_gpu_shared);
 +            /* Replace the wait time by the average over the ranks */
 +            load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
 +        }
 +#endif
 +    }
 +
 +    return load;
 +}
 +
 +static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                i;
 +
 +    comm = dd->comm;
 +
 +    snew(*dim_f, dd->nc[dim]+1);
 +    (*dim_f)[0] = 0;
 +    for (i = 1; i < dd->nc[dim]; i++)
 +    {
 +        if (comm->slb_frac[dim])
 +        {
 +            (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
 +        }
 +        else
 +        {
 +            (*dim_f)[i] = (real)i/(real)dd->nc[dim];
 +        }
 +    }
 +    (*dim_f)[dd->nc[dim]] = 1;
 +}
 +
 +static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
 +{
 +    int  pmeindex, slab, nso, i;
 +    ivec xyz;
 +
 +    if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
 +    {
 +        ddpme->dim = YY;
 +    }
 +    else
 +    {
 +        ddpme->dim = dimind;
 +    }
 +    ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
 +
 +    ddpme->nslab = (ddpme->dim == 0 ?
 +                    dd->comm->npmenodes_x :
 +                    dd->comm->npmenodes_y);
 +
 +    if (ddpme->nslab <= 1)
 +    {
 +        return;
 +    }
 +
 +    nso = dd->comm->npmenodes/ddpme->nslab;
 +    /* Determine for each PME slab the PP location range for dimension dim */
 +    snew(ddpme->pp_min, ddpme->nslab);
 +    snew(ddpme->pp_max, ddpme->nslab);
 +    for (slab = 0; slab < ddpme->nslab; slab++)
 +    {
 +        ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
 +        ddpme->pp_max[slab] = 0;
 +    }
 +    for (i = 0; i < dd->nnodes; i++)
 +    {
 +        ddindex2xyz(dd->nc, i, xyz);
 +        /* For y only use our y/z slab.
 +         * This assumes that the PME x grid size matches the DD grid size.
 +         */
 +        if (dimind == 0 || xyz[XX] == dd->ci[XX])
 +        {
 +            pmeindex = ddindex2pmeindex(dd, i);
 +            if (dimind == 0)
 +            {
 +                slab = pmeindex/nso;
 +            }
 +            else
 +            {
 +                slab = pmeindex % ddpme->nslab;
 +            }
 +            ddpme->pp_min[slab] = min(ddpme->pp_min[slab], xyz[dimind]);
 +            ddpme->pp_max[slab] = max(ddpme->pp_max[slab], xyz[dimind]);
 +        }
 +    }
 +
 +    set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
 +}
 +
 +int dd_pme_maxshift_x(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->ddpme[0].dim == XX)
 +    {
 +        return dd->comm->ddpme[0].maxshift;
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +int dd_pme_maxshift_y(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->ddpme[0].dim == YY)
 +    {
 +        return dd->comm->ddpme[0].maxshift;
 +    }
 +    else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
 +    {
 +        return dd->comm->ddpme[1].maxshift;
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +static void set_pme_maxshift(gmx_domdec_t *dd, gmx_ddpme_t *ddpme,
 +                             gmx_bool bUniform, gmx_ddbox_t *ddbox, real *cell_f)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                nc, ns, s;
 +    int               *xmin, *xmax;
 +    real               range, pme_boundary;
 +    int                sh;
 +
 +    comm = dd->comm;
 +    nc   = dd->nc[ddpme->dim];
 +    ns   = ddpme->nslab;
 +
 +    if (!ddpme->dim_match)
 +    {
 +        /* PP decomposition is not along dim: the worst situation */
 +        sh = ns/2;
 +    }
 +    else if (ns <= 3 || (bUniform && ns == nc))
 +    {
 +        /* The optimal situation */
 +        sh = 1;
 +    }
 +    else
 +    {
 +        /* We need to check for all pme nodes which nodes they
 +         * could possibly need to communicate with.
 +         */
 +        xmin = ddpme->pp_min;
 +        xmax = ddpme->pp_max;
 +        /* Allow for atoms to be maximally 2/3 times the cut-off
 +         * out of their DD cell. This is a reasonable balance between
 +         * between performance and support for most charge-group/cut-off
 +         * combinations.
 +         */
 +        range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
 +        /* Avoid extra communication when we are exactly at a boundary */
 +        range *= 0.999;
 +
 +        sh = 1;
 +        for (s = 0; s < ns; s++)
 +        {
 +            /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
 +            pme_boundary = (real)s/ns;
 +            while (sh+1 < ns &&
 +                   ((s-(sh+1) >= 0 &&
 +                     cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
 +                    (s-(sh+1) <  0 &&
 +                     cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
 +            {
 +                sh++;
 +            }
 +            pme_boundary = (real)(s+1)/ns;
 +            while (sh+1 < ns &&
 +                   ((s+(sh+1) <  ns &&
 +                     cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
 +                    (s+(sh+1) >= ns &&
 +                     cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
 +            {
 +                sh++;
 +            }
 +        }
 +    }
 +
 +    ddpme->maxshift = sh;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "PME slab communication range for dim %d is %d\n",
 +                ddpme->dim, ddpme->maxshift);
 +    }
 +}
 +
 +static void check_box_size(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
 +{
 +    int d, dim;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        if (dim < ddbox->nboundeddim &&
 +            ddbox->box_size[dim]*ddbox->skew_fac[dim] <
 +            dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
 +        {
 +            gmx_fatal(FARGS, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
 +                      dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
 +                      dd->nc[dim], dd->comm->cellsize_limit);
 +        }
 +    }
 +}
 +
 +static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
 +                                  gmx_bool bMaster, ivec npulse)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d, j;
 +    rvec               cellsize_min;
 +    real              *cell_x, cell_dx, cellsize;
 +
 +    comm = dd->comm;
 +
 +    for (d = 0; d < DIM; d++)
 +    {
 +        cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
 +        npulse[d]       = 1;
 +        if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
 +        {
 +            /* Uniform grid */
 +            cell_dx = ddbox->box_size[d]/dd->nc[d];
 +            if (bMaster)
 +            {
 +                for (j = 0; j < dd->nc[d]+1; j++)
 +                {
 +                    dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
 +                }
 +            }
 +            else
 +            {
 +                comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
 +                comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
 +            }
 +            cellsize = cell_dx*ddbox->skew_fac[d];
 +            while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
 +            {
 +                npulse[d]++;
 +            }
 +            cellsize_min[d] = cellsize;
 +        }
 +        else
 +        {
 +            /* Statically load balanced grid */
 +            /* Also when we are not doing a master distribution we determine
 +             * all cell borders in a loop to obtain identical values
 +             * to the master distribution case and to determine npulse.
 +             */
 +            if (bMaster)
 +            {
 +                cell_x = dd->ma->cell_x[d];
 +            }
 +            else
 +            {
 +                snew(cell_x, dd->nc[d]+1);
 +            }
 +            cell_x[0] = ddbox->box0[d];
 +            for (j = 0; j < dd->nc[d]; j++)
 +            {
 +                cell_dx     = ddbox->box_size[d]*comm->slb_frac[d][j];
 +                cell_x[j+1] = cell_x[j] + cell_dx;
 +                cellsize    = cell_dx*ddbox->skew_fac[d];
 +                while (cellsize*npulse[d] < comm->cutoff &&
 +                       npulse[d] < dd->nc[d]-1)
 +                {
 +                    npulse[d]++;
 +                }
 +                cellsize_min[d] = min(cellsize_min[d], cellsize);
 +            }
 +            if (!bMaster)
 +            {
 +                comm->cell_x0[d] = cell_x[dd->ci[d]];
 +                comm->cell_x1[d] = cell_x[dd->ci[d]+1];
 +                sfree(cell_x);
 +            }
 +        }
 +        /* The following limitation is to avoid that a cell would receive
 +         * some of its own home charge groups back over the periodic boundary.
 +         * Double charge groups cause trouble with the global indices.
 +         */
 +        if (d < ddbox->npbcdim &&
 +            dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
 +        {
 +            gmx_fatal_collective(FARGS, NULL, dd,
 +                                 "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
 +                                 dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d],
 +                                 comm->cutoff,
 +                                 dd->nc[d], dd->nc[d],
 +                                 dd->nnodes > dd->nc[d] ? "cells" : "processors");
 +        }
 +    }
 +
 +    if (!comm->bDynLoadBal)
 +    {
 +        copy_rvec(cellsize_min, comm->cellsize_min);
 +    }
 +
 +    for (d = 0; d < comm->npmedecompdim; d++)
 +    {
 +        set_pme_maxshift(dd, &comm->ddpme[d],
 +                         comm->slb_frac[dd->dim[d]] == NULL, ddbox,
 +                         comm->ddpme[d].slb_dim_f);
 +    }
 +}
 +
 +
 +static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
 +                                                  int d, int dim, gmx_domdec_root_t *root,
 +                                                  gmx_ddbox_t *ddbox,
 +                                                  gmx_bool bUniform, gmx_large_int_t step, real cellsize_limit_f, int range[])
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ncd, i, j, nmin, nmin_old;
 +    gmx_bool           bLimLo, bLimHi;
 +    real              *cell_size;
 +    real               fac, halfway, cellsize_limit_f_i, region_size;
 +    gmx_bool           bPBC, bLastHi = FALSE;
 +    int                nrange[] = {range[0], range[1]};
 +
 +    region_size = root->cell_f[range[1]]-root->cell_f[range[0]];
 +
 +    comm = dd->comm;
 +
 +    ncd = dd->nc[dim];
 +
 +    bPBC = (dim < ddbox->npbcdim);
 +
 +    cell_size = root->buf_ncd;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "enforce_limits: %d %d\n", range[0], range[1]);
 +    }
 +
 +    /* First we need to check if the scaling does not make cells
 +     * smaller than the smallest allowed size.
 +     * We need to do this iteratively, since if a cell is too small,
 +     * it needs to be enlarged, which makes all the other cells smaller,
 +     * which could in turn make another cell smaller than allowed.
 +     */
 +    for (i = range[0]; i < range[1]; i++)
 +    {
 +        root->bCellMin[i] = FALSE;
 +    }
 +    nmin = 0;
 +    do
 +    {
 +        nmin_old = nmin;
 +        /* We need the total for normalization */
 +        fac = 0;
 +        for (i = range[0]; i < range[1]; i++)
 +        {
 +            if (root->bCellMin[i] == FALSE)
 +            {
 +                fac += cell_size[i];
 +            }
 +        }
 +        fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
 +        /* Determine the cell boundaries */
 +        for (i = range[0]; i < range[1]; i++)
 +        {
 +            if (root->bCellMin[i] == FALSE)
 +            {
 +                cell_size[i] *= fac;
 +                if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
 +                {
 +                    cellsize_limit_f_i = 0;
 +                }
 +                else
 +                {
 +                    cellsize_limit_f_i = cellsize_limit_f;
 +                }
 +                if (cell_size[i] < cellsize_limit_f_i)
 +                {
 +                    root->bCellMin[i] = TRUE;
 +                    cell_size[i]      = cellsize_limit_f_i;
 +                    nmin++;
 +                }
 +            }
 +            root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
 +        }
 +    }
 +    while (nmin > nmin_old);
 +
 +    i            = range[1]-1;
 +    cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
 +    /* For this check we should not use DD_CELL_MARGIN,
 +     * but a slightly smaller factor,
 +     * since rounding could get use below the limit.
 +     */
 +    if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
 +    {
 +        char buf[22];
 +        gmx_fatal(FARGS, "Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
 +                  gmx_step_str(step, buf),
 +                  dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
 +                  ncd, comm->cellsize_min[dim]);
 +    }
 +
 +    root->bLimited = (nmin > 0) || (range[0] > 0) || (range[1] < ncd);
 +
 +    if (!bUniform)
 +    {
 +        /* Check if the boundary did not displace more than halfway
 +         * each of the cells it bounds, as this could cause problems,
 +         * especially when the differences between cell sizes are large.
 +         * If changes are applied, they will not make cells smaller
 +         * than the cut-off, as we check all the boundaries which
 +         * might be affected by a change and if the old state was ok,
 +         * the cells will at most be shrunk back to their old size.
 +         */
 +        for (i = range[0]+1; i < range[1]; i++)
 +        {
 +            halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
 +            if (root->cell_f[i] < halfway)
 +            {
 +                root->cell_f[i] = halfway;
 +                /* Check if the change also causes shifts of the next boundaries */
 +                for (j = i+1; j < range[1]; j++)
 +                {
 +                    if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
 +                    {
 +                        root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
 +                    }
 +                }
 +            }
 +            halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
 +            if (root->cell_f[i] > halfway)
 +            {
 +                root->cell_f[i] = halfway;
 +                /* Check if the change also causes shifts of the next boundaries */
 +                for (j = i-1; j >= range[0]+1; j--)
 +                {
 +                    if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
 +                    {
 +                        root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    /* nrange is defined as [lower, upper) range for new call to enforce_limits */
 +    /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
 +     * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
 +     * for a and b nrange is used */
 +    if (d > 0)
 +    {
 +        /* Take care of the staggering of the cell boundaries */
 +        if (bUniform)
 +        {
 +            for (i = range[0]; i < range[1]; i++)
 +            {
 +                root->cell_f_max0[i] = root->cell_f[i];
 +                root->cell_f_min1[i] = root->cell_f[i+1];
 +            }
 +        }
 +        else
 +        {
 +            for (i = range[0]+1; i < range[1]; i++)
 +            {
 +                bLimLo = (root->cell_f[i] < root->bound_min[i]);
 +                bLimHi = (root->cell_f[i] > root->bound_max[i]);
 +                if (bLimLo && bLimHi)
 +                {
 +                    /* Both limits violated, try the best we can */
 +                    /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
 +                    root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
 +                    nrange[0]       = range[0];
 +                    nrange[1]       = i;
 +                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +
 +                    nrange[0] = i;
 +                    nrange[1] = range[1];
 +                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +
 +                    return;
 +                }
 +                else if (bLimLo)
 +                {
 +                    /* root->cell_f[i] = root->bound_min[i]; */
 +                    nrange[1] = i;  /* only store violation location. There could be a LimLo violation following with an higher index */
 +                    bLastHi   = FALSE;
 +                }
 +                else if (bLimHi && !bLastHi)
 +                {
 +                    bLastHi = TRUE;
 +                    if (nrange[1] < range[1])   /* found a LimLo before */
 +                    {
 +                        root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
 +                        dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +                        nrange[0] = nrange[1];
 +                    }
 +                    root->cell_f[i] = root->bound_max[i];
 +                    nrange[1]       = i;
 +                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +                    nrange[0] = i;
 +                    nrange[1] = range[1];
 +                }
 +            }
 +            if (nrange[1] < range[1])   /* found last a LimLo */
 +            {
 +                root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
 +                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +                nrange[0] = nrange[1];
 +                nrange[1] = range[1];
 +                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +            }
 +            else if (nrange[0] > range[0]) /* found at least one LimHi */
 +            {
 +                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
 +                                       int d, int dim, gmx_domdec_root_t *root,
 +                                       gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
 +                                       gmx_bool bUniform, gmx_large_int_t step)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ncd, d1, i, j, pos;
 +    real              *cell_size;
 +    real               load_aver, load_i, imbalance, change, change_max, sc;
 +    real               cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
 +    real               change_limit;
 +    real               relax = 0.5;
 +    gmx_bool           bPBC;
 +    int                range[] = { 0, 0 };
 +
 +    comm = dd->comm;
 +
 +    /* Convert the maximum change from the input percentage to a fraction */
 +    change_limit = comm->dlb_scale_lim*0.01;
 +
 +    ncd = dd->nc[dim];
 +
 +    bPBC = (dim < ddbox->npbcdim);
 +
 +    cell_size = root->buf_ncd;
 +
 +    /* Store the original boundaries */
 +    for (i = 0; i < ncd+1; i++)
 +    {
 +        root->old_cell_f[i] = root->cell_f[i];
 +    }
 +    if (bUniform)
 +    {
 +        for (i = 0; i < ncd; i++)
 +        {
 +            cell_size[i] = 1.0/ncd;
 +        }
 +    }
 +    else if (dd_load_count(comm))
 +    {
 +        load_aver  = comm->load[d].sum_m/ncd;
 +        change_max = 0;
 +        for (i = 0; i < ncd; i++)
 +        {
 +            /* Determine the relative imbalance of cell i */
 +            load_i    = comm->load[d].load[i*comm->load[d].nload+2];
 +            imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
 +            /* Determine the change of the cell size using underrelaxation */
 +            change     = -relax*imbalance;
 +            change_max = max(change_max, max(change, -change));
 +        }
 +        /* Limit the amount of scaling.
 +         * We need to use the same rescaling for all cells in one row,
 +         * otherwise the load balancing might not converge.
 +         */
 +        sc = relax;
 +        if (change_max > change_limit)
 +        {
 +            sc *= change_limit/change_max;
 +        }
 +        for (i = 0; i < ncd; i++)
 +        {
 +            /* Determine the relative imbalance of cell i */
 +            load_i    = comm->load[d].load[i*comm->load[d].nload+2];
 +            imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
 +            /* Determine the change of the cell size using underrelaxation */
 +            change       = -sc*imbalance;
 +            cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
 +        }
 +    }
 +
 +    cellsize_limit_f  = cellsize_min_dlb(comm, d, dim)/ddbox->box_size[dim];
 +    cellsize_limit_f *= DD_CELL_MARGIN;
 +    dist_min_f_hard   = grid_jump_limit(comm, comm->cutoff, d)/ddbox->box_size[dim];
 +    dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
 +    if (ddbox->tric_dir[dim])
 +    {
 +        cellsize_limit_f /= ddbox->skew_fac[dim];
 +        dist_min_f       /= ddbox->skew_fac[dim];
 +    }
 +    if (bDynamicBox && d > 0)
 +    {
 +        dist_min_f *= DD_PRES_SCALE_MARGIN;
 +    }
 +    if (d > 0 && !bUniform)
 +    {
 +        /* Make sure that the grid is not shifted too much */
 +        for (i = 1; i < ncd; i++)
 +        {
 +            if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
 +            {
 +                gmx_incons("Inconsistent DD boundary staggering limits!");
 +            }
 +            root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
 +            space              = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
 +            if (space > 0)
 +            {
 +                root->bound_min[i] += 0.5*space;
 +            }
 +            root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
 +            space              = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
 +            if (space < 0)
 +            {
 +                root->bound_max[i] += 0.5*space;
 +            }
 +            if (debug)
 +            {
 +                fprintf(debug,
 +                        "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
 +                        d, i,
 +                        root->cell_f_max0[i-1] + dist_min_f,
 +                        root->bound_min[i], root->cell_f[i], root->bound_max[i],
 +                        root->cell_f_min1[i] - dist_min_f);
 +            }
 +        }
 +    }
 +    range[1]          = ncd;
 +    root->cell_f[0]   = 0;
 +    root->cell_f[ncd] = 1;
 +    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
 +
 +
 +    /* After the checks above, the cells should obey the cut-off
 +     * restrictions, but it does not hurt to check.
 +     */
 +    for (i = 0; i < ncd; i++)
 +    {
 +        if (debug)
 +        {
 +            fprintf(debug, "Relative bounds dim %d  cell %d: %f %f\n",
 +                    dim, i, root->cell_f[i], root->cell_f[i+1]);
 +        }
 +
 +        if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
 +            root->cell_f[i+1] - root->cell_f[i] <
 +            cellsize_limit_f/DD_CELL_MARGIN)
 +        {
 +            char buf[22];
 +            fprintf(stderr,
 +                    "\nWARNING step %s: direction %c, cell %d too small: %f\n",
 +                    gmx_step_str(step, buf), dim2char(dim), i,
 +                    (root->cell_f[i+1] - root->cell_f[i])
 +                    *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
 +        }
 +    }
 +
 +    pos = ncd + 1;
 +    /* Store the cell boundaries of the lower dimensions at the end */
 +    for (d1 = 0; d1 < d; d1++)
 +    {
 +        root->cell_f[pos++] = comm->cell_f0[d1];
 +        root->cell_f[pos++] = comm->cell_f1[d1];
 +    }
 +
 +    if (d < comm->npmedecompdim)
 +    {
 +        /* The master determines the maximum shift for
 +         * the coordinate communication between separate PME nodes.
 +         */
 +        set_pme_maxshift(dd, &comm->ddpme[d], bUniform, ddbox, root->cell_f);
 +    }
 +    root->cell_f[pos++] = comm->ddpme[0].maxshift;
 +    if (d >= 1)
 +    {
 +        root->cell_f[pos++] = comm->ddpme[1].maxshift;
 +    }
 +}
 +
 +static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
 +                                             gmx_ddbox_t *ddbox, int dimind)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                dim;
 +
 +    comm = dd->comm;
 +
 +    /* Set the cell dimensions */
 +    dim                = dd->dim[dimind];
 +    comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
 +    comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
 +    if (dim >= ddbox->nboundeddim)
 +    {
 +        comm->cell_x0[dim] += ddbox->box0[dim];
 +        comm->cell_x1[dim] += ddbox->box0[dim];
 +    }
 +}
 +
 +static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
 +                                         int d, int dim, real *cell_f_row,
 +                                         gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d1, dim1, pos;
 +
 +    comm = dd->comm;
 +
 +#ifdef GMX_MPI
 +    /* Each node would only need to know two fractions,
 +     * but it is probably cheaper to broadcast the whole array.
 +     */
 +    MPI_Bcast(cell_f_row, DD_CELL_F_SIZE(dd, d)*sizeof(real), MPI_BYTE,
 +              0, comm->mpi_comm_load[d]);
 +#endif
 +    /* Copy the fractions for this dimension from the buffer */
 +    comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
 +    comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
 +    /* The whole array was communicated, so set the buffer position */
 +    pos = dd->nc[dim] + 1;
 +    for (d1 = 0; d1 <= d; d1++)
 +    {
 +        if (d1 < d)
 +        {
 +            /* Copy the cell fractions of the lower dimensions */
 +            comm->cell_f0[d1] = cell_f_row[pos++];
 +            comm->cell_f1[d1] = cell_f_row[pos++];
 +        }
 +        relative_to_absolute_cell_bounds(dd, ddbox, d1);
 +    }
 +    /* Convert the communicated shift from float to int */
 +    comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
 +    if (d >= 1)
 +    {
 +        comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
 +    }
 +}
 +
 +static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
 +                                         gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
 +                                         gmx_bool bUniform, gmx_large_int_t step)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d, dim, d1;
 +    gmx_bool           bRowMember, bRowRoot;
 +    real              *cell_f_row;
 +
 +    comm = dd->comm;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim        = dd->dim[d];
 +        bRowMember = TRUE;
 +        bRowRoot   = TRUE;
 +        for (d1 = d; d1 < dd->ndim; d1++)
 +        {
 +            if (dd->ci[dd->dim[d1]] > 0)
 +            {
 +                if (d1 != d)
 +                {
 +                    bRowMember = FALSE;
 +                }
 +                bRowRoot = FALSE;
 +            }
 +        }
 +        if (bRowMember)
 +        {
 +            if (bRowRoot)
 +            {
 +                set_dd_cell_sizes_dlb_root(dd, d, dim, comm->root[d],
 +                                           ddbox, bDynamicBox, bUniform, step);
 +                cell_f_row = comm->root[d]->cell_f;
 +            }
 +            else
 +            {
 +                cell_f_row = comm->cell_f_row;
 +            }
 +            distribute_dd_cell_sizes_dlb(dd, d, dim, cell_f_row, ddbox);
 +        }
 +    }
 +}
 +
 +static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
 +{
 +    int d;
 +
 +    /* This function assumes the box is static and should therefore
 +     * not be called when the box has changed since the last
 +     * call to dd_partition_system.
 +     */
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        relative_to_absolute_cell_bounds(dd, ddbox, d);
 +    }
 +}
 +
 +
 +
 +static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
 +                                  gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
 +                                  gmx_bool bUniform, gmx_bool bDoDLB, gmx_large_int_t step,
 +                                  gmx_wallcycle_t wcycle)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                dim;
 +
 +    comm = dd->comm;
 +
 +    if (bDoDLB)
 +    {
 +        wallcycle_start(wcycle, ewcDDCOMMBOUND);
 +        set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
 +        wallcycle_stop(wcycle, ewcDDCOMMBOUND);
 +    }
 +    else if (bDynamicBox)
 +    {
 +        set_dd_cell_sizes_dlb_nochange(dd, ddbox);
 +    }
 +
 +    /* Set the dimensions for which no DD is used */
 +    for (dim = 0; dim < DIM; dim++)
 +    {
 +        if (dd->nc[dim] == 1)
 +        {
 +            comm->cell_x0[dim] = 0;
 +            comm->cell_x1[dim] = ddbox->box_size[dim];
 +            if (dim >= ddbox->nboundeddim)
 +            {
 +                comm->cell_x0[dim] += ddbox->box0[dim];
 +                comm->cell_x1[dim] += ddbox->box0[dim];
 +            }
 +        }
 +    }
 +}
 +
 +static void realloc_comm_ind(gmx_domdec_t *dd, ivec npulse)
 +{
 +    int                    d, np, i;
 +    gmx_domdec_comm_dim_t *cd;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        cd = &dd->comm->cd[d];
 +        np = npulse[dd->dim[d]];
 +        if (np > cd->np_nalloc)
 +        {
 +            if (debug)
 +            {
 +                fprintf(debug, "(Re)allocing cd for %c to %d pulses\n",
 +                        dim2char(dd->dim[d]), np);
 +            }
 +            if (DDMASTER(dd) && cd->np_nalloc > 0)
 +            {
 +                fprintf(stderr, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd->dim[d]), np);
 +            }
 +            srenew(cd->ind, np);
 +            for (i = cd->np_nalloc; i < np; i++)
 +            {
 +                cd->ind[i].index  = NULL;
 +                cd->ind[i].nalloc = 0;
 +            }
 +            cd->np_nalloc = np;
 +        }
 +        cd->np = np;
 +    }
 +}
 +
 +
 +static void set_dd_cell_sizes(gmx_domdec_t *dd,
 +                              gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
 +                              gmx_bool bUniform, gmx_bool bDoDLB, gmx_large_int_t step,
 +                              gmx_wallcycle_t wcycle)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d;
 +    ivec               npulse;
 +
 +    comm = dd->comm;
 +
 +    /* Copy the old cell boundaries for the cg displacement check */
 +    copy_rvec(comm->cell_x0, comm->old_cell_x0);
 +    copy_rvec(comm->cell_x1, comm->old_cell_x1);
 +
 +    if (comm->bDynLoadBal)
 +    {
 +        if (DDMASTER(dd))
 +        {
 +            check_box_size(dd, ddbox);
 +        }
 +        set_dd_cell_sizes_dlb(dd, ddbox, bDynamicBox, bUniform, bDoDLB, step, wcycle);
 +    }
 +    else
 +    {
 +        set_dd_cell_sizes_slb(dd, ddbox, FALSE, npulse);
 +        realloc_comm_ind(dd, npulse);
 +    }
 +
 +    if (debug)
 +    {
 +        for (d = 0; d < DIM; d++)
 +        {
 +            fprintf(debug, "cell_x[%d] %f - %f skew_fac %f\n",
 +                    d, comm->cell_x0[d], comm->cell_x1[d], ddbox->skew_fac[d]);
 +        }
 +    }
 +}
 +
 +static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
 +                                  gmx_ddbox_t *ddbox,
 +                                  rvec cell_ns_x0, rvec cell_ns_x1,
 +                                  gmx_large_int_t step)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                dim_ind, dim;
 +
 +    comm = dd->comm;
 +
 +    for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
 +    {
 +        dim = dd->dim[dim_ind];
 +
 +        /* Without PBC we don't have restrictions on the outer cells */
 +        if (!(dim >= ddbox->npbcdim &&
 +              (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
 +            comm->bDynLoadBal &&
 +            (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
 +            comm->cellsize_min[dim])
 +        {
 +            char buf[22];
 +            gmx_fatal(FARGS, "Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
 +                      gmx_step_str(step, buf), dim2char(dim),
 +                      comm->cell_x1[dim] - comm->cell_x0[dim],
 +                      ddbox->skew_fac[dim],
 +                      dd->comm->cellsize_min[dim],
 +                      dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +        }
 +    }
 +
 +    if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
 +    {
 +        /* Communicate the boundaries and update cell_ns_x0/1 */
 +        dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
 +        if (dd->bGridJump && dd->ndim > 1)
 +        {
 +            check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
 +        }
 +    }
 +}
 +
 +static void make_tric_corr_matrix(int npbcdim, matrix box, matrix tcm)
 +{
 +    if (YY < npbcdim)
 +    {
 +        tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
 +    }
 +    else
 +    {
 +        tcm[YY][XX] = 0;
 +    }
 +    if (ZZ < npbcdim)
 +    {
 +        tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
 +        tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
 +    }
 +    else
 +    {
 +        tcm[ZZ][XX] = 0;
 +        tcm[ZZ][YY] = 0;
 +    }
 +}
 +
 +static void check_screw_box(matrix box)
 +{
 +    /* Mathematical limitation */
 +    if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
 +    {
 +        gmx_fatal(FARGS, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
 +    }
 +
 +    /* Limitation due to the asymmetry of the eighth shell method */
 +    if (box[ZZ][YY] != 0)
 +    {
 +        gmx_fatal(FARGS, "pbc=screw with non-zero box_zy is not supported");
 +    }
 +}
 +
 +static void distribute_cg(FILE *fplog, gmx_large_int_t step,
 +                          matrix box, ivec tric_dir, t_block *cgs, rvec pos[],
 +                          gmx_domdec_t *dd)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                **tmp_ind = NULL, *tmp_nalloc = NULL;
 +    int                  i, icg, j, k, k0, k1, d, npbcdim;
 +    matrix               tcm;
 +    rvec                 box_size, cg_cm;
 +    ivec                 ind;
 +    real                 nrcg, inv_ncg, pos_d;
 +    atom_id             *cgindex;
 +    gmx_bool             bUnbounded, bScrew;
 +
 +    ma = dd->ma;
 +
 +    if (tmp_ind == NULL)
 +    {
 +        snew(tmp_nalloc, dd->nnodes);
 +        snew(tmp_ind, dd->nnodes);
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
 +            snew(tmp_ind[i], tmp_nalloc[i]);
 +        }
 +    }
 +
 +    /* Clear the count */
 +    for (i = 0; i < dd->nnodes; i++)
 +    {
 +        ma->ncg[i] = 0;
 +        ma->nat[i] = 0;
 +    }
 +
 +    make_tric_corr_matrix(dd->npbcdim, box, tcm);
 +
 +    cgindex = cgs->index;
 +
 +    /* Compute the center of geometry for all charge groups */
 +    for (icg = 0; icg < cgs->nr; icg++)
 +    {
 +        k0      = cgindex[icg];
 +        k1      = cgindex[icg+1];
 +        nrcg    = k1 - k0;
 +        if (nrcg == 1)
 +        {
 +            copy_rvec(pos[k0], cg_cm);
 +        }
 +        else
 +        {
 +            inv_ncg = 1.0/nrcg;
 +
 +            clear_rvec(cg_cm);
 +            for (k = k0; (k < k1); k++)
 +            {
 +                rvec_inc(cg_cm, pos[k]);
 +            }
 +            for (d = 0; (d < DIM); d++)
 +            {
 +                cg_cm[d] *= inv_ncg;
 +            }
 +        }
 +        /* Put the charge group in the box and determine the cell index */
 +        for (d = DIM-1; d >= 0; d--)
 +        {
 +            pos_d = cg_cm[d];
 +            if (d < dd->npbcdim)
 +            {
 +                bScrew = (dd->bScrewPBC && d == XX);
 +                if (tric_dir[d] && dd->nc[d] > 1)
 +                {
 +                    /* Use triclinic coordintates for this dimension */
 +                    for (j = d+1; j < DIM; j++)
 +                    {
 +                        pos_d += cg_cm[j]*tcm[j][d];
 +                    }
 +                }
 +                while (pos_d >= box[d][d])
 +                {
 +                    pos_d -= box[d][d];
 +                    rvec_dec(cg_cm, box[d]);
 +                    if (bScrew)
 +                    {
 +                        cg_cm[YY] = box[YY][YY] - cg_cm[YY];
 +                        cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
 +                    }
 +                    for (k = k0; (k < k1); k++)
 +                    {
 +                        rvec_dec(pos[k], box[d]);
 +                        if (bScrew)
 +                        {
 +                            pos[k][YY] = box[YY][YY] - pos[k][YY];
 +                            pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
 +                        }
 +                    }
 +                }
 +                while (pos_d < 0)
 +                {
 +                    pos_d += box[d][d];
 +                    rvec_inc(cg_cm, box[d]);
 +                    if (bScrew)
 +                    {
 +                        cg_cm[YY] = box[YY][YY] - cg_cm[YY];
 +                        cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
 +                    }
 +                    for (k = k0; (k < k1); k++)
 +                    {
 +                        rvec_inc(pos[k], box[d]);
 +                        if (bScrew)
 +                        {
 +                            pos[k][YY] = box[YY][YY] - pos[k][YY];
 +                            pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
 +                        }
 +                    }
 +                }
 +            }
 +            /* This could be done more efficiently */
 +            ind[d] = 0;
 +            while (ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
 +            {
 +                ind[d]++;
 +            }
 +        }
 +        i = dd_index(dd->nc, ind);
 +        if (ma->ncg[i] == tmp_nalloc[i])
 +        {
 +            tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
 +            srenew(tmp_ind[i], tmp_nalloc[i]);
 +        }
 +        tmp_ind[i][ma->ncg[i]] = icg;
 +        ma->ncg[i]++;
 +        ma->nat[i] += cgindex[icg+1] - cgindex[icg];
 +    }
 +
 +    k1 = 0;
 +    for (i = 0; i < dd->nnodes; i++)
 +    {
 +        ma->index[i] = k1;
 +        for (k = 0; k < ma->ncg[i]; k++)
 +        {
 +            ma->cg[k1++] = tmp_ind[i][k];
 +        }
 +    }
 +    ma->index[dd->nnodes] = k1;
 +
 +    for (i = 0; i < dd->nnodes; i++)
 +    {
 +        sfree(tmp_ind[i]);
 +    }
 +    sfree(tmp_ind);
 +    sfree(tmp_nalloc);
 +
 +    if (fplog)
 +    {
 +        char buf[22];
 +        fprintf(fplog, "Charge group distribution at step %s:",
 +                gmx_step_str(step, buf));
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            fprintf(fplog, " %d", ma->ncg[i]);
 +        }
 +        fprintf(fplog, "\n");
 +    }
 +}
 +
 +static void get_cg_distribution(FILE *fplog, gmx_large_int_t step, gmx_domdec_t *dd,
 +                                t_block *cgs, matrix box, gmx_ddbox_t *ddbox,
 +                                rvec pos[])
 +{
 +    gmx_domdec_master_t *ma = NULL;
 +    ivec                 npulse;
 +    int                  i, cg_gl;
 +    int                 *ibuf, buf2[2] = { 0, 0 };
 +    gmx_bool             bMaster = DDMASTER(dd);
 +    if (bMaster)
 +    {
 +        ma = dd->ma;
 +
 +        if (dd->bScrewPBC)
 +        {
 +            check_screw_box(box);
 +        }
 +
 +        set_dd_cell_sizes_slb(dd, ddbox, TRUE, npulse);
 +
 +        distribute_cg(fplog, step, box, ddbox->tric_dir, cgs, pos, dd);
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            ma->ibuf[2*i]   = ma->ncg[i];
 +            ma->ibuf[2*i+1] = ma->nat[i];
 +        }
 +        ibuf = ma->ibuf;
 +    }
 +    else
 +    {
 +        ibuf = NULL;
 +    }
 +    dd_scatter(dd, 2*sizeof(int), ibuf, buf2);
 +
 +    dd->ncg_home = buf2[0];
 +    dd->nat_home = buf2[1];
 +    dd->ncg_tot  = dd->ncg_home;
 +    dd->nat_tot  = dd->nat_home;
 +    if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
 +    {
 +        dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
 +        srenew(dd->index_gl, dd->cg_nalloc);
 +        srenew(dd->cgindex, dd->cg_nalloc+1);
 +    }
 +    if (bMaster)
 +    {
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
 +            ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
 +        }
 +    }
 +
 +    dd_scatterv(dd,
 +                DDMASTER(dd) ? ma->ibuf : NULL,
 +                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
 +                DDMASTER(dd) ? ma->cg : NULL,
 +                dd->ncg_home*sizeof(int), dd->index_gl);
 +
 +    /* Determine the home charge group sizes */
 +    dd->cgindex[0] = 0;
 +    for (i = 0; i < dd->ncg_home; i++)
 +    {
 +        cg_gl            = dd->index_gl[i];
 +        dd->cgindex[i+1] =
 +            dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Home charge groups:\n");
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            fprintf(debug, " %d", dd->index_gl[i]);
 +            if (i % 10 == 9)
 +            {
 +                fprintf(debug, "\n");
 +            }
 +        }
 +        fprintf(debug, "\n");
 +    }
 +}
 +
 +static int compact_and_copy_vec_at(int ncg, int *move,
 +                                   int *cgindex,
 +                                   int nvec, int vec,
 +                                   rvec *src, gmx_domdec_comm_t *comm,
 +                                   gmx_bool bCompact)
 +{
 +    int m, icg, i, i0, i1, nrcg;
 +    int home_pos;
 +    int pos_vec[DIM*2];
 +
 +    home_pos = 0;
 +
 +    for (m = 0; m < DIM*2; m++)
 +    {
 +        pos_vec[m] = 0;
 +    }
 +
 +    i0 = 0;
 +    for (icg = 0; icg < ncg; icg++)
 +    {
 +        i1 = cgindex[icg+1];
 +        m  = move[icg];
 +        if (m == -1)
 +        {
 +            if (bCompact)
 +            {
 +                /* Compact the home array in place */
 +                for (i = i0; i < i1; i++)
 +                {
 +                    copy_rvec(src[i], src[home_pos++]);
 +                }
 +            }
 +        }
 +        else
 +        {
 +            /* Copy to the communication buffer */
 +            nrcg        = i1 - i0;
 +            pos_vec[m] += 1 + vec*nrcg;
 +            for (i = i0; i < i1; i++)
 +            {
 +                copy_rvec(src[i], comm->cgcm_state[m][pos_vec[m]++]);
 +            }
 +            pos_vec[m] += (nvec - vec - 1)*nrcg;
 +        }
 +        if (!bCompact)
 +        {
 +            home_pos += i1 - i0;
 +        }
 +        i0 = i1;
 +    }
 +
 +    return home_pos;
 +}
 +
 +static int compact_and_copy_vec_cg(int ncg, int *move,
 +                                   int *cgindex,
 +                                   int nvec, rvec *src, gmx_domdec_comm_t *comm,
 +                                   gmx_bool bCompact)
 +{
 +    int m, icg, i0, i1, nrcg;
 +    int home_pos;
 +    int pos_vec[DIM*2];
 +
 +    home_pos = 0;
 +
 +    for (m = 0; m < DIM*2; m++)
 +    {
 +        pos_vec[m] = 0;
 +    }
 +
 +    i0 = 0;
 +    for (icg = 0; icg < ncg; icg++)
 +    {
 +        i1 = cgindex[icg+1];
 +        m  = move[icg];
 +        if (m == -1)
 +        {
 +            if (bCompact)
 +            {
 +                /* Compact the home array in place */
 +                copy_rvec(src[icg], src[home_pos++]);
 +            }
 +        }
 +        else
 +        {
 +            nrcg = i1 - i0;
 +            /* Copy to the communication buffer */
 +            copy_rvec(src[icg], comm->cgcm_state[m][pos_vec[m]]);
 +            pos_vec[m] += 1 + nrcg*nvec;
 +        }
 +        i0 = i1;
 +    }
 +    if (!bCompact)
 +    {
 +        home_pos = ncg;
 +    }
 +
 +    return home_pos;
 +}
 +
 +static int compact_ind(int ncg, int *move,
 +                       int *index_gl, int *cgindex,
 +                       int *gatindex,
 +                       gmx_ga2la_t ga2la, char *bLocalCG,
 +                       int *cginfo)
 +{
 +    int cg, nat, a0, a1, a, a_gl;
 +    int home_pos;
 +
 +    home_pos = 0;
 +    nat      = 0;
 +    for (cg = 0; cg < ncg; cg++)
 +    {
 +        a0 = cgindex[cg];
 +        a1 = cgindex[cg+1];
 +        if (move[cg] == -1)
 +        {
 +            /* Compact the home arrays in place.
 +             * Anything that can be done here avoids access to global arrays.
 +             */
 +            cgindex[home_pos] = nat;
 +            for (a = a0; a < a1; a++)
 +            {
 +                a_gl          = gatindex[a];
 +                gatindex[nat] = a_gl;
 +                /* The cell number stays 0, so we don't need to set it */
 +                ga2la_change_la(ga2la, a_gl, nat);
 +                nat++;
 +            }
 +            index_gl[home_pos] = index_gl[cg];
 +            cginfo[home_pos]   = cginfo[cg];
 +            /* The charge group remains local, so bLocalCG does not change */
 +            home_pos++;
 +        }
 +        else
 +        {
 +            /* Clear the global indices */
 +            for (a = a0; a < a1; a++)
 +            {
 +                ga2la_del(ga2la, gatindex[a]);
 +            }
 +            if (bLocalCG)
 +            {
 +                bLocalCG[index_gl[cg]] = FALSE;
 +            }
 +        }
 +    }
 +    cgindex[home_pos] = nat;
 +
 +    return home_pos;
 +}
 +
 +static void clear_and_mark_ind(int ncg, int *move,
 +                               int *index_gl, int *cgindex, int *gatindex,
 +                               gmx_ga2la_t ga2la, char *bLocalCG,
 +                               int *cell_index)
 +{
 +    int cg, a0, a1, a;
 +
 +    for (cg = 0; cg < ncg; cg++)
 +    {
 +        if (move[cg] >= 0)
 +        {
 +            a0 = cgindex[cg];
 +            a1 = cgindex[cg+1];
 +            /* Clear the global indices */
 +            for (a = a0; a < a1; a++)
 +            {
 +                ga2la_del(ga2la, gatindex[a]);
 +            }
 +            if (bLocalCG)
 +            {
 +                bLocalCG[index_gl[cg]] = FALSE;
 +            }
 +            /* Signal that this cg has moved using the ns cell index.
 +             * Here we set it to -1. fill_grid will change it
 +             * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
 +             */
 +            cell_index[cg] = -1;
 +        }
 +    }
 +}
 +
 +static void print_cg_move(FILE *fplog,
 +                          gmx_domdec_t *dd,
 +                          gmx_large_int_t step, int cg, int dim, int dir,
 +                          gmx_bool bHaveLimitdAndCMOld, real limitd,
 +                          rvec cm_old, rvec cm_new, real pos_d)
 +{
 +    gmx_domdec_comm_t *comm;
 +    char               buf[22];
 +
 +    comm = dd->comm;
 +
 +    fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
 +    if (bHaveLimitdAndCMOld)
 +    {
 +        fprintf(fplog, "The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
 +                ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
 +    }
 +    else
 +    {
 +        fprintf(fplog, "The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
 +                ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
 +    }
 +    fprintf(fplog, "distance out of cell %f\n",
 +            dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
 +    if (bHaveLimitdAndCMOld)
 +    {
 +        fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
 +                cm_old[XX], cm_old[YY], cm_old[ZZ]);
 +    }
 +    fprintf(fplog, "New coordinates: %8.3f %8.3f %8.3f\n",
 +            cm_new[XX], cm_new[YY], cm_new[ZZ]);
 +    fprintf(fplog, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
 +            dim2char(dim),
 +            comm->old_cell_x0[dim], comm->old_cell_x1[dim]);
 +    fprintf(fplog, "New cell boundaries in direction %c: %8.3f %8.3f\n",
 +            dim2char(dim),
 +            comm->cell_x0[dim], comm->cell_x1[dim]);
 +}
 +
 +static void cg_move_error(FILE *fplog,
 +                          gmx_domdec_t *dd,
 +                          gmx_large_int_t step, int cg, int dim, int dir,
 +                          gmx_bool bHaveLimitdAndCMOld, real limitd,
 +                          rvec cm_old, rvec cm_new, real pos_d)
 +{
 +    if (fplog)
 +    {
 +        print_cg_move(fplog, dd, step, cg, dim, dir,
 +                      bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
 +    }
 +    print_cg_move(stderr, dd, step, cg, dim, dir,
 +                  bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
 +    gmx_fatal(FARGS,
 +              "A charge group moved too far between two domain decomposition steps\n"
 +              "This usually means that your system is not well equilibrated");
 +}
 +
 +static void rotate_state_atom(t_state *state, int a)
 +{
 +    int est;
 +
 +    for (est = 0; est < estNR; est++)
 +    {
 +        if (EST_DISTR(est) && (state->flags & (1<<est)))
 +        {
 +            switch (est)
 +            {
 +                case estX:
 +                    /* Rotate the complete state; for a rectangular box only */
 +                    state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
 +                    state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
 +                    break;
 +                case estV:
 +                    state->v[a][YY] = -state->v[a][YY];
 +                    state->v[a][ZZ] = -state->v[a][ZZ];
 +                    break;
 +                case estSDX:
 +                    state->sd_X[a][YY] = -state->sd_X[a][YY];
 +                    state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
 +                    break;
 +                case estCGP:
 +                    state->cg_p[a][YY] = -state->cg_p[a][YY];
 +                    state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
 +                    break;
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* These are distances, so not affected by rotation */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in rotate_state_atom");
 +            }
 +        }
 +    }
 +}
 +
 +static int *get_moved(gmx_domdec_comm_t *comm, int natoms)
 +{
 +    if (natoms > comm->moved_nalloc)
 +    {
 +        /* Contents should be preserved here */
 +        comm->moved_nalloc = over_alloc_dd(natoms);
 +        srenew(comm->moved, comm->moved_nalloc);
 +    }
 +
 +    return comm->moved;
 +}
 +
 +static void calc_cg_move(FILE *fplog, gmx_large_int_t step,
 +                         gmx_domdec_t *dd,
 +                         t_state *state,
 +                         ivec tric_dir, matrix tcm,
 +                         rvec cell_x0, rvec cell_x1,
 +                         rvec limitd, rvec limit0, rvec limit1,
 +                         const int *cgindex,
 +                         int cg_start, int cg_end,
 +                         rvec *cg_cm,
 +                         int *move)
 +{
 +    int      npbcdim;
 +    int      c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
 +    int      mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
 +    int      flag;
 +    gmx_bool bScrew;
 +    ivec     dev;
 +    real     inv_ncg, pos_d;
 +    rvec     cm_new;
 +
 +    npbcdim = dd->npbcdim;
 +
 +    for (cg = cg_start; cg < cg_end; cg++)
 +    {
 +        k0   = cgindex[cg];
 +        k1   = cgindex[cg+1];
 +        nrcg = k1 - k0;
 +        if (nrcg == 1)
 +        {
 +            copy_rvec(state->x[k0], cm_new);
 +        }
 +        else
 +        {
 +            inv_ncg = 1.0/nrcg;
 +
 +            clear_rvec(cm_new);
 +            for (k = k0; (k < k1); k++)
 +            {
 +                rvec_inc(cm_new, state->x[k]);
 +            }
 +            for (d = 0; (d < DIM); d++)
 +            {
 +                cm_new[d] = inv_ncg*cm_new[d];
 +            }
 +        }
 +
 +        clear_ivec(dev);
 +        /* Do pbc and check DD cell boundary crossings */
 +        for (d = DIM-1; d >= 0; d--)
 +        {
 +            if (dd->nc[d] > 1)
 +            {
 +                bScrew = (dd->bScrewPBC && d == XX);
 +                /* Determine the location of this cg in lattice coordinates */
 +                pos_d = cm_new[d];
 +                if (tric_dir[d])
 +                {
 +                    for (d2 = d+1; d2 < DIM; d2++)
 +                    {
 +                        pos_d += cm_new[d2]*tcm[d2][d];
 +                    }
 +                }
 +                /* Put the charge group in the triclinic unit-cell */
 +                if (pos_d >= cell_x1[d])
 +                {
 +                    if (pos_d >= limit1[d])
 +                    {
 +                        cg_move_error(fplog, dd, step, cg, d, 1, TRUE, limitd[d],
 +                                      cg_cm[cg], cm_new, pos_d);
 +                    }
 +                    dev[d] = 1;
 +                    if (dd->ci[d] == dd->nc[d] - 1)
 +                    {
 +                        rvec_dec(cm_new, state->box[d]);
 +                        if (bScrew)
 +                        {
 +                            cm_new[YY] = state->box[YY][YY] - cm_new[YY];
 +                            cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
 +                        }
 +                        for (k = k0; (k < k1); k++)
 +                        {
 +                            rvec_dec(state->x[k], state->box[d]);
 +                            if (bScrew)
 +                            {
 +                                rotate_state_atom(state, k);
 +                            }
 +                        }
 +                    }
 +                }
 +                else if (pos_d < cell_x0[d])
 +                {
 +                    if (pos_d < limit0[d])
 +                    {
 +                        cg_move_error(fplog, dd, step, cg, d, -1, TRUE, limitd[d],
 +                                      cg_cm[cg], cm_new, pos_d);
 +                    }
 +                    dev[d] = -1;
 +                    if (dd->ci[d] == 0)
 +                    {
 +                        rvec_inc(cm_new, state->box[d]);
 +                        if (bScrew)
 +                        {
 +                            cm_new[YY] = state->box[YY][YY] - cm_new[YY];
 +                            cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
 +                        }
 +                        for (k = k0; (k < k1); k++)
 +                        {
 +                            rvec_inc(state->x[k], state->box[d]);
 +                            if (bScrew)
 +                            {
 +                                rotate_state_atom(state, k);
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +            else if (d < npbcdim)
 +            {
 +                /* Put the charge group in the rectangular unit-cell */
 +                while (cm_new[d] >= state->box[d][d])
 +                {
 +                    rvec_dec(cm_new, state->box[d]);
 +                    for (k = k0; (k < k1); k++)
 +                    {
 +                        rvec_dec(state->x[k], state->box[d]);
 +                    }
 +                }
 +                while (cm_new[d] < 0)
 +                {
 +                    rvec_inc(cm_new, state->box[d]);
 +                    for (k = k0; (k < k1); k++)
 +                    {
 +                        rvec_inc(state->x[k], state->box[d]);
 +                    }
 +                }
 +            }
 +        }
 +
 +        copy_rvec(cm_new, cg_cm[cg]);
 +
 +        /* Determine where this cg should go */
 +        flag = 0;
 +        mc   = -1;
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            dim = dd->dim[d];
 +            if (dev[dim] == 1)
 +            {
 +                flag |= DD_FLAG_FW(d);
 +                if (mc == -1)
 +                {
 +                    mc = d*2;
 +                }
 +            }
 +            else if (dev[dim] == -1)
 +            {
 +                flag |= DD_FLAG_BW(d);
 +                if (mc == -1)
 +                {
 +                    if (dd->nc[dim] > 2)
 +                    {
 +                        mc = d*2 + 1;
 +                    }
 +                    else
 +                    {
 +                        mc = d*2;
 +                    }
 +                }
 +            }
 +        }
 +        /* Temporarily store the flag in move */
 +        move[cg] = mc + flag;
 +    }
 +}
 +
 +static void dd_redistribute_cg(FILE *fplog, gmx_large_int_t step,
 +                               gmx_domdec_t *dd, ivec tric_dir,
 +                               t_state *state, rvec **f,
 +                               t_forcerec *fr,
 +                               gmx_bool bCompact,
 +                               t_nrnb *nrnb,
 +                               int *ncg_stay_home,
 +                               int *ncg_moved)
 +{
 +    int               *move;
 +    int                npbcdim;
 +    int                ncg[DIM*2], nat[DIM*2];
 +    int                c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
 +    int                mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
 +    int                sbuf[2], rbuf[2];
 +    int                home_pos_cg, home_pos_at, buf_pos;
 +    int                flag;
 +    gmx_bool           bV = FALSE, bSDX = FALSE, bCGP = FALSE;
 +    gmx_bool           bScrew;
 +    ivec               dev;
 +    real               inv_ncg, pos_d;
 +    matrix             tcm;
 +    rvec              *cg_cm = NULL, cell_x0, cell_x1, limitd, limit0, limit1, cm_new;
 +    atom_id           *cgindex;
 +    cginfo_mb_t       *cginfo_mb;
 +    gmx_domdec_comm_t *comm;
 +    int               *moved;
 +    int                nthread, thread;
 +
 +    if (dd->bScrewPBC)
 +    {
 +        check_screw_box(state->box);
 +    }
 +
 +    comm  = dd->comm;
 +    if (fr->cutoff_scheme == ecutsGROUP)
 +    {
 +        cg_cm = fr->cg_cm;
 +    }
 +
 +    for (i = 0; i < estNR; i++)
 +    {
 +        if (EST_DISTR(i))
 +        {
 +            switch (i)
 +            {
 +                case estX: /* Always present */ break;
 +                case estV:   bV   = (state->flags & (1<<i)); break;
 +                case estSDX: bSDX = (state->flags & (1<<i)); break;
 +                case estCGP: bCGP = (state->flags & (1<<i)); break;
 +                case estLD_RNG:
 +                case estLD_RNGI:
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* No processing required */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
 +            }
 +        }
 +    }
 +
 +    if (dd->ncg_tot > comm->nalloc_int)
 +    {
 +        comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
 +        srenew(comm->buf_int, comm->nalloc_int);
 +    }
 +    move = comm->buf_int;
 +
 +    /* Clear the count */
 +    for (c = 0; c < dd->ndim*2; c++)
 +    {
 +        ncg[c] = 0;
 +        nat[c] = 0;
 +    }
 +
 +    npbcdim = dd->npbcdim;
 +
 +    for (d = 0; (d < DIM); d++)
 +    {
 +        limitd[d] = dd->comm->cellsize_min[d];
 +        if (d >= npbcdim && dd->ci[d] == 0)
 +        {
 +            cell_x0[d] = -GMX_FLOAT_MAX;
 +        }
 +        else
 +        {
 +            cell_x0[d] = comm->cell_x0[d];
 +        }
 +        if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
 +        {
 +            cell_x1[d] = GMX_FLOAT_MAX;
 +        }
 +        else
 +        {
 +            cell_x1[d] = comm->cell_x1[d];
 +        }
 +        if (d < npbcdim)
 +        {
 +            limit0[d] = comm->old_cell_x0[d] - limitd[d];
 +            limit1[d] = comm->old_cell_x1[d] + limitd[d];
 +        }
 +        else
 +        {
 +            /* We check after communication if a charge group moved
 +             * more than one cell. Set the pre-comm check limit to float_max.
 +             */
 +            limit0[d] = -GMX_FLOAT_MAX;
 +            limit1[d] =  GMX_FLOAT_MAX;
 +        }
 +    }
 +
 +    make_tric_corr_matrix(npbcdim, state->box, tcm);
 +
 +    cgindex = dd->cgindex;
 +
 +    nthread = gmx_omp_nthreads_get(emntDomdec);
 +
 +    /* Compute the center of geometry for all home charge groups
 +     * and put them in the box and determine where they should go.
 +     */
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +    for (thread = 0; thread < nthread; thread++)
 +    {
 +        calc_cg_move(fplog, step, dd, state, tric_dir, tcm,
 +                     cell_x0, cell_x1, limitd, limit0, limit1,
 +                     cgindex,
 +                     ( thread   *dd->ncg_home)/nthread,
 +                     ((thread+1)*dd->ncg_home)/nthread,
 +                     fr->cutoff_scheme == ecutsGROUP ? cg_cm : state->x,
 +                     move);
 +    }
 +
 +    for (cg = 0; cg < dd->ncg_home; cg++)
 +    {
 +        if (move[cg] >= 0)
 +        {
 +            mc       = move[cg];
 +            flag     = mc & ~DD_FLAG_NRCG;
 +            mc       = mc & DD_FLAG_NRCG;
 +            move[cg] = mc;
 +
 +            if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
 +            {
 +                comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
 +                srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
 +            }
 +            comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
 +            /* We store the cg size in the lower 16 bits
 +             * and the place where the charge group should go
 +             * in the next 6 bits. This saves some communication volume.
 +             */
 +            nrcg = cgindex[cg+1] - cgindex[cg];
 +            comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
 +            ncg[mc] += 1;
 +            nat[mc] += nrcg;
 +        }
 +    }
 +
 +    inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
 +    inc_nrnb(nrnb, eNR_RESETX, dd->ncg_home);
 +
 +    *ncg_moved = 0;
 +    for (i = 0; i < dd->ndim*2; i++)
 +    {
 +        *ncg_moved += ncg[i];
 +    }
 +
 +    nvec = 1;
 +    if (bV)
 +    {
 +        nvec++;
 +    }
 +    if (bSDX)
 +    {
 +        nvec++;
 +    }
 +    if (bCGP)
 +    {
 +        nvec++;
 +    }
 +
 +    /* Make sure the communication buffers are large enough */
 +    for (mc = 0; mc < dd->ndim*2; mc++)
 +    {
 +        nvr = ncg[mc] + nat[mc]*nvec;
 +        if (nvr > comm->cgcm_state_nalloc[mc])
 +        {
 +            comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
 +            srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
 +        }
 +    }
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +        case ecutsGROUP:
 +            /* Recalculating cg_cm might be cheaper than communicating,
 +             * but that could give rise to rounding issues.
 +             */
 +            home_pos_cg =
 +                compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
 +                                        nvec, cg_cm, comm, bCompact);
 +            break;
 +        case ecutsVERLET:
 +            /* Without charge groups we send the moved atom coordinates
 +             * over twice. This is so the code below can be used without
 +             * many conditionals for both for with and without charge groups.
 +             */
 +            home_pos_cg =
 +                compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
 +                                        nvec, state->x, comm, FALSE);
 +            if (bCompact)
 +            {
 +                home_pos_cg -= *ncg_moved;
 +            }
 +            break;
 +        default:
 +            gmx_incons("unimplemented");
 +            home_pos_cg = 0;
 +    }
 +
 +    vec         = 0;
 +    home_pos_at =
 +        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
 +                                nvec, vec++, state->x, comm, bCompact);
 +    if (bV)
 +    {
 +        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
 +                                nvec, vec++, state->v, comm, bCompact);
 +    }
 +    if (bSDX)
 +    {
 +        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
 +                                nvec, vec++, state->sd_X, comm, bCompact);
 +    }
 +    if (bCGP)
 +    {
 +        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
 +                                nvec, vec++, state->cg_p, comm, bCompact);
 +    }
 +
 +    if (bCompact)
 +    {
 +        compact_ind(dd->ncg_home, move,
 +                    dd->index_gl, dd->cgindex, dd->gatindex,
 +                    dd->ga2la, comm->bLocalCG,
 +                    fr->cginfo);
 +    }
 +    else
 +    {
 +        if (fr->cutoff_scheme == ecutsVERLET)
 +        {
 +            moved = get_moved(comm, dd->ncg_home);
 +
 +            for (k = 0; k < dd->ncg_home; k++)
 +            {
 +                moved[k] = 0;
 +            }
 +        }
 +        else
 +        {
 +            moved = fr->ns.grid->cell_index;
 +        }
 +
 +        clear_and_mark_ind(dd->ncg_home, move,
 +                           dd->index_gl, dd->cgindex, dd->gatindex,
 +                           dd->ga2la, comm->bLocalCG,
 +                           moved);
 +    }
 +
 +    cginfo_mb = fr->cginfo_mb;
 +
 +    *ncg_stay_home = home_pos_cg;
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim      = dd->dim[d];
 +        ncg_recv = 0;
 +        nat_recv = 0;
 +        nvr      = 0;
 +        for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
 +        {
 +            cdd = d*2 + dir;
 +            /* Communicate the cg and atom counts */
 +            sbuf[0] = ncg[cdd];
 +            sbuf[1] = nat[cdd];
 +            if (debug)
 +            {
 +                fprintf(debug, "Sending ddim %d dir %d: ncg %d nat %d\n",
 +                        d, dir, sbuf[0], sbuf[1]);
 +            }
 +            dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
 +
 +            if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
 +            {
 +                comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
 +                srenew(comm->buf_int, comm->nalloc_int);
 +            }
 +
 +            /* Communicate the charge group indices, sizes and flags */
 +            dd_sendrecv_int(dd, d, dir,
 +                            comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
 +                            comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
 +
 +            nvs = ncg[cdd] + nat[cdd]*nvec;
 +            i   = rbuf[0]  + rbuf[1] *nvec;
 +            vec_rvec_check_alloc(&comm->vbuf, nvr+i);
 +
 +            /* Communicate cgcm and state */
 +            dd_sendrecv_rvec(dd, d, dir,
 +                             comm->cgcm_state[cdd], nvs,
 +                             comm->vbuf.v+nvr, i);
 +            ncg_recv += rbuf[0];
 +            nat_recv += rbuf[1];
 +            nvr      += i;
 +        }
 +
 +        /* Process the received charge groups */
 +        buf_pos = 0;
 +        for (cg = 0; cg < ncg_recv; cg++)
 +        {
 +            flag = comm->buf_int[cg*DD_CGIBS+1];
 +
 +            if (dim >= npbcdim && dd->nc[dim] > 2)
 +            {
 +                /* No pbc in this dim and more than one domain boundary.
 +                 * We do a separate check if a charge group didn't move too far.
 +                 */
 +                if (((flag & DD_FLAG_FW(d)) &&
 +                     comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
 +                    ((flag & DD_FLAG_BW(d)) &&
 +                     comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
 +                {
 +                    cg_move_error(fplog, dd, step, cg, dim,
 +                                  (flag & DD_FLAG_FW(d)) ? 1 : 0,
 +                                  FALSE, 0,
 +                                  comm->vbuf.v[buf_pos],
 +                                  comm->vbuf.v[buf_pos],
 +                                  comm->vbuf.v[buf_pos][dim]);
 +                }
 +            }
 +
 +            mc = -1;
 +            if (d < dd->ndim-1)
 +            {
 +                /* Check which direction this cg should go */
 +                for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
 +                {
 +                    if (dd->bGridJump)
 +                    {
 +                        /* The cell boundaries for dimension d2 are not equal
 +                         * for each cell row of the lower dimension(s),
 +                         * therefore we might need to redetermine where
 +                         * this cg should go.
 +                         */
 +                        dim2 = dd->dim[d2];
 +                        /* If this cg crosses the box boundary in dimension d2
 +                         * we can use the communicated flag, so we do not
 +                         * have to worry about pbc.
 +                         */
 +                        if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
 +                               (flag & DD_FLAG_FW(d2))) ||
 +                              (dd->ci[dim2] == 0 &&
 +                               (flag & DD_FLAG_BW(d2)))))
 +                        {
 +                            /* Clear the two flags for this dimension */
 +                            flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
 +                            /* Determine the location of this cg
 +                             * in lattice coordinates
 +                             */
 +                            pos_d = comm->vbuf.v[buf_pos][dim2];
 +                            if (tric_dir[dim2])
 +                            {
 +                                for (d3 = dim2+1; d3 < DIM; d3++)
 +                                {
 +                                    pos_d +=
 +                                        comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
 +                                }
 +                            }
 +                            /* Check of we are not at the box edge.
 +                             * pbc is only handled in the first step above,
 +                             * but this check could move over pbc while
 +                             * the first step did not due to different rounding.
 +                             */
 +                            if (pos_d >= cell_x1[dim2] &&
 +                                dd->ci[dim2] != dd->nc[dim2]-1)
 +                            {
 +                                flag |= DD_FLAG_FW(d2);
 +                            }
 +                            else if (pos_d < cell_x0[dim2] &&
 +                                     dd->ci[dim2] != 0)
 +                            {
 +                                flag |= DD_FLAG_BW(d2);
 +                            }
 +                            comm->buf_int[cg*DD_CGIBS+1] = flag;
 +                        }
 +                    }
 +                    /* Set to which neighboring cell this cg should go */
 +                    if (flag & DD_FLAG_FW(d2))
 +                    {
 +                        mc = d2*2;
 +                    }
 +                    else if (flag & DD_FLAG_BW(d2))
 +                    {
 +                        if (dd->nc[dd->dim[d2]] > 2)
 +                        {
 +                            mc = d2*2+1;
 +                        }
 +                        else
 +                        {
 +                            mc = d2*2;
 +                        }
 +                    }
 +                }
 +            }
 +
 +            nrcg = flag & DD_FLAG_NRCG;
 +            if (mc == -1)
 +            {
 +                if (home_pos_cg+1 > dd->cg_nalloc)
 +                {
 +                    dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
 +                    srenew(dd->index_gl, dd->cg_nalloc);
 +                    srenew(dd->cgindex, dd->cg_nalloc+1);
 +                }
 +                /* Set the global charge group index and size */
 +                dd->index_gl[home_pos_cg]  = comm->buf_int[cg*DD_CGIBS];
 +                dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
 +                /* Copy the state from the buffer */
 +                dd_check_alloc_ncg(fr, state, f, home_pos_cg+1);
 +                if (fr->cutoff_scheme == ecutsGROUP)
 +                {
 +                    cg_cm = fr->cg_cm;
 +                    copy_rvec(comm->vbuf.v[buf_pos], cg_cm[home_pos_cg]);
 +                }
 +                buf_pos++;
 +
 +                /* Set the cginfo */
 +                fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
 +                                                   dd->index_gl[home_pos_cg]);
 +                if (comm->bLocalCG)
 +                {
 +                    comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
 +                }
 +
 +                if (home_pos_at+nrcg > state->nalloc)
 +                {
 +                    dd_realloc_state(state, f, home_pos_at+nrcg);
 +                }
 +                for (i = 0; i < nrcg; i++)
 +                {
 +                    copy_rvec(comm->vbuf.v[buf_pos++],
 +                              state->x[home_pos_at+i]);
 +                }
 +                if (bV)
 +                {
 +                    for (i = 0; i < nrcg; i++)
 +                    {
 +                        copy_rvec(comm->vbuf.v[buf_pos++],
 +                                  state->v[home_pos_at+i]);
 +                    }
 +                }
 +                if (bSDX)
 +                {
 +                    for (i = 0; i < nrcg; i++)
 +                    {
 +                        copy_rvec(comm->vbuf.v[buf_pos++],
 +                                  state->sd_X[home_pos_at+i]);
 +                    }
 +                }
 +                if (bCGP)
 +                {
 +                    for (i = 0; i < nrcg; i++)
 +                    {
 +                        copy_rvec(comm->vbuf.v[buf_pos++],
 +                                  state->cg_p[home_pos_at+i]);
 +                    }
 +                }
 +                home_pos_cg += 1;
 +                home_pos_at += nrcg;
 +            }
 +            else
 +            {
 +                /* Reallocate the buffers if necessary  */
 +                if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
 +                {
 +                    comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
 +                    srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
 +                }
 +                nvr = ncg[mc] + nat[mc]*nvec;
 +                if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
 +                {
 +                    comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
 +                    srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
 +                }
 +                /* Copy from the receive to the send buffers */
 +                memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
 +                       comm->buf_int + cg*DD_CGIBS,
 +                       DD_CGIBS*sizeof(int));
 +                memcpy(comm->cgcm_state[mc][nvr],
 +                       comm->vbuf.v[buf_pos],
 +                       (1+nrcg*nvec)*sizeof(rvec));
 +                buf_pos += 1 + nrcg*nvec;
 +                ncg[mc] += 1;
 +                nat[mc] += nrcg;
 +            }
 +        }
 +    }
 +
 +    /* With sorting (!bCompact) the indices are now only partially up to date
 +     * and ncg_home and nat_home are not the real count, since there are
 +     * "holes" in the arrays for the charge groups that moved to neighbors.
 +     */
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        moved = get_moved(comm, home_pos_cg);
 +
 +        for (i = dd->ncg_home; i < home_pos_cg; i++)
 +        {
 +            moved[i] = 0;
 +        }
 +    }
 +    dd->ncg_home = home_pos_cg;
 +    dd->nat_home = home_pos_at;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,
 +                "Finished repartitioning: cgs moved out %d, new home %d\n",
 +                *ncg_moved, dd->ncg_home-*ncg_moved);
 +
 +    }
 +}
 +
 +void dd_cycles_add(gmx_domdec_t *dd, float cycles, int ddCycl)
 +{
 +    dd->comm->cycl[ddCycl] += cycles;
 +    dd->comm->cycl_n[ddCycl]++;
 +    if (cycles > dd->comm->cycl_max[ddCycl])
 +    {
 +        dd->comm->cycl_max[ddCycl] = cycles;
 +    }
 +}
 +
 +static double force_flop_count(t_nrnb *nrnb)
 +{
 +    int         i;
 +    double      sum;
 +    const char *name;
 +
 +    sum = 0;
 +    for (i = 0; i < eNR_NBKERNEL_FREE_ENERGY; i++)
 +    {
 +        /* To get closer to the real timings, we half the count
 +         * for the normal loops and again half it for water loops.
 +         */
 +        name = nrnb_str(i);
 +        if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
 +        {
 +            sum += nrnb->n[i]*0.25*cost_nrnb(i);
 +        }
 +        else
 +        {
 +            sum += nrnb->n[i]*0.50*cost_nrnb(i);
 +        }
 +    }
 +    for (i = eNR_NBKERNEL_FREE_ENERGY; i <= eNR_NB14; i++)
 +    {
 +        name = nrnb_str(i);
 +        if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
 +        {
 +            sum += nrnb->n[i]*cost_nrnb(i);
 +        }
 +    }
 +    for (i = eNR_BONDS; i <= eNR_WALLS; i++)
 +    {
 +        sum += nrnb->n[i]*cost_nrnb(i);
 +    }
 +
 +    return sum;
 +}
 +
 +void dd_force_flop_start(gmx_domdec_t *dd, t_nrnb *nrnb)
 +{
 +    if (dd->comm->eFlop)
 +    {
 +        dd->comm->flop -= force_flop_count(nrnb);
 +    }
 +}
 +void dd_force_flop_stop(gmx_domdec_t *dd, t_nrnb *nrnb)
 +{
 +    if (dd->comm->eFlop)
 +    {
 +        dd->comm->flop += force_flop_count(nrnb);
 +        dd->comm->flop_n++;
 +    }
 +}
 +
 +static void clear_dd_cycle_counts(gmx_domdec_t *dd)
 +{
 +    int i;
 +
 +    for (i = 0; i < ddCyclNr; i++)
 +    {
 +        dd->comm->cycl[i]     = 0;
 +        dd->comm->cycl_n[i]   = 0;
 +        dd->comm->cycl_max[i] = 0;
 +    }
 +    dd->comm->flop   = 0;
 +    dd->comm->flop_n = 0;
 +}
 +
 +static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_domdec_load_t *load;
 +    gmx_domdec_root_t *root = NULL;
 +    int                d, dim, cid, i, pos;
 +    float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
 +    gmx_bool           bSepPME;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "get_load_distribution start\n");
 +    }
 +
 +    wallcycle_start(wcycle, ewcDDCOMMLOAD);
 +
 +    comm = dd->comm;
 +
 +    bSepPME = (dd->pme_nodeid >= 0);
 +
 +    for (d = dd->ndim-1; d >= 0; d--)
 +    {
 +        dim = dd->dim[d];
 +        /* Check if we participate in the communication in this dimension */
 +        if (d == dd->ndim-1 ||
 +            (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
 +        {
 +            load = &comm->load[d];
 +            if (dd->bGridJump)
 +            {
 +                cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
 +            }
 +            pos = 0;
 +            if (d == dd->ndim-1)
 +            {
 +                sbuf[pos++] = dd_force_load(comm);
 +                sbuf[pos++] = sbuf[0];
 +                if (dd->bGridJump)
 +                {
 +                    sbuf[pos++] = sbuf[0];
 +                    sbuf[pos++] = cell_frac;
 +                    if (d > 0)
 +                    {
 +                        sbuf[pos++] = comm->cell_f_max0[d];
 +                        sbuf[pos++] = comm->cell_f_min1[d];
 +                    }
 +                }
 +                if (bSepPME)
 +                {
 +                    sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
 +                    sbuf[pos++] = comm->cycl[ddCyclPME];
 +                }
 +            }
 +            else
 +            {
 +                sbuf[pos++] = comm->load[d+1].sum;
 +                sbuf[pos++] = comm->load[d+1].max;
 +                if (dd->bGridJump)
 +                {
 +                    sbuf[pos++] = comm->load[d+1].sum_m;
 +                    sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
 +                    sbuf[pos++] = comm->load[d+1].flags;
 +                    if (d > 0)
 +                    {
 +                        sbuf[pos++] = comm->cell_f_max0[d];
 +                        sbuf[pos++] = comm->cell_f_min1[d];
 +                    }
 +                }
 +                if (bSepPME)
 +                {
 +                    sbuf[pos++] = comm->load[d+1].mdf;
 +                    sbuf[pos++] = comm->load[d+1].pme;
 +                }
 +            }
 +            load->nload = pos;
 +            /* Communicate a row in DD direction d.
 +             * The communicators are setup such that the root always has rank 0.
 +             */
 +#ifdef GMX_MPI
 +            MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
 +                       load->load, load->nload*sizeof(float), MPI_BYTE,
 +                       0, comm->mpi_comm_load[d]);
 +#endif
 +            if (dd->ci[dim] == dd->master_ci[dim])
 +            {
 +                /* We are the root, process this row */
 +                if (comm->bDynLoadBal)
 +                {
 +                    root = comm->root[d];
 +                }
 +                load->sum      = 0;
 +                load->max      = 0;
 +                load->sum_m    = 0;
 +                load->cvol_min = 1;
 +                load->flags    = 0;
 +                load->mdf      = 0;
 +                load->pme      = 0;
 +                pos            = 0;
 +                for (i = 0; i < dd->nc[dim]; i++)
 +                {
 +                    load->sum += load->load[pos++];
 +                    load->max  = max(load->max, load->load[pos]);
 +                    pos++;
 +                    if (dd->bGridJump)
 +                    {
 +                        if (root->bLimited)
 +                        {
 +                            /* This direction could not be load balanced properly,
 +                             * therefore we need to use the maximum iso the average load.
 +                             */
 +                            load->sum_m = max(load->sum_m, load->load[pos]);
 +                        }
 +                        else
 +                        {
 +                            load->sum_m += load->load[pos];
 +                        }
 +                        pos++;
 +                        load->cvol_min = min(load->cvol_min, load->load[pos]);
 +                        pos++;
 +                        if (d < dd->ndim-1)
 +                        {
 +                            load->flags = (int)(load->load[pos++] + 0.5);
 +                        }
 +                        if (d > 0)
 +                        {
 +                            root->cell_f_max0[i] = load->load[pos++];
 +                            root->cell_f_min1[i] = load->load[pos++];
 +                        }
 +                    }
 +                    if (bSepPME)
 +                    {
 +                        load->mdf = max(load->mdf, load->load[pos]);
 +                        pos++;
 +                        load->pme = max(load->pme, load->load[pos]);
 +                        pos++;
 +                    }
 +                }
 +                if (comm->bDynLoadBal && root->bLimited)
 +                {
 +                    load->sum_m *= dd->nc[dim];
 +                    load->flags |= (1<<d);
 +                }
 +            }
 +        }
 +    }
 +
 +    if (DDMASTER(dd))
 +    {
 +        comm->nload      += dd_load_count(comm);
 +        comm->load_step  += comm->cycl[ddCyclStep];
 +        comm->load_sum   += comm->load[0].sum;
 +        comm->load_max   += comm->load[0].max;
 +        if (comm->bDynLoadBal)
 +        {
 +            for (d = 0; d < dd->ndim; d++)
 +            {
 +                if (comm->load[0].flags & (1<<d))
 +                {
 +                    comm->load_lim[d]++;
 +                }
 +            }
 +        }
 +        if (bSepPME)
 +        {
 +            comm->load_mdf += comm->load[0].mdf;
 +            comm->load_pme += comm->load[0].pme;
 +        }
 +    }
 +
 +    wallcycle_stop(wcycle, ewcDDCOMMLOAD);
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "get_load_distribution finished\n");
 +    }
 +}
 +
 +static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
 +{
 +    /* Return the relative performance loss on the total run time
 +     * due to the force calculation load imbalance.
 +     */
 +    if (dd->comm->nload > 0)
 +    {
 +        return
 +            (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
 +            (dd->comm->load_step*dd->nnodes);
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
 +{
 +    char               buf[STRLEN];
 +    int                npp, npme, nnodes, d, limp;
 +    float              imbal, pme_f_ratio, lossf, lossp = 0;
 +    gmx_bool           bLim;
 +    gmx_domdec_comm_t *comm;
 +
 +    comm = dd->comm;
 +    if (DDMASTER(dd) && comm->nload > 0)
 +    {
 +        npp    = dd->nnodes;
 +        npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
 +        nnodes = npp + npme;
 +        imbal  = comm->load_max*npp/comm->load_sum - 1;
 +        lossf  = dd_force_imb_perf_loss(dd);
 +        sprintf(buf, " Average load imbalance: %.1f %%\n", imbal*100);
 +        fprintf(fplog, "%s", buf);
 +        fprintf(stderr, "\n");
 +        fprintf(stderr, "%s", buf);
 +        sprintf(buf, " Part of the total run time spent waiting due to load imbalance: %.1f %%\n", lossf*100);
 +        fprintf(fplog, "%s", buf);
 +        fprintf(stderr, "%s", buf);
 +        bLim = FALSE;
 +        if (comm->bDynLoadBal)
 +        {
 +            sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
 +            for (d = 0; d < dd->ndim; d++)
 +            {
 +                limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
 +                sprintf(buf+strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limp);
 +                if (limp >= 50)
 +                {
 +                    bLim = TRUE;
 +                }
 +            }
 +            sprintf(buf+strlen(buf), "\n");
 +            fprintf(fplog, "%s", buf);
 +            fprintf(stderr, "%s", buf);
 +        }
 +        if (npme > 0)
 +        {
 +            pme_f_ratio = comm->load_pme/comm->load_mdf;
 +            lossp       = (comm->load_pme -comm->load_mdf)/comm->load_step;
 +            if (lossp <= 0)
 +            {
 +                lossp *= (float)npme/(float)nnodes;
 +            }
 +            else
 +            {
 +                lossp *= (float)npp/(float)nnodes;
 +            }
 +            sprintf(buf, " Average PME mesh/force load: %5.3f\n", pme_f_ratio);
 +            fprintf(fplog, "%s", buf);
 +            fprintf(stderr, "%s", buf);
 +            sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossp)*100);
 +            fprintf(fplog, "%s", buf);
 +            fprintf(stderr, "%s", buf);
 +        }
 +        fprintf(fplog, "\n");
 +        fprintf(stderr, "\n");
 +
 +        if (lossf >= DD_PERF_LOSS)
 +        {
 +            sprintf(buf,
 +                    "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
 +                    "      in the domain decomposition.\n", lossf*100);
 +            if (!comm->bDynLoadBal)
 +            {
 +                sprintf(buf+strlen(buf), "      You might want to use dynamic load balancing (option -dlb.)\n");
 +            }
 +            else if (bLim)
 +            {
 +                sprintf(buf+strlen(buf), "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
 +            }
 +            fprintf(fplog, "%s\n", buf);
 +            fprintf(stderr, "%s\n", buf);
 +        }
 +        if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
 +        {
 +            sprintf(buf,
 +                    "NOTE: %.1f %% performance was lost because the PME nodes\n"
 +                    "      had %s work to do than the PP nodes.\n"
 +                    "      You might want to %s the number of PME nodes\n"
 +                    "      or %s the cut-off and the grid spacing.\n",
 +                    fabs(lossp*100),
 +                    (lossp < 0) ? "less"     : "more",
 +                    (lossp < 0) ? "decrease" : "increase",
 +                    (lossp < 0) ? "decrease" : "increase");
 +            fprintf(fplog, "%s\n", buf);
 +            fprintf(stderr, "%s\n", buf);
 +        }
 +    }
 +}
 +
 +static float dd_vol_min(gmx_domdec_t *dd)
 +{
 +    return dd->comm->load[0].cvol_min*dd->nnodes;
 +}
 +
 +static gmx_bool dd_load_flags(gmx_domdec_t *dd)
 +{
 +    return dd->comm->load[0].flags;
 +}
 +
 +static float dd_f_imbal(gmx_domdec_t *dd)
 +{
 +    return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
 +}
 +
 +float dd_pme_f_ratio(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->cycl_n[ddCyclPME] > 0)
 +    {
 +        return dd->comm->load[0].pme/dd->comm->load[0].mdf;
 +    }
 +    else
 +    {
 +        return -1.0;
 +    }
 +}
 +
 +static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_large_int_t step)
 +{
 +    int  flags, d;
 +    char buf[22];
 +
 +    flags = dd_load_flags(dd);
 +    if (flags)
 +    {
 +        fprintf(fplog,
 +                "DD  load balancing is limited by minimum cell size in dimension");
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            if (flags & (1<<d))
 +            {
 +                fprintf(fplog, " %c", dim2char(dd->dim[d]));
 +            }
 +        }
 +        fprintf(fplog, "\n");
 +    }
 +    fprintf(fplog, "DD  step %s", gmx_step_str(step, buf));
 +    if (dd->comm->bDynLoadBal)
 +    {
 +        fprintf(fplog, "  vol min/aver %5.3f%c",
 +                dd_vol_min(dd), flags ? '!' : ' ');
 +    }
 +    fprintf(fplog, " load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
 +    if (dd->comm->cycl_n[ddCyclPME])
 +    {
 +        fprintf(fplog, "  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
 +    }
 +    fprintf(fplog, "\n\n");
 +}
 +
 +static void dd_print_load_verbose(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->bDynLoadBal)
 +    {
 +        fprintf(stderr, "vol %4.2f%c ",
 +                dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
 +    }
 +    fprintf(stderr, "imb F %2d%% ", (int)(dd_f_imbal(dd)*100+0.5));
 +    if (dd->comm->cycl_n[ddCyclPME])
 +    {
 +        fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
 +    }
 +}
 +
 +#ifdef GMX_MPI
 +static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
 +{
 +    MPI_Comm           c_row;
 +    int                dim, i, rank;
 +    ivec               loc_c;
 +    gmx_domdec_root_t *root;
 +    gmx_bool           bPartOfGroup = FALSE;
 +
 +    dim = dd->dim[dim_ind];
 +    copy_ivec(loc, loc_c);
 +    for (i = 0; i < dd->nc[dim]; i++)
 +    {
 +        loc_c[dim] = i;
 +        rank       = dd_index(dd->nc, loc_c);
 +        if (rank == dd->rank)
 +        {
 +            /* This process is part of the group */
 +            bPartOfGroup = TRUE;
 +        }
 +    }
 +    MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
 +                   &c_row);
 +    if (bPartOfGroup)
 +    {
 +        dd->comm->mpi_comm_load[dim_ind] = c_row;
 +        if (dd->comm->eDLB != edlbNO)
 +        {
 +            if (dd->ci[dim] == dd->master_ci[dim])
 +            {
 +                /* This is the root process of this row */
 +                snew(dd->comm->root[dim_ind], 1);
 +                root = dd->comm->root[dim_ind];
 +                snew(root->cell_f, DD_CELL_F_SIZE(dd, dim_ind));
 +                snew(root->old_cell_f, dd->nc[dim]+1);
 +                snew(root->bCellMin, dd->nc[dim]);
 +                if (dim_ind > 0)
 +                {
 +                    snew(root->cell_f_max0, dd->nc[dim]);
 +                    snew(root->cell_f_min1, dd->nc[dim]);
 +                    snew(root->bound_min, dd->nc[dim]);
 +                    snew(root->bound_max, dd->nc[dim]);
 +                }
 +                snew(root->buf_ncd, dd->nc[dim]);
 +            }
 +            else
 +            {
 +                /* This is not a root process, we only need to receive cell_f */
 +                snew(dd->comm->cell_f_row, DD_CELL_F_SIZE(dd, dim_ind));
 +            }
 +        }
 +        if (dd->ci[dim] == dd->master_ci[dim])
 +        {
 +            snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
 +        }
 +    }
 +}
 +#endif
 +
 +void dd_setup_dlb_resource_sharing(t_commrec           gmx_unused *cr,
 +                                   const gmx_hw_info_t gmx_unused *hwinfo,
 +                                   const gmx_hw_opt_t  gmx_unused *hw_opt)
 +{
 +#ifdef GMX_MPI
 +    int           physicalnode_id_hash;
 +    int           gpu_id;
 +    gmx_domdec_t *dd;
 +    MPI_Comm      mpi_comm_pp_physicalnode;
 +
 +    if (!(cr->duty & DUTY_PP) ||
 +        hw_opt->gpu_opt.ncuda_dev_use == 0)
 +    {
 +        /* Only PP nodes (currently) use GPUs.
 +         * If we don't have GPUs, there are no resources to share.
 +         */
 +        return;
 +    }
 +
 +    physicalnode_id_hash = gmx_physicalnode_id_hash();
 +
++    gpu_id = get_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->rank_pp_intranode);
 +
 +    dd = cr->dd;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
 +        fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
 +                dd->rank, physicalnode_id_hash, gpu_id);
 +    }
 +    /* Split the PP communicator over the physical nodes */
 +    /* TODO: See if we should store this (before), as it's also used for
 +     * for the nodecomm summution.
 +     */
 +    MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
 +                   &mpi_comm_pp_physicalnode);
 +    MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
 +                   &dd->comm->mpi_comm_gpu_shared);
 +    MPI_Comm_free(&mpi_comm_pp_physicalnode);
 +    MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
 +    }
 +
 +    /* Note that some ranks could share a GPU, while others don't */
 +
 +    if (dd->comm->nrank_gpu_shared == 1)
 +    {
 +        MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
 +    }
 +#endif
 +}
 +
 +static void make_load_communicators(gmx_domdec_t gmx_unused *dd)
 +{
 +#ifdef GMX_MPI
 +    int  dim0, dim1, i, j;
 +    ivec loc;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Making load communicators\n");
 +    }
 +
 +    snew(dd->comm->load, dd->ndim);
 +    snew(dd->comm->mpi_comm_load, dd->ndim);
 +
 +    clear_ivec(loc);
 +    make_load_communicator(dd, 0, loc);
 +    if (dd->ndim > 1)
 +    {
 +        dim0 = dd->dim[0];
 +        for (i = 0; i < dd->nc[dim0]; i++)
 +        {
 +            loc[dim0] = i;
 +            make_load_communicator(dd, 1, loc);
 +        }
 +    }
 +    if (dd->ndim > 2)
 +    {
 +        dim0 = dd->dim[0];
 +        for (i = 0; i < dd->nc[dim0]; i++)
 +        {
 +            loc[dim0] = i;
 +            dim1      = dd->dim[1];
 +            for (j = 0; j < dd->nc[dim1]; j++)
 +            {
 +                loc[dim1] = j;
 +                make_load_communicator(dd, 2, loc);
 +            }
 +        }
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Finished making load communicators\n");
 +    }
 +#endif
 +}
 +
 +void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd)
 +{
 +    gmx_bool                bZYX;
 +    int                     d, dim, i, j, m;
 +    ivec                    tmp, s;
 +    int                     nzone, nzonep;
 +    ivec                    dd_zp[DD_MAXIZONE];
 +    gmx_domdec_zones_t     *zones;
 +    gmx_domdec_ns_ranges_t *izone;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        copy_ivec(dd->ci, tmp);
 +        tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
 +        dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
 +        copy_ivec(dd->ci, tmp);
 +        tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
 +        dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
 +        if (debug)
 +        {
 +            fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
 +                    dd->rank, dim,
 +                    dd->neighbor[d][0],
 +                    dd->neighbor[d][1]);
 +        }
 +    }
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog, "\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
 +                dd->ndim,
 +                dd->nc[XX], dd->nc[YY], dd->nc[ZZ],
 +                dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +    }
 +    switch (dd->ndim)
 +    {
 +        case 3:
 +            nzone  = dd_z3n;
 +            nzonep = dd_zp3n;
 +            for (i = 0; i < nzonep; i++)
 +            {
 +                copy_ivec(dd_zp3[i], dd_zp[i]);
 +            }
 +            break;
 +        case 2:
 +            nzone  = dd_z2n;
 +            nzonep = dd_zp2n;
 +            for (i = 0; i < nzonep; i++)
 +            {
 +                copy_ivec(dd_zp2[i], dd_zp[i]);
 +            }
 +            break;
 +        case 1:
 +            nzone  = dd_z1n;
 +            nzonep = dd_zp1n;
 +            for (i = 0; i < nzonep; i++)
 +            {
 +                copy_ivec(dd_zp1[i], dd_zp[i]);
 +            }
 +            break;
 +        default:
 +            gmx_fatal(FARGS, "Can only do 1, 2 or 3D domain decomposition");
 +            nzone  = 0;
 +            nzonep = 0;
 +    }
 +
 +    zones = &dd->comm->zones;
 +
 +    for (i = 0; i < nzone; i++)
 +    {
 +        m = 0;
 +        clear_ivec(zones->shift[i]);
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
 +        }
 +    }
 +
 +    zones->n = nzone;
 +    for (i = 0; i < nzone; i++)
 +    {
 +        for (d = 0; d < DIM; d++)
 +        {
 +            s[d] = dd->ci[d] - zones->shift[i][d];
 +            if (s[d] < 0)
 +            {
 +                s[d] += dd->nc[d];
 +            }
 +            else if (s[d] >= dd->nc[d])
 +            {
 +                s[d] -= dd->nc[d];
 +            }
 +        }
 +    }
 +    zones->nizone = nzonep;
 +    for (i = 0; i < zones->nizone; i++)
 +    {
 +        if (dd_zp[i][0] != i)
 +        {
 +            gmx_fatal(FARGS, "Internal inconsistency in the dd grid setup");
 +        }
 +        izone     = &zones->izone[i];
 +        izone->j0 = dd_zp[i][1];
 +        izone->j1 = dd_zp[i][2];
 +        for (dim = 0; dim < DIM; dim++)
 +        {
 +            if (dd->nc[dim] == 1)
 +            {
 +                /* All shifts should be allowed */
 +                izone->shift0[dim] = -1;
 +                izone->shift1[dim] = 1;
 +            }
 +            else
 +            {
 +                /*
 +                   izone->shift0[d] = 0;
 +                   izone->shift1[d] = 0;
 +                   for(j=izone->j0; j<izone->j1; j++) {
 +                   if (dd->shift[j][d] > dd->shift[i][d])
 +                   izone->shift0[d] = -1;
 +                   if (dd->shift[j][d] < dd->shift[i][d])
 +                   izone->shift1[d] = 1;
 +                   }
 +                 */
 +
 +                int shift_diff;
 +
 +                /* Assume the shift are not more than 1 cell */
 +                izone->shift0[dim] = 1;
 +                izone->shift1[dim] = -1;
 +                for (j = izone->j0; j < izone->j1; j++)
 +                {
 +                    shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
 +                    if (shift_diff < izone->shift0[dim])
 +                    {
 +                        izone->shift0[dim] = shift_diff;
 +                    }
 +                    if (shift_diff > izone->shift1[dim])
 +                    {
 +                        izone->shift1[dim] = shift_diff;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    if (dd->comm->eDLB != edlbNO)
 +    {
 +        snew(dd->comm->root, dd->ndim);
 +    }
 +
 +    if (dd->comm->bRecordLoad)
 +    {
 +        make_load_communicators(dd);
 +    }
 +}
 +
 +static void make_pp_communicator(FILE *fplog, t_commrec *cr, int gmx_unused reorder)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    int                i, rank, *buf;
 +    ivec               periods;
 +#ifdef GMX_MPI
 +    MPI_Comm           comm_cart;
 +#endif
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +#ifdef GMX_MPI
 +    if (comm->bCartesianPP)
 +    {
 +        /* Set up cartesian communication for the particle-particle part */
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Will use a Cartesian communicator: %d x %d x %d\n",
 +                    dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
 +        }
 +
 +        for (i = 0; i < DIM; i++)
 +        {
 +            periods[i] = TRUE;
 +        }
 +        MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, reorder,
 +                        &comm_cart);
 +        /* We overwrite the old communicator with the new cartesian one */
 +        cr->mpi_comm_mygroup = comm_cart;
 +    }
 +
 +    dd->mpi_comm_all = cr->mpi_comm_mygroup;
 +    MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
 +
 +    if (comm->bCartesianPP_PME)
 +    {
 +        /* Since we want to use the original cartesian setup for sim,
 +         * and not the one after split, we need to make an index.
 +         */
 +        snew(comm->ddindex2ddnodeid, dd->nnodes);
 +        comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
 +        gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
 +        /* Get the rank of the DD master,
 +         * above we made sure that the master node is a PP node.
 +         */
 +        if (MASTER(cr))
 +        {
 +            rank = dd->rank;
 +        }
 +        else
 +        {
 +            rank = 0;
 +        }
 +        MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
 +    }
 +    else if (comm->bCartesianPP)
 +    {
 +        if (cr->npmenodes == 0)
 +        {
 +            /* The PP communicator is also
 +             * the communicator for this simulation
 +             */
 +            cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
 +        }
 +        cr->nodeid = dd->rank;
 +
 +        MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
 +
 +        /* We need to make an index to go from the coordinates
 +         * to the nodeid of this simulation.
 +         */
 +        snew(comm->ddindex2simnodeid, dd->nnodes);
 +        snew(buf, dd->nnodes);
 +        if (cr->duty & DUTY_PP)
 +        {
 +            buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
 +        }
 +        /* Communicate the ddindex to simulation nodeid index */
 +        MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
 +                      cr->mpi_comm_mysim);
 +        sfree(buf);
 +
 +        /* Determine the master coordinates and rank.
 +         * The DD master should be the same node as the master of this sim.
 +         */
 +        for (i = 0; i < dd->nnodes; i++)
 +        {
 +            if (comm->ddindex2simnodeid[i] == 0)
 +            {
 +                ddindex2xyz(dd->nc, i, dd->master_ci);
 +                MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug, "The master rank is %d\n", dd->masterrank);
 +        }
 +    }
 +    else
 +    {
 +        /* No Cartesian communicators */
 +        /* We use the rank in dd->comm->all as DD index */
 +        ddindex2xyz(dd->nc, dd->rank, dd->ci);
 +        /* The simulation master nodeid is 0, so the DD master rank is also 0 */
 +        dd->masterrank = 0;
 +        clear_ivec(dd->master_ci);
 +    }
 +#endif
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog,
 +                "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
 +                dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug,
 +                "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
 +                dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +    }
 +}
 +
 +static void receive_ddindex2simnodeid(t_commrec *cr)
 +{
 +    gmx_domdec_t      *dd;
 +
 +    gmx_domdec_comm_t *comm;
 +    int               *buf;
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +#ifdef GMX_MPI
 +    if (!comm->bCartesianPP_PME && comm->bCartesianPP)
 +    {
 +        snew(comm->ddindex2simnodeid, dd->nnodes);
 +        snew(buf, dd->nnodes);
 +        if (cr->duty & DUTY_PP)
 +        {
 +            buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
 +        }
 +#ifdef GMX_MPI
 +        /* Communicate the ddindex to simulation nodeid index */
 +        MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
 +                      cr->mpi_comm_mysim);
 +#endif
 +        sfree(buf);
 +    }
 +#endif
 +}
 +
 +static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
 +                                                     int ncg, int natoms)
 +{
 +    gmx_domdec_master_t *ma;
 +    int                  i;
 +
 +    snew(ma, 1);
 +
 +    snew(ma->ncg, dd->nnodes);
 +    snew(ma->index, dd->nnodes+1);
 +    snew(ma->cg, ncg);
 +    snew(ma->nat, dd->nnodes);
 +    snew(ma->ibuf, dd->nnodes*2);
 +    snew(ma->cell_x, DIM);
 +    for (i = 0; i < DIM; i++)
 +    {
 +        snew(ma->cell_x[i], dd->nc[i]+1);
 +    }
 +
 +    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
 +    {
 +        ma->vbuf = NULL;
 +    }
 +    else
 +    {
 +        snew(ma->vbuf, natoms);
 +    }
 +
 +    return ma;
 +}
 +
 +static void split_communicator(FILE *fplog, t_commrec *cr, int gmx_unused dd_node_order,
 +                               int gmx_unused reorder)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    int                i, rank;
 +    gmx_bool           bDiv[DIM];
 +    ivec               periods;
 +#ifdef GMX_MPI
 +    MPI_Comm           comm_cart;
 +#endif
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    if (comm->bCartesianPP)
 +    {
 +        for (i = 1; i < DIM; i++)
 +        {
 +            bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
 +        }
 +        if (bDiv[YY] || bDiv[ZZ])
 +        {
 +            comm->bCartesianPP_PME = TRUE;
 +            /* If we have 2D PME decomposition, which is always in x+y,
 +             * we stack the PME only nodes in z.
 +             * Otherwise we choose the direction that provides the thinnest slab
 +             * of PME only nodes as this will have the least effect
 +             * on the PP communication.
 +             * But for the PME communication the opposite might be better.
 +             */
 +            if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
 +                             !bDiv[YY] ||
 +                             dd->nc[YY] > dd->nc[ZZ]))
 +            {
 +                comm->cartpmedim = ZZ;
 +            }
 +            else
 +            {
 +                comm->cartpmedim = YY;
 +            }
 +            comm->ntot[comm->cartpmedim]
 +                += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
 +        }
 +        else if (fplog)
 +        {
 +            fprintf(fplog, "#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
 +            fprintf(fplog,
 +                    "Will not use a Cartesian communicator for PP <-> PME\n\n");
 +        }
 +    }
 +
 +#ifdef GMX_MPI
 +    if (comm->bCartesianPP_PME)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
 +        }
 +
 +        for (i = 0; i < DIM; i++)
 +        {
 +            periods[i] = TRUE;
 +        }
 +        MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
 +                        &comm_cart);
 +
 +        MPI_Comm_rank(comm_cart, &rank);
 +        if (MASTERNODE(cr) && rank != 0)
 +        {
 +            gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
 +        }
 +
 +        /* With this assigment we loose the link to the original communicator
 +         * which will usually be MPI_COMM_WORLD, unless have multisim.
 +         */
 +        cr->mpi_comm_mysim = comm_cart;
 +        cr->sim_nodeid     = rank;
 +
 +        MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
 +
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Cartesian nodeid %d, coordinates %d %d %d\n\n",
 +                    cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
 +        }
 +
 +        if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 +        {
 +            cr->duty = DUTY_PP;
 +        }
 +        if (cr->npmenodes == 0 ||
 +            dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
 +        {
 +            cr->duty = DUTY_PME;
 +        }
 +
 +        /* Split the sim communicator into PP and PME only nodes */
 +        MPI_Comm_split(cr->mpi_comm_mysim,
 +                       cr->duty,
 +                       dd_index(comm->ntot, dd->ci),
 +                       &cr->mpi_comm_mygroup);
 +    }
 +    else
 +    {
 +        switch (dd_node_order)
 +        {
 +            case ddnoPP_PME:
 +                if (fplog)
 +                {
 +                    fprintf(fplog, "Order of the nodes: PP first, PME last\n");
 +                }
 +                break;
 +            case ddnoINTERLEAVE:
 +                /* Interleave the PP-only and PME-only nodes,
 +                 * as on clusters with dual-core machines this will double
 +                 * the communication bandwidth of the PME processes
 +                 * and thus speed up the PP <-> PME and inter PME communication.
 +                 */
 +                if (fplog)
 +                {
 +                    fprintf(fplog, "Interleaving PP and PME nodes\n");
 +                }
 +                comm->pmenodes = dd_pmenodes(cr);
 +                break;
 +            case ddnoCARTESIAN:
 +                break;
 +            default:
 +                gmx_fatal(FARGS, "Unknown dd_node_order=%d", dd_node_order);
 +        }
 +
 +        if (dd_simnode2pmenode(cr, cr->sim_nodeid) == -1)
 +        {
 +            cr->duty = DUTY_PME;
 +        }
 +        else
 +        {
 +            cr->duty = DUTY_PP;
 +        }
 +
 +        /* Split the sim communicator into PP and PME only nodes */
 +        MPI_Comm_split(cr->mpi_comm_mysim,
 +                       cr->duty,
 +                       cr->nodeid,
 +                       &cr->mpi_comm_mygroup);
 +        MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
 +    }
 +#endif
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog, "This is a %s only node\n\n",
 +                (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
 +    }
 +}
 +
 +void make_dd_communicators(FILE *fplog, t_commrec *cr, int dd_node_order)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    int                CartReorder;
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    copy_ivec(dd->nc, comm->ntot);
 +
 +    comm->bCartesianPP     = (dd_node_order == ddnoCARTESIAN);
 +    comm->bCartesianPP_PME = FALSE;
 +
 +    /* Reorder the nodes by default. This might change the MPI ranks.
 +     * Real reordering is only supported on very few architectures,
 +     * Blue Gene is one of them.
 +     */
 +    CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
 +
 +    if (cr->npmenodes > 0)
 +    {
 +        /* Split the communicator into a PP and PME part */
 +        split_communicator(fplog, cr, dd_node_order, CartReorder);
 +        if (comm->bCartesianPP_PME)
 +        {
 +            /* We (possibly) reordered the nodes in split_communicator,
 +             * so it is no longer required in make_pp_communicator.
 +             */
 +            CartReorder = FALSE;
 +        }
 +    }
 +    else
 +    {
 +        /* All nodes do PP and PME */
 +#ifdef GMX_MPI
 +        /* We do not require separate communicators */
 +        cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
 +#endif
 +    }
 +
 +    if (cr->duty & DUTY_PP)
 +    {
 +        /* Copy or make a new PP communicator */
 +        make_pp_communicator(fplog, cr, CartReorder);
 +    }
 +    else
 +    {
 +        receive_ddindex2simnodeid(cr);
 +    }
 +
 +    if (!(cr->duty & DUTY_PME))
 +    {
 +        /* Set up the commnuication to our PME node */
 +        dd->pme_nodeid           = dd_simnode2pmenode(cr, cr->sim_nodeid);
 +        dd->pme_receive_vir_ener = receive_vir_ener(cr);
 +        if (debug)
 +        {
 +            fprintf(debug, "My pme_nodeid %d receive ener %d\n",
 +                    dd->pme_nodeid, dd->pme_receive_vir_ener);
 +        }
 +    }
 +    else
 +    {
 +        dd->pme_nodeid = -1;
 +    }
 +
 +    if (DDMASTER(dd))
 +    {
 +        dd->ma = init_gmx_domdec_master_t(dd,
 +                                          comm->cgs_gl.nr,
 +                                          comm->cgs_gl.index[comm->cgs_gl.nr]);
 +    }
 +}
 +
 +static real *get_slb_frac(FILE *fplog, const char *dir, int nc, const char *size_string)
 +{
 +    real  *slb_frac, tot;
 +    int    i, n;
 +    double dbl;
 +
 +    slb_frac = NULL;
 +    if (nc > 1 && size_string != NULL)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Using static load balancing for the %s direction\n",
 +                    dir);
 +        }
 +        snew(slb_frac, nc);
 +        tot = 0;
 +        for (i = 0; i < nc; i++)
 +        {
 +            dbl = 0;
 +            sscanf(size_string, "%lf%n", &dbl, &n);
 +            if (dbl == 0)
 +            {
 +                gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
 +            }
 +            slb_frac[i]  = dbl;
 +            size_string += n;
 +            tot         += slb_frac[i];
 +        }
 +        /* Normalize */
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Relative cell sizes:");
 +        }
 +        for (i = 0; i < nc; i++)
 +        {
 +            slb_frac[i] /= tot;
 +            if (fplog)
 +            {
 +                fprintf(fplog, " %5.3f", slb_frac[i]);
 +            }
 +        }
 +        if (fplog)
 +        {
 +            fprintf(fplog, "\n");
 +        }
 +    }
 +
 +    return slb_frac;
 +}
 +
 +static int multi_body_bondeds_count(gmx_mtop_t *mtop)
 +{
 +    int                  n, nmol, ftype;
 +    gmx_mtop_ilistloop_t iloop;
 +    t_ilist             *il;
 +
 +    n     = 0;
 +    iloop = gmx_mtop_ilistloop_init(mtop);
 +    while (gmx_mtop_ilistloop_next(iloop, &il, &nmol))
 +    {
 +        for (ftype = 0; ftype < F_NRE; ftype++)
 +        {
 +            if ((interaction_function[ftype].flags & IF_BOND) &&
 +                NRAL(ftype) >  2)
 +            {
 +                n += nmol*il[ftype].nr/(1 + NRAL(ftype));
 +            }
 +        }
 +    }
 +
 +    return n;
 +}
 +
 +static int dd_nst_env(FILE *fplog, const char *env_var, int def)
 +{
 +    char *val;
 +    int   nst;
 +
 +    nst = def;
 +    val = getenv(env_var);
 +    if (val)
 +    {
 +        if (sscanf(val, "%d", &nst) <= 0)
 +        {
 +            nst = 1;
 +        }
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Found env.var. %s = %s, using value %d\n",
 +                    env_var, val, nst);
 +        }
 +    }
 +
 +    return nst;
 +}
 +
 +static void dd_warning(t_commrec *cr, FILE *fplog, const char *warn_string)
 +{
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr, "\n%s\n", warn_string);
 +    }
 +    if (fplog)
 +    {
 +        fprintf(fplog, "\n%s\n", warn_string);
 +    }
 +}
 +
 +static void check_dd_restrictions(t_commrec *cr, gmx_domdec_t *dd,
 +                                  t_inputrec *ir, FILE *fplog)
 +{
 +    if (ir->ePBC == epbcSCREW &&
 +        (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
 +    {
 +        gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
 +    }
 +
 +    if (ir->ns_type == ensSIMPLE)
 +    {
 +        gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
 +    }
 +
 +    if (ir->nstlist == 0)
 +    {
 +        gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
 +    }
 +
 +    if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
 +    {
 +        dd_warning(cr, fplog, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
 +    }
 +}
 +
 +static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
 +{
 +    int  di, d;
 +    real r;
 +
 +    r = ddbox->box_size[XX];
 +    for (di = 0; di < dd->ndim; di++)
 +    {
 +        d = dd->dim[di];
 +        /* Check using the initial average cell size */
 +        r = min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
 +    }
 +
 +    return r;
 +}
 +
 +static int check_dlb_support(FILE *fplog, t_commrec *cr,
 +                             const char *dlb_opt, gmx_bool bRecordLoad,
 +                             unsigned long Flags, t_inputrec *ir)
 +{
 +    gmx_domdec_t *dd;
 +    int           eDLB = -1;
 +    char          buf[STRLEN];
 +
 +    switch (dlb_opt[0])
 +    {
 +        case 'a': eDLB = edlbAUTO; break;
 +        case 'n': eDLB = edlbNO;   break;
 +        case 'y': eDLB = edlbYES;  break;
 +        default: gmx_incons("Unknown dlb_opt");
 +    }
 +
 +    if (Flags & MD_RERUN)
 +    {
 +        return edlbNO;
 +    }
 +
 +    if (!EI_DYNAMICS(ir->eI))
 +    {
 +        if (eDLB == edlbYES)
 +        {
 +            sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI));
 +            dd_warning(cr, fplog, buf);
 +        }
 +
 +        return edlbNO;
 +    }
 +
 +    if (!bRecordLoad)
 +    {
 +        dd_warning(cr, fplog, "NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
 +
 +        return edlbNO;
 +    }
 +
 +    if (Flags & MD_REPRODUCIBLE)
 +    {
 +        switch (eDLB)
 +        {
 +            case edlbNO:
 +                break;
 +            case edlbAUTO:
 +                dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
 +                eDLB = edlbNO;
 +                break;
 +            case edlbYES:
 +                dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
 +                break;
 +            default:
 +                gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", eDLB);
 +                break;
 +        }
 +    }
 +
 +    return eDLB;
 +}
 +
 +static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
 +{
 +    int dim;
 +
 +    dd->ndim = 0;
 +    if (getenv("GMX_DD_ORDER_ZYX") != NULL)
 +    {
 +        /* Decomposition order z,y,x */
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Using domain decomposition order z, y, x\n");
 +        }
 +        for (dim = DIM-1; dim >= 0; dim--)
 +        {
 +            if (dd->nc[dim] > 1)
 +            {
 +                dd->dim[dd->ndim++] = dim;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        /* Decomposition order x,y,z */
 +        for (dim = 0; dim < DIM; dim++)
 +        {
 +            if (dd->nc[dim] > 1)
 +            {
 +                dd->dim[dd->ndim++] = dim;
 +            }
 +        }
 +    }
 +}
 +
 +static gmx_domdec_comm_t *init_dd_comm()
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                i;
 +
 +    snew(comm, 1);
 +    snew(comm->cggl_flag, DIM*2);
 +    snew(comm->cgcm_state, DIM*2);
 +    for (i = 0; i < DIM*2; i++)
 +    {
 +        comm->cggl_flag_nalloc[i]  = 0;
 +        comm->cgcm_state_nalloc[i] = 0;
 +    }
 +
 +    comm->nalloc_int = 0;
 +    comm->buf_int    = NULL;
 +
 +    vec_rvec_init(&comm->vbuf);
 +
 +    comm->n_load_have    = 0;
 +    comm->n_load_collect = 0;
 +
 +    for (i = 0; i < ddnatNR-ddnatZONE; i++)
 +    {
 +        comm->sum_nat[i] = 0;
 +    }
 +    comm->ndecomp   = 0;
 +    comm->nload     = 0;
 +    comm->load_step = 0;
 +    comm->load_sum  = 0;
 +    comm->load_max  = 0;
 +    clear_ivec(comm->load_lim);
 +    comm->load_mdf  = 0;
 +    comm->load_pme  = 0;
 +
 +    return comm;
 +}
 +
 +gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
 +                                        unsigned long Flags,
 +                                        ivec nc,
 +                                        real comm_distance_min, real rconstr,
 +                                        const char *dlb_opt, real dlb_scale,
 +                                        const char *sizex, const char *sizey, const char *sizez,
 +                                        gmx_mtop_t *mtop, t_inputrec *ir,
 +                                        matrix box, rvec *x,
 +                                        gmx_ddbox_t *ddbox,
 +                                        int *npme_x, int *npme_y)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    int                recload;
 +    int                d, i, j;
 +    real               r_2b, r_mb, r_bonded = -1, r_bonded_limit = -1, limit, acs;
 +    gmx_bool           bC;
 +    char               buf[STRLEN];
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog,
 +                "\nInitializing Domain Decomposition on %d nodes\n", cr->nnodes);
 +    }
 +
 +    snew(dd, 1);
 +
 +    dd->comm = init_dd_comm();
 +    comm     = dd->comm;
 +    snew(comm->cggl_flag, DIM*2);
 +    snew(comm->cgcm_state, DIM*2);
 +
 +    dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
 +    dd->bScrewPBC = (ir->ePBC == epbcSCREW);
 +
 +    dd->bSendRecv2      = dd_nst_env(fplog, "GMX_DD_SENDRECV2", 0);
 +    comm->dlb_scale_lim = dd_nst_env(fplog, "GMX_DLB_MAX", 10);
 +    comm->eFlop         = dd_nst_env(fplog, "GMX_DLB_FLOP", 0);
 +    recload             = dd_nst_env(fplog, "GMX_DD_LOAD", 1);
 +    comm->nstSortCG     = dd_nst_env(fplog, "GMX_DD_SORT", 1);
 +    comm->nstDDDump     = dd_nst_env(fplog, "GMX_DD_DUMP", 0);
 +    comm->nstDDDumpGrid = dd_nst_env(fplog, "GMX_DD_DUMP_GRID", 0);
 +    comm->DD_debug      = dd_nst_env(fplog, "GMX_DD_DEBUG", 0);
 +
 +    dd->pme_recv_f_alloc = 0;
 +    dd->pme_recv_f_buf   = NULL;
 +
 +    if (dd->bSendRecv2 && fplog)
 +    {
 +        fprintf(fplog, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
 +    }
 +    if (comm->eFlop)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Will load balance based on FLOP count\n");
 +        }
 +        if (comm->eFlop > 1)
 +        {
 +            srand(1+cr->nodeid);
 +        }
 +        comm->bRecordLoad = TRUE;
 +    }
 +    else
 +    {
 +        comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
 +
 +    }
 +
 +    /* Initialize to GPU share count to 0, might change later */
 +    comm->nrank_gpu_shared = 0;
 +
 +    comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
 +
 +    comm->bDynLoadBal = (comm->eDLB == edlbYES);
 +    if (fplog)
 +    {
 +        fprintf(fplog, "Dynamic load balancing: %s\n", edlb_names[comm->eDLB]);
 +    }
 +    dd->bGridJump              = comm->bDynLoadBal;
 +    comm->bPMELoadBalDLBLimits = FALSE;
 +
 +    if (comm->nstSortCG)
 +    {
 +        if (fplog)
 +        {
 +            if (comm->nstSortCG == 1)
 +            {
 +                fprintf(fplog, "Will sort the charge groups at every domain (re)decomposition\n");
 +            }
 +            else
 +            {
 +                fprintf(fplog, "Will sort the charge groups every %d steps\n",
 +                        comm->nstSortCG);
 +            }
 +        }
 +        snew(comm->sort, 1);
 +    }
 +    else
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "Will not sort the charge groups\n");
 +        }
 +    }
 +
 +    comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
 +
 +    comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
 +    if (comm->bInterCGBondeds)
 +    {
 +        comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
 +    }
 +    else
 +    {
 +        comm->bInterCGMultiBody = FALSE;
 +    }
 +
 +    dd->bInterCGcons    = inter_charge_group_constraints(mtop);
 +    dd->bInterCGsettles = inter_charge_group_settles(mtop);
 +
 +    if (ir->rlistlong == 0)
 +    {
 +        /* Set the cut-off to some very large value,
 +         * so we don't need if statements everywhere in the code.
 +         * We use sqrt, since the cut-off is squared in some places.
 +         */
 +        comm->cutoff   = GMX_CUTOFF_INF;
 +    }
 +    else
 +    {
 +        comm->cutoff   = ir->rlistlong;
 +    }
 +    comm->cutoff_mbody = 0;
 +
 +    comm->cellsize_limit = 0;
 +    comm->bBondComm      = FALSE;
 +
 +    if (comm->bInterCGBondeds)
 +    {
 +        if (comm_distance_min > 0)
 +        {
 +            comm->cutoff_mbody = comm_distance_min;
 +            if (Flags & MD_DDBONDCOMM)
 +            {
 +                comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
 +            }
 +            else
 +            {
 +                comm->cutoff = max(comm->cutoff, comm->cutoff_mbody);
 +            }
 +            r_bonded_limit = comm->cutoff_mbody;
 +        }
 +        else if (ir->bPeriodicMols)
 +        {
 +            /* Can not easily determine the required cut-off */
 +            dd_warning(cr, fplog, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
 +            comm->cutoff_mbody = comm->cutoff/2;
 +            r_bonded_limit     = comm->cutoff_mbody;
 +        }
 +        else
 +        {
 +            if (MASTER(cr))
 +            {
 +                dd_bonded_cg_distance(fplog, mtop, ir, x, box,
 +                                      Flags & MD_DDBONDCHECK, &r_2b, &r_mb);
 +            }
 +            gmx_bcast(sizeof(r_2b), &r_2b, cr);
 +            gmx_bcast(sizeof(r_mb), &r_mb, cr);
 +
 +            /* We use an initial margin of 10% for the minimum cell size,
 +             * except when we are just below the non-bonded cut-off.
 +             */
 +            if (Flags & MD_DDBONDCOMM)
 +            {
 +                if (max(r_2b, r_mb) > comm->cutoff)
 +                {
 +                    r_bonded        = max(r_2b, r_mb);
 +                    r_bonded_limit  = 1.1*r_bonded;
 +                    comm->bBondComm = TRUE;
 +                }
 +                else
 +                {
 +                    r_bonded       = r_mb;
 +                    r_bonded_limit = min(1.1*r_bonded, comm->cutoff);
 +                }
 +                /* We determine cutoff_mbody later */
 +            }
 +            else
 +            {
 +                /* No special bonded communication,
 +                 * simply increase the DD cut-off.
 +                 */
 +                r_bonded_limit     = 1.1*max(r_2b, r_mb);
 +                comm->cutoff_mbody = r_bonded_limit;
 +                comm->cutoff       = max(comm->cutoff, comm->cutoff_mbody);
 +            }
 +        }
 +        comm->cellsize_limit = max(comm->cellsize_limit, r_bonded_limit);
 +        if (fplog)
 +        {
 +            fprintf(fplog,
 +                    "Minimum cell size due to bonded interactions: %.3f nm\n",
 +                    comm->cellsize_limit);
 +        }
 +    }
 +
 +    if (dd->bInterCGcons && rconstr <= 0)
 +    {
 +        /* There is a cell size limit due to the constraints (P-LINCS) */
 +        rconstr = constr_r_max(fplog, mtop, ir);
 +        if (fplog)
 +        {
 +            fprintf(fplog,
 +                    "Estimated maximum distance required for P-LINCS: %.3f nm\n",
 +                    rconstr);
 +            if (rconstr > comm->cellsize_limit)
 +            {
 +                fprintf(fplog, "This distance will limit the DD cell size, you can override this with -rcon\n");
 +            }
 +        }
 +    }
 +    else if (rconstr > 0 && fplog)
 +    {
 +        /* Here we do not check for dd->bInterCGcons,
 +         * because one can also set a cell size limit for virtual sites only
 +         * and at this point we don't know yet if there are intercg v-sites.
 +         */
 +        fprintf(fplog,
 +                "User supplied maximum distance required for P-LINCS: %.3f nm\n",
 +                rconstr);
 +    }
 +    comm->cellsize_limit = max(comm->cellsize_limit, rconstr);
 +
 +    comm->cgs_gl = gmx_mtop_global_cgs(mtop);
 +
 +    if (nc[XX] > 0)
 +    {
 +        copy_ivec(nc, dd->nc);
 +        set_dd_dim(fplog, dd);
 +        set_ddbox_cr(cr, &dd->nc, ir, box, &comm->cgs_gl, x, ddbox);
 +
 +        if (cr->npmenodes == -1)
 +        {
 +            cr->npmenodes = 0;
 +        }
 +        acs = average_cellsize_min(dd, ddbox);
 +        if (acs < comm->cellsize_limit)
 +        {
 +            if (fplog)
 +            {
 +                fprintf(fplog, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs, comm->cellsize_limit);
 +            }
 +            gmx_fatal_collective(FARGS, cr, NULL,
 +                                 "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
 +                                 acs, comm->cellsize_limit);
 +        }
 +    }
 +    else
 +    {
 +        set_ddbox_cr(cr, NULL, ir, box, &comm->cgs_gl, x, ddbox);
 +
 +        /* We need to choose the optimal DD grid and possibly PME nodes */
 +        limit = dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
 +                               comm->eDLB != edlbNO, dlb_scale,
 +                               comm->cellsize_limit, comm->cutoff,
 +                               comm->bInterCGBondeds);
 +
 +        if (dd->nc[XX] == 0)
 +        {
 +            bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
 +            sprintf(buf, "Change the number of nodes or mdrun option %s%s%s",
 +                    !bC ? "-rdd" : "-rcon",
 +                    comm->eDLB != edlbNO ? " or -dds" : "",
 +                    bC ? " or your LINCS settings" : "");
 +
 +            gmx_fatal_collective(FARGS, cr, NULL,
 +                                 "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
 +                                 "%s\n"
 +                                 "Look in the log file for details on the domain decomposition",
 +                                 cr->nnodes-cr->npmenodes, limit, buf);
 +        }
 +        set_dd_dim(fplog, dd);
 +    }
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog,
 +                "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
 +                dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
 +    }
 +
 +    dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
 +    if (cr->nnodes - dd->nnodes != cr->npmenodes)
 +    {
 +        gmx_fatal_collective(FARGS, cr, NULL,
 +                             "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
 +                             dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
 +    }
 +    if (cr->npmenodes > dd->nnodes)
 +    {
 +        gmx_fatal_collective(FARGS, cr, NULL,
 +                             "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.", cr->npmenodes, dd->nnodes);
 +    }
 +    if (cr->npmenodes > 0)
 +    {
 +        comm->npmenodes = cr->npmenodes;
 +    }
 +    else
 +    {
 +        comm->npmenodes = dd->nnodes;
 +    }
 +
 +    if (EEL_PME(ir->coulombtype))
 +    {
 +        /* The following choices should match those
 +         * in comm_cost_est in domdec_setup.c.
 +         * Note that here the checks have to take into account
 +         * that the decomposition might occur in a different order than xyz
 +         * (for instance through the env.var. GMX_DD_ORDER_ZYX),
 +         * in which case they will not match those in comm_cost_est,
 +         * but since that is mainly for testing purposes that's fine.
 +         */
 +        if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
 +            comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
 +            getenv("GMX_PMEONEDD") == NULL)
 +        {
 +            comm->npmedecompdim = 2;
 +            comm->npmenodes_x   = dd->nc[XX];
 +            comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
 +        }
 +        else
 +        {
 +            /* In case nc is 1 in both x and y we could still choose to
 +             * decompose pme in y instead of x, but we use x for simplicity.
 +             */
 +            comm->npmedecompdim = 1;
 +            if (dd->dim[0] == YY)
 +            {
 +                comm->npmenodes_x = 1;
 +                comm->npmenodes_y = comm->npmenodes;
 +            }
 +            else
 +            {
 +                comm->npmenodes_x = comm->npmenodes;
 +                comm->npmenodes_y = 1;
 +            }
 +        }
 +        if (fplog)
 +        {
 +            fprintf(fplog, "PME domain decomposition: %d x %d x %d\n",
 +                    comm->npmenodes_x, comm->npmenodes_y, 1);
 +        }
 +    }
 +    else
 +    {
 +        comm->npmedecompdim = 0;
 +        comm->npmenodes_x   = 0;
 +        comm->npmenodes_y   = 0;
 +    }
 +
 +    /* Technically we don't need both of these,
 +     * but it simplifies code not having to recalculate it.
 +     */
 +    *npme_x = comm->npmenodes_x;
 +    *npme_y = comm->npmenodes_y;
 +
 +    snew(comm->slb_frac, DIM);
 +    if (comm->eDLB == edlbNO)
 +    {
 +        comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], sizex);
 +        comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], sizey);
 +        comm->slb_frac[ZZ] = get_slb_frac(fplog, "z", dd->nc[ZZ], sizez);
 +    }
 +
 +    if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
 +    {
 +        if (comm->bBondComm || comm->eDLB != edlbNO)
 +        {
 +            /* Set the bonded communication distance to halfway
 +             * the minimum and the maximum,
 +             * since the extra communication cost is nearly zero.
 +             */
 +            acs                = average_cellsize_min(dd, ddbox);
 +            comm->cutoff_mbody = 0.5*(r_bonded + acs);
 +            if (comm->eDLB != edlbNO)
 +            {
 +                /* Check if this does not limit the scaling */
 +                comm->cutoff_mbody = min(comm->cutoff_mbody, dlb_scale*acs);
 +            }
 +            if (!comm->bBondComm)
 +            {
 +                /* Without bBondComm do not go beyond the n.b. cut-off */
 +                comm->cutoff_mbody = min(comm->cutoff_mbody, comm->cutoff);
 +                if (comm->cellsize_limit >= comm->cutoff)
 +                {
 +                    /* We don't loose a lot of efficieny
 +                     * when increasing it to the n.b. cut-off.
 +                     * It can even be slightly faster, because we need
 +                     * less checks for the communication setup.
 +                     */
 +                    comm->cutoff_mbody = comm->cutoff;
 +                }
 +            }
 +            /* Check if we did not end up below our original limit */
 +            comm->cutoff_mbody = max(comm->cutoff_mbody, r_bonded_limit);
 +
 +            if (comm->cutoff_mbody > comm->cellsize_limit)
 +            {
 +                comm->cellsize_limit = comm->cutoff_mbody;
 +            }
 +        }
 +        /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Bonded atom communication beyond the cut-off: %d\n"
 +                "cellsize limit %f\n",
 +                comm->bBondComm, comm->cellsize_limit);
 +    }
 +
 +    if (MASTER(cr))
 +    {
 +        check_dd_restrictions(cr, dd, ir, fplog);
 +    }
 +
 +    comm->partition_step = INT_MIN;
 +    dd->ddp_count        = 0;
 +
 +    clear_dd_cycle_counts(dd);
 +
 +    return dd;
 +}
 +
 +static void set_dlb_limits(gmx_domdec_t *dd)
 +
 +{
 +    int d;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dd->comm->cd[d].np                 = dd->comm->cd[d].np_dlb;
 +        dd->comm->cellsize_min[dd->dim[d]] =
 +            dd->comm->cellsize_min_dlb[dd->dim[d]];
 +    }
 +}
 +
 +
 +static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_large_int_t step)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    real               cellsize_min;
 +    int                d, nc, i;
 +    char               buf[STRLEN];
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog, "At step %s the performance loss due to force load imbalance is %.1f %%\n", gmx_step_str(step, buf), dd_force_imb_perf_loss(dd)*100);
 +    }
 +
 +    cellsize_min = comm->cellsize_min[dd->dim[0]];
 +    for (d = 1; d < dd->ndim; d++)
 +    {
 +        cellsize_min = min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
 +    }
 +
 +    if (cellsize_min < comm->cellsize_limit*1.05)
 +    {
 +        dd_warning(cr, fplog, "NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
 +
 +        /* Change DLB from "auto" to "no". */
 +        comm->eDLB = edlbNO;
 +
 +        return;
 +    }
 +
 +    dd_warning(cr, fplog, "NOTE: Turning on dynamic load balancing\n");
 +    comm->bDynLoadBal = TRUE;
 +    dd->bGridJump     = TRUE;
 +
 +    set_dlb_limits(dd);
 +
 +    /* We can set the required cell size info here,
 +     * so we do not need to communicate this.
 +     * The grid is completely uniform.
 +     */
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        if (comm->root[d])
 +        {
 +            comm->load[d].sum_m = comm->load[d].sum;
 +
 +            nc = dd->nc[dd->dim[d]];
 +            for (i = 0; i < nc; i++)
 +            {
 +                comm->root[d]->cell_f[i]    = i/(real)nc;
 +                if (d > 0)
 +                {
 +                    comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
 +                    comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
 +                }
 +            }
 +            comm->root[d]->cell_f[nc] = 1.0;
 +        }
 +    }
 +}
 +
 +static char *init_bLocalCG(gmx_mtop_t *mtop)
 +{
 +    int   ncg, cg;
 +    char *bLocalCG;
 +
 +    ncg = ncg_mtop(mtop);
 +    snew(bLocalCG, ncg);
 +    for (cg = 0; cg < ncg; cg++)
 +    {
 +        bLocalCG[cg] = FALSE;
 +    }
 +
 +    return bLocalCG;
 +}
 +
 +void dd_init_bondeds(FILE *fplog,
 +                     gmx_domdec_t *dd, gmx_mtop_t *mtop,
 +                     gmx_vsite_t *vsite,
 +                     t_inputrec *ir, gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_bool           bBondComm;
 +    int                d;
 +
 +    dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
 +
 +    comm = dd->comm;
 +
 +    if (comm->bBondComm)
 +    {
 +        /* Communicate atoms beyond the cut-off for bonded interactions */
 +        comm = dd->comm;
 +
 +        comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
 +
 +        comm->bLocalCG = init_bLocalCG(mtop);
 +    }
 +    else
 +    {
 +        /* Only communicate atoms based on cut-off */
 +        comm->cglink   = NULL;
 +        comm->bLocalCG = NULL;
 +    }
 +}
 +
 +static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
 +                              t_inputrec *ir,
 +                              gmx_bool bDynLoadBal, real dlb_scale,
 +                              gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d;
 +    ivec               np;
 +    real               limit, shrink;
 +    char               buf[64];
 +
 +    if (fplog == NULL)
 +    {
 +        return;
 +    }
 +
 +    comm = dd->comm;
 +
 +    if (bDynLoadBal)
 +    {
 +        fprintf(fplog, "The maximum number of communication pulses is:");
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            fprintf(fplog, " %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
 +        }
 +        fprintf(fplog, "\n");
 +        fprintf(fplog, "The minimum size for domain decomposition cells is %.3f nm\n", comm->cellsize_limit);
 +        fprintf(fplog, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale);
 +        fprintf(fplog, "The allowed shrink of domain decomposition cells is:");
 +        for (d = 0; d < DIM; d++)
 +        {
 +            if (dd->nc[d] > 1)
 +            {
 +                if (d >= ddbox->npbcdim && dd->nc[d] == 2)
 +                {
 +                    shrink = 0;
 +                }
 +                else
 +                {
 +                    shrink =
 +                        comm->cellsize_min_dlb[d]/
 +                        (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
 +                }
 +                fprintf(fplog, " %c %.2f", dim2char(d), shrink);
 +            }
 +        }
 +        fprintf(fplog, "\n");
 +    }
 +    else
 +    {
 +        set_dd_cell_sizes_slb(dd, ddbox, FALSE, np);
 +        fprintf(fplog, "The initial number of communication pulses is:");
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            fprintf(fplog, " %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
 +        }
 +        fprintf(fplog, "\n");
 +        fprintf(fplog, "The initial domain decomposition cell size is:");
 +        for (d = 0; d < DIM; d++)
 +        {
 +            if (dd->nc[d] > 1)
 +            {
 +                fprintf(fplog, " %c %.2f nm",
 +                        dim2char(d), dd->comm->cellsize_min[d]);
 +            }
 +        }
 +        fprintf(fplog, "\n\n");
 +    }
 +
 +    if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
 +    {
 +        fprintf(fplog, "The maximum allowed distance for charge groups involved in interactions is:\n");
 +        fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                "non-bonded interactions", "", comm->cutoff);
 +
 +        if (bDynLoadBal)
 +        {
 +            limit = dd->comm->cellsize_limit;
 +        }
 +        else
 +        {
 +            if (dynamic_dd_box(ddbox, ir))
 +            {
 +                fprintf(fplog, "(the following are initial values, they could change due to box deformation)\n");
 +            }
 +            limit = dd->comm->cellsize_min[XX];
 +            for (d = 1; d < DIM; d++)
 +            {
 +                limit = min(limit, dd->comm->cellsize_min[d]);
 +            }
 +        }
 +
 +        if (comm->bInterCGBondeds)
 +        {
 +            fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                    "two-body bonded interactions", "(-rdd)",
 +                    max(comm->cutoff, comm->cutoff_mbody));
 +            fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                    "multi-body bonded interactions", "(-rdd)",
 +                    (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff, limit));
 +        }
 +        if (dd->vsite_comm)
 +        {
 +            fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                    "virtual site constructions", "(-rcon)", limit);
 +        }
 +        if (dd->constraint_comm)
 +        {
 +            sprintf(buf, "atoms separated by up to %d constraints",
 +                    1+ir->nProjOrder);
 +            fprintf(fplog, "%40s  %-7s %6.3f nm\n",
 +                    buf, "(-rcon)", limit);
 +        }
 +        fprintf(fplog, "\n");
 +    }
 +
 +    fflush(fplog);
 +}
 +
 +static void set_cell_limits_dlb(gmx_domdec_t      *dd,
 +                                real               dlb_scale,
 +                                const t_inputrec  *ir,
 +                                const gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                d, dim, npulse, npulse_d_max, npulse_d;
 +    gmx_bool           bNoCutOff;
 +
 +    comm = dd->comm;
 +
 +    bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
 +
 +    /* Determine the maximum number of comm. pulses in one dimension */
 +
 +    comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
 +
 +    /* Determine the maximum required number of grid pulses */
 +    if (comm->cellsize_limit >= comm->cutoff)
 +    {
 +        /* Only a single pulse is required */
 +        npulse = 1;
 +    }
 +    else if (!bNoCutOff && comm->cellsize_limit > 0)
 +    {
 +        /* We round down slightly here to avoid overhead due to the latency
 +         * of extra communication calls when the cut-off
 +         * would be only slightly longer than the cell size.
 +         * Later cellsize_limit is redetermined,
 +         * so we can not miss interactions due to this rounding.
 +         */
 +        npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
 +    }
 +    else
 +    {
 +        /* There is no cell size limit */
 +        npulse = max(dd->nc[XX]-1, max(dd->nc[YY]-1, dd->nc[ZZ]-1));
 +    }
 +
 +    if (!bNoCutOff && npulse > 1)
 +    {
 +        /* See if we can do with less pulses, based on dlb_scale */
 +        npulse_d_max = 0;
 +        for (d = 0; d < dd->ndim; d++)
 +        {
 +            dim      = dd->dim[d];
 +            npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
 +                             /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
 +            npulse_d_max = max(npulse_d_max, npulse_d);
 +        }
 +        npulse = min(npulse, npulse_d_max);
 +    }
 +
 +    /* This env var can override npulse */
 +    d = dd_nst_env(debug, "GMX_DD_NPULSE", 0);
 +    if (d > 0)
 +    {
 +        npulse = d;
 +    }
 +
 +    comm->maxpulse       = 1;
 +    comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        comm->cd[d].np_dlb    = min(npulse, dd->nc[dd->dim[d]]-1);
 +        comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
 +        snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
 +        comm->maxpulse = max(comm->maxpulse, comm->cd[d].np_dlb);
 +        if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
 +        {
 +            comm->bVacDLBNoLimit = FALSE;
 +        }
 +    }
 +
 +    /* cellsize_limit is set for LINCS in init_domain_decomposition */
 +    if (!comm->bVacDLBNoLimit)
 +    {
 +        comm->cellsize_limit = max(comm->cellsize_limit,
 +                                   comm->cutoff/comm->maxpulse);
 +    }
 +    comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
 +    /* Set the minimum cell size for each DD dimension */
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        if (comm->bVacDLBNoLimit ||
 +            comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
 +        {
 +            comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
 +        }
 +        else
 +        {
 +            comm->cellsize_min_dlb[dd->dim[d]] =
 +                comm->cutoff/comm->cd[d].np_dlb;
 +        }
 +    }
 +    if (comm->cutoff_mbody <= 0)
 +    {
 +        comm->cutoff_mbody = min(comm->cutoff, comm->cellsize_limit);
 +    }
 +    if (comm->bDynLoadBal)
 +    {
 +        set_dlb_limits(dd);
 +    }
 +}
 +
 +gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd, int ePBC)
 +{
 +    /* If each molecule is a single charge group
 +     * or we use domain decomposition for each periodic dimension,
 +     * we do not need to take pbc into account for the bonded interactions.
 +     */
 +    return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
 +            !(dd->nc[XX] > 1 &&
 +              dd->nc[YY] > 1 &&
 +              (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
 +}
 +
 +void set_dd_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
 +                       t_inputrec *ir, gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                natoms_tot;
 +    real               vol_frac;
 +
 +    comm = dd->comm;
 +
 +    /* Initialize the thread data.
 +     * This can not be done in init_domain_decomposition,
 +     * as the numbers of threads is determined later.
 +     */
 +    comm->nth = gmx_omp_nthreads_get(emntDomdec);
 +    if (comm->nth > 1)
 +    {
 +        snew(comm->dth, comm->nth);
 +    }
 +
 +    if (EEL_PME(ir->coulombtype))
 +    {
 +        init_ddpme(dd, &comm->ddpme[0], 0);
 +        if (comm->npmedecompdim >= 2)
 +        {
 +            init_ddpme(dd, &comm->ddpme[1], 1);
 +        }
 +    }
 +    else
 +    {
 +        comm->npmenodes = 0;
 +        if (dd->pme_nodeid >= 0)
 +        {
 +            gmx_fatal_collective(FARGS, NULL, dd,
 +                                 "Can not have separate PME nodes without PME electrostatics");
 +        }
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
 +    }
 +    if (comm->eDLB != edlbNO)
 +    {
 +        set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
 +    }
 +
 +    print_dd_settings(fplog, dd, ir, comm->bDynLoadBal, dlb_scale, ddbox);
 +    if (comm->eDLB == edlbAUTO)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog, "When dynamic load balancing gets turned on, these settings will change to:\n");
 +        }
 +        print_dd_settings(fplog, dd, ir, TRUE, dlb_scale, ddbox);
 +    }
 +
 +    if (ir->ePBC == epbcNONE)
 +    {
 +        vol_frac = 1 - 1/(double)dd->nnodes;
 +    }
 +    else
 +    {
 +        vol_frac =
 +            (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/(double)dd->nnodes;
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
 +    }
 +    natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
 +
 +    dd->ga2la = ga2la_init(natoms_tot, vol_frac*natoms_tot);
 +}
 +
 +static gmx_bool test_dd_cutoff(t_commrec *cr,
 +                               t_state *state, t_inputrec *ir,
 +                               real cutoff_req)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_ddbox_t   ddbox;
 +    int           d, dim, np;
 +    real          inv_cell_size;
 +    int           LocallyLimited;
 +
 +    dd = cr->dd;
 +
 +    set_ddbox(dd, FALSE, cr, ir, state->box,
 +              TRUE, &dd->comm->cgs_gl, state->x, &ddbox);
 +
 +    LocallyLimited = 0;
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +
 +        inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
 +        if (dynamic_dd_box(&ddbox, ir))
 +        {
 +            inv_cell_size *= DD_PRES_SCALE_MARGIN;
 +        }
 +
 +        np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
 +
 +        if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
 +            dd->comm->cd[d].np_dlb > 0)
 +        {
 +            if (np > dd->comm->cd[d].np_dlb)
 +            {
 +                return FALSE;
 +            }
 +
 +            /* If a current local cell size is smaller than the requested
 +             * cut-off, we could still fix it, but this gets very complicated.
 +             * Without fixing here, we might actually need more checks.
 +             */
 +            if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
 +            {
 +                LocallyLimited = 1;
 +            }
 +        }
 +    }
 +
 +    if (dd->comm->eDLB != edlbNO)
 +    {
 +        /* If DLB is not active yet, we don't need to check the grid jumps.
 +         * Actually we shouldn't, because then the grid jump data is not set.
 +         */
 +        if (dd->comm->bDynLoadBal &&
 +            check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
 +        {
 +            LocallyLimited = 1;
 +        }
 +
 +        gmx_sumi(1, &LocallyLimited, cr);
 +
 +        if (LocallyLimited > 0)
 +        {
 +            return FALSE;
 +        }
 +    }
 +
 +    return TRUE;
 +}
 +
 +gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, t_inputrec *ir,
 +                          real cutoff_req)
 +{
 +    gmx_bool bCutoffAllowed;
 +
 +    bCutoffAllowed = test_dd_cutoff(cr, state, ir, cutoff_req);
 +
 +    if (bCutoffAllowed)
 +    {
 +        cr->dd->comm->cutoff = cutoff_req;
 +    }
 +
 +    return bCutoffAllowed;
 +}
 +
 +void change_dd_dlb_cutoff_limit(t_commrec *cr)
 +{
 +    gmx_domdec_comm_t *comm;
 +
 +    comm = cr->dd->comm;
 +
 +    /* Turn on the DLB limiting (might have been on already) */
 +    comm->bPMELoadBalDLBLimits = TRUE;
 +
 +    /* Change the cut-off limit */
 +    comm->PMELoadBal_max_cutoff = comm->cutoff;
 +}
 +
 +static void merge_cg_buffers(int ncell,
 +                             gmx_domdec_comm_dim_t *cd, int pulse,
 +                             int  *ncg_cell,
 +                             int  *index_gl, int  *recv_i,
 +                             rvec *cg_cm,    rvec *recv_vr,
 +                             int *cgindex,
 +                             cginfo_mb_t *cginfo_mb, int *cginfo)
 +{
 +    gmx_domdec_ind_t *ind, *ind_p;
 +    int               p, cell, c, cg, cg0, cg1, cg_gl, nat;
 +    int               shift, shift_at;
 +
 +    ind = &cd->ind[pulse];
 +
 +    /* First correct the already stored data */
 +    shift = ind->nrecv[ncell];
 +    for (cell = ncell-1; cell >= 0; cell--)
 +    {
 +        shift -= ind->nrecv[cell];
 +        if (shift > 0)
 +        {
 +            /* Move the cg's present from previous grid pulses */
 +            cg0                = ncg_cell[ncell+cell];
 +            cg1                = ncg_cell[ncell+cell+1];
 +            cgindex[cg1+shift] = cgindex[cg1];
 +            for (cg = cg1-1; cg >= cg0; cg--)
 +            {
 +                index_gl[cg+shift] = index_gl[cg];
 +                copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
 +                cgindex[cg+shift] = cgindex[cg];
 +                cginfo[cg+shift]  = cginfo[cg];
 +            }
 +            /* Correct the already stored send indices for the shift */
 +            for (p = 1; p <= pulse; p++)
 +            {
 +                ind_p = &cd->ind[p];
 +                cg0   = 0;
 +                for (c = 0; c < cell; c++)
 +                {
 +                    cg0 += ind_p->nsend[c];
 +                }
 +                cg1 = cg0 + ind_p->nsend[cell];
 +                for (cg = cg0; cg < cg1; cg++)
 +                {
 +                    ind_p->index[cg] += shift;
 +                }
 +            }
 +        }
 +    }
 +
 +    /* Merge in the communicated buffers */
 +    shift    = 0;
 +    shift_at = 0;
 +    cg0      = 0;
 +    for (cell = 0; cell < ncell; cell++)
 +    {
 +        cg1 = ncg_cell[ncell+cell+1] + shift;
 +        if (shift_at > 0)
 +        {
 +            /* Correct the old cg indices */
 +            for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
 +            {
 +                cgindex[cg+1] += shift_at;
 +            }
 +        }
 +        for (cg = 0; cg < ind->nrecv[cell]; cg++)
 +        {
 +            /* Copy this charge group from the buffer */
 +            index_gl[cg1] = recv_i[cg0];
 +            copy_rvec(recv_vr[cg0], cg_cm[cg1]);
 +            /* Add it to the cgindex */
 +            cg_gl          = index_gl[cg1];
 +            cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
 +            nat            = GET_CGINFO_NATOMS(cginfo[cg1]);
 +            cgindex[cg1+1] = cgindex[cg1] + nat;
 +            cg0++;
 +            cg1++;
 +            shift_at += nat;
 +        }
 +        shift                 += ind->nrecv[cell];
 +        ncg_cell[ncell+cell+1] = cg1;
 +    }
 +}
 +
 +static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
 +                               int nzone, int cg0, const int *cgindex)
 +{
 +    int cg, zone, p;
 +
 +    /* Store the atom block boundaries for easy copying of communication buffers
 +     */
 +    cg = cg0;
 +    for (zone = 0; zone < nzone; zone++)
 +    {
 +        for (p = 0; p < cd->np; p++)
 +        {
 +            cd->ind[p].cell2at0[zone] = cgindex[cg];
 +            cg += cd->ind[p].nrecv[zone];
 +            cd->ind[p].cell2at1[zone] = cgindex[cg];
 +        }
 +    }
 +}
 +
 +static gmx_bool missing_link(t_blocka *link, int cg_gl, char *bLocalCG)
 +{
 +    int      i;
 +    gmx_bool bMiss;
 +
 +    bMiss = FALSE;
 +    for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
 +    {
 +        if (!bLocalCG[link->a[i]])
 +        {
 +            bMiss = TRUE;
 +        }
 +    }
 +
 +    return bMiss;
 +}
 +
 +/* Domain corners for communication, a maximum of 4 i-zones see a j domain */
 +typedef struct {
 +    real c[DIM][4]; /* the corners for the non-bonded communication */
 +    real cr0;       /* corner for rounding */
 +    real cr1[4];    /* corners for rounding */
 +    real bc[DIM];   /* corners for bounded communication */
 +    real bcr1;      /* corner for rounding for bonded communication */
 +} dd_corners_t;
 +
 +/* Determine the corners of the domain(s) we are communicating with */
 +static void
 +set_dd_corners(const gmx_domdec_t *dd,
 +               int dim0, int dim1, int dim2,
 +               gmx_bool bDistMB,
 +               dd_corners_t *c)
 +{
 +    const gmx_domdec_comm_t  *comm;
 +    const gmx_domdec_zones_t *zones;
 +    int i, j;
 +
 +    comm = dd->comm;
 +
 +    zones = &comm->zones;
 +
 +    /* Keep the compiler happy */
 +    c->cr0  = 0;
 +    c->bcr1 = 0;
 +
 +    /* The first dimension is equal for all cells */
 +    c->c[0][0] = comm->cell_x0[dim0];
 +    if (bDistMB)
 +    {
 +        c->bc[0] = c->c[0][0];
 +    }
 +    if (dd->ndim >= 2)
 +    {
 +        dim1 = dd->dim[1];
 +        /* This cell row is only seen from the first row */
 +        c->c[1][0] = comm->cell_x0[dim1];
 +        /* All rows can see this row */
 +        c->c[1][1] = comm->cell_x0[dim1];
 +        if (dd->bGridJump)
 +        {
 +            c->c[1][1] = max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
 +            if (bDistMB)
 +            {
 +                /* For the multi-body distance we need the maximum */
 +                c->bc[1] = max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
 +            }
 +        }
 +        /* Set the upper-right corner for rounding */
 +        c->cr0 = comm->cell_x1[dim0];
 +
 +        if (dd->ndim >= 3)
 +        {
 +            dim2 = dd->dim[2];
 +            for (j = 0; j < 4; j++)
 +            {
 +                c->c[2][j] = comm->cell_x0[dim2];
 +            }
 +            if (dd->bGridJump)
 +            {
 +                /* Use the maximum of the i-cells that see a j-cell */
 +                for (i = 0; i < zones->nizone; i++)
 +                {
 +                    for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
 +                    {
 +                        if (j >= 4)
 +                        {
 +                            c->c[2][j-4] =
 +                                max(c->c[2][j-4],
 +                                    comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
 +                        }
 +                    }
 +                }
 +                if (bDistMB)
 +                {
 +                    /* For the multi-body distance we need the maximum */
 +                    c->bc[2] = comm->cell_x0[dim2];
 +                    for (i = 0; i < 2; i++)
 +                    {
 +                        for (j = 0; j < 2; j++)
 +                        {
 +                            c->bc[2] = max(c->bc[2], comm->zone_d2[i][j].p1_0);
 +                        }
 +                    }
 +                }
 +            }
 +
 +            /* Set the upper-right corner for rounding */
 +            /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
 +             * Only cell (0,0,0) can see cell 7 (1,1,1)
 +             */
 +            c->cr1[0] = comm->cell_x1[dim1];
 +            c->cr1[3] = comm->cell_x1[dim1];
 +            if (dd->bGridJump)
 +            {
 +                c->cr1[0] = max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
 +                if (bDistMB)
 +                {
 +                    /* For the multi-body distance we need the maximum */
 +                    c->bcr1 = max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* Determine which cg's we need to send in this pulse from this zone */
 +static void
 +get_zone_pulse_cgs(gmx_domdec_t *dd,
 +                   int zonei, int zone,
 +                   int cg0, int cg1,
 +                   const int *index_gl,
 +                   const int *cgindex,
 +                   int dim, int dim_ind,
 +                   int dim0, int dim1, int dim2,
 +                   real r_comm2, real r_bcomm2,
 +                   matrix box,
 +                   ivec tric_dist,
 +                   rvec *normal,
 +                   real skew_fac2_d, real skew_fac_01,
 +                   rvec *v_d, rvec *v_0, rvec *v_1,
 +                   const dd_corners_t *c,
 +                   rvec sf2_round,
 +                   gmx_bool bDistBonded,
 +                   gmx_bool bBondComm,
 +                   gmx_bool bDist2B,
 +                   gmx_bool bDistMB,
 +                   rvec *cg_cm,
 +                   int *cginfo,
 +                   gmx_domdec_ind_t *ind,
 +                   int **ibuf, int *ibuf_nalloc,
 +                   vec_rvec_t *vbuf,
 +                   int *nsend_ptr,
 +                   int *nat_ptr,
 +                   int *nsend_z_ptr)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_bool           bScrew;
 +    gmx_bool           bDistMB_pulse;
 +    int                cg, i;
 +    real               r2, rb2, r, tric_sh;
 +    rvec               rn, rb;
 +    int                dimd;
 +    int                nsend_z, nsend, nat;
 +
 +    comm = dd->comm;
 +
 +    bScrew = (dd->bScrewPBC && dim == XX);
 +
 +    bDistMB_pulse = (bDistMB && bDistBonded);
 +
 +    nsend_z = 0;
 +    nsend   = *nsend_ptr;
 +    nat     = *nat_ptr;
 +
 +    for (cg = cg0; cg < cg1; cg++)
 +    {
 +        r2  = 0;
 +        rb2 = 0;
 +        if (tric_dist[dim_ind] == 0)
 +        {
 +            /* Rectangular direction, easy */
 +            r = cg_cm[cg][dim] - c->c[dim_ind][zone];
 +            if (r > 0)
 +            {
 +                r2 += r*r;
 +            }
 +            if (bDistMB_pulse)
 +            {
 +                r = cg_cm[cg][dim] - c->bc[dim_ind];
 +                if (r > 0)
 +                {
 +                    rb2 += r*r;
 +                }
 +            }
 +            /* Rounding gives at most a 16% reduction
 +             * in communicated atoms
 +             */
 +            if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
 +            {
 +                r = cg_cm[cg][dim0] - c->cr0;
 +                /* This is the first dimension, so always r >= 0 */
 +                r2 += r*r;
 +                if (bDistMB_pulse)
 +                {
 +                    rb2 += r*r;
 +                }
 +            }
 +            if (dim_ind == 2 && (zonei == 2 || zonei == 3))
 +            {
 +                r = cg_cm[cg][dim1] - c->cr1[zone];
 +                if (r > 0)
 +                {
 +                    r2 += r*r;
 +                }
 +                if (bDistMB_pulse)
 +                {
 +                    r = cg_cm[cg][dim1] - c->bcr1;
 +                    if (r > 0)
 +                    {
 +                        rb2 += r*r;
 +                    }
 +                }
 +            }
 +        }
 +        else
 +        {
 +            /* Triclinic direction, more complicated */
 +            clear_rvec(rn);
 +            clear_rvec(rb);
 +            /* Rounding, conservative as the skew_fac multiplication
 +             * will slightly underestimate the distance.
 +             */
 +            if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
 +            {
 +                rn[dim0] = cg_cm[cg][dim0] - c->cr0;
 +                for (i = dim0+1; i < DIM; i++)
 +                {
 +                    rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
 +                }
 +                r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
 +                if (bDistMB_pulse)
 +                {
 +                    rb[dim0] = rn[dim0];
 +                    rb2      = r2;
 +                }
 +                /* Take care that the cell planes along dim0 might not
 +                 * be orthogonal to those along dim1 and dim2.
 +                 */
 +                for (i = 1; i <= dim_ind; i++)
 +                {
 +                    dimd = dd->dim[i];
 +                    if (normal[dim0][dimd] > 0)
 +                    {
 +                        rn[dimd] -= rn[dim0]*normal[dim0][dimd];
 +                        if (bDistMB_pulse)
 +                        {
 +                            rb[dimd] -= rb[dim0]*normal[dim0][dimd];
 +                        }
 +                    }
 +                }
 +            }
 +            if (dim_ind == 2 && (zonei == 2 || zonei == 3))
 +            {
 +                rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
 +                tric_sh   = 0;
 +                for (i = dim1+1; i < DIM; i++)
 +                {
 +                    tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
 +                }
 +                rn[dim1] += tric_sh;
 +                if (rn[dim1] > 0)
 +                {
 +                    r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
 +                    /* Take care of coupling of the distances
 +                     * to the planes along dim0 and dim1 through dim2.
 +                     */
 +                    r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
 +                    /* Take care that the cell planes along dim1
 +                     * might not be orthogonal to that along dim2.
 +                     */
 +                    if (normal[dim1][dim2] > 0)
 +                    {
 +                        rn[dim2] -= rn[dim1]*normal[dim1][dim2];
 +                    }
 +                }
 +                if (bDistMB_pulse)
 +                {
 +                    rb[dim1] +=
 +                        cg_cm[cg][dim1] - c->bcr1 + tric_sh;
 +                    if (rb[dim1] > 0)
 +                    {
 +                        rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
 +                        /* Take care of coupling of the distances
 +                         * to the planes along dim0 and dim1 through dim2.
 +                         */
 +                        rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
 +                        /* Take care that the cell planes along dim1
 +                         * might not be orthogonal to that along dim2.
 +                         */
 +                        if (normal[dim1][dim2] > 0)
 +                        {
 +                            rb[dim2] -= rb[dim1]*normal[dim1][dim2];
 +                        }
 +                    }
 +                }
 +            }
 +            /* The distance along the communication direction */
 +            rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
 +            tric_sh  = 0;
 +            for (i = dim+1; i < DIM; i++)
 +            {
 +                tric_sh -= cg_cm[cg][i]*v_d[i][dim];
 +            }
 +            rn[dim] += tric_sh;
 +            if (rn[dim] > 0)
 +            {
 +                r2 += rn[dim]*rn[dim]*skew_fac2_d;
 +                /* Take care of coupling of the distances
 +                 * to the planes along dim0 and dim1 through dim2.
 +                 */
 +                if (dim_ind == 1 && zonei == 1)
 +                {
 +                    r2 -= rn[dim0]*rn[dim]*skew_fac_01;
 +                }
 +            }
 +            if (bDistMB_pulse)
 +            {
 +                clear_rvec(rb);
 +                rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
 +                if (rb[dim] > 0)
 +                {
 +                    rb2 += rb[dim]*rb[dim]*skew_fac2_d;
 +                    /* Take care of coupling of the distances
 +                     * to the planes along dim0 and dim1 through dim2.
 +                     */
 +                    if (dim_ind == 1 && zonei == 1)
 +                    {
 +                        rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
 +                    }
 +                }
 +            }
 +        }
 +
 +        if (r2 < r_comm2 ||
 +            (bDistBonded &&
 +             ((bDistMB && rb2 < r_bcomm2) ||
 +              (bDist2B && r2  < r_bcomm2)) &&
 +             (!bBondComm ||
 +              (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
 +               missing_link(comm->cglink, index_gl[cg],
 +                            comm->bLocalCG)))))
 +        {
 +            /* Make an index to the local charge groups */
 +            if (nsend+1 > ind->nalloc)
 +            {
 +                ind->nalloc = over_alloc_large(nsend+1);
 +                srenew(ind->index, ind->nalloc);
 +            }
 +            if (nsend+1 > *ibuf_nalloc)
 +            {
 +                *ibuf_nalloc = over_alloc_large(nsend+1);
 +                srenew(*ibuf, *ibuf_nalloc);
 +            }
 +            ind->index[nsend] = cg;
 +            (*ibuf)[nsend]    = index_gl[cg];
 +            nsend_z++;
 +            vec_rvec_check_alloc(vbuf, nsend+1);
 +
 +            if (dd->ci[dim] == 0)
 +            {
 +                /* Correct cg_cm for pbc */
 +                rvec_add(cg_cm[cg], box[dim], vbuf->v[nsend]);
 +                if (bScrew)
 +                {
 +                    vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
 +                    vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
 +                }
 +            }
 +            else
 +            {
 +                copy_rvec(cg_cm[cg], vbuf->v[nsend]);
 +            }
 +            nsend++;
 +            nat += cgindex[cg+1] - cgindex[cg];
 +        }
 +    }
 +
 +    *nsend_ptr   = nsend;
 +    *nat_ptr     = nat;
 +    *nsend_z_ptr = nsend_z;
 +}
 +
 +static void setup_dd_communication(gmx_domdec_t *dd,
 +                                   matrix box, gmx_ddbox_t *ddbox,
 +                                   t_forcerec *fr, t_state *state, rvec **f)
 +{
 +    int                    dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
 +    int                    nzone, nzone_send, zone, zonei, cg0, cg1;
 +    int                    c, i, j, cg, cg_gl, nrcg;
 +    int                   *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
 +    gmx_domdec_comm_t     *comm;
 +    gmx_domdec_zones_t    *zones;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t      *ind;
 +    cginfo_mb_t           *cginfo_mb;
 +    gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
 +    real                   r_mb, r_comm2, r_scomm2, r_bcomm2, r_0, r_1, r2inc, inv_ncg;
 +    dd_corners_t           corners;
 +    ivec                   tric_dist;
 +    rvec                  *cg_cm, *normal, *v_d, *v_0 = NULL, *v_1 = NULL, *recv_vr;
 +    real                   skew_fac2_d, skew_fac_01;
 +    rvec                   sf2_round;
 +    int                    nsend, nat;
 +    int                    th;
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Setting up DD communication\n");
 +    }
 +
 +    comm  = dd->comm;
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +        case ecutsGROUP:
 +            cg_cm = fr->cg_cm;
 +            break;
 +        case ecutsVERLET:
 +            cg_cm = state->x;
 +            break;
 +        default:
 +            gmx_incons("unimplemented");
 +            cg_cm = NULL;
 +    }
 +
 +    for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
 +    {
 +        dim = dd->dim[dim_ind];
 +
 +        /* Check if we need to use triclinic distances */
 +        tric_dist[dim_ind] = 0;
 +        for (i = 0; i <= dim_ind; i++)
 +        {
 +            if (ddbox->tric_dir[dd->dim[i]])
 +            {
 +                tric_dist[dim_ind] = 1;
 +            }
 +        }
 +    }
 +
 +    bBondComm = comm->bBondComm;
 +
 +    /* Do we need to determine extra distances for multi-body bondeds? */
 +    bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
 +
 +    /* Do we need to determine extra distances for only two-body bondeds? */
 +    bDist2B = (bBondComm && !bDistMB);
 +
 +    r_comm2  = sqr(comm->cutoff);
 +    r_bcomm2 = sqr(comm->cutoff_mbody);
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "bBondComm %d, r_bc %f\n", bBondComm, sqrt(r_bcomm2));
 +    }
 +
 +    zones = &comm->zones;
 +
 +    dim0 = dd->dim[0];
 +    dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
 +    dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
 +
 +    set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
 +
 +    /* Triclinic stuff */
 +    normal      = ddbox->normal;
 +    skew_fac_01 = 0;
 +    if (dd->ndim >= 2)
 +    {
 +        v_0 = ddbox->v[dim0];
 +        if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
 +        {
 +            /* Determine the coupling coefficient for the distances
 +             * to the cell planes along dim0 and dim1 through dim2.
 +             * This is required for correct rounding.
 +             */
 +            skew_fac_01 =
 +                ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
 +            if (debug)
 +            {
 +                fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
 +            }
 +        }
 +    }
 +    if (dd->ndim >= 3)
 +    {
 +        v_1 = ddbox->v[dim1];
 +    }
 +
 +    zone_cg_range = zones->cg_range;
 +    index_gl      = dd->index_gl;
 +    cgindex       = dd->cgindex;
 +    cginfo_mb     = fr->cginfo_mb;
 +
 +    zone_cg_range[0]   = 0;
 +    zone_cg_range[1]   = dd->ncg_home;
 +    comm->zone_ncg1[0] = dd->ncg_home;
 +    pos_cg             = dd->ncg_home;
 +
 +    nat_tot = dd->nat_home;
 +    nzone   = 1;
 +    for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
 +    {
 +        dim = dd->dim[dim_ind];
 +        cd  = &comm->cd[dim_ind];
 +
 +        if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
 +        {
 +            /* No pbc in this dimension, the first node should not comm. */
 +            nzone_send = 0;
 +        }
 +        else
 +        {
 +            nzone_send = nzone;
 +        }
 +
 +        v_d         = ddbox->v[dim];
 +        skew_fac2_d = sqr(ddbox->skew_fac[dim]);
 +
 +        cd->bInPlace = TRUE;
 +        for (p = 0; p < cd->np; p++)
 +        {
 +            /* Only atoms communicated in the first pulse are used
 +             * for multi-body bonded interactions or for bBondComm.
 +             */
 +            bDistBonded = ((bDistMB || bDist2B) && p == 0);
 +
 +            ind   = &cd->ind[p];
 +            nsend = 0;
 +            nat   = 0;
 +            for (zone = 0; zone < nzone_send; zone++)
 +            {
 +                if (tric_dist[dim_ind] && dim_ind > 0)
 +                {
 +                    /* Determine slightly more optimized skew_fac's
 +                     * for rounding.
 +                     * This reduces the number of communicated atoms
 +                     * by about 10% for 3D DD of rhombic dodecahedra.
 +                     */
 +                    for (dimd = 0; dimd < dim; dimd++)
 +                    {
 +                        sf2_round[dimd] = 1;
 +                        if (ddbox->tric_dir[dimd])
 +                        {
 +                            for (i = dd->dim[dimd]+1; i < DIM; i++)
 +                            {
 +                                /* If we are shifted in dimension i
 +                                 * and the cell plane is tilted forward
 +                                 * in dimension i, skip this coupling.
 +                                 */
 +                                if (!(zones->shift[nzone+zone][i] &&
 +                                      ddbox->v[dimd][i][dimd] >= 0))
 +                                {
 +                                    sf2_round[dimd] +=
 +                                        sqr(ddbox->v[dimd][i][dimd]);
 +                                }
 +                            }
 +                            sf2_round[dimd] = 1/sf2_round[dimd];
 +                        }
 +                    }
 +                }
 +
 +                zonei = zone_perm[dim_ind][zone];
 +                if (p == 0)
 +                {
 +                    /* Here we permutate the zones to obtain a convenient order
 +                     * for neighbor searching
 +                     */
 +                    cg0 = zone_cg_range[zonei];
 +                    cg1 = zone_cg_range[zonei+1];
 +                }
 +                else
 +                {
 +                    /* Look only at the cg's received in the previous grid pulse
 +                     */
 +                    cg1 = zone_cg_range[nzone+zone+1];
 +                    cg0 = cg1 - cd->ind[p-1].nrecv[zone];
 +                }
 +
 +#pragma omp parallel for num_threads(comm->nth) schedule(static)
 +                for (th = 0; th < comm->nth; th++)
 +                {
 +                    gmx_domdec_ind_t *ind_p;
 +                    int             **ibuf_p, *ibuf_nalloc_p;
 +                    vec_rvec_t       *vbuf_p;
 +                    int              *nsend_p, *nat_p;
 +                    int              *nsend_zone_p;
 +                    int               cg0_th, cg1_th;
 +
 +                    if (th == 0)
 +                    {
 +                        /* Thread 0 writes in the comm buffers */
 +                        ind_p         = ind;
 +                        ibuf_p        = &comm->buf_int;
 +                        ibuf_nalloc_p = &comm->nalloc_int;
 +                        vbuf_p        = &comm->vbuf;
 +                        nsend_p       = &nsend;
 +                        nat_p         = &nat;
 +                        nsend_zone_p  = &ind->nsend[zone];
 +                    }
 +                    else
 +                    {
 +                        /* Other threads write into temp buffers */
 +                        ind_p         = &comm->dth[th].ind;
 +                        ibuf_p        = &comm->dth[th].ibuf;
 +                        ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
 +                        vbuf_p        = &comm->dth[th].vbuf;
 +                        nsend_p       = &comm->dth[th].nsend;
 +                        nat_p         = &comm->dth[th].nat;
 +                        nsend_zone_p  = &comm->dth[th].nsend_zone;
 +
 +                        comm->dth[th].nsend      = 0;
 +                        comm->dth[th].nat        = 0;
 +                        comm->dth[th].nsend_zone = 0;
 +                    }
 +
 +                    if (comm->nth == 1)
 +                    {
 +                        cg0_th = cg0;
 +                        cg1_th = cg1;
 +                    }
 +                    else
 +                    {
 +                        cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
 +                        cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
 +                    }
 +
 +                    /* Get the cg's for this pulse in this zone */
 +                    get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
 +                                       index_gl, cgindex,
 +                                       dim, dim_ind, dim0, dim1, dim2,
 +                                       r_comm2, r_bcomm2,
 +                                       box, tric_dist,
 +                                       normal, skew_fac2_d, skew_fac_01,
 +                                       v_d, v_0, v_1, &corners, sf2_round,
 +                                       bDistBonded, bBondComm,
 +                                       bDist2B, bDistMB,
 +                                       cg_cm, fr->cginfo,
 +                                       ind_p,
 +                                       ibuf_p, ibuf_nalloc_p,
 +                                       vbuf_p,
 +                                       nsend_p, nat_p,
 +                                       nsend_zone_p);
 +                }
 +
 +                /* Append data of threads>=1 to the communication buffers */
 +                for (th = 1; th < comm->nth; th++)
 +                {
 +                    dd_comm_setup_work_t *dth;
 +                    int                   i, ns1;
 +
 +                    dth = &comm->dth[th];
 +
 +                    ns1 = nsend + dth->nsend_zone;
 +                    if (ns1 > ind->nalloc)
 +                    {
 +                        ind->nalloc = over_alloc_dd(ns1);
 +                        srenew(ind->index, ind->nalloc);
 +                    }
 +                    if (ns1 > comm->nalloc_int)
 +                    {
 +                        comm->nalloc_int = over_alloc_dd(ns1);
 +                        srenew(comm->buf_int, comm->nalloc_int);
 +                    }
 +                    if (ns1 > comm->vbuf.nalloc)
 +                    {
 +                        comm->vbuf.nalloc = over_alloc_dd(ns1);
 +                        srenew(comm->vbuf.v, comm->vbuf.nalloc);
 +                    }
 +
 +                    for (i = 0; i < dth->nsend_zone; i++)
 +                    {
 +                        ind->index[nsend]    = dth->ind.index[i];
 +                        comm->buf_int[nsend] = dth->ibuf[i];
 +                        copy_rvec(dth->vbuf.v[i],
 +                                  comm->vbuf.v[nsend]);
 +                        nsend++;
 +                    }
 +                    nat              += dth->nat;
 +                    ind->nsend[zone] += dth->nsend_zone;
 +                }
 +            }
 +            /* Clear the counts in case we do not have pbc */
 +            for (zone = nzone_send; zone < nzone; zone++)
 +            {
 +                ind->nsend[zone] = 0;
 +            }
 +            ind->nsend[nzone]   = nsend;
 +            ind->nsend[nzone+1] = nat;
 +            /* Communicate the number of cg's and atoms to receive */
 +            dd_sendrecv_int(dd, dim_ind, dddirBackward,
 +                            ind->nsend, nzone+2,
 +                            ind->nrecv, nzone+2);
 +
 +            /* The rvec buffer is also required for atom buffers of size nsend
 +             * in dd_move_x and dd_move_f.
 +             */
 +            vec_rvec_check_alloc(&comm->vbuf, ind->nsend[nzone+1]);
 +
 +            if (p > 0)
 +            {
 +                /* We can receive in place if only the last zone is not empty */
 +                for (zone = 0; zone < nzone-1; zone++)
 +                {
 +                    if (ind->nrecv[zone] > 0)
 +                    {
 +                        cd->bInPlace = FALSE;
 +                    }
 +                }
 +                if (!cd->bInPlace)
 +                {
 +                    /* The int buffer is only required here for the cg indices */
 +                    if (ind->nrecv[nzone] > comm->nalloc_int2)
 +                    {
 +                        comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
 +                        srenew(comm->buf_int2, comm->nalloc_int2);
 +                    }
 +                    /* The rvec buffer is also required for atom buffers
 +                     * of size nrecv in dd_move_x and dd_move_f.
 +                     */
 +                    i = max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
 +                    vec_rvec_check_alloc(&comm->vbuf2, i);
 +                }
 +            }
 +
 +            /* Make space for the global cg indices */
 +            if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
 +                || dd->cg_nalloc == 0)
 +            {
 +                dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
 +                srenew(index_gl, dd->cg_nalloc);
 +                srenew(cgindex, dd->cg_nalloc+1);
 +            }
 +            /* Communicate the global cg indices */
 +            if (cd->bInPlace)
 +            {
 +                recv_i = index_gl + pos_cg;
 +            }
 +            else
 +            {
 +                recv_i = comm->buf_int2;
 +            }
 +            dd_sendrecv_int(dd, dim_ind, dddirBackward,
 +                            comm->buf_int, nsend,
 +                            recv_i,        ind->nrecv[nzone]);
 +
 +            /* Make space for cg_cm */
 +            dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
 +            if (fr->cutoff_scheme == ecutsGROUP)
 +            {
 +                cg_cm = fr->cg_cm;
 +            }
 +            else
 +            {
 +                cg_cm = state->x;
 +            }
 +            /* Communicate cg_cm */
 +            if (cd->bInPlace)
 +            {
 +                recv_vr = cg_cm + pos_cg;
 +            }
 +            else
 +            {
 +                recv_vr = comm->vbuf2.v;
 +            }
 +            dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
 +                             comm->vbuf.v, nsend,
 +                             recv_vr,      ind->nrecv[nzone]);
 +
 +            /* Make the charge group index */
 +            if (cd->bInPlace)
 +            {
 +                zone = (p == 0 ? 0 : nzone - 1);
 +                while (zone < nzone)
 +                {
 +                    for (cg = 0; cg < ind->nrecv[zone]; cg++)
 +                    {
 +                        cg_gl              = index_gl[pos_cg];
 +                        fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl);
 +                        nrcg               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
 +                        cgindex[pos_cg+1]  = cgindex[pos_cg] + nrcg;
 +                        if (bBondComm)
 +                        {
 +                            /* Update the charge group presence,
 +                             * so we can use it in the next pass of the loop.
 +                             */
 +                            comm->bLocalCG[cg_gl] = TRUE;
 +                        }
 +                        pos_cg++;
 +                    }
 +                    if (p == 0)
 +                    {
 +                        comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
 +                    }
 +                    zone++;
 +                    zone_cg_range[nzone+zone] = pos_cg;
 +                }
 +            }
 +            else
 +            {
 +                /* This part of the code is never executed with bBondComm. */
 +                merge_cg_buffers(nzone, cd, p, zone_cg_range,
 +                                 index_gl, recv_i, cg_cm, recv_vr,
 +                                 cgindex, fr->cginfo_mb, fr->cginfo);
 +                pos_cg += ind->nrecv[nzone];
 +            }
 +            nat_tot += ind->nrecv[nzone+1];
 +        }
 +        if (!cd->bInPlace)
 +        {
 +            /* Store the atom block for easy copying of communication buffers */
 +            make_cell2at_index(cd, nzone, zone_cg_range[nzone], cgindex);
 +        }
 +        nzone += nzone;
 +    }
 +    dd->index_gl = index_gl;
 +    dd->cgindex  = cgindex;
 +
 +    dd->ncg_tot          = zone_cg_range[zones->n];
 +    dd->nat_tot          = nat_tot;
 +    comm->nat[ddnatHOME] = dd->nat_home;
 +    for (i = ddnatZONE; i < ddnatNR; i++)
 +    {
 +        comm->nat[i] = dd->nat_tot;
 +    }
 +
 +    if (!bBondComm)
 +    {
 +        /* We don't need to update cginfo, since that was alrady done above.
 +         * So we pass NULL for the forcerec.
 +         */
 +        dd_set_cginfo(dd->index_gl, dd->ncg_home, dd->ncg_tot,
 +                      NULL, comm->bLocalCG);
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Finished setting up DD communication, zones:");
 +        for (c = 0; c < zones->n; c++)
 +        {
 +            fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
 +        }
 +        fprintf(debug, "\n");
 +    }
 +}
 +
 +static void set_cg_boundaries(gmx_domdec_zones_t *zones)
 +{
 +    int c;
 +
 +    for (c = 0; c < zones->nizone; c++)
 +    {
 +        zones->izone[c].cg1  = zones->cg_range[c+1];
 +        zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
 +        zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
 +    }
 +}
 +
 +static void set_zones_size(gmx_domdec_t *dd,
 +                           matrix box, const gmx_ddbox_t *ddbox,
 +                           int zone_start, int zone_end)
 +{
 +    gmx_domdec_comm_t  *comm;
 +    gmx_domdec_zones_t *zones;
 +    gmx_bool            bDistMB;
 +    int                 z, zi, zj0, zj1, d, dim;
 +    real                rcs, rcmbs;
 +    int                 i, j;
 +    real                size_j, add_tric;
 +    real                vol;
 +
 +    comm = dd->comm;
 +
 +    zones = &comm->zones;
 +
 +    /* Do we need to determine extra distances for multi-body bondeds? */
 +    bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
 +
 +    for (z = zone_start; z < zone_end; z++)
 +    {
 +        /* Copy cell limits to zone limits.
 +         * Valid for non-DD dims and non-shifted dims.
 +         */
 +        copy_rvec(comm->cell_x0, zones->size[z].x0);
 +        copy_rvec(comm->cell_x1, zones->size[z].x1);
 +    }
 +
 +    for (d = 0; d < dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +
 +        for (z = 0; z < zones->n; z++)
 +        {
 +            /* With a staggered grid we have different sizes
 +             * for non-shifted dimensions.
 +             */
 +            if (dd->bGridJump && zones->shift[z][dim] == 0)
 +            {
 +                if (d == 1)
 +                {
 +                    zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
 +                    zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
 +                }
 +                else if (d == 2)
 +                {
 +                    zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
 +                    zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
 +                }
 +            }
 +        }
 +
 +        rcs   = comm->cutoff;
 +        rcmbs = comm->cutoff_mbody;
 +        if (ddbox->tric_dir[dim])
 +        {
 +            rcs   /= ddbox->skew_fac[dim];
 +            rcmbs /= ddbox->skew_fac[dim];
 +        }
 +
 +        /* Set the lower limit for the shifted zone dimensions */
 +        for (z = zone_start; z < zone_end; z++)
 +        {
 +            if (zones->shift[z][dim] > 0)
 +            {
 +                dim = dd->dim[d];
 +                if (!dd->bGridJump || d == 0)
 +                {
 +                    zones->size[z].x0[dim] = comm->cell_x1[dim];
 +                    zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
 +                }
 +                else
 +                {
 +                    /* Here we take the lower limit of the zone from
 +                     * the lowest domain of the zone below.
 +                     */
 +                    if (z < 4)
 +                    {
 +                        zones->size[z].x0[dim] =
 +                            comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
 +                    }
 +                    else
 +                    {
 +                        if (d == 1)
 +                        {
 +                            zones->size[z].x0[dim] =
 +                                zones->size[zone_perm[2][z-4]].x0[dim];
 +                        }
 +                        else
 +                        {
 +                            zones->size[z].x0[dim] =
 +                                comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
 +                        }
 +                    }
 +                    /* A temporary limit, is updated below */
 +                    zones->size[z].x1[dim] = zones->size[z].x0[dim];
 +
 +                    if (bDistMB)
 +                    {
 +                        for (zi = 0; zi < zones->nizone; zi++)
 +                        {
 +                            if (zones->shift[zi][dim] == 0)
 +                            {
 +                                /* This takes the whole zone into account.
 +                                 * With multiple pulses this will lead
 +                                 * to a larger zone then strictly necessary.
 +                                 */
 +                                zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
 +                                                             zones->size[zi].x1[dim]+rcmbs);
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +
 +        /* Loop over the i-zones to set the upper limit of each
 +         * j-zone they see.
 +         */
 +        for (zi = 0; zi < zones->nizone; zi++)
 +        {
 +            if (zones->shift[zi][dim] == 0)
 +            {
 +                for (z = zones->izone[zi].j0; z < zones->izone[zi].j1; z++)
 +                {
 +                    if (zones->shift[z][dim] > 0)
 +                    {
 +                        zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
 +                                                     zones->size[zi].x1[dim]+rcs);
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    for (z = zone_start; z < zone_end; z++)
 +    {
 +        /* Initialization only required to keep the compiler happy */
 +        rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
 +        int  nc, c;
 +
 +        /* To determine the bounding box for a zone we need to find
 +         * the extreme corners of 4, 2 or 1 corners.
 +         */
 +        nc = 1 << (ddbox->npbcdim - 1);
 +
 +        for (c = 0; c < nc; c++)
 +        {
 +            /* Set up a zone corner at x=0, ignoring trilinic couplings */
 +            corner[XX] = 0;
 +            if ((c & 1) == 0)
 +            {
 +                corner[YY] = zones->size[z].x0[YY];
 +            }
 +            else
 +            {
 +                corner[YY] = zones->size[z].x1[YY];
 +            }
 +            if ((c & 2) == 0)
 +            {
 +                corner[ZZ] = zones->size[z].x0[ZZ];
 +            }
 +            else
 +            {
 +                corner[ZZ] = zones->size[z].x1[ZZ];
 +            }
 +            if (dd->ndim == 1 && box[ZZ][YY] != 0)
 +            {
 +                /* With 1D domain decomposition the cg's are not in
 +                 * the triclinic box, but triclinic x-y and rectangular y-z.
 +                 * Shift y back, so it will later end up at 0.
 +                 */
 +                corner[YY] -= corner[ZZ]*box[ZZ][YY]/box[ZZ][ZZ];
 +            }
 +            /* Apply the triclinic couplings */
 +            for (i = YY; i < ddbox->npbcdim; i++)
 +            {
 +                for (j = XX; j < i; j++)
 +                {
 +                    corner[j] += corner[i]*box[i][j]/box[i][i];
 +                }
 +            }
 +            if (c == 0)
 +            {
 +                copy_rvec(corner, corner_min);
 +                copy_rvec(corner, corner_max);
 +            }
 +            else
 +            {
 +                for (i = 0; i < DIM; i++)
 +                {
 +                    corner_min[i] = min(corner_min[i], corner[i]);
 +                    corner_max[i] = max(corner_max[i], corner[i]);
 +                }
 +            }
 +        }
 +        /* Copy the extreme cornes without offset along x */
 +        for (i = 0; i < DIM; i++)
 +        {
 +            zones->size[z].bb_x0[i] = corner_min[i];
 +            zones->size[z].bb_x1[i] = corner_max[i];
 +        }
 +        /* Add the offset along x */
 +        zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
 +        zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
 +    }
 +
 +    if (zone_start == 0)
 +    {
 +        vol = 1;
 +        for (dim = 0; dim < DIM; dim++)
 +        {
 +            vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
 +        }
 +        zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
 +    }
 +
 +    if (debug)
 +    {
 +        for (z = zone_start; z < zone_end; z++)
 +        {
 +            fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
 +                    z,
 +                    zones->size[z].x0[XX], zones->size[z].x1[XX],
 +                    zones->size[z].x0[YY], zones->size[z].x1[YY],
 +                    zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
 +            fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
 +                    z,
 +                    zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
 +                    zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
 +                    zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
 +        }
 +    }
 +}
 +
 +static int comp_cgsort(const void *a, const void *b)
 +{
 +    int           comp;
 +
 +    gmx_cgsort_t *cga, *cgb;
 +    cga = (gmx_cgsort_t *)a;
 +    cgb = (gmx_cgsort_t *)b;
 +
 +    comp = cga->nsc - cgb->nsc;
 +    if (comp == 0)
 +    {
 +        comp = cga->ind_gl - cgb->ind_gl;
 +    }
 +
 +    return comp;
 +}
 +
 +static void order_int_cg(int n, const gmx_cgsort_t *sort,
 +                         int *a, int *buf)
 +{
 +    int i;
 +
 +    /* Order the data */
 +    for (i = 0; i < n; i++)
 +    {
 +        buf[i] = a[sort[i].ind];
 +    }
 +
 +    /* Copy back to the original array */
 +    for (i = 0; i < n; i++)
 +    {
 +        a[i] = buf[i];
 +    }
 +}
 +
 +static void order_vec_cg(int n, const gmx_cgsort_t *sort,
 +                         rvec *v, rvec *buf)
 +{
 +    int i;
 +
 +    /* Order the data */
 +    for (i = 0; i < n; i++)
 +    {
 +        copy_rvec(v[sort[i].ind], buf[i]);
 +    }
 +
 +    /* Copy back to the original array */
 +    for (i = 0; i < n; i++)
 +    {
 +        copy_rvec(buf[i], v[i]);
 +    }
 +}
 +
 +static void order_vec_atom(int ncg, const int *cgindex, const gmx_cgsort_t *sort,
 +                           rvec *v, rvec *buf)
 +{
 +    int a, atot, cg, cg0, cg1, i;
 +
 +    if (cgindex == NULL)
 +    {
 +        /* Avoid the useless loop of the atoms within a cg */
 +        order_vec_cg(ncg, sort, v, buf);
 +
 +        return;
 +    }
 +
 +    /* Order the data */
 +    a = 0;
 +    for (cg = 0; cg < ncg; cg++)
 +    {
 +        cg0 = cgindex[sort[cg].ind];
 +        cg1 = cgindex[sort[cg].ind+1];
 +        for (i = cg0; i < cg1; i++)
 +        {
 +            copy_rvec(v[i], buf[a]);
 +            a++;
 +        }
 +    }
 +    atot = a;
 +
 +    /* Copy back to the original array */
 +    for (a = 0; a < atot; a++)
 +    {
 +        copy_rvec(buf[a], v[a]);
 +    }
 +}
 +
 +static void ordered_sort(int nsort2, gmx_cgsort_t *sort2,
 +                         int nsort_new, gmx_cgsort_t *sort_new,
 +                         gmx_cgsort_t *sort1)
 +{
 +    int i1, i2, i_new;
 +
 +    /* The new indices are not very ordered, so we qsort them */
 +    qsort_threadsafe(sort_new, nsort_new, sizeof(sort_new[0]), comp_cgsort);
 +
 +    /* sort2 is already ordered, so now we can merge the two arrays */
 +    i1    = 0;
 +    i2    = 0;
 +    i_new = 0;
 +    while (i2 < nsort2 || i_new < nsort_new)
 +    {
 +        if (i2 == nsort2)
 +        {
 +            sort1[i1++] = sort_new[i_new++];
 +        }
 +        else if (i_new == nsort_new)
 +        {
 +            sort1[i1++] = sort2[i2++];
 +        }
 +        else if (sort2[i2].nsc < sort_new[i_new].nsc ||
 +                 (sort2[i2].nsc == sort_new[i_new].nsc &&
 +                  sort2[i2].ind_gl < sort_new[i_new].ind_gl))
 +        {
 +            sort1[i1++] = sort2[i2++];
 +        }
 +        else
 +        {
 +            sort1[i1++] = sort_new[i_new++];
 +        }
 +    }
 +}
 +
 +static int dd_sort_order(gmx_domdec_t *dd, t_forcerec *fr, int ncg_home_old)
 +{
 +    gmx_domdec_sort_t *sort;
 +    gmx_cgsort_t      *cgsort, *sort_i;
 +    int                ncg_new, nsort2, nsort_new, i, *a, moved, *ibuf;
 +    int                sort_last, sort_skip;
 +
 +    sort = dd->comm->sort;
 +
 +    a = fr->ns.grid->cell_index;
 +
 +    moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
 +
 +    if (ncg_home_old >= 0)
 +    {
 +        /* The charge groups that remained in the same ns grid cell
 +         * are completely ordered. So we can sort efficiently by sorting
 +         * the charge groups that did move into the stationary list.
 +         */
 +        ncg_new   = 0;
 +        nsort2    = 0;
 +        nsort_new = 0;
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            /* Check if this cg did not move to another node */
 +            if (a[i] < moved)
 +            {
 +                if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
 +                {
 +                    /* This cg is new on this node or moved ns grid cell */
 +                    if (nsort_new >= sort->sort_new_nalloc)
 +                    {
 +                        sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
 +                        srenew(sort->sort_new, sort->sort_new_nalloc);
 +                    }
 +                    sort_i = &(sort->sort_new[nsort_new++]);
 +                }
 +                else
 +                {
 +                    /* This cg did not move */
 +                    sort_i = &(sort->sort2[nsort2++]);
 +                }
 +                /* Sort on the ns grid cell indices
 +                 * and the global topology index.
 +                 * index_gl is irrelevant with cell ns,
 +                 * but we set it here anyhow to avoid a conditional.
 +                 */
 +                sort_i->nsc    = a[i];
 +                sort_i->ind_gl = dd->index_gl[i];
 +                sort_i->ind    = i;
 +                ncg_new++;
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug, "ordered sort cgs: stationary %d moved %d\n",
 +                    nsort2, nsort_new);
 +        }
 +        /* Sort efficiently */
 +        ordered_sort(nsort2, sort->sort2, nsort_new, sort->sort_new,
 +                     sort->sort);
 +    }
 +    else
 +    {
 +        cgsort  = sort->sort;
 +        ncg_new = 0;
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            /* Sort on the ns grid cell indices
 +             * and the global topology index
 +             */
 +            cgsort[i].nsc    = a[i];
 +            cgsort[i].ind_gl = dd->index_gl[i];
 +            cgsort[i].ind    = i;
 +            if (cgsort[i].nsc < moved)
 +            {
 +                ncg_new++;
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, ncg_new);
 +        }
 +        /* Determine the order of the charge groups using qsort */
 +        qsort_threadsafe(cgsort, dd->ncg_home, sizeof(cgsort[0]), comp_cgsort);
 +    }
 +
 +    return ncg_new;
 +}
 +
 +static int dd_sort_order_nbnxn(gmx_domdec_t *dd, t_forcerec *fr)
 +{
 +    gmx_cgsort_t *sort;
 +    int           ncg_new, i, *a, na;
 +
 +    sort = dd->comm->sort->sort;
 +
 +    nbnxn_get_atomorder(fr->nbv->nbs, &a, &na);
 +
 +    ncg_new = 0;
 +    for (i = 0; i < na; i++)
 +    {
 +        if (a[i] >= 0)
 +        {
 +            sort[ncg_new].ind = a[i];
 +            ncg_new++;
 +        }
 +    }
 +
 +    return ncg_new;
 +}
 +
 +static void dd_sort_state(gmx_domdec_t *dd, rvec *cgcm, t_forcerec *fr, t_state *state,
 +                          int ncg_home_old)
 +{
 +    gmx_domdec_sort_t *sort;
 +    gmx_cgsort_t      *cgsort, *sort_i;
 +    int               *cgindex;
 +    int                ncg_new, i, *ibuf, cgsize;
 +    rvec              *vbuf;
 +
 +    sort = dd->comm->sort;
 +
 +    if (dd->ncg_home > sort->sort_nalloc)
 +    {
 +        sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
 +        srenew(sort->sort, sort->sort_nalloc);
 +        srenew(sort->sort2, sort->sort_nalloc);
 +    }
 +    cgsort = sort->sort;
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +        case ecutsGROUP:
 +            ncg_new = dd_sort_order(dd, fr, ncg_home_old);
 +            break;
 +        case ecutsVERLET:
 +            ncg_new = dd_sort_order_nbnxn(dd, fr);
 +            break;
 +        default:
 +            gmx_incons("unimplemented");
 +            ncg_new = 0;
 +    }
 +
 +    /* We alloc with the old size, since cgindex is still old */
 +    vec_rvec_check_alloc(&dd->comm->vbuf, dd->cgindex[dd->ncg_home]);
 +    vbuf = dd->comm->vbuf.v;
 +
 +    if (dd->comm->bCGs)
 +    {
 +        cgindex = dd->cgindex;
 +    }
 +    else
 +    {
 +        cgindex = NULL;
 +    }
 +
 +    /* Remove the charge groups which are no longer at home here */
 +    dd->ncg_home = ncg_new;
 +    if (debug)
 +    {
 +        fprintf(debug, "Set the new home charge group count to %d\n",
 +                dd->ncg_home);
 +    }
 +
 +    /* Reorder the state */
 +    for (i = 0; i < estNR; i++)
 +    {
 +        if (EST_DISTR(i) && (state->flags & (1<<i)))
 +        {
 +            switch (i)
 +            {
 +                case estX:
 +                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->x, vbuf);
 +                    break;
 +                case estV:
 +                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->v, vbuf);
 +                    break;
 +                case estSDX:
 +                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->sd_X, vbuf);
 +                    break;
 +                case estCGP:
 +                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->cg_p, vbuf);
 +                    break;
 +                case estLD_RNG:
 +                case estLD_RNGI:
 +                case estDISRE_INITF:
 +                case estDISRE_RM3TAV:
 +                case estORIRE_INITF:
 +                case estORIRE_DTAV:
 +                    /* No ordering required */
 +                    break;
 +                default:
 +                    gmx_incons("Unknown state entry encountered in dd_sort_state");
 +                    break;
 +            }
 +        }
 +    }
 +    if (fr->cutoff_scheme == ecutsGROUP)
 +    {
 +        /* Reorder cgcm */
 +        order_vec_cg(dd->ncg_home, cgsort, cgcm, vbuf);
 +    }
 +
 +    if (dd->ncg_home+1 > sort->ibuf_nalloc)
 +    {
 +        sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
 +        srenew(sort->ibuf, sort->ibuf_nalloc);
 +    }
 +    ibuf = sort->ibuf;
 +    /* Reorder the global cg index */
 +    order_int_cg(dd->ncg_home, cgsort, dd->index_gl, ibuf);
 +    /* Reorder the cginfo */
 +    order_int_cg(dd->ncg_home, cgsort, fr->cginfo, ibuf);
 +    /* Rebuild the local cg index */
 +    if (dd->comm->bCGs)
 +    {
 +        ibuf[0] = 0;
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            cgsize    = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
 +            ibuf[i+1] = ibuf[i] + cgsize;
 +        }
 +        for (i = 0; i < dd->ncg_home+1; i++)
 +        {
 +            dd->cgindex[i] = ibuf[i];
 +        }
 +    }
 +    else
 +    {
 +        for (i = 0; i < dd->ncg_home+1; i++)
 +        {
 +            dd->cgindex[i] = i;
 +        }
 +    }
 +    /* Set the home atom number */
 +    dd->nat_home = dd->cgindex[dd->ncg_home];
 +
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        /* The atoms are now exactly in grid order, update the grid order */
 +        nbnxn_set_atomorder(fr->nbv->nbs);
 +    }
 +    else
 +    {
 +        /* Copy the sorted ns cell indices back to the ns grid struct */
 +        for (i = 0; i < dd->ncg_home; i++)
 +        {
 +            fr->ns.grid->cell_index[i] = cgsort[i].nsc;
 +        }
 +        fr->ns.grid->nr = dd->ncg_home;
 +    }
 +}
 +
 +static void add_dd_statistics(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ddnat;
 +
 +    comm = dd->comm;
 +
 +    for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
 +    {
 +        comm->sum_nat[ddnat-ddnatZONE] +=
 +            comm->nat[ddnat] - comm->nat[ddnat-1];
 +    }
 +    comm->ndecomp++;
 +}
 +
 +void reset_dd_statistics_counters(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ddnat;
 +
 +    comm = dd->comm;
 +
 +    /* Reset all the statistics and counters for total run counting */
 +    for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
 +    {
 +        comm->sum_nat[ddnat-ddnatZONE] = 0;
 +    }
 +    comm->ndecomp   = 0;
 +    comm->nload     = 0;
 +    comm->load_step = 0;
 +    comm->load_sum  = 0;
 +    comm->load_max  = 0;
 +    clear_ivec(comm->load_lim);
 +    comm->load_mdf = 0;
 +    comm->load_pme = 0;
 +}
 +
 +void print_dd_statistics(t_commrec *cr, t_inputrec *ir, FILE *fplog)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int                ddnat;
 +    double             av;
 +
 +    comm = cr->dd->comm;
 +
 +    gmx_sumd(ddnatNR-ddnatZONE, comm->sum_nat, cr);
 +
 +    if (fplog == NULL)
 +    {
 +        return;
 +    }
 +
 +    fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
 +
 +    for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
 +    {
 +        av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
 +        switch (ddnat)
 +        {
 +            case ddnatZONE:
 +                fprintf(fplog,
 +                        " av. #atoms communicated per step for force:  %d x %.1f\n",
 +                        2, av);
 +                break;
 +            case ddnatVSITE:
 +                if (cr->dd->vsite_comm)
 +                {
 +                    fprintf(fplog,
 +                            " av. #atoms communicated per step for vsites: %d x %.1f\n",
 +                            (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
 +                            av);
 +                }
 +                break;
 +            case ddnatCON:
 +                if (cr->dd->constraint_comm)
 +                {
 +                    fprintf(fplog,
 +                            " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
 +                            1 + ir->nLincsIter, av);
 +                }
 +                break;
 +            default:
 +                gmx_incons(" Unknown type for DD statistics");
 +        }
 +    }
 +    fprintf(fplog, "\n");
 +
 +    if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
 +    {
 +        print_dd_load_av(fplog, cr->dd);
 +    }
 +}
 +
 +void dd_partition_system(FILE                *fplog,
 +                         gmx_large_int_t      step,
 +                         t_commrec           *cr,
 +                         gmx_bool             bMasterState,
 +                         int                  nstglobalcomm,
 +                         t_state             *state_global,
 +                         gmx_mtop_t          *top_global,
 +                         t_inputrec          *ir,
 +                         t_state             *state_local,
 +                         rvec               **f,
 +                         t_mdatoms           *mdatoms,
 +                         gmx_localtop_t      *top_local,
 +                         t_forcerec          *fr,
 +                         gmx_vsite_t         *vsite,
 +                         gmx_shellfc_t        shellfc,
 +                         gmx_constr_t         constr,
 +                         t_nrnb              *nrnb,
 +                         gmx_wallcycle_t      wcycle,
 +                         gmx_bool             bVerbose)
 +{
 +    gmx_domdec_t      *dd;
 +    gmx_domdec_comm_t *comm;
 +    gmx_ddbox_t        ddbox = {0};
 +    t_block           *cgs_gl;
 +    gmx_large_int_t    step_pcoupl;
 +    rvec               cell_ns_x0, cell_ns_x1;
 +    int                i, j, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
 +    gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckDLB, bTurnOnDLB, bLogLoad;
 +    gmx_bool           bRedist, bSortCG, bResortAll;
 +    ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
 +    real               grid_density;
 +    char               sbuf[22];
 +
 +    dd   = cr->dd;
 +    comm = dd->comm;
 +
 +    bBoxChanged = (bMasterState || DEFORM(*ir));
 +    if (ir->epc != epcNO)
 +    {
 +        /* With nstpcouple > 1 pressure coupling happens.
 +         * one step after calculating the pressure.
 +         * Box scaling happens at the end of the MD step,
 +         * after the DD partitioning.
 +         * We therefore have to do DLB in the first partitioning
 +         * after an MD step where P-coupling occured.
 +         * We need to determine the last step in which p-coupling occurred.
 +         * MRS -- need to validate this for vv?
 +         */
 +        n = ir->nstpcouple;
 +        if (n == 1)
 +        {
 +            step_pcoupl = step - 1;
 +        }
 +        else
 +        {
 +            step_pcoupl = ((step - 1)/n)*n + 1;
 +        }
 +        if (step_pcoupl >= comm->partition_step)
 +        {
 +            bBoxChanged = TRUE;
 +        }
 +    }
 +
 +    bNStGlobalComm = (step % nstglobalcomm == 0);
 +
 +    if (!comm->bDynLoadBal)
 +    {
 +        bDoDLB = FALSE;
 +    }
 +    else
 +    {
 +        /* Should we do dynamic load balacing this step?
 +         * Since it requires (possibly expensive) global communication,
 +         * we might want to do DLB less frequently.
 +         */
 +        if (bBoxChanged || ir->epc != epcNO)
 +        {
 +            bDoDLB = bBoxChanged;
 +        }
 +        else
 +        {
 +            bDoDLB = bNStGlobalComm;
 +        }
 +    }
 +
 +    /* Check if we have recorded loads on the nodes */
 +    if (comm->bRecordLoad && dd_load_count(comm))
 +    {
 +        if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
 +        {
 +            /* Check if we should use DLB at the second partitioning
 +             * and every 100 partitionings,
 +             * so the extra communication cost is negligible.
 +             */
 +            n         = max(100, nstglobalcomm);
 +            bCheckDLB = (comm->n_load_collect == 0 ||
 +                         comm->n_load_have % n == n-1);
 +        }
 +        else
 +        {
 +            bCheckDLB = FALSE;
 +        }
 +
 +        /* Print load every nstlog, first and last step to the log file */
 +        bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
 +                    comm->n_load_collect == 0 ||
 +                    (ir->nsteps >= 0 &&
 +                     (step + ir->nstlist > ir->init_step + ir->nsteps)));
 +
 +        /* Avoid extra communication due to verbose screen output
 +         * when nstglobalcomm is set.
 +         */
 +        if (bDoDLB || bLogLoad || bCheckDLB ||
 +            (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
 +        {
 +            get_load_distribution(dd, wcycle);
 +            if (DDMASTER(dd))
 +            {
 +                if (bLogLoad)
 +                {
 +                    dd_print_load(fplog, dd, step-1);
 +                }
 +                if (bVerbose)
 +                {
 +                    dd_print_load_verbose(dd);
 +                }
 +            }
 +            comm->n_load_collect++;
 +
 +            if (bCheckDLB)
 +            {
 +                /* Since the timings are node dependent, the master decides */
 +                if (DDMASTER(dd))
 +                {
 +                    bTurnOnDLB =
 +                        (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
 +                    if (debug)
 +                    {
 +                        fprintf(debug, "step %s, imb loss %f\n",
 +                                gmx_step_str(step, sbuf),
 +                                dd_force_imb_perf_loss(dd));
 +                    }
 +                }
 +                dd_bcast(dd, sizeof(bTurnOnDLB), &bTurnOnDLB);
 +                if (bTurnOnDLB)
 +                {
 +                    turn_on_dlb(fplog, cr, step);
 +                    bDoDLB = TRUE;
 +                }
 +            }
 +        }
 +        comm->n_load_have++;
 +    }
 +
 +    cgs_gl = &comm->cgs_gl;
 +
 +    bRedist = FALSE;
 +    if (bMasterState)
 +    {
 +        /* Clear the old state */
 +        clear_dd_indices(dd, 0, 0);
 +        ncgindex_set = 0;
 +
 +        set_ddbox(dd, bMasterState, cr, ir, state_global->box,
 +                  TRUE, cgs_gl, state_global->x, &ddbox);
 +
 +        get_cg_distribution(fplog, step, dd, cgs_gl,
 +                            state_global->box, &ddbox, state_global->x);
 +
 +        dd_distribute_state(dd, cgs_gl,
 +                            state_global, state_local, f);
 +
 +        dd_make_local_cgs(dd, &top_local->cgs);
 +
 +        /* Ensure that we have space for the new distribution */
 +        dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
 +
 +        if (fr->cutoff_scheme == ecutsGROUP)
 +        {
 +            calc_cgcm(fplog, 0, dd->ncg_home,
 +                      &top_local->cgs, state_local->x, fr->cg_cm);
 +        }
 +
 +        inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
 +
 +        dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
 +    }
 +    else if (state_local->ddp_count != dd->ddp_count)
 +    {
 +        if (state_local->ddp_count > dd->ddp_count)
 +        {
 +            gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local->ddp_count, dd->ddp_count);
 +        }
 +
 +        if (state_local->ddp_count_cg_gl != state_local->ddp_count)
 +        {
 +            gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
 +        }
 +
 +        /* Clear the old state */
 +        clear_dd_indices(dd, 0, 0);
 +
 +        /* Build the new indices */
 +        rebuild_cgindex(dd, cgs_gl->index, state_local);
 +        make_dd_indices(dd, cgs_gl->index, 0);
 +        ncgindex_set = dd->ncg_home;
 +
 +        if (fr->cutoff_scheme == ecutsGROUP)
 +        {
 +            /* Redetermine the cg COMs */
 +            calc_cgcm(fplog, 0, dd->ncg_home,
 +                      &top_local->cgs, state_local->x, fr->cg_cm);
 +        }
 +
 +        inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
 +
 +        dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
 +
 +        set_ddbox(dd, bMasterState, cr, ir, state_local->box,
 +                  TRUE, &top_local->cgs, state_local->x, &ddbox);
 +
 +        bRedist = comm->bDynLoadBal;
 +    }
 +    else
 +    {
 +        /* We have the full state, only redistribute the cgs */
 +
 +        /* Clear the non-home indices */
 +        clear_dd_indices(dd, dd->ncg_home, dd->nat_home);
 +        ncgindex_set = 0;
 +
 +        /* Avoid global communication for dim's without pbc and -gcom */
 +        if (!bNStGlobalComm)
 +        {
 +            copy_rvec(comm->box0, ddbox.box0    );
 +            copy_rvec(comm->box_size, ddbox.box_size);
 +        }
 +        set_ddbox(dd, bMasterState, cr, ir, state_local->box,
 +                  bNStGlobalComm, &top_local->cgs, state_local->x, &ddbox);
 +
 +        bBoxChanged = TRUE;
 +        bRedist     = TRUE;
 +    }
 +    /* For dim's without pbc and -gcom */
 +    copy_rvec(ddbox.box0, comm->box0    );
 +    copy_rvec(ddbox.box_size, comm->box_size);
 +
 +    set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(&ddbox, ir), bMasterState, bDoDLB,
 +                      step, wcycle);
 +
 +    if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
 +    {
 +        write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
 +    }
 +
 +    /* Check if we should sort the charge groups */
 +    if (comm->nstSortCG > 0)
 +    {
 +        bSortCG = (bMasterState ||
 +                   (bRedist && (step % comm->nstSortCG == 0)));
 +    }
 +    else
 +    {
 +        bSortCG = FALSE;
 +    }
 +
 +    ncg_home_old = dd->ncg_home;
 +
 +    ncg_moved = 0;
 +    if (bRedist)
 +    {
 +        wallcycle_sub_start(wcycle, ewcsDD_REDIST);
 +
 +        dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
 +                           state_local, f, fr,
 +                           !bSortCG, nrnb, &ncgindex_set, &ncg_moved);
 +
 +        wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
 +    }
 +
 +    get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
 +                          dd, &ddbox,
 +                          &comm->cell_x0, &comm->cell_x1,
 +                          dd->ncg_home, fr->cg_cm,
 +                          cell_ns_x0, cell_ns_x1, &grid_density);
 +
 +    if (bBoxChanged)
 +    {
 +        comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
 +    }
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +        case ecutsGROUP:
 +            copy_ivec(fr->ns.grid->n, ncells_old);
 +            grid_first(fplog, fr->ns.grid, dd, &ddbox,
 +                       state_local->box, cell_ns_x0, cell_ns_x1,
 +                       fr->rlistlong, grid_density);
 +            break;
 +        case ecutsVERLET:
 +            nbnxn_get_ncells(fr->nbv->nbs, &ncells_old[XX], &ncells_old[YY]);
 +            break;
 +        default:
 +            gmx_incons("unimplemented");
 +    }
 +    /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
 +    copy_ivec(ddbox.tric_dir, comm->tric_dir);
 +
 +    if (bSortCG)
 +    {
 +        wallcycle_sub_start(wcycle, ewcsDD_GRID);
 +
 +        /* Sort the state on charge group position.
 +         * This enables exact restarts from this step.
 +         * It also improves performance by about 15% with larger numbers
 +         * of atoms per node.
 +         */
 +
 +        /* Fill the ns grid with the home cell,
 +         * so we can sort with the indices.
 +         */
 +        set_zones_ncg_home(dd);
 +
 +        switch (fr->cutoff_scheme)
 +        {
 +            case ecutsVERLET:
 +                set_zones_size(dd, state_local->box, &ddbox, 0, 1);
 +
 +                nbnxn_put_on_grid(fr->nbv->nbs, fr->ePBC, state_local->box,
 +                                  0,
 +                                  comm->zones.size[0].bb_x0,
 +                                  comm->zones.size[0].bb_x1,
 +                                  0, dd->ncg_home,
 +                                  comm->zones.dens_zone0,
 +                                  fr->cginfo,
 +                                  state_local->x,
 +                                  ncg_moved, bRedist ? comm->moved : NULL,
 +                                  fr->nbv->grp[eintLocal].kernel_type,
 +                                  fr->nbv->grp[eintLocal].nbat);
 +
 +                nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]);
 +                break;
 +            case ecutsGROUP:
 +                fill_grid(&comm->zones, fr->ns.grid, dd->ncg_home,
 +                          0, dd->ncg_home, fr->cg_cm);
 +
 +                copy_ivec(fr->ns.grid->n, ncells_new);
 +                break;
 +            default:
 +                gmx_incons("unimplemented");
 +        }
 +
 +        bResortAll = bMasterState;
 +
 +        /* Check if we can user the old order and ns grid cell indices
 +         * of the charge groups to sort the charge groups efficiently.
 +         */
 +        if (ncells_new[XX] != ncells_old[XX] ||
 +            ncells_new[YY] != ncells_old[YY] ||
 +            ncells_new[ZZ] != ncells_old[ZZ])
 +        {
 +            bResortAll = TRUE;
 +        }
 +
 +        if (debug)
 +        {
 +            fprintf(debug, "Step %s, sorting the %d home charge groups\n",
 +                    gmx_step_str(step, sbuf), dd->ncg_home);
 +        }
 +        dd_sort_state(dd, fr->cg_cm, fr, state_local,
 +                      bResortAll ? -1 : ncg_home_old);
 +        /* Rebuild all the indices */
 +        ga2la_clear(dd->ga2la);
 +        ncgindex_set = 0;
 +
 +        wallcycle_sub_stop(wcycle, ewcsDD_GRID);
 +    }
 +
 +    wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
 +
 +    /* Setup up the communication and communicate the coordinates */
 +    setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
 +
 +    /* Set the indices */
 +    make_dd_indices(dd, cgs_gl->index, ncgindex_set);
 +
 +    /* Set the charge group boundaries for neighbor searching */
 +    set_cg_boundaries(&comm->zones);
 +
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        set_zones_size(dd, state_local->box, &ddbox,
 +                       bSortCG ? 1 : 0, comm->zones.n);
 +    }
 +
 +    wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
 +
 +    /*
 +       write_dd_pdb("dd_home",step,"dump",top_global,cr,
 +                 -1,state_local->x,state_local->box);
 +     */
 +
 +    wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
 +
 +    /* Extract a local topology from the global topology */
 +    for (i = 0; i < dd->ndim; i++)
 +    {
 +        np[dd->dim[i]] = comm->cd[i].np;
 +    }
 +    dd_make_local_top(dd, &comm->zones, dd->npbcdim, state_local->box,
 +                      comm->cellsize_min, np,
 +                      fr,
 +                      fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : state_local->x,
 +                      vsite, top_global, top_local);
 +
 +    wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
 +
 +    wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
 +
 +    /* Set up the special atom communication */
 +    n = comm->nat[ddnatZONE];
 +    for (i = ddnatZONE+1; i < ddnatNR; i++)
 +    {
 +        switch (i)
 +        {
 +            case ddnatVSITE:
 +                if (vsite && vsite->n_intercg_vsite)
 +                {
 +                    n = dd_make_local_vsites(dd, n, top_local->idef.il);
 +                }
 +                break;
 +            case ddnatCON:
 +                if (dd->bInterCGcons || dd->bInterCGsettles)
 +                {
 +                    /* Only for inter-cg constraints we need special code */
 +                    n = dd_make_local_constraints(dd, n, top_global, fr->cginfo,
 +                                                  constr, ir->nProjOrder,
 +                                                  top_local->idef.il);
 +                }
 +                break;
 +            default:
 +                gmx_incons("Unknown special atom type setup");
 +        }
 +        comm->nat[i] = n;
 +    }
 +
 +    wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
 +
 +    wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
 +
 +    /* Make space for the extra coordinates for virtual site
 +     * or constraint communication.
 +     */
 +    state_local->natoms = comm->nat[ddnatNR-1];
 +    if (state_local->natoms > state_local->nalloc)
 +    {
 +        dd_realloc_state(state_local, f, state_local->natoms);
 +    }
 +
 +    if (fr->bF_NoVirSum)
 +    {
 +        if (vsite && vsite->n_intercg_vsite)
 +        {
 +            nat_f_novirsum = comm->nat[ddnatVSITE];
 +        }
 +        else
 +        {
 +            if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
 +            {
 +                nat_f_novirsum = dd->nat_tot;
 +            }
 +            else
 +            {
 +                nat_f_novirsum = dd->nat_home;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        nat_f_novirsum = 0;
 +    }
 +
 +    /* Set the number of atoms required for the force calculation.
 +     * Forces need to be constrained when using a twin-range setup
 +     * or with energy minimization. For simple simulations we could
 +     * avoid some allocation, zeroing and copying, but this is
 +     * probably not worth the complications ande checking.
 +     */
 +    forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot,
 +                        dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum);
 +
 +    /* We make the all mdatoms up to nat_tot_con.
 +     * We could save some work by only setting invmass
 +     * between nat_tot and nat_tot_con.
 +     */
 +    /* This call also sets the new number of home particles to dd->nat_home */
 +    atoms2md(top_global, ir,
 +             comm->nat[ddnatCON], dd->gatindex, 0, dd->nat_home, mdatoms);
 +
 +    /* Now we have the charges we can sort the FE interactions */
 +    dd_sort_local_top(dd, mdatoms, top_local);
 +
 +    if (vsite != NULL)
 +    {
 +        /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
 +        split_vsites_over_threads(top_local->idef.il, mdatoms, FALSE, vsite);
 +    }
 +
 +    if (shellfc)
 +    {
 +        /* Make the local shell stuff, currently no communication is done */
 +        make_local_shells(cr, mdatoms, shellfc);
 +    }
 +
 +    if (ir->implicit_solvent)
 +    {
 +        make_local_gb(cr, fr->born, ir->gb_algorithm);
 +    }
 +
 +    setup_bonded_threading(fr, &top_local->idef);
 +
 +    if (!(cr->duty & DUTY_PME))
 +    {
 +        /* Send the charges to our PME only node */
 +        gmx_pme_send_q(cr, mdatoms->nChargePerturbed,
 +                       mdatoms->chargeA, mdatoms->chargeB,
 +                       dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
 +    }
 +
 +    if (constr)
 +    {
 +        set_constraints(constr, top_local, ir, mdatoms, cr);
 +    }
 +
 +    if (ir->ePull != epullNO)
 +    {
 +        /* Update the local pull groups */
 +        dd_make_local_pull_groups(dd, ir->pull, mdatoms);
 +    }
 +
 +    if (ir->bRot)
 +    {
 +        /* Update the local rotation groups */
 +        dd_make_local_rotation_groups(dd, ir->rot);
 +    }
 +
 +
 +    add_dd_statistics(dd);
 +
 +    /* Make sure we only count the cycles for this DD partitioning */
 +    clear_dd_cycle_counts(dd);
 +
 +    /* Because the order of the atoms might have changed since
 +     * the last vsite construction, we need to communicate the constructing
 +     * atom coordinates again (for spreading the forces this MD step).
 +     */
 +    dd_move_x_vsites(dd, state_local->box, state_local->x);
 +
 +    wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
 +
 +    if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
 +    {
 +        dd_move_x(dd, state_local->box, state_local->x);
 +        write_dd_pdb("dd_dump", step, "dump", top_global, cr,
 +                     -1, state_local->x, state_local->box);
 +    }
 +
 +    /* Store the partitioning step */
 +    comm->partition_step = step;
 +
 +    /* Increase the DD partitioning counter */
 +    dd->ddp_count++;
 +    /* The state currently matches this DD partitioning count, store it */
 +    state_local->ddp_count = dd->ddp_count;
 +    if (bMasterState)
 +    {
 +        /* The DD master node knows the complete cg distribution,
 +         * store the count so we can possibly skip the cg info communication.
 +         */
 +        comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
 +    }
 +
 +    if (comm->DD_debug > 0)
 +    {
 +        /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
 +        check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global),
 +                                "after partitioning");
 +    }
 +}
index b7e6ac8786444c1f2ebd84a20b56c403f15b02d5,0000000000000000000000000000000000000000..9e22d6c653a30ef462f69325f5ab917f80f13d04
mode 100644,000000..100644
--- /dev/null
@@@ -1,633 -1,0 +1,633 @@@
-     int        ti0_array[2*GMX_SIMD_WIDTH_HERE-1], *ti0 = NULL;
-     int        ti1_array[2*GMX_SIMD_WIDTH_HERE-1], *ti1 = NULL;
-     int        ti2_array[2*GMX_SIMD_WIDTH_HERE-1], *ti2 = NULL;
-     int        ti3_array[2*GMX_SIMD_WIDTH_HERE-1], *ti3 = NULL;
 +/*
 + * This file is part of the GROMACS molecular simulation package.
 + *
 + * Copyright (c) 2012,2013, by the GROMACS development team, led by
 + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 + * and including many others, as listed in the AUTHORS file in the
 + * top-level source directory and at http://www.gromacs.org.
 + *
 + * GROMACS is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2.1
 + * of the License, or (at your option) any later version.
 + *
 + * GROMACS is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with GROMACS; if not, see
 + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 + *
 + * If you want to redistribute modifications to GROMACS, please
 + * consider that scientific software is very special. Version
 + * control is crucial - bugs must be traceable. We will be happy to
 + * consider code for inclusion in the official distribution, but
 + * derived work must not be called official GROMACS. Details are found
 + * in the README & COPYING files - if they are missing, get the
 + * official version at http://www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org.
 + */
 +
 +{
 +    const nbnxn_ci_t   *nbln;
 +    const nbnxn_cj_t   *l_cj;
 +    const int          *type;
 +    const real         *q;
 +    const real         *shiftvec;
 +    const real         *x;
 +    const real         *nbfp0, *nbfp1, *nbfp2 = NULL, *nbfp3 = NULL;
 +    real                facel;
 +    real               *nbfp_ptr;
 +    int                 n, ci, ci_sh;
 +    int                 ish, ish3;
 +    gmx_bool            do_LJ, half_LJ, do_coul;
 +    int                 sci, scix, sciy, sciz, sci2;
 +    int                 cjind0, cjind1, cjind;
 +    int                 ip, jp;
 +
 +#ifdef ENERGY_GROUPS
 +    int         Vstride_i;
 +    int         egps_ishift, egps_imask;
 +    int         egps_jshift, egps_jmask, egps_jstride;
 +    int         egps_i;
 +    real       *vvdwtp[UNROLLI];
 +    real       *vctp[UNROLLI];
 +#endif
 +
 +    gmx_mm_pr  shX_S;
 +    gmx_mm_pr  shY_S;
 +    gmx_mm_pr  shZ_S;
 +    gmx_mm_pr  ix_S0, iy_S0, iz_S0;
 +    gmx_mm_pr  ix_S1, iy_S1, iz_S1;
 +    gmx_mm_pr  ix_S2, iy_S2, iz_S2;
 +    gmx_mm_pr  ix_S3, iy_S3, iz_S3;
 +    gmx_mm_pr  fix_S0, fiy_S0, fiz_S0;
 +    gmx_mm_pr  fix_S1, fiy_S1, fiz_S1;
 +    gmx_mm_pr  fix_S2, fiy_S2, fiz_S2;
 +    gmx_mm_pr  fix_S3, fiy_S3, fiz_S3;
 +#if UNROLLJ >= 4
 +    /* We use an i-force SIMD register width of 4 */
 +    gmx_mm_pr4 fix_S, fiy_S, fiz_S;
 +#else
 +    /* We use an i-force SIMD register width of 2 */
 +    gmx_mm_pr  fix0_S, fiy0_S, fiz0_S;
 +    gmx_mm_pr  fix2_S, fiy2_S, fiz2_S;
 +#endif
 +
 +    gmx_mm_pr  diagonal_jmi_S;
 +#if UNROLLI == UNROLLJ
 +    gmx_mm_pb  diagonal_mask_S0, diagonal_mask_S1, diagonal_mask_S2, diagonal_mask_S3;
 +#else
 +    gmx_mm_pb  diagonal_mask0_S0, diagonal_mask0_S1, diagonal_mask0_S2, diagonal_mask0_S3;
 +    gmx_mm_pb  diagonal_mask1_S0, diagonal_mask1_S1, diagonal_mask1_S2, diagonal_mask1_S3;
 +#endif
 +
 +    unsigned      *exclusion_filter;
 +    gmx_exclfilter filter_S0, filter_S1, filter_S2, filter_S3;
 +
 +    gmx_mm_pr      zero_S = gmx_set1_pr(0.0);
 +
 +    gmx_mm_pr      one_S  = gmx_set1_pr(1.0);
 +    gmx_mm_pr      iq_S0  = gmx_setzero_pr();
 +    gmx_mm_pr      iq_S1  = gmx_setzero_pr();
 +    gmx_mm_pr      iq_S2  = gmx_setzero_pr();
 +    gmx_mm_pr      iq_S3  = gmx_setzero_pr();
 +    gmx_mm_pr      mrc_3_S;
 +#ifdef CALC_ENERGIES
 +    gmx_mm_pr      hrc_3_S, moh_rc_S;
 +#endif
 +
 +#ifdef CALC_COUL_TAB
 +    /* Coulomb table variables */
 +    gmx_mm_pr   invtsp_S;
 +    const real *tab_coul_F;
 +#ifndef TAB_FDV0
 +    const real *tab_coul_V;
 +#endif
 +    /* Thread-local working buffers for force and potential lookups */
-     gmx_mm_pr  mhalfsp_S;
++    int         ti0_array[2*GMX_SIMD_WIDTH_HERE], *ti0 = NULL;
++    int         ti1_array[2*GMX_SIMD_WIDTH_HERE], *ti1 = NULL;
++    int         ti2_array[2*GMX_SIMD_WIDTH_HERE], *ti2 = NULL;
++    int         ti3_array[2*GMX_SIMD_WIDTH_HERE], *ti3 = NULL;
 +#ifdef CALC_ENERGIES
++    gmx_mm_pr   mhalfsp_S;
 +#endif
 +#endif
 +
 +#ifdef CALC_COUL_EWALD
 +    gmx_mm_pr beta2_S, beta_S;
 +#endif
 +
 +#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
 +    gmx_mm_pr  sh_ewald_S;
 +#endif
 +
 +#ifdef LJ_COMB_LB
 +    const real *ljc;
 +
 +    gmx_mm_pr   hsig_i_S0, seps_i_S0;
 +    gmx_mm_pr   hsig_i_S1, seps_i_S1;
 +    gmx_mm_pr   hsig_i_S2, seps_i_S2;
 +    gmx_mm_pr   hsig_i_S3, seps_i_S3;
 +#else
 +#ifdef FIX_LJ_C
 +    real        pvdw_array[2*UNROLLI*UNROLLJ+3];
 +    real       *pvdw_c6, *pvdw_c12;
 +    gmx_mm_pr   c6_S0, c12_S0;
 +    gmx_mm_pr   c6_S1, c12_S1;
 +    gmx_mm_pr   c6_S2, c12_S2;
 +    gmx_mm_pr   c6_S3, c12_S3;
 +#endif
 +
 +#ifdef LJ_COMB_GEOM
 +    const real *ljc;
 +
 +    gmx_mm_pr   c6s_S0, c12s_S0;
 +    gmx_mm_pr   c6s_S1, c12s_S1;
 +    gmx_mm_pr   c6s_S2 = gmx_setzero_pr(), c12s_S2 = gmx_setzero_pr();
 +    gmx_mm_pr   c6s_S3 = gmx_setzero_pr(), c12s_S3 = gmx_setzero_pr();
 +#endif
 +#endif /* LJ_COMB_LB */
 +
 +    gmx_mm_pr  vctot_S, Vvdwtot_S;
 +    gmx_mm_pr  sixth_S, twelveth_S;
 +
 +    gmx_mm_pr  avoid_sing_S;
 +    gmx_mm_pr  rc2_S;
 +#ifdef VDW_CUTOFF_CHECK
 +    gmx_mm_pr  rcvdw2_S;
 +#endif
 +
 +#ifdef CALC_ENERGIES
 +    gmx_mm_pr  sh_invrc6_S, sh_invrc12_S;
 +
 +    /* cppcheck-suppress unassignedVariable */
 +    real       tmpsum_array[15], *tmpsum;
 +#endif
 +#ifdef CALC_SHIFTFORCES
 +    /* cppcheck-suppress unassignedVariable */
 +    real       shf_array[15], *shf;
 +#endif
 +
 +    int ninner;
 +
 +#ifdef COUNT_PAIRS
 +    int npair = 0;
 +#endif
 +
 +#if defined LJ_COMB_GEOM || defined LJ_COMB_LB
 +    ljc = nbat->lj_comb;
 +#else
 +    /* No combination rule used */
 +    nbfp_ptr    = (4 == nbfp_stride) ? nbat->nbfp_s4 : nbat->nbfp;
 +#endif
 +
 +    /* Load j-i for the first i */
 +    diagonal_jmi_S    = gmx_load_pr(nbat->simd_4xn_diagonal_j_minus_i);
 +    /* Generate all the diagonal masks as comparison results */
 +#if UNROLLI == UNROLLJ
 +    diagonal_mask_S0  = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 +    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 +    diagonal_mask_S1  = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 +    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 +    diagonal_mask_S2  = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 +    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 +    diagonal_mask_S3  = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 +#else
 +#if UNROLLI == 2*UNROLLJ || 2*UNROLLI == UNROLLJ
 +    diagonal_mask0_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 +    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 +    diagonal_mask0_S1 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 +    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 +    diagonal_mask0_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 +    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 +    diagonal_mask0_S3 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 +    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 +
 +#if UNROLLI == 2*UNROLLJ
 +    /* Load j-i for the second half of the j-cluster */
 +    diagonal_jmi_S    = gmx_load_pr(nbat->simd_4xn_diagonal_j_minus_i + UNROLLJ);
 +#endif
 +
 +    diagonal_mask1_S0 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 +    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 +    diagonal_mask1_S1 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 +    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 +    diagonal_mask1_S2 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 +    diagonal_jmi_S    = gmx_sub_pr(diagonal_jmi_S, one_S);
 +    diagonal_mask1_S3 = gmx_cmplt_pr(zero_S, diagonal_jmi_S);
 +#endif
 +#endif
 +
 +    /* Load masks for topology exclusion masking. filter_stride is
 +       static const, so the conditional will be optimized away. */
 +    if (1 == filter_stride)
 +    {
 +        exclusion_filter = nbat->simd_exclusion_filter1;
 +    }
 +    else /* (2 == filter_stride) */
 +    {
 +        exclusion_filter = nbat->simd_exclusion_filter2;
 +    }
 +
 +    /* Here we cast the exclusion filters from unsigned * to int * or real *.
 +     * Since we only check bits, the actual value they represent does not
 +     * matter, as long as both filter and mask data are treated the same way.
 +     */
 +    filter_S0    = gmx_load_exclusion_filter(exclusion_filter + 0*UNROLLJ*filter_stride);
 +    filter_S1    = gmx_load_exclusion_filter(exclusion_filter + 1*UNROLLJ*filter_stride);
 +    filter_S2    = gmx_load_exclusion_filter(exclusion_filter + 2*UNROLLJ*filter_stride);
 +    filter_S3    = gmx_load_exclusion_filter(exclusion_filter + 3*UNROLLJ*filter_stride);
 +
 +#ifdef CALC_COUL_TAB
 +    /* Generate aligned table index pointers */
 +    ti0 = prepare_table_load_buffer(ti0_array);
 +    ti1 = prepare_table_load_buffer(ti1_array);
 +    ti2 = prepare_table_load_buffer(ti2_array);
 +    ti3 = prepare_table_load_buffer(ti3_array);
 +
 +    invtsp_S  = gmx_set1_pr(ic->tabq_scale);
 +#ifdef CALC_ENERGIES
 +    mhalfsp_S = gmx_set1_pr(-0.5/ic->tabq_scale);
 +#endif
 +
 +#ifdef TAB_FDV0
 +    tab_coul_F = ic->tabq_coul_FDV0;
 +#else
 +    tab_coul_F = ic->tabq_coul_F;
 +    tab_coul_V = ic->tabq_coul_V;
 +#endif
 +#endif /* CALC_COUL_TAB */
 +
 +#ifdef CALC_COUL_EWALD
 +    beta2_S = gmx_set1_pr(ic->ewaldcoeff*ic->ewaldcoeff);
 +    beta_S  = gmx_set1_pr(ic->ewaldcoeff);
 +#endif
 +
 +#if (defined CALC_COUL_TAB || defined CALC_COUL_EWALD) && defined CALC_ENERGIES
 +    sh_ewald_S = gmx_set1_pr(ic->sh_ewald);
 +#endif
 +
 +    q                   = nbat->q;
 +    type                = nbat->type;
 +    facel               = ic->epsfac;
 +    shiftvec            = shift_vec[0];
 +    x                   = nbat->x;
 +
 +    avoid_sing_S = gmx_set1_pr(NBNXN_AVOID_SING_R2_INC);
 +
 +    /* The kernel either supports rcoulomb = rvdw or rcoulomb >= rvdw */
 +    rc2_S    = gmx_set1_pr(ic->rcoulomb*ic->rcoulomb);
 +#ifdef VDW_CUTOFF_CHECK
 +    rcvdw2_S = gmx_set1_pr(ic->rvdw*ic->rvdw);
 +#endif
 +
 +#ifdef CALC_ENERGIES
 +    sixth_S      = gmx_set1_pr(1.0/6.0);
 +    twelveth_S   = gmx_set1_pr(1.0/12.0);
 +
 +    sh_invrc6_S  = gmx_set1_pr(ic->sh_invrc6);
 +    sh_invrc12_S = gmx_set1_pr(ic->sh_invrc6*ic->sh_invrc6);
 +#endif
 +
 +    mrc_3_S  = gmx_set1_pr(-2*ic->k_rf);
 +
 +#ifdef CALC_ENERGIES
 +    hrc_3_S  = gmx_set1_pr(ic->k_rf);
 +
 +    moh_rc_S = gmx_set1_pr(-ic->c_rf);
 +#endif
 +
 +#ifdef CALC_ENERGIES
 +    tmpsum   = gmx_simd_align_real(tmpsum_array);
 +#endif
 +#ifdef CALC_SHIFTFORCES
 +    shf      = gmx_simd_align_real(shf_array);
 +#endif
 +
 +#ifdef FIX_LJ_C
 +    pvdw_c6  = gmx_simd_align_real(pvdw_array+3);
 +    pvdw_c12 = pvdw_c6 + UNROLLI*UNROLLJ;
 +
 +    for (jp = 0; jp < UNROLLJ; jp++)
 +    {
 +        pvdw_c6 [0*UNROLLJ+jp] = nbat->nbfp[0*2];
 +        pvdw_c6 [1*UNROLLJ+jp] = nbat->nbfp[0*2];
 +        pvdw_c6 [2*UNROLLJ+jp] = nbat->nbfp[0*2];
 +        pvdw_c6 [3*UNROLLJ+jp] = nbat->nbfp[0*2];
 +
 +        pvdw_c12[0*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 +        pvdw_c12[1*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 +        pvdw_c12[2*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 +        pvdw_c12[3*UNROLLJ+jp] = nbat->nbfp[0*2+1];
 +    }
 +    c6_S0            = gmx_load_pr(pvdw_c6 +0*UNROLLJ);
 +    c6_S1            = gmx_load_pr(pvdw_c6 +1*UNROLLJ);
 +    c6_S2            = gmx_load_pr(pvdw_c6 +2*UNROLLJ);
 +    c6_S3            = gmx_load_pr(pvdw_c6 +3*UNROLLJ);
 +
 +    c12_S0           = gmx_load_pr(pvdw_c12+0*UNROLLJ);
 +    c12_S1           = gmx_load_pr(pvdw_c12+1*UNROLLJ);
 +    c12_S2           = gmx_load_pr(pvdw_c12+2*UNROLLJ);
 +    c12_S3           = gmx_load_pr(pvdw_c12+3*UNROLLJ);
 +#endif /* FIX_LJ_C */
 +
 +#ifdef ENERGY_GROUPS
 +    egps_ishift  = nbat->neg_2log;
 +    egps_imask   = (1<<egps_ishift) - 1;
 +    egps_jshift  = 2*nbat->neg_2log;
 +    egps_jmask   = (1<<egps_jshift) - 1;
 +    egps_jstride = (UNROLLJ>>1)*UNROLLJ;
 +    /* Major division is over i-particle energy groups, determine the stride */
 +    Vstride_i    = nbat->nenergrp*(1<<nbat->neg_2log)*egps_jstride;
 +#endif
 +
 +    l_cj = nbl->cj;
 +
 +    ninner = 0;
 +    for (n = 0; n < nbl->nci; n++)
 +    {
 +        nbln = &nbl->ci[n];
 +
 +        ish              = (nbln->shift & NBNXN_CI_SHIFT);
 +        ish3             = ish*3;
 +        cjind0           = nbln->cj_ind_start;
 +        cjind1           = nbln->cj_ind_end;
 +        ci               = nbln->ci;
 +        ci_sh            = (ish == CENTRAL ? ci : -1);
 +
 +        shX_S = gmx_load1_pr(shiftvec+ish3);
 +        shY_S = gmx_load1_pr(shiftvec+ish3+1);
 +        shZ_S = gmx_load1_pr(shiftvec+ish3+2);
 +
 +#if UNROLLJ <= 4
 +        sci              = ci*STRIDE;
 +        scix             = sci*DIM;
 +        sci2             = sci*2;
 +#else
 +        sci              = (ci>>1)*STRIDE;
 +        scix             = sci*DIM + (ci & 1)*(STRIDE>>1);
 +        sci2             = sci*2 + (ci & 1)*(STRIDE>>1);
 +        sci             += (ci & 1)*(STRIDE>>1);
 +#endif
 +
 +        /* We have 5 LJ/C combinations, but use only three inner loops,
 +         * as the other combinations are unlikely and/or not much faster:
 +         * inner half-LJ + C for half-LJ + C / no-LJ + C
 +         * inner LJ + C      for full-LJ + C
 +         * inner LJ          for full-LJ + no-C / half-LJ + no-C
 +         */
 +        do_LJ   = (nbln->shift & NBNXN_CI_DO_LJ(0));
 +        do_coul = (nbln->shift & NBNXN_CI_DO_COUL(0));
 +        half_LJ = ((nbln->shift & NBNXN_CI_HALF_LJ(0)) || !do_LJ) && do_coul;
 +
 +#ifdef ENERGY_GROUPS
 +        egps_i = nbat->energrp[ci];
 +        {
 +            int ia, egp_ia;
 +
 +            for (ia = 0; ia < UNROLLI; ia++)
 +            {
 +                egp_ia     = (egps_i >> (ia*egps_ishift)) & egps_imask;
 +                vvdwtp[ia] = Vvdw + egp_ia*Vstride_i;
 +                vctp[ia]   = Vc   + egp_ia*Vstride_i;
 +            }
 +        }
 +#endif
 +#if defined CALC_ENERGIES
 +#if UNROLLJ == 4
 +        if (do_coul && l_cj[nbln->cj_ind_start].cj == ci_sh)
 +#endif
 +#if UNROLLJ == 2
 +        if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh<<1))
 +#endif
 +#if UNROLLJ == 8
 +        if (do_coul && l_cj[nbln->cj_ind_start].cj == (ci_sh>>1))
 +#endif
 +        {
 +            int  ia;
 +            real Vc_sub_self;
 +
 +#ifdef CALC_COUL_RF
 +            Vc_sub_self = 0.5*ic->c_rf;
 +#endif
 +#ifdef CALC_COUL_TAB
 +#ifdef TAB_FDV0
 +            Vc_sub_self = 0.5*tab_coul_F[2];
 +#else
 +            Vc_sub_self = 0.5*tab_coul_V[0];
 +#endif
 +#endif
 +#ifdef CALC_COUL_EWALD
 +            /* beta/sqrt(pi) */
 +            Vc_sub_self = 0.5*ic->ewaldcoeff*M_2_SQRTPI;
 +#endif
 +
 +            for (ia = 0; ia < UNROLLI; ia++)
 +            {
 +                real qi;
 +
 +                qi = q[sci+ia];
 +#ifdef ENERGY_GROUPS
 +                vctp[ia][((egps_i>>(ia*egps_ishift)) & egps_imask)*egps_jstride]
 +#else
 +                Vc[0]
 +#endif
 +                    -= facel*qi*qi*Vc_sub_self;
 +            }
 +        }
 +#endif
 +
 +        /* Load i atom data */
 +        sciy             = scix + STRIDE;
 +        sciz             = sciy + STRIDE;
 +        ix_S0            = gmx_add_pr(gmx_load1_pr(x+scix), shX_S);
 +        ix_S1            = gmx_add_pr(gmx_load1_pr(x+scix+1), shX_S);
 +        ix_S2            = gmx_add_pr(gmx_load1_pr(x+scix+2), shX_S);
 +        ix_S3            = gmx_add_pr(gmx_load1_pr(x+scix+3), shX_S);
 +        iy_S0            = gmx_add_pr(gmx_load1_pr(x+sciy), shY_S);
 +        iy_S1            = gmx_add_pr(gmx_load1_pr(x+sciy+1), shY_S);
 +        iy_S2            = gmx_add_pr(gmx_load1_pr(x+sciy+2), shY_S);
 +        iy_S3            = gmx_add_pr(gmx_load1_pr(x+sciy+3), shY_S);
 +        iz_S0            = gmx_add_pr(gmx_load1_pr(x+sciz), shZ_S);
 +        iz_S1            = gmx_add_pr(gmx_load1_pr(x+sciz+1), shZ_S);
 +        iz_S2            = gmx_add_pr(gmx_load1_pr(x+sciz+2), shZ_S);
 +        iz_S3            = gmx_add_pr(gmx_load1_pr(x+sciz+3), shZ_S);
 +
 +        if (do_coul)
 +        {
 +            iq_S0      = gmx_set1_pr(facel*q[sci]);
 +            iq_S1      = gmx_set1_pr(facel*q[sci+1]);
 +            iq_S2      = gmx_set1_pr(facel*q[sci+2]);
 +            iq_S3      = gmx_set1_pr(facel*q[sci+3]);
 +        }
 +
 +#ifdef LJ_COMB_LB
 +        hsig_i_S0      = gmx_load1_pr(ljc+sci2+0);
 +        hsig_i_S1      = gmx_load1_pr(ljc+sci2+1);
 +        hsig_i_S2      = gmx_load1_pr(ljc+sci2+2);
 +        hsig_i_S3      = gmx_load1_pr(ljc+sci2+3);
 +        seps_i_S0      = gmx_load1_pr(ljc+sci2+STRIDE+0);
 +        seps_i_S1      = gmx_load1_pr(ljc+sci2+STRIDE+1);
 +        seps_i_S2      = gmx_load1_pr(ljc+sci2+STRIDE+2);
 +        seps_i_S3      = gmx_load1_pr(ljc+sci2+STRIDE+3);
 +#else
 +#ifdef LJ_COMB_GEOM
 +        c6s_S0         = gmx_load1_pr(ljc+sci2+0);
 +        c6s_S1         = gmx_load1_pr(ljc+sci2+1);
 +        if (!half_LJ)
 +        {
 +            c6s_S2     = gmx_load1_pr(ljc+sci2+2);
 +            c6s_S3     = gmx_load1_pr(ljc+sci2+3);
 +        }
 +        c12s_S0        = gmx_load1_pr(ljc+sci2+STRIDE+0);
 +        c12s_S1        = gmx_load1_pr(ljc+sci2+STRIDE+1);
 +        if (!half_LJ)
 +        {
 +            c12s_S2    = gmx_load1_pr(ljc+sci2+STRIDE+2);
 +            c12s_S3    = gmx_load1_pr(ljc+sci2+STRIDE+3);
 +        }
 +#else
 +        nbfp0     = nbfp_ptr + type[sci  ]*nbat->ntype*nbfp_stride;
 +        nbfp1     = nbfp_ptr + type[sci+1]*nbat->ntype*nbfp_stride;
 +        if (!half_LJ)
 +        {
 +            nbfp2 = nbfp_ptr + type[sci+2]*nbat->ntype*nbfp_stride;
 +            nbfp3 = nbfp_ptr + type[sci+3]*nbat->ntype*nbfp_stride;
 +        }
 +#endif
 +#endif
 +
 +        /* Zero the potential energy for this list */
 +        Vvdwtot_S        = gmx_setzero_pr();
 +        vctot_S          = gmx_setzero_pr();
 +
 +        /* Clear i atom forces */
 +        fix_S0           = gmx_setzero_pr();
 +        fix_S1           = gmx_setzero_pr();
 +        fix_S2           = gmx_setzero_pr();
 +        fix_S3           = gmx_setzero_pr();
 +        fiy_S0           = gmx_setzero_pr();
 +        fiy_S1           = gmx_setzero_pr();
 +        fiy_S2           = gmx_setzero_pr();
 +        fiy_S3           = gmx_setzero_pr();
 +        fiz_S0           = gmx_setzero_pr();
 +        fiz_S1           = gmx_setzero_pr();
 +        fiz_S2           = gmx_setzero_pr();
 +        fiz_S3           = gmx_setzero_pr();
 +
 +        cjind = cjind0;
 +
 +        /* Currently all kernels use (at least half) LJ */
 +#define CALC_LJ
 +        if (half_LJ)
 +        {
 +#define CALC_COULOMB
 +#define HALF_LJ
 +#define CHECK_EXCLS
 +            while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
 +            {
 +#include "nbnxn_kernel_simd_4xn_inner.h"
 +                cjind++;
 +            }
 +#undef CHECK_EXCLS
 +            for (; (cjind < cjind1); cjind++)
 +            {
 +#include "nbnxn_kernel_simd_4xn_inner.h"
 +            }
 +#undef HALF_LJ
 +#undef CALC_COULOMB
 +        }
 +        else if (do_coul)
 +        {
 +#define CALC_COULOMB
 +#define CHECK_EXCLS
 +            while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
 +            {
 +#include "nbnxn_kernel_simd_4xn_inner.h"
 +                cjind++;
 +            }
 +#undef CHECK_EXCLS
 +            for (; (cjind < cjind1); cjind++)
 +            {
 +#include "nbnxn_kernel_simd_4xn_inner.h"
 +            }
 +#undef CALC_COULOMB
 +        }
 +        else
 +        {
 +#define CHECK_EXCLS
 +            while (cjind < cjind1 && nbl->cj[cjind].excl != NBNXN_INTERACTION_MASK_ALL)
 +            {
 +#include "nbnxn_kernel_simd_4xn_inner.h"
 +                cjind++;
 +            }
 +#undef CHECK_EXCLS
 +            for (; (cjind < cjind1); cjind++)
 +            {
 +#include "nbnxn_kernel_simd_4xn_inner.h"
 +            }
 +        }
 +#undef CALC_LJ
 +        ninner += cjind1 - cjind0;
 +
 +        /* Add accumulated i-forces to the force array */
 +#if UNROLLJ >= 4
 +        fix_S = gmx_mm_transpose_sum4_pr(fix_S0, fix_S1, fix_S2, fix_S3);
 +        gmx_store_pr4(f+scix, gmx_add_pr4(fix_S, gmx_load_pr4(f+scix)));
 +
 +        fiy_S = gmx_mm_transpose_sum4_pr(fiy_S0, fiy_S1, fiy_S2, fiy_S3);
 +        gmx_store_pr4(f+sciy, gmx_add_pr4(fiy_S, gmx_load_pr4(f+sciy)));
 +
 +        fiz_S = gmx_mm_transpose_sum4_pr(fiz_S0, fiz_S1, fiz_S2, fiz_S3);
 +        gmx_store_pr4(f+sciz, gmx_add_pr4(fiz_S, gmx_load_pr4(f+sciz)));
 +
 +#ifdef CALC_SHIFTFORCES
 +        fshift[ish3+0] += gmx_sum_simd4(fix_S, shf);
 +        fshift[ish3+1] += gmx_sum_simd4(fiy_S, shf);
 +        fshift[ish3+2] += gmx_sum_simd4(fiz_S, shf);
 +#endif
 +#else
 +        fix0_S = gmx_mm_transpose_sum2_pr(fix_S0, fix_S1);
 +        gmx_store_pr(f+scix, gmx_add_pr(fix0_S, gmx_load_pr(f+scix)));
 +        fix2_S = gmx_mm_transpose_sum2_pr(fix_S2, fix_S3);
 +        gmx_store_pr(f+scix+2, gmx_add_pr(fix2_S, gmx_load_pr(f+scix+2)));
 +
 +        fiy0_S = gmx_mm_transpose_sum2_pr(fiy_S0, fiy_S1);
 +        gmx_store_pr(f+sciy, gmx_add_pr(fiy0_S, gmx_load_pr(f+sciy)));
 +        fiy2_S = gmx_mm_transpose_sum2_pr(fiy_S2, fiy_S3);
 +        gmx_store_pr(f+sciy+2, gmx_add_pr(fiy2_S, gmx_load_pr(f+sciy+2)));
 +
 +        fiz0_S = gmx_mm_transpose_sum2_pr(fiz_S0, fiz_S1);
 +        gmx_store_pr(f+sciz, gmx_add_pr(fiz0_S, gmx_load_pr(f+sciz)));
 +        fiz2_S = gmx_mm_transpose_sum2_pr(fiz_S2, fiz_S3);
 +        gmx_store_pr(f+sciz+2, gmx_add_pr(fiz2_S, gmx_load_pr(f+sciz+2)));
 +
 +#ifdef CALC_SHIFTFORCES
 +        fshift[ish3+0] += gmx_sum_simd2(gmx_add_pr(fix0_S, fix2_S), shf);
 +        fshift[ish3+1] += gmx_sum_simd2(gmx_add_pr(fiy0_S, fiy2_S), shf);
 +        fshift[ish3+2] += gmx_sum_simd2(gmx_add_pr(fiz0_S, fiz2_S), shf);
 +#endif
 +#endif
 +
 +#ifdef CALC_ENERGIES
 +        if (do_coul)
 +        {
 +            *Vc += gmx_sum_simd(vctot_S, tmpsum);
 +        }
 +
 +        *Vvdw += gmx_sum_simd(Vvdwtot_S, tmpsum);
 +#endif
 +
 +        /* Outer loop uses 6 flops/iteration */
 +    }
 +
 +#ifdef COUNT_PAIRS
 +    printf("atom pairs %d\n", npair);
 +#endif
 +}
index f43fa0fb34cbb8ed8f304aa0190fdb53f1909190,0000000000000000000000000000000000000000..6d0e85df592da48876de61975e5affb18f6dc79e
mode 100644,000000..100644
--- /dev/null
@@@ -1,1767 -1,0 +1,1772 @@@
 +/*
 + * This file is part of the GROMACS molecular simulation package.
 + *
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team.
 + * Copyright (c) 2011,2012,2013, by the GROMACS development team, led by
 + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 + * and including many others, as listed in the AUTHORS file in the
 + * top-level source directory and at http://www.gromacs.org.
 + *
 + * GROMACS is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2.1
 + * of the License, or (at your option) any later version.
 + *
 + * GROMACS is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 + * Lesser General Public License for more details.
 + *
 + * You should have received a copy of the GNU Lesser General Public
 + * License along with GROMACS; if not, see
 + * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 + *
 + * If you want to redistribute modifications to GROMACS, please
 + * consider that scientific software is very special. Version
 + * control is crucial - bugs must be traceable. We will be happy to
 + * consider code for inclusion in the official distribution, but
 + * derived work must not be called official GROMACS. Details are found
 + * in the README & COPYING files - if they are missing, get the
 + * official version at http://www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org.
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +#include <signal.h>
 +#include <stdlib.h>
 +#ifdef HAVE_UNISTD_H
 +#include <unistd.h>
 +#endif
 +#include <string.h>
 +#include <assert.h>
 +
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "sysstuff.h"
 +#include "statutil.h"
 +#include "force.h"
 +#include "mdrun.h"
 +#include "md_logging.h"
 +#include "md_support.h"
 +#include "network.h"
 +#include "pull.h"
 +#include "pull_rotation.h"
 +#include "names.h"
 +#include "disre.h"
 +#include "orires.h"
 +#include "pme.h"
 +#include "mdatoms.h"
 +#include "repl_ex.h"
 +#include "qmmm.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "coulomb.h"
 +#include "constr.h"
 +#include "mvdata.h"
 +#include "checkpoint.h"
 +#include "mtop_util.h"
 +#include "sighandler.h"
 +#include "txtdump.h"
 +#include "gmx_detect_hardware.h"
 +#include "gmx_omp_nthreads.h"
 +#include "pull_rotation.h"
 +#include "calc_verletbuf.h"
 +#include "gmx_fatal_collective.h"
 +#include "membed.h"
 +#include "macros.h"
 +#include "gmx_thread_affinity.h"
 +
 +#include "gromacs/fileio/tpxio.h"
 +#include "gromacs/mdlib/nbnxn_search.h"
 +#include "gromacs/mdlib/nbnxn_consts.h"
 +#include "gromacs/timing/wallcycle.h"
 +#include "gromacs/utility/gmxmpi.h"
 +#include "gromacs/utility/gmxomp.h"
 +
 +#ifdef GMX_FAHCORE
 +#include "corewrap.h"
 +#endif
 +
 +#include "gpu_utils.h"
 +#include "nbnxn_cuda_data_mgmt.h"
 +
 +typedef struct {
 +    gmx_integrator_t *func;
 +} gmx_intp_t;
 +
 +/* The array should match the eI array in include/types/enums.h */
 +const gmx_intp_t    integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md}, {do_md}};
 +
 +gmx_large_int_t     deform_init_init_step_tpx;
 +matrix              deform_init_box_tpx;
 +tMPI_Thread_mutex_t deform_init_box_mutex = TMPI_THREAD_MUTEX_INITIALIZER;
 +
 +
 +#ifdef GMX_THREAD_MPI
 +struct mdrunner_arglist
 +{
 +    gmx_hw_opt_t    hw_opt;
 +    FILE           *fplog;
 +    t_commrec      *cr;
 +    int             nfile;
 +    const t_filenm *fnm;
 +    output_env_t    oenv;
 +    gmx_bool        bVerbose;
 +    gmx_bool        bCompact;
 +    int             nstglobalcomm;
 +    ivec            ddxyz;
 +    int             dd_node_order;
 +    real            rdd;
 +    real            rconstr;
 +    const char     *dddlb_opt;
 +    real            dlb_scale;
 +    const char     *ddcsx;
 +    const char     *ddcsy;
 +    const char     *ddcsz;
 +    const char     *nbpu_opt;
 +    gmx_large_int_t nsteps_cmdline;
 +    int             nstepout;
 +    int             resetstep;
 +    int             nmultisim;
 +    int             repl_ex_nst;
 +    int             repl_ex_nex;
 +    int             repl_ex_seed;
 +    real            pforce;
 +    real            cpt_period;
 +    real            max_hours;
 +    const char     *deviceOptions;
 +    unsigned long   Flags;
 +    int             ret; /* return value */
 +};
 +
 +
 +/* The function used for spawning threads. Extracts the mdrunner()
 +   arguments from its one argument and calls mdrunner(), after making
 +   a commrec. */
 +static void mdrunner_start_fn(void *arg)
 +{
 +    struct mdrunner_arglist *mda = (struct mdrunner_arglist*)arg;
 +    struct mdrunner_arglist  mc  = *mda; /* copy the arg list to make sure
 +                                            that it's thread-local. This doesn't
 +                                            copy pointed-to items, of course,
 +                                            but those are all const. */
 +    t_commrec *cr;                       /* we need a local version of this */
 +    FILE      *fplog = NULL;
 +    t_filenm  *fnm;
 +
 +    fnm = dup_tfn(mc.nfile, mc.fnm);
 +
 +    cr = reinitialize_commrec_for_this_thread(mc.cr);
 +
 +    if (MASTER(cr))
 +    {
 +        fplog = mc.fplog;
 +    }
 +
 +    mda->ret = mdrunner(&mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv,
 +                        mc.bVerbose, mc.bCompact, mc.nstglobalcomm,
 +                        mc.ddxyz, mc.dd_node_order, mc.rdd,
 +                        mc.rconstr, mc.dddlb_opt, mc.dlb_scale,
 +                        mc.ddcsx, mc.ddcsy, mc.ddcsz,
 +                        mc.nbpu_opt,
 +                        mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
 +                        mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce,
 +                        mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.Flags);
 +}
 +
 +/* called by mdrunner() to start a specific number of threads (including
 +   the main thread) for thread-parallel runs. This in turn calls mdrunner()
 +   for each thread.
 +   All options besides nthreads are the same as for mdrunner(). */
 +static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
 +                                         FILE *fplog, t_commrec *cr, int nfile,
 +                                         const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
 +                                         gmx_bool bCompact, int nstglobalcomm,
 +                                         ivec ddxyz, int dd_node_order, real rdd, real rconstr,
 +                                         const char *dddlb_opt, real dlb_scale,
 +                                         const char *ddcsx, const char *ddcsy, const char *ddcsz,
 +                                         const char *nbpu_opt,
 +                                         gmx_large_int_t nsteps_cmdline,
 +                                         int nstepout, int resetstep,
 +                                         int nmultisim, int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
 +                                         real pforce, real cpt_period, real max_hours,
 +                                         const char *deviceOptions, unsigned long Flags)
 +{
 +    int                      ret;
 +    struct mdrunner_arglist *mda;
 +    t_commrec               *crn; /* the new commrec */
 +    t_filenm                *fnmn;
 +
 +    /* first check whether we even need to start tMPI */
 +    if (hw_opt->nthreads_tmpi < 2)
 +    {
 +        return cr;
 +    }
 +
 +    /* a few small, one-time, almost unavoidable memory leaks: */
 +    snew(mda, 1);
 +    fnmn = dup_tfn(nfile, fnm);
 +
 +    /* fill the data structure to pass as void pointer to thread start fn */
 +    /* hw_opt contains pointers, which should all be NULL at this stage */
 +    mda->hw_opt         = *hw_opt;
 +    mda->fplog          = fplog;
 +    mda->cr             = cr;
 +    mda->nfile          = nfile;
 +    mda->fnm            = fnmn;
 +    mda->oenv           = oenv;
 +    mda->bVerbose       = bVerbose;
 +    mda->bCompact       = bCompact;
 +    mda->nstglobalcomm  = nstglobalcomm;
 +    mda->ddxyz[XX]      = ddxyz[XX];
 +    mda->ddxyz[YY]      = ddxyz[YY];
 +    mda->ddxyz[ZZ]      = ddxyz[ZZ];
 +    mda->dd_node_order  = dd_node_order;
 +    mda->rdd            = rdd;
 +    mda->rconstr        = rconstr;
 +    mda->dddlb_opt      = dddlb_opt;
 +    mda->dlb_scale      = dlb_scale;
 +    mda->ddcsx          = ddcsx;
 +    mda->ddcsy          = ddcsy;
 +    mda->ddcsz          = ddcsz;
 +    mda->nbpu_opt       = nbpu_opt;
 +    mda->nsteps_cmdline = nsteps_cmdline;
 +    mda->nstepout       = nstepout;
 +    mda->resetstep      = resetstep;
 +    mda->nmultisim      = nmultisim;
 +    mda->repl_ex_nst    = repl_ex_nst;
 +    mda->repl_ex_nex    = repl_ex_nex;
 +    mda->repl_ex_seed   = repl_ex_seed;
 +    mda->pforce         = pforce;
 +    mda->cpt_period     = cpt_period;
 +    mda->max_hours      = max_hours;
 +    mda->deviceOptions  = deviceOptions;
 +    mda->Flags          = Flags;
 +
 +    /* now spawn new threads that start mdrunner_start_fn(), while
 +       the main thread returns, we set thread affinity later */
 +    ret = tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi, TMPI_AFFINITY_NONE,
 +                       mdrunner_start_fn, (void*)(mda) );
 +    if (ret != TMPI_SUCCESS)
 +    {
 +        return NULL;
 +    }
 +
 +    crn = reinitialize_commrec_for_this_thread(cr);
 +    return crn;
 +}
 +
 +
 +static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
 +                                        const gmx_hw_opt_t  *hw_opt,
 +                                        int                  nthreads_tot,
 +                                        int                  ngpu)
 +{
 +    int nthreads_tmpi;
 +
 +    /* There are no separate PME nodes here, as we ensured in
 +     * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
 +     * and a conditional ensures we would not have ended up here.
 +     * Note that separate PME nodes might be switched on later.
 +     */
 +    if (ngpu > 0)
 +    {
 +        nthreads_tmpi = ngpu;
 +        if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi)
 +        {
 +            nthreads_tmpi = nthreads_tot;
 +        }
 +    }
 +    else if (hw_opt->nthreads_omp > 0)
 +    {
 +        /* Here we could oversubscribe, when we do, we issue a warning later */
 +        nthreads_tmpi = max(1, nthreads_tot/hw_opt->nthreads_omp);
 +    }
 +    else
 +    {
 +        /* TODO choose nthreads_omp based on hardware topology
 +           when we have a hardware topology detection library */
 +        /* In general, when running up to 4 threads, OpenMP should be faster.
 +         * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
 +         * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
 +         * even on two CPUs it's usually faster (but with many OpenMP threads
 +         * it could be faster not to use HT, currently we always use HT).
 +         * On Nehalem/Westmere we want to avoid running 16 threads over
 +         * two CPUs with HT, so we need a limit<16; thus we use 12.
 +         * A reasonable limit for Intel Sandy and Ivy bridge,
 +         * not knowing the topology, is 16 threads.
 +         */
 +        const int nthreads_omp_always_faster             =  4;
 +        const int nthreads_omp_always_faster_Nehalem     = 12;
 +        const int nthreads_omp_always_faster_SandyBridge = 16;
 +        const int first_model_Nehalem                    = 0x1A;
 +        const int first_model_SandyBridge                = 0x2A;
 +        gmx_bool  bIntel_Family6;
 +
 +        bIntel_Family6 =
 +            (gmx_cpuid_vendor(hwinfo->cpuid_info) == GMX_CPUID_VENDOR_INTEL &&
 +             gmx_cpuid_family(hwinfo->cpuid_info) == 6);
 +
 +        if (nthreads_tot <= nthreads_omp_always_faster ||
 +            (bIntel_Family6 &&
 +             ((gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_Nehalem && nthreads_tot <= nthreads_omp_always_faster_Nehalem) ||
 +              (gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_SandyBridge && nthreads_tot <= nthreads_omp_always_faster_SandyBridge))))
 +        {
 +            /* Use pure OpenMP parallelization */
 +            nthreads_tmpi = 1;
 +        }
 +        else
 +        {
 +            /* Don't use OpenMP parallelization */
 +            nthreads_tmpi = nthreads_tot;
 +        }
 +    }
 +
 +    return nthreads_tmpi;
 +}
 +
 +
 +/* Get the number of threads to use for thread-MPI based on how many
 + * were requested, which algorithms we're using,
 + * and how many particles there are.
 + * At the point we have already called check_and_update_hw_opt.
 + * Thus all options should be internally consistent and consistent
 + * with the hardware, except that ntmpi could be larger than #GPU.
 + */
 +static int get_nthreads_mpi(const gmx_hw_info_t *hwinfo,
 +                            gmx_hw_opt_t *hw_opt,
 +                            t_inputrec *inputrec, gmx_mtop_t *mtop,
 +                            const t_commrec *cr,
 +                            FILE *fplog)
 +{
 +    int      nthreads_hw, nthreads_tot_max, nthreads_tmpi, nthreads_new, ngpu;
 +    int      min_atoms_per_mpi_thread;
 +    char    *env;
 +    char     sbuf[STRLEN];
 +    gmx_bool bCanUseGPU;
 +
 +    if (hw_opt->nthreads_tmpi > 0)
 +    {
 +        /* Trivial, return right away */
 +        return hw_opt->nthreads_tmpi;
 +    }
 +
 +    nthreads_hw = hwinfo->nthreads_hw_avail;
 +
 +    /* How many total (#tMPI*#OpenMP) threads can we start? */
 +    if (hw_opt->nthreads_tot > 0)
 +    {
 +        nthreads_tot_max = hw_opt->nthreads_tot;
 +    }
 +    else
 +    {
 +        nthreads_tot_max = nthreads_hw;
 +    }
 +
 +    bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET &&
 +                  hwinfo->gpu_info.ncuda_dev_compatible > 0);
 +    if (bCanUseGPU)
 +    {
 +        ngpu = hwinfo->gpu_info.ncuda_dev_compatible;
 +    }
 +    else
 +    {
 +        ngpu = 0;
 +    }
 +
 +    if (inputrec->cutoff_scheme == ecutsGROUP)
 +    {
 +        /* We checked this before, but it doesn't hurt to do it once more */
 +        assert(hw_opt->nthreads_omp == 1);
 +    }
 +
 +    nthreads_tmpi =
 +        get_tmpi_omp_thread_division(hwinfo, hw_opt, nthreads_tot_max, ngpu);
 +
 +    if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
 +    {
 +        /* Dims/steps are divided over the nodes iso splitting the atoms */
 +        min_atoms_per_mpi_thread = 0;
 +    }
 +    else
 +    {
 +        if (bCanUseGPU)
 +        {
 +            min_atoms_per_mpi_thread = MIN_ATOMS_PER_GPU;
 +        }
 +        else
 +        {
 +            min_atoms_per_mpi_thread = MIN_ATOMS_PER_MPI_THREAD;
 +        }
 +    }
 +
 +    /* Check if an algorithm does not support parallel simulation.  */
 +    if (nthreads_tmpi != 1 &&
 +        ( inputrec->eI == eiLBFGS ||
 +          inputrec->coulombtype == eelEWALD ) )
 +    {
 +        nthreads_tmpi = 1;
 +
 +        md_print_warn(cr, fplog, "The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
 +        if (hw_opt->nthreads_tmpi > nthreads_tmpi)
 +        {
 +            gmx_fatal(FARGS, "You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
 +        }
 +    }
 +    else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_thread)
 +    {
 +        /* the thread number was chosen automatically, but there are too many
 +           threads (too few atoms per thread) */
 +        nthreads_new = max(1, mtop->natoms/min_atoms_per_mpi_thread);
 +
 +        /* Avoid partial use of Hyper-Threading */
 +        if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
 +            nthreads_new > nthreads_hw/2 && nthreads_new < nthreads_hw)
 +        {
 +            nthreads_new = nthreads_hw/2;
 +        }
 +
 +        /* Avoid large prime numbers in the thread count */
 +        if (nthreads_new >= 6)
 +        {
 +            /* Use only 6,8,10 with additional factors of 2 */
 +            int fac;
 +
 +            fac = 2;
 +            while (3*fac*2 <= nthreads_new)
 +            {
 +                fac *= 2;
 +            }
 +
 +            nthreads_new = (nthreads_new/fac)*fac;
 +        }
 +        else
 +        {
 +            /* Avoid 5 */
 +            if (nthreads_new == 5)
 +            {
 +                nthreads_new = 4;
 +            }
 +        }
 +
 +        nthreads_tmpi = nthreads_new;
 +
 +        fprintf(stderr, "\n");
 +        fprintf(stderr, "NOTE: Parallelization is limited by the small number of atoms,\n");
 +        fprintf(stderr, "      only starting %d thread-MPI threads.\n", nthreads_tmpi);
 +        fprintf(stderr, "      You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
 +    }
 +
 +    return nthreads_tmpi;
 +}
 +#endif /* GMX_THREAD_MPI */
 +
 +
 +/* Environment variable for setting nstlist */
 +static const char*  NSTLIST_ENVVAR          =  "GMX_NSTLIST";
 +/* Try to increase nstlist when using a GPU with nstlist less than this */
 +static const int    NSTLIST_GPU_ENOUGH      = 20;
 +/* Increase nstlist until the non-bonded cost increases more than this factor */
 +static const float  NBNXN_GPU_LIST_OK_FAC   = 1.20;
 +/* Don't increase nstlist beyond a non-bonded cost increases of this factor.
 + * A standard (protein+)water system at 300K with PME ewald_rtol=1e-5
 + * needs 1.28 at rcoulomb=0.9 and 1.24 at rcoulomb=1.0 to get to nstlist=40.
 + */
 +static const float  NBNXN_GPU_LIST_MAX_FAC  = 1.30;
 +
 +/* Try to increase nstlist when running on a GPU */
 +static void increase_nstlist(FILE *fp, t_commrec *cr,
 +                             t_inputrec *ir, const gmx_mtop_t *mtop, matrix box)
 +{
 +    char                  *env;
 +    int                    nstlist_orig, nstlist_prev;
 +    verletbuf_list_setup_t ls;
 +    real                   rlist_nstlist10, rlist_inc, rlist_ok, rlist_max;
 +    real                   rlist_new, rlist_prev;
 +    int                    i;
 +    t_state                state_tmp;
 +    gmx_bool               bBox, bDD, bCont;
 +    const char            *nstl_fmt = "\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
 +    const char            *vbd_err  = "Can not increase nstlist for GPU run because verlet-buffer-drift is not set or used";
 +    const char            *box_err  = "Can not increase nstlist for GPU run because the box is too small";
 +    const char            *dd_err   = "Can not increase nstlist for GPU run because of domain decomposition limitations";
 +    char                   buf[STRLEN];
 +
 +    /* Number of + nstlist alternative values to try when switching  */
 +    const int nstl[] = { 20, 25, 40 };
 +#define NNSTL  sizeof(nstl)/sizeof(nstl[0])
 +
 +    env = getenv(NSTLIST_ENVVAR);
 +    if (env == NULL)
 +    {
 +        if (fp != NULL)
 +        {
 +            fprintf(fp, nstl_fmt, ir->nstlist);
 +        }
 +    }
 +
 +    if (ir->verletbuf_drift == 0)
 +    {
 +        gmx_fatal(FARGS, "You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
 +    }
 +
 +    if (ir->verletbuf_drift < 0)
 +    {
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr, "%s\n", vbd_err);
 +        }
 +        if (fp != NULL)
 +        {
 +            fprintf(fp, "%s\n", vbd_err);
 +        }
 +
 +        return;
 +    }
 +
 +    nstlist_orig = ir->nstlist;
 +    if (env != NULL)
 +    {
 +        sprintf(buf, "Getting nstlist from environment variable GMX_NSTLIST=%s", env);
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr, "%s\n", buf);
 +        }
 +        if (fp != NULL)
 +        {
 +            fprintf(fp, "%s\n", buf);
 +        }
 +        sscanf(env, "%d", &ir->nstlist);
 +    }
 +
 +    verletbuf_get_list_setup(TRUE, &ls);
 +
 +    /* Allow rlist to make the list a given factor larger than the list
 +     * would be with nstlist=10.
 +     */
 +    nstlist_prev = ir->nstlist;
 +    ir->nstlist  = 10;
 +    calc_verlet_buffer_size(mtop, det(box), ir, ir->verletbuf_drift, &ls,
 +                            NULL, &rlist_nstlist10);
 +    ir->nstlist  = nstlist_prev;
 +
 +    /* Determine the pair list size increase due to zero interactions */
 +    rlist_inc = nbnxn_get_rlist_effective_inc(NBNXN_GPU_CLUSTER_SIZE, mtop->natoms/det(box));
 +    rlist_ok  = (rlist_nstlist10 + rlist_inc)*pow(NBNXN_GPU_LIST_OK_FAC, 1.0/3.0) - rlist_inc;
 +    rlist_max = (rlist_nstlist10 + rlist_inc)*pow(NBNXN_GPU_LIST_MAX_FAC, 1.0/3.0) - rlist_inc;
 +    if (debug)
 +    {
 +        fprintf(debug, "GPU nstlist tuning: rlist_inc %.3f rlist_max %.3f\n",
 +                rlist_inc, rlist_max);
 +    }
 +
 +    i            = 0;
 +    nstlist_prev = nstlist_orig;
 +    rlist_prev   = ir->rlist;
 +    do
 +    {
 +        if (env == NULL)
 +        {
 +            ir->nstlist = nstl[i];
 +        }
 +
 +        /* Set the pair-list buffer size in ir */
 +        calc_verlet_buffer_size(mtop, det(box), ir, ir->verletbuf_drift, &ls,
 +                                NULL, &rlist_new);
 +
 +        /* Does rlist fit in the box? */
 +        bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC, box));
 +        bDD  = TRUE;
 +        if (bBox && DOMAINDECOMP(cr))
 +        {
 +            /* Check if rlist fits in the domain decomposition */
 +            if (inputrec2nboundeddim(ir) < DIM)
 +            {
 +                gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
 +            }
 +            copy_mat(box, state_tmp.box);
 +            bDD = change_dd_cutoff(cr, &state_tmp, ir, rlist_new);
 +        }
 +
 +        bCont = FALSE;
 +
 +        if (env == NULL)
 +        {
 +            if (bBox && bDD && rlist_new <= rlist_max)
 +            {
 +                /* Increase nstlist */
 +                nstlist_prev = ir->nstlist;
 +                rlist_prev   = rlist_new;
 +                bCont        = (i+1 < NNSTL && rlist_new < rlist_ok);
 +            }
 +            else
 +            {
 +                /* Stick with the previous nstlist */
 +                ir->nstlist = nstlist_prev;
 +                rlist_new   = rlist_prev;
 +                bBox        = TRUE;
 +                bDD         = TRUE;
 +            }
 +        }
 +
 +        i++;
 +    }
 +    while (bCont);
 +
 +    if (!bBox || !bDD)
 +    {
 +        gmx_warning(!bBox ? box_err : dd_err);
 +        if (fp != NULL)
 +        {
 +            fprintf(fp, "\n%s\n", bBox ? box_err : dd_err);
 +        }
 +        ir->nstlist = nstlist_orig;
 +    }
 +    else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
 +    {
 +        sprintf(buf, "Changing nstlist from %d to %d, rlist from %g to %g",
 +                nstlist_orig, ir->nstlist,
 +                ir->rlist, rlist_new);
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr, "%s\n\n", buf);
 +        }
 +        if (fp != NULL)
 +        {
 +            fprintf(fp, "%s\n\n", buf);
 +        }
 +        ir->rlist     = rlist_new;
 +        ir->rlistlong = rlist_new;
 +    }
 +}
 +
 +static void prepare_verlet_scheme(FILE                           *fplog,
 +                                  t_commrec                      *cr,
 +                                  t_inputrec                     *ir,
 +                                  const gmx_mtop_t               *mtop,
 +                                  matrix                          box,
 +                                  gmx_bool                        bUseGPU)
 +{
 +    if (ir->verletbuf_drift > 0)
 +    {
 +        /* Update the Verlet buffer size for the current run setup */
 +        verletbuf_list_setup_t ls;
 +        real                   rlist_new;
 +
 +        /* Here we assume CPU acceleration is on. But as currently
 +         * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
 +         * and 4x2 gives a larger buffer than 4x4, this is ok.
 +         */
 +        verletbuf_get_list_setup(bUseGPU, &ls);
 +
 +        calc_verlet_buffer_size(mtop, det(box), ir,
 +                                ir->verletbuf_drift, &ls,
 +                                NULL, &rlist_new);
 +        if (rlist_new != ir->rlist)
 +        {
 +            if (fplog != NULL)
 +            {
 +                fprintf(fplog, "\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
 +                        ir->rlist, rlist_new,
 +                        ls.cluster_size_i, ls.cluster_size_j);
 +            }
 +            ir->rlist     = rlist_new;
 +            ir->rlistlong = rlist_new;
 +        }
 +    }
 +
 +    /* With GPU or emulation we should check nstlist for performance */
 +    if ((EI_DYNAMICS(ir->eI) &&
 +         bUseGPU &&
 +         ir->nstlist < NSTLIST_GPU_ENOUGH) ||
 +        getenv(NSTLIST_ENVVAR) != NULL)
 +    {
 +        /* Choose a better nstlist */
 +        increase_nstlist(fplog, cr, ir, mtop, box);
 +    }
 +}
 +
 +static void convert_to_verlet_scheme(FILE *fplog,
 +                                     t_inputrec *ir,
 +                                     gmx_mtop_t *mtop, real box_vol)
 +{
 +    char *conv_mesg = "Converting input file with group cut-off scheme to the Verlet cut-off scheme";
 +
 +    md_print_warn(NULL, fplog, "%s\n", conv_mesg);
 +
 +    ir->cutoff_scheme   = ecutsVERLET;
 +    ir->verletbuf_drift = 0.005;
 +
 +    if (ir->rcoulomb != ir->rvdw)
 +    {
 +        gmx_fatal(FARGS, "The VdW and Coulomb cut-offs are different, whereas the Verlet scheme only supports equal cut-offs");
 +    }
 +
 +    if (ir->vdwtype == evdwUSER || EEL_USER(ir->coulombtype))
 +    {
 +        gmx_fatal(FARGS, "User non-bonded potentials are not (yet) supported with the Verlet scheme");
 +    }
 +    else if (EVDW_SWITCHED(ir->vdwtype) || EEL_SWITCHED(ir->coulombtype))
 +    {
 +        md_print_warn(NULL, fplog, "Converting switched or shifted interactions to a shifted potential (without force shift), this will lead to slightly different interaction potentials");
 +
 +        if (EVDW_SWITCHED(ir->vdwtype))
 +        {
 +            ir->vdwtype = evdwCUT;
 +        }
 +        if (EEL_SWITCHED(ir->coulombtype))
 +        {
 +            if (EEL_FULL(ir->coulombtype))
 +            {
 +                /* With full electrostatic only PME can be switched */
 +                ir->coulombtype = eelPME;
 +            }
 +            else
 +            {
 +                md_print_warn(NULL, fplog, "NOTE: Replacing %s electrostatics with reaction-field with epsilon-rf=inf\n", eel_names[ir->coulombtype]);
 +                ir->coulombtype = eelRF;
 +                ir->epsilon_rf  = 0.0;
 +            }
 +        }
 +
 +        /* We set the target energy drift to a small number.
 +         * Note that this is only for testing. For production the user
 +         * should think about this and set the mdp options.
 +         */
 +        ir->verletbuf_drift = 1e-4;
 +    }
 +
 +    if (inputrec2nboundeddim(ir) != 3)
 +    {
 +        gmx_fatal(FARGS, "Can only convert old tpr files to the Verlet cut-off scheme with 3D pbc");
 +    }
 +
 +    if (ir->efep != efepNO || ir->implicit_solvent != eisNO)
 +    {
 +        gmx_fatal(FARGS, "Will not convert old tpr files to the Verlet cut-off scheme with free-energy calculations or implicit solvent");
 +    }
 +
 +    if (EI_DYNAMICS(ir->eI) && !(EI_MD(ir->eI) && ir->etc == etcNO))
 +    {
 +        verletbuf_list_setup_t ls;
 +
 +        verletbuf_get_list_setup(FALSE, &ls);
 +        calc_verlet_buffer_size(mtop, box_vol, ir, ir->verletbuf_drift, &ls,
 +                                NULL, &ir->rlist);
 +    }
 +    else
 +    {
 +        ir->verletbuf_drift = -1;
 +        ir->rlist           = 1.05*max(ir->rvdw, ir->rcoulomb);
 +    }
 +
 +    gmx_mtop_remove_chargegroups(mtop);
 +}
 +
 +static void print_hw_opt(FILE *fp, const gmx_hw_opt_t *hw_opt)
 +{
 +    fprintf(fp, "hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
 +            hw_opt->nthreads_tot,
 +            hw_opt->nthreads_tmpi,
 +            hw_opt->nthreads_omp,
 +            hw_opt->nthreads_omp_pme,
 +            hw_opt->gpu_opt.gpu_id != NULL ? hw_opt->gpu_opt.gpu_id : "");
 +}
 +
 +/* Checks we can do when we don't (yet) know the cut-off scheme */
 +static void check_and_update_hw_opt_1(gmx_hw_opt_t *hw_opt,
 +                                      gmx_bool      bIsSimMaster)
 +{
 +    gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp, bIsSimMaster);
 +
 +#ifndef GMX_THREAD_MPI
 +    if (hw_opt->nthreads_tot > 0)
 +    {
 +        gmx_fatal(FARGS, "Setting the total number of threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
 +    }
 +    if (hw_opt->nthreads_tmpi > 0)
 +    {
 +        gmx_fatal(FARGS, "Setting the number of thread-MPI threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
 +    }
 +#endif
 +
 +#ifndef GMX_OPENMP
 +    if (hw_opt->nthreads_omp > 1)
 +    {
 +        gmx_fatal(FARGS, "More than 1 OpenMP thread requested, but Gromacs was compiled without OpenMP support");
 +    }
 +    hw_opt->nthreads_omp = 1;
 +#endif
 +
 +    if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
 +    {
 +        /* We have the same number of OpenMP threads for PP and PME processes,
 +         * thus we can perform several consistency checks.
 +         */
 +        if (hw_opt->nthreads_tmpi > 0 &&
 +            hw_opt->nthreads_omp > 0 &&
 +            hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
 +        {
 +            gmx_fatal(FARGS, "The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
 +                      hw_opt->nthreads_tot, hw_opt->nthreads_tmpi, hw_opt->nthreads_omp);
 +        }
 +
 +        if (hw_opt->nthreads_tmpi > 0 &&
 +            hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
 +        {
 +            gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
 +                      hw_opt->nthreads_tot, hw_opt->nthreads_tmpi);
 +        }
 +
 +        if (hw_opt->nthreads_omp > 0 &&
 +            hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
 +        {
 +            gmx_fatal(FARGS, "The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
 +                      hw_opt->nthreads_tot, hw_opt->nthreads_omp);
 +        }
 +
 +        if (hw_opt->nthreads_tmpi > 0 &&
 +            hw_opt->nthreads_omp <= 0)
 +        {
 +            hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
 +        }
 +    }
 +
 +#ifndef GMX_OPENMP
 +    if (hw_opt->nthreads_omp > 1)
 +    {
 +        gmx_fatal(FARGS, "OpenMP threads are requested, but Gromacs was compiled without OpenMP support");
 +    }
 +#endif
 +
 +    if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
 +    {
 +        gmx_fatal(FARGS, "You need to specify -ntomp in addition to -ntomp_pme");
 +    }
 +
 +    if (hw_opt->nthreads_tot == 1)
 +    {
 +        hw_opt->nthreads_tmpi = 1;
 +
 +        if (hw_opt->nthreads_omp > 1)
 +        {
 +            gmx_fatal(FARGS, "You requested %d OpenMP threads with %d total threads",
 +                      hw_opt->nthreads_tmpi, hw_opt->nthreads_tot);
 +        }
 +        hw_opt->nthreads_omp = 1;
 +    }
 +
 +    if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
 +    {
 +        hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
 +    }
 +
 +    /* Parse GPU IDs, if provided.
 +     * We check consistency with the tMPI thread count later.
 +     */
 +    gmx_parse_gpu_ids(&hw_opt->gpu_opt);
 +
 +#ifdef GMX_THREAD_MPI
 +    if (hw_opt->gpu_opt.ncuda_dev_use > 0 && hw_opt->nthreads_tmpi == 0)
 +    {
 +        /* Set the number of MPI threads equal to the number of GPUs */
 +        hw_opt->nthreads_tmpi = hw_opt->gpu_opt.ncuda_dev_use;
 +
 +        if (hw_opt->nthreads_tot > 0 &&
 +            hw_opt->nthreads_tmpi > hw_opt->nthreads_tot)
 +        {
 +            /* We have more GPUs than total threads requested.
 +             * We choose to (later) generate a mismatch error,
 +             * instead of launching more threads than requested.
 +             */
 +            hw_opt->nthreads_tmpi = hw_opt->nthreads_tot;
 +        }
 +    }
 +#endif
 +
 +    if (debug)
 +    {
 +        print_hw_opt(debug, hw_opt);
 +    }
 +}
 +
 +/* Checks we can do when we know the cut-off scheme */
 +static void check_and_update_hw_opt_2(gmx_hw_opt_t *hw_opt,
 +                                      int           cutoff_scheme)
 +{
 +    if (cutoff_scheme == ecutsGROUP)
 +    {
 +        /* We only have OpenMP support for PME only nodes */
 +        if (hw_opt->nthreads_omp > 1)
 +        {
 +            gmx_fatal(FARGS, "OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
 +                      ecutscheme_names[cutoff_scheme],
 +                      ecutscheme_names[ecutsVERLET]);
 +        }
 +        hw_opt->nthreads_omp = 1;
 +    }
 +
 +    if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
 +    {
 +        hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
 +    }
 +
 +    if (debug)
 +    {
 +        print_hw_opt(debug, hw_opt);
 +    }
 +}
 +
 +
 +/* Override the value in inputrec with value passed on the command line (if any) */
 +static void override_nsteps_cmdline(FILE            *fplog,
 +                                    gmx_large_int_t  nsteps_cmdline,
 +                                    t_inputrec      *ir,
 +                                    const t_commrec *cr)
 +{
 +    char sbuf[STEPSTRSIZE];
 +
 +    assert(ir);
 +    assert(cr);
 +
 +    /* override with anything else than the default -2 */
 +    if (nsteps_cmdline > -2)
 +    {
 +        char stmp[STRLEN];
 +
 +        ir->nsteps = nsteps_cmdline;
 +        if (EI_DYNAMICS(ir->eI))
 +        {
 +            sprintf(stmp, "Overriding nsteps with value passed on the command line: %s steps, %.3f ps",
 +                    gmx_step_str(nsteps_cmdline, sbuf),
 +                    nsteps_cmdline*ir->delta_t);
 +        }
 +        else
 +        {
 +            sprintf(stmp, "Overriding nsteps with value passed on the command line: %s steps",
 +                    gmx_step_str(nsteps_cmdline, sbuf));
 +        }
 +
 +        md_print_warn(cr, fplog, "%s\n", stmp);
 +    }
 +}
 +
 +/* Frees GPU memory and destroys the CUDA context.
 + *
 + * Note that this function needs to be called even if GPUs are not used
 + * in this run because the PME ranks have no knowledge of whether GPUs
 + * are used or not, but all ranks need to enter the barrier below.
 + */
 +static void free_gpu_resources(FILE             *fplog,
 +                               const t_forcerec *fr,
 +                               const t_commrec  *cr)
 +{
 +    gmx_bool bIsPPrankUsingGPU;
 +    char     gpu_err_str[STRLEN];
 +
 +    bIsPPrankUsingGPU = (cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU;
 +
 +    if (bIsPPrankUsingGPU)
 +    {
 +        /* free nbnxn data in GPU memory */
 +        nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
 +
 +        /* With tMPI we need to wait for all ranks to finish deallocation before
 +         * destroying the context in free_gpu() as some ranks may be sharing
 +         * GPU and context.
 +         * Note: as only PP ranks need to free GPU resources, so it is safe to
 +         * not call the barrier on PME ranks.
 +         */
 +#ifdef GMX_THREAD_MPI
 +        if (PAR(cr))
 +        {
 +            gmx_barrier(cr);
 +        }
 +#endif  /* GMX_THREAD_MPI */
 +
 +        /* uninitialize GPU (by destroying the context) */
 +        if (!free_gpu(gpu_err_str))
 +        {
 +            gmx_warning("On node %d failed to free GPU #%d: %s",
 +                        cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
 +        }
 +    }
 +}
 +
 +int mdrunner(gmx_hw_opt_t *hw_opt,
 +             FILE *fplog, t_commrec *cr, int nfile,
 +             const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
 +             gmx_bool bCompact, int nstglobalcomm,
 +             ivec ddxyz, int dd_node_order, real rdd, real rconstr,
 +             const char *dddlb_opt, real dlb_scale,
 +             const char *ddcsx, const char *ddcsy, const char *ddcsz,
 +             const char *nbpu_opt,
 +             gmx_large_int_t nsteps_cmdline, int nstepout, int resetstep,
 +             int gmx_unused nmultisim, int repl_ex_nst, int repl_ex_nex,
 +             int repl_ex_seed, real pforce, real cpt_period, real max_hours,
 +             const char *deviceOptions, unsigned long Flags)
 +{
 +    gmx_bool                  bForceUseGPU, bTryUseGPU;
 +    double                    nodetime = 0, realtime;
 +    t_inputrec               *inputrec;
 +    t_state                  *state = NULL;
 +    matrix                    box;
 +    gmx_ddbox_t               ddbox = {0};
 +    int                       npme_major, npme_minor;
 +    real                      tmpr1, tmpr2;
 +    t_nrnb                   *nrnb;
 +    gmx_mtop_t               *mtop       = NULL;
 +    t_mdatoms                *mdatoms    = NULL;
 +    t_forcerec               *fr         = NULL;
 +    t_fcdata                 *fcd        = NULL;
 +    real                      ewaldcoeff = 0;
 +    gmx_pme_t                *pmedata    = NULL;
 +    gmx_vsite_t              *vsite      = NULL;
 +    gmx_constr_t              constr;
 +    int                       i, m, nChargePerturbed = -1, status, nalloc;
 +    char                     *gro;
 +    gmx_wallcycle_t           wcycle;
 +    gmx_bool                  bReadRNG, bReadEkin;
 +    int                       list;
 +    gmx_walltime_accounting_t walltime_accounting = NULL;
 +    int                       rc;
 +    gmx_large_int_t           reset_counters;
 +    gmx_edsam_t               ed           = NULL;
 +    t_commrec                *cr_old       = cr;
 +    int                       nthreads_pme = 1;
 +    int                       nthreads_pp  = 1;
 +    gmx_membed_t              membed       = NULL;
 +    gmx_hw_info_t            *hwinfo       = NULL;
 +    /* The master rank decides early on bUseGPU and broadcasts this later */
 +    gmx_bool                  bUseGPU      = FALSE;
 +
 +    /* CAUTION: threads may be started later on in this function, so
 +       cr doesn't reflect the final parallel state right now */
 +    snew(inputrec, 1);
 +    snew(mtop, 1);
 +
 +    if (Flags & MD_APPENDFILES)
 +    {
 +        fplog = NULL;
 +    }
 +
 +    bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
 +    bTryUseGPU   = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
 +
 +    /* Detect hardware, gather information. This is an operation that is
 +     * global for this process (MPI rank). */
 +    hwinfo = gmx_detect_hardware(fplog, cr, bTryUseGPU);
 +
 +
 +    snew(state, 1);
 +    if (SIMMASTER(cr))
 +    {
 +        /* Read (nearly) all data required for the simulation */
 +        read_tpx_state(ftp2fn(efTPX, nfile, fnm), inputrec, state, NULL, mtop);
 +
 +        if (inputrec->cutoff_scheme != ecutsVERLET &&
 +            ((Flags & MD_TESTVERLET) || getenv("GMX_VERLET_SCHEME") != NULL))
 +        {
 +            convert_to_verlet_scheme(fplog, inputrec, mtop, det(state->box));
 +        }
 +
 +        if (inputrec->cutoff_scheme == ecutsVERLET)
 +        {
 +            /* Here the master rank decides if all ranks will use GPUs */
 +            bUseGPU = (hwinfo->gpu_info.ncuda_dev_compatible > 0 ||
 +                       getenv("GMX_EMULATE_GPU") != NULL);
 +
 +            prepare_verlet_scheme(fplog, cr,
 +                                  inputrec, mtop, state->box,
 +                                  bUseGPU);
 +        }
 +        else if (hwinfo->gpu_info.ncuda_dev_compatible > 0)
 +        {
 +            md_print_warn(cr, fplog,
 +                          "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
 +                          "      To use a GPU, set the mdp option: cutoff-scheme = Verlet\n"
 +                          "      (for quick performance testing you can use the -testverlet option)\n");
 +
 +            if (bForceUseGPU)
 +            {
 +                gmx_fatal(FARGS, "GPU requested, but can't be used without cutoff-scheme=Verlet");
 +            }
 +        }
 +#ifdef GMX_TARGET_BGQ
 +        else
 +        {
 +            md_print_warn(cr, fplog,
 +                          "NOTE: There is no SIMD implementation of the group scheme kernels on\n"
 +                          "      BlueGene/Q. You will observe better performance from using the\n"
 +                          "      Verlet cut-off scheme.\n");
 +        }
 +#endif
 +    }
 +
 +    /* Check for externally set OpenMP affinity and turn off internal
 +     * pinning if any is found. We need to do this check early to tell
 +     * thread-MPI whether it should do pinning when spawning threads.
 +     * TODO: the above no longer holds, we should move these checks down
 +     */
 +    gmx_omp_check_thread_affinity(fplog, cr, hw_opt);
 +
 +    /* Check and update the hardware options for internal consistency */
 +    check_and_update_hw_opt_1(hw_opt, SIMMASTER(cr));
 +
 +    if (SIMMASTER(cr))
 +    {
 +#ifdef GMX_THREAD_MPI
 +        /* Early check for externally set process affinity.
 +         * With thread-MPI this is needed as pinning might get turned off,
 +         * which needs to be known before starting thread-MPI.
 +         * With thread-MPI hw_opt is processed here on the master rank
 +         * and passed to the other ranks later, so we only do this on master.
 +         */
 +        gmx_check_thread_affinity_set(fplog,
 +                                      NULL,
 +                                      hw_opt, hwinfo->nthreads_hw_avail, FALSE);
 +#endif
 +
 +#ifdef GMX_THREAD_MPI
 +        if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
 +        {
 +            gmx_fatal(FARGS, "You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME nodes");
 +        }
 +#endif
 +
 +        if (hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
 +            cr->npmenodes <= 0)
 +        {
 +            gmx_fatal(FARGS, "You need to explicitly specify the number of PME nodes (-npme) when using different number of OpenMP threads for PP and PME nodes");
 +        }
 +    }
 +
 +#ifdef GMX_THREAD_MPI
 +    if (SIMMASTER(cr))
 +    {
 +        /* Since the master knows the cut-off scheme, update hw_opt for this.
 +         * This is done later for normal MPI and also once more with tMPI
 +         * for all tMPI ranks.
 +         */
 +        check_and_update_hw_opt_2(hw_opt, inputrec->cutoff_scheme);
 +
 +        /* NOW the threads will be started: */
 +        hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
 +                                                 hw_opt,
 +                                                 inputrec, mtop,
 +                                                 cr, fplog);
 +        if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
 +        {
 +            hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
 +        }
 +
 +        if (hw_opt->nthreads_tmpi > 1)
 +        {
 +            /* now start the threads. */
 +            cr = mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm,
 +                                        oenv, bVerbose, bCompact, nstglobalcomm,
 +                                        ddxyz, dd_node_order, rdd, rconstr,
 +                                        dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
 +                                        nbpu_opt,
 +                                        nsteps_cmdline, nstepout, resetstep, nmultisim,
 +                                        repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
 +                                        cpt_period, max_hours, deviceOptions,
 +                                        Flags);
 +            /* the main thread continues here with a new cr. We don't deallocate
 +               the old cr because other threads may still be reading it. */
 +            if (cr == NULL)
 +            {
 +                gmx_comm("Failed to spawn threads");
 +            }
 +        }
 +    }
 +#endif
 +    /* END OF CAUTION: cr is now reliable */
 +
 +    /* g_membed initialisation *
 +     * Because we change the mtop, init_membed is called before the init_parallel *
 +     * (in case we ever want to make it run in parallel) */
 +    if (opt2bSet("-membed", nfile, fnm))
 +    {
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr, "Initializing membed");
 +        }
 +        membed = init_membed(fplog, nfile, fnm, mtop, inputrec, state, cr, &cpt_period);
 +    }
 +
 +    if (PAR(cr))
 +    {
 +        /* now broadcast everything to the non-master nodes/threads: */
 +        init_parallel(cr, inputrec, mtop);
 +
 +        /* This check needs to happen after get_nthreads_mpi() */
 +        if (inputrec->cutoff_scheme == ecutsVERLET && (Flags & MD_PARTDEC))
 +        {
 +            gmx_fatal_collective(FARGS, cr, NULL,
 +                                 "The Verlet cut-off scheme is not supported with particle decomposition.\n"
 +                                 "You can achieve the same effect as particle decomposition by running in parallel using only OpenMP threads.");
 +        }
 +    }
 +    if (fplog != NULL)
 +    {
 +        pr_inputrec(fplog, 0, "Input Parameters", inputrec, FALSE);
 +    }
 +
 +    /* now make sure the state is initialized and propagated */
 +    set_state_entries(state, inputrec, cr->nnodes);
 +
 +    /* A parallel command line option consistency check that we can
 +       only do after any threads have started. */
 +    if (!PAR(cr) &&
 +        (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
 +    {
 +        gmx_fatal(FARGS,
 +                  "The -dd or -npme option request a parallel simulation, "
 +#ifndef GMX_MPI
 +                  "but %s was compiled without threads or MPI enabled"
 +#else
 +#ifdef GMX_THREAD_MPI
 +                  "but the number of threads (option -nt) is 1"
 +#else
 +                  "but %s was not started through mpirun/mpiexec or only one process was requested through mpirun/mpiexec"
 +#endif
 +#endif
 +                  , ShortProgram()
 +                  );
 +    }
 +
 +    if ((Flags & MD_RERUN) &&
 +        (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
 +    {
 +        gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
 +    }
 +
 +    if (can_use_allvsall(inputrec, TRUE, cr, fplog) && PAR(cr))
 +    {
 +        /* Simple neighbour searching and (also?) all-vs-all loops
 +         * do not work with domain decomposition. */
 +        Flags |= MD_PARTDEC;
 +    }
 +
 +    if (!EEL_PME(inputrec->coulombtype) || (Flags & MD_PARTDEC))
 +    {
 +        if (cr->npmenodes > 0)
 +        {
 +            if (!EEL_PME(inputrec->coulombtype))
 +            {
 +                gmx_fatal_collective(FARGS, cr, NULL,
 +                                     "PME nodes are requested, but the system does not use PME electrostatics");
 +            }
 +            if (Flags & MD_PARTDEC)
 +            {
 +                gmx_fatal_collective(FARGS, cr, NULL,
 +                                     "PME nodes are requested, but particle decomposition does not support separate PME nodes");
 +            }
 +        }
 +
 +        cr->npmenodes = 0;
 +    }
 +
 +#ifdef GMX_FAHCORE
 +    fcRegisterSteps(inputrec->nsteps, inputrec->init_step);
 +#endif
 +
 +    /* NMR restraints must be initialized before load_checkpoint,
 +     * since with time averaging the history is added to t_state.
 +     * For proper consistency check we therefore need to extend
 +     * t_state here.
 +     * So the PME-only nodes (if present) will also initialize
 +     * the distance restraints.
 +     */
 +    snew(fcd, 1);
 +
 +    /* This needs to be called before read_checkpoint to extend the state */
 +    init_disres(fplog, mtop, inputrec, cr, Flags & MD_PARTDEC, fcd, state, repl_ex_nst > 0);
 +
 +    if (gmx_mtop_ftype_count(mtop, F_ORIRES) > 0)
 +    {
 +        if (PAR(cr) && !(Flags & MD_PARTDEC))
 +        {
 +            gmx_fatal(FARGS, "Orientation restraints do not work (yet) with domain decomposition, use particle decomposition (mdrun option -pd)");
 +        }
 +        /* Orientation restraints */
 +        if (MASTER(cr))
 +        {
 +            init_orires(fplog, mtop, state->x, inputrec, cr->ms, &(fcd->orires),
 +                        state);
 +        }
 +    }
 +
 +    if (DEFORM(*inputrec))
 +    {
 +        /* Store the deform reference box before reading the checkpoint */
 +        if (SIMMASTER(cr))
 +        {
 +            copy_mat(state->box, box);
 +        }
 +        if (PAR(cr))
 +        {
 +            gmx_bcast(sizeof(box), box, cr);
 +        }
 +        /* Because we do not have the update struct available yet
 +         * in which the reference values should be stored,
 +         * we store them temporarily in static variables.
 +         * This should be thread safe, since they are only written once
 +         * and with identical values.
 +         */
 +        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
 +        deform_init_init_step_tpx = inputrec->init_step;
 +        copy_mat(box, deform_init_box_tpx);
 +        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
 +    }
 +
 +    if (opt2bSet("-cpi", nfile, fnm))
 +    {
 +        /* Check if checkpoint file exists before doing continuation.
 +         * This way we can use identical input options for the first and subsequent runs...
 +         */
 +        if (gmx_fexist_master(opt2fn_master("-cpi", nfile, fnm, cr), cr) )
 +        {
 +            load_checkpoint(opt2fn_master("-cpi", nfile, fnm, cr), &fplog,
 +                            cr, Flags & MD_PARTDEC, ddxyz,
 +                            inputrec, state, &bReadRNG, &bReadEkin,
 +                            (Flags & MD_APPENDFILES),
 +                            (Flags & MD_APPENDFILESSET));
 +
 +            if (bReadRNG)
 +            {
 +                Flags |= MD_READ_RNG;
 +            }
 +            if (bReadEkin)
 +            {
 +                Flags |= MD_READ_EKIN;
 +            }
 +        }
 +    }
 +
 +    if (((MASTER(cr) || (Flags & MD_SEPPOT)) && (Flags & MD_APPENDFILES))
 +#ifdef GMX_THREAD_MPI
 +        /* With thread MPI only the master node/thread exists in mdrun.c,
 +         * therefore non-master nodes need to open the "seppot" log file here.
 +         */
 +        || (!MASTER(cr) && (Flags & MD_SEPPOT))
 +#endif
 +        )
 +    {
 +        gmx_log_open(ftp2fn(efLOG, nfile, fnm), cr, !(Flags & MD_SEPPOT),
 +                     Flags, &fplog);
 +    }
 +
 +    /* override nsteps with value from cmdline */
 +    override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
 +
 +    if (SIMMASTER(cr))
 +    {
 +        copy_mat(state->box, box);
 +    }
 +
 +    if (PAR(cr))
 +    {
 +        gmx_bcast(sizeof(box), box, cr);
 +    }
 +
 +    /* Essential dynamics */
 +    if (opt2bSet("-ei", nfile, fnm))
 +    {
 +        /* Open input and output files, allocate space for ED data structure */
 +        ed = ed_open(mtop->natoms, &state->edsamstate, nfile, fnm, Flags, oenv, cr);
 +    }
 +
 +    if (PAR(cr) && !((Flags & MD_PARTDEC) ||
 +                     EI_TPI(inputrec->eI) ||
 +                     inputrec->eI == eiNM))
 +    {
 +        cr->dd = init_domain_decomposition(fplog, cr, Flags, ddxyz, rdd, rconstr,
 +                                           dddlb_opt, dlb_scale,
 +                                           ddcsx, ddcsy, ddcsz,
 +                                           mtop, inputrec,
 +                                           box, state->x,
 +                                           &ddbox, &npme_major, &npme_minor);
 +
 +        make_dd_communicators(fplog, cr, dd_node_order);
 +
 +        /* Set overallocation to avoid frequent reallocation of arrays */
 +        set_over_alloc_dd(TRUE);
 +    }
 +    else
 +    {
 +        /* PME, if used, is done on all nodes with 1D decomposition */
 +        cr->npmenodes = 0;
 +        cr->duty      = (DUTY_PP | DUTY_PME);
 +        npme_major    = 1;
 +        npme_minor    = 1;
 +        /* NM and TPI perform single node energy calculations in parallel */
 +        if (!(inputrec->eI == eiNM || EI_TPI(inputrec->eI)))
 +        {
 +            npme_major = cr->nnodes;
 +        }
 +
 +        if (inputrec->ePBC == epbcSCREW)
 +        {
 +            gmx_fatal(FARGS,
 +                      "pbc=%s is only implemented with domain decomposition",
 +                      epbc_names[inputrec->ePBC]);
 +        }
 +    }
 +
 +    if (PAR(cr))
 +    {
 +        /* After possible communicator splitting in make_dd_communicators.
 +         * we can set up the intra/inter node communication.
 +         */
 +        gmx_setup_nodecomm(fplog, cr);
 +    }
 +
 +    /* Initialize per-physical-node MPI process/thread ID and counters. */
 +    gmx_init_intranode_counters(cr);
 +
 +#ifdef GMX_MPI
 +    md_print_info(cr, fplog, "Using %d MPI %s\n",
 +                  cr->nnodes,
 +#ifdef GMX_THREAD_MPI
 +                  cr->nnodes == 1 ? "thread" : "threads"
 +#else
 +                  cr->nnodes == 1 ? "process" : "processes"
 +#endif
 +                  );
 +    fflush(stderr);
 +#endif
 +
 +    /* Check and update hw_opt for the cut-off scheme */
 +    check_and_update_hw_opt_2(hw_opt, inputrec->cutoff_scheme);
 +
 +    gmx_omp_nthreads_init(fplog, cr,
 +                          hwinfo->nthreads_hw_avail,
 +                          hw_opt->nthreads_omp,
 +                          hw_opt->nthreads_omp_pme,
 +                          (cr->duty & DUTY_PP) == 0,
 +                          inputrec->cutoff_scheme == ecutsVERLET);
 +
 +    if (PAR(cr))
 +    {
 +        /* The master rank decided on the use of GPUs,
 +         * broadcast this information to all ranks.
 +         */
 +        gmx_bcast_sim(sizeof(bUseGPU), &bUseGPU, cr);
 +    }
 +
 +    if (bUseGPU)
 +    {
 +        if (cr->npmenodes == -1)
 +        {
 +            /* Don't automatically use PME-only nodes with GPUs */
 +            cr->npmenodes = 0;
 +        }
 +
 +        /* Select GPU id's to use */
 +        gmx_select_gpu_ids(fplog, cr, &hwinfo->gpu_info, bForceUseGPU,
 +                           &hw_opt->gpu_opt);
 +    }
++    else
++    {
++        /* Ignore (potentially) manually selected GPUs */
++        hw_opt->gpu_opt.ncuda_dev_use = 0;
++    }
 +
 +    /* check consistency of CPU acceleration and number of GPUs selected */
 +    gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt, bUseGPU);
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        /* When we share GPUs over ranks, we need to know this for the DLB */
 +        dd_setup_dlb_resource_sharing(cr, hwinfo, hw_opt);
 +    }
 +
 +    /* getting number of PP/PME threads
 +       PME: env variable should be read only on one node to make sure it is
 +       identical everywhere;
 +     */
 +    /* TODO nthreads_pp is only used for pinning threads.
 +     * This is a temporary solution until we have a hw topology library.
 +     */
 +    nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
 +    nthreads_pme = gmx_omp_nthreads_get(emntPME);
 +
 +    wcycle = wallcycle_init(fplog, resetstep, cr, nthreads_pp, nthreads_pme);
 +
 +    if (PAR(cr))
 +    {
 +        /* Master synchronizes its value of reset_counters with all nodes
 +         * including PME only nodes */
 +        reset_counters = wcycle_get_reset_counters(wcycle);
 +        gmx_bcast_sim(sizeof(reset_counters), &reset_counters, cr);
 +        wcycle_set_reset_counters(wcycle, reset_counters);
 +    }
 +
 +    snew(nrnb, 1);
 +    if (cr->duty & DUTY_PP)
 +    {
 +        /* For domain decomposition we allocate dynamically
 +         * in dd_partition_system.
 +         */
 +        if (DOMAINDECOMP(cr))
 +        {
 +            bcast_state_setup(cr, state);
 +        }
 +        else
 +        {
 +            if (PAR(cr))
 +            {
 +                bcast_state(cr, state, TRUE);
 +            }
 +        }
 +
 +        /* Initiate forcerecord */
 +        fr          = mk_forcerec();
 +        fr->hwinfo  = hwinfo;
 +        fr->gpu_opt = &hw_opt->gpu_opt;
 +        init_forcerec(fplog, oenv, fr, fcd, inputrec, mtop, cr, box,
 +                      opt2fn("-table", nfile, fnm),
 +                      opt2fn("-tabletf", nfile, fnm),
 +                      opt2fn("-tablep", nfile, fnm),
 +                      opt2fn("-tableb", nfile, fnm),
 +                      nbpu_opt,
 +                      FALSE,
 +                      pforce);
 +
 +        /* version for PCA_NOT_READ_NODE (see md.c) */
 +        /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
 +           "nofile","nofile","nofile","nofile",FALSE,pforce);
 +         */
 +        fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT);
 +
 +        /* Initialize QM-MM */
 +        if (fr->bQMMM)
 +        {
 +            init_QMMMrec(cr, mtop, inputrec, fr);
 +        }
 +
 +        /* Initialize the mdatoms structure.
 +         * mdatoms is not filled with atom data,
 +         * as this can not be done now with domain decomposition.
 +         */
 +        mdatoms = init_mdatoms(fplog, mtop, inputrec->efep != efepNO);
 +
 +        if (mdatoms->nPerturbed > 0 && inputrec->cutoff_scheme == ecutsVERLET)
 +        {
 +            gmx_fatal(FARGS, "The Verlet cut-off scheme does not (yet) support free-energy calculations with perturbed atoms, only perturbed interactions. This will be implemented soon. Use the group scheme for now.");
 +        }
 +
 +        /* Initialize the virtual site communication */
 +        vsite = init_vsite(mtop, cr, FALSE);
 +
 +        calc_shifts(box, fr->shift_vec);
 +
 +        /* With periodic molecules the charge groups should be whole at start up
 +         * and the virtual sites should not be far from their proper positions.
 +         */
 +        if (!inputrec->bContinuation && MASTER(cr) &&
 +            !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
 +        {
 +            /* Make molecules whole at start of run */
 +            if (fr->ePBC != epbcNONE)
 +            {
 +                do_pbc_first_mtop(fplog, inputrec->ePBC, box, mtop, state->x);
 +            }
 +            if (vsite)
 +            {
 +                /* Correct initial vsite positions are required
 +                 * for the initial distribution in the domain decomposition
 +                 * and for the initial shell prediction.
 +                 */
 +                construct_vsites_mtop(vsite, mtop, state->x);
 +            }
 +        }
 +
 +        if (EEL_PME(fr->eeltype))
 +        {
 +            ewaldcoeff = fr->ewaldcoeff;
 +            pmedata    = &fr->pmedata;
 +        }
 +        else
 +        {
 +            pmedata = NULL;
 +        }
 +    }
 +    else
 +    {
 +        /* This is a PME only node */
 +
 +        /* We don't need the state */
 +        done_state(state);
 +
 +        ewaldcoeff = calc_ewaldcoeff(inputrec->rcoulomb, inputrec->ewald_rtol);
 +        snew(pmedata, 1);
 +    }
 +
 +    if (hw_opt->thread_affinity != threadaffOFF)
 +    {
 +        /* Before setting affinity, check whether the affinity has changed
 +         * - which indicates that probably the OpenMP library has changed it
 +         * since we first checked).
 +         */
 +        gmx_check_thread_affinity_set(fplog, cr,
 +                                      hw_opt, hwinfo->nthreads_hw_avail, TRUE);
 +
 +        /* Set the CPU affinity */
 +        gmx_set_thread_affinity(fplog, cr, hw_opt, hwinfo);
 +    }
 +
 +    /* Initiate PME if necessary,
 +     * either on all nodes or on dedicated PME nodes only. */
 +    if (EEL_PME(inputrec->coulombtype))
 +    {
 +        if (mdatoms)
 +        {
 +            nChargePerturbed = mdatoms->nChargePerturbed;
 +        }
 +        if (cr->npmenodes > 0)
 +        {
 +            /* The PME only nodes need to know nChargePerturbed */
 +            gmx_bcast_sim(sizeof(nChargePerturbed), &nChargePerturbed, cr);
 +        }
 +
 +        if (cr->duty & DUTY_PME)
 +        {
 +            status = gmx_pme_init(pmedata, cr, npme_major, npme_minor, inputrec,
 +                                  mtop ? mtop->natoms : 0, nChargePerturbed,
 +                                  (Flags & MD_REPRODUCIBLE), nthreads_pme);
 +            if (status != 0)
 +            {
 +                gmx_fatal(FARGS, "Error %d initializing PME", status);
 +            }
 +        }
 +    }
 +
 +
 +    if (integrator[inputrec->eI].func == do_md)
 +    {
 +        /* Turn on signal handling on all nodes */
 +        /*
 +         * (A user signal from the PME nodes (if any)
 +         * is communicated to the PP nodes.
 +         */
 +        signal_handler_install();
 +    }
 +
 +    if (cr->duty & DUTY_PP)
 +    {
 +        /* Assumes uniform use of the number of OpenMP threads */
 +        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntDefault));
 +
 +        if (inputrec->ePull != epullNO)
 +        {
 +            /* Initialize pull code */
 +            init_pull(fplog, inputrec, nfile, fnm, mtop, cr, oenv, inputrec->fepvals->init_lambda,
 +                      EI_DYNAMICS(inputrec->eI) && MASTER(cr), Flags);
 +        }
 +
 +        if (inputrec->bRot)
 +        {
 +            /* Initialize enforced rotation code */
 +            init_rot(fplog, inputrec, nfile, fnm, cr, state->x, box, mtop, oenv,
 +                     bVerbose, Flags);
 +        }
 +
 +        constr = init_constraints(fplog, mtop, inputrec, ed, state, cr);
 +
 +        if (DOMAINDECOMP(cr))
 +        {
 +            dd_init_bondeds(fplog, cr->dd, mtop, vsite, inputrec,
 +                            Flags & MD_DDBONDCHECK, fr->cginfo_mb);
 +
 +            set_dd_parameters(fplog, cr->dd, dlb_scale, inputrec, &ddbox);
 +
 +            setup_dd_grid(fplog, cr->dd);
 +        }
 +
 +        /* Now do whatever the user wants us to do (how flexible...) */
 +        integrator[inputrec->eI].func(fplog, cr, nfile, fnm,
 +                                      oenv, bVerbose, bCompact,
 +                                      nstglobalcomm,
 +                                      vsite, constr,
 +                                      nstepout, inputrec, mtop,
 +                                      fcd, state,
 +                                      mdatoms, nrnb, wcycle, ed, fr,
 +                                      repl_ex_nst, repl_ex_nex, repl_ex_seed,
 +                                      membed,
 +                                      cpt_period, max_hours,
 +                                      deviceOptions,
 +                                      Flags,
 +                                      walltime_accounting);
 +
 +        if (inputrec->ePull != epullNO)
 +        {
 +            finish_pull(inputrec->pull);
 +        }
 +
 +        if (inputrec->bRot)
 +        {
 +            finish_rot(inputrec->rot);
 +        }
 +
 +    }
 +    else
 +    {
 +        /* do PME only */
 +        walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
 +        gmx_pmeonly(*pmedata, cr, nrnb, wcycle, walltime_accounting, ewaldcoeff, inputrec);
 +    }
 +
 +    wallcycle_stop(wcycle, ewcRUN);
 +
 +    /* Finish up, write some stuff
 +     * if rerunMD, don't write last frame again
 +     */
 +    finish_run(fplog, cr,
 +               inputrec, nrnb, wcycle, walltime_accounting,
 +               fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ?
 +               nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL,
 +               EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
 +
 +
 +    /* Free GPU memory and context */
 +    free_gpu_resources(fplog, fr, cr);
 +
 +    if (opt2bSet("-membed", nfile, fnm))
 +    {
 +        sfree(membed);
 +    }
 +
 +    gmx_hardware_info_free(hwinfo);
 +
 +    /* Does what it says */
 +    print_date_and_time(fplog, cr->nodeid, "Finished mdrun", walltime_accounting);
 +    walltime_accounting_destroy(walltime_accounting);
 +
 +    /* Close logfile already here if we were appending to it */
 +    if (MASTER(cr) && (Flags & MD_APPENDFILES))
 +    {
 +        gmx_log_close(fplog);
 +    }
 +
 +    rc = (int)gmx_get_stop_condition();
 +
 +#ifdef GMX_THREAD_MPI
 +    /* we need to join all threads. The sub-threads join when they
 +       exit this function, but the master thread needs to be told to
 +       wait for that. */
 +    if (PAR(cr) && MASTER(cr))
 +    {
 +        tMPI_Finalize();
 +    }
 +#endif
 +
 +    return rc;
 +}