Merge release-4-6 into master
authorRoland Schulz <roland@utk.edu>
Wed, 12 Dec 2012 23:07:08 +0000 (18:07 -0500)
committerRoland Schulz <roland@utk.edu>
Wed, 12 Dec 2012 23:07:08 +0000 (18:07 -0500)
Conflicts:
CMakeLists.txt (applied OpenMM changes to new location)
Deleted:
src/gromacs/gmxlib/nonbonded/nb_kernel_adress_c/nb_kernel[0-9]*
src/gromacs/gmxlib/nonbonded/nb_kernel_adress_c/nb_kernel_c_adress.c
src/gromacs/gmxlib/nonbonded/nb_kernel_adress_c/nb_kernel_c_adress.h
Trivial:
src/gromacs/legacyheaders/network.h
src/gromacs/mdlib/adress.c
visibility.h not needed for master yet:
src/gromacs/legacyheaders/types/ifunc.h
src/gromacs/legacyheaders/types/iteratedconstraints.h
src/gromacs/legacyheaders/types/nlistheuristics.h

This commit reverts e6cd064af2032c (Copyright). Will be done by Mark after 4.6
release using the script.

Change-Id: Id7371c4a727213159c1c481d300c5b5bd4e290b4

47 files changed:
1  2 
CMakeLists.txt
cmake/FindFFTW.cmake
cmake/ThreadMPI.cmake
cmake/gmxManageGPU.cmake
cmake/gmxManageNvccConfig.cmake
src/gromacs/gmxlib/gmx_detect_hardware.c
src/gromacs/gmxlib/gmx_omp_nthreads.c
src/gromacs/gmxlib/ifunc.c
src/gromacs/gmxlib/network.c
src/gromacs/gmxlib/nonbonded/nb_generic_adress.c
src/gromacs/gmxlib/nonbonded/nb_generic_adress.h
src/gromacs/gmxlib/nonbonded/nb_kernel_adress_c/make_nb_kernel_adress_c.py
src/gromacs/gmxlib/nonbonded/nb_kernel_adress_c/nb_kernel_adress_template_c.pre
src/gromacs/gmxlib/nonbonded/nonbonded.c
src/gromacs/gmxlib/tpxio.c
src/gromacs/gmxpreprocess/readir.c
src/gromacs/legacyheaders/network.h
src/gromacs/legacyheaders/types/commrec.h
src/gromacs/legacyheaders/types/enums.h
src/gromacs/legacyheaders/types/idef.h
src/gromacs/legacyheaders/types/mdatom.h
src/gromacs/legacyheaders/types/nblist.h
src/gromacs/legacyheaders/types/nbnxn_pairlist.h
src/gromacs/mdlib/adress.c
src/gromacs/mdlib/domdec.c
src/gromacs/mdlib/ebin.c
src/gromacs/mdlib/forcerec.c
src/gromacs/mdlib/md_support.c
src/gromacs/mdlib/mdatom.c
src/gromacs/mdlib/mdebin.c
src/gromacs/mdlib/nbnxn_atomdata.c
src/gromacs/mdlib/nbnxn_internal.h
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_common.c
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_common.h
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.c
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.c
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd128.c
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_x86_simd256.c
src/gromacs/mdlib/nbnxn_search.c
src/gromacs/mdlib/nbnxn_search.h
src/gromacs/mdlib/ns.c
src/gromacs/mdlib/stat.c
src/gromacs/mdlib/vsite.c
src/gromacs/mdlib/wnblist.c
src/programs/mdrun/runner.c
src/tools/gmx_hbond.c
src/tools/gmx_membed.c

diff --cc CMakeLists.txt
index bc0f5e3941510093d264fc79fe707b8f886677d2,b9f3dfca9bf8e1ece05cbe3ff160b9c1fecb4945..6c53546c67c0b91cdd17c98d69c434f156c83005
@@@ -162,11 -214,6 +162,13 @@@ mark_as_advanced(GMX_POWERPC_INVSQRT
  option(GMX_FAHCORE "Build a library with mdrun functionality" OFF)
  mark_as_advanced(GMX_FAHCORE)
  
 +# decide on GPU settings based on user-settings and GPU/CUDA detection
 +include(gmxManageGPU)
 +
++# TODO: move OpenMM to contrib
 +option(GMX_OPENMM "Accelerated execution on GPUs through the OpenMM library (rerun cmake after changing to see relevant options)" OFF)
++mark_as_advanced(GMX_OPENMM)
 +
  include(gmxDetectAcceleration)
  if(NOT DEFINED GMX_CPU_ACCELERATION)
      if(CMAKE_CROSSCOMPILING)
@@@ -707,16 -736,18 +709,16 @@@ elseif(${GMX_CPU_ACCELERATION} STREQUA
          GMX_TEST_CFLAG(MSVC_SSE2_CFLAG "/arch:SSE2" GROMACS_C_FLAGS)
      endif(NOT GNU_SSE4_CFLAG AND NOT MSVC_SSE4_CFLAG)
  
 -    if (CMAKE_CXX_COMPILER_LOADED)
 -        GMX_TEST_CXXFLAG(GNU_SSE4_CXXFLAG "-msse4.1" GROMACS_CXX_FLAG)
 -        if (NOT GNU_SSE4_CXXFLAG)
 -            GMX_TEST_CXXFLAG(MSVC_SSE4_CXXFLAG "/arch:SSE4.1" GROMACS_CXX_FLAGS)
 -        endif(NOT GNU_SSE4_CXXFLAG)
 -        if (NOT GNU_SSE4_CXXFLAG AND NOT MSVC_SSE4_CXXFLAG) 
 -            message(WARNING "No C++ SSE4.1 flag found. Consider a newer compiler, or use SSE2 for slightly lower performance.")
 -            # Not surprising if we end up here! MSVC current does not support the SSE4.1 flag. However, it appears to accept SSE4.1
 -            # intrinsics when SSE2 support is enabled, so we try that instead.
 -            GMX_TEST_CXXFLAG(MSVC_SSE2_CXXFLAG "/arch:SSE2" GROMACS_CXX_FLAGS)
 -        endif(NOT GNU_SSE4_CXXFLAG AND NOT MSVC_SSE4_CXXFLAG)
 -    endif()
 +    GMX_TEST_CXXFLAG(GNU_SSE4_CXXFLAG "-msse4.1" GROMACS_CXX_FLAG)
 +    if (NOT GNU_SSE4_CXXFLAG)
 +       GMX_TEST_CXXFLAG(MSVC_SSE4_CXXFLAG "/arch:SSE4.1" GROMACS_CXX_FLAGS)
 +    endif(NOT GNU_SSE4_CXXFLAG)
 +    if (NOT GNU_SSE4_CXXFLAG AND NOT MSVC_SSE4_CXXFLAG)
-         message(WARNING "No C++ SSE4.1 flag found. Consider a newer compiler, or disable SSE4.1 for slightly lower performance.")
++        message(WARNING "No C++ SSE4.1 flag found. Consider a newer compiler, or use SSE2 for slightly lower performance.")
 +        # Not surprising if we end up here! MSVC current does not support the SSE4.1 flag. However, it appears to accept SSE4.1
 +        # intrinsics when SSE2 support is enabled, so we try that instead.
 +        GMX_TEST_CXXFLAG(MSVC_SSE2_CXXFLAG "/arch:SSE2" GROMACS_CXX_FLAGS)
 +    endif(NOT GNU_SSE4_CXXFLAG AND NOT MSVC_SSE4_CXXFLAG)
  
      # This must come after we have added the -msse4.1 flag on some platforms.
      check_include_file(smmintrin.h  HAVE_SMMINTRIN_H ${GROMACS_C_FLAGS})
Simple merge
Simple merge
Simple merge
index b7d5998147a8746146fa03df288f7f8b0fbfd76a,f12d07de6702818dc5df7925172a9871dccd0e36..24a3e1275edcb3ff65c7215c23d451f99bc63870
@@@ -1,6 -1,40 +1,6 @@@
 -#
 -# This file is part of the GROMACS molecular simulation package.
 -#
 -# Copyright (c) 2012, by the GROMACS development team, led by
 -# David van der Spoel, Berk Hess, Erik Lindahl, and including many
 -# others, as listed in the AUTHORS file in the top-level source
 -# directory and at http://www.gromacs.org.
 -#
 -# GROMACS is free software; you can redistribute it and/or
 -# modify it under the terms of the GNU Lesser General Public License
 -# as published by the Free Software Foundation; either version 2.1
 -# of the License, or (at your option) any later version.
 -#
 -# GROMACS is distributed in the hope that it will be useful,
 -# but WITHOUT ANY WARRANTY; without even the implied warranty of
 -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 -# Lesser General Public License for more details.
 -#
 -# You should have received a copy of the GNU Lesser General Public
 -# License along with GROMACS; if not, see
 -# http://www.gnu.org/licenses, or write to the Free Software Foundation,
 -# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 -#
 -# If you want to redistribute modifications to GROMACS, please
 -# consider that scientific software is very special. Version
 -# control is crucial - bugs must be traceable. We will be happy to
 -# consider code for inclusion in the official distribution, but
 -# derived work must not be called official GROMACS. Details are found
 -# in the README & COPYING files - if they are missing, get the
 -# official version at http://www.gromacs.org.
 -#
 -# To help us fund GROMACS development, we humbly ask that you cite
 -# the research papers on the package. Check out http://www.gromacs.org.
 -#
  # Manage CUDA nvcc compilation configuration, try to be smart to ease the users'
  # pain as much as possible:
- # - use the CUDA_NVCC_HOST_COMPILER if defined by the user, otherwise
+ # - use the CUDA_HOST_COMPILER if defined by the user, otherwise
  # - auto-detect compatible nvcc host compiler and set nvcc -ccbin (if not MPI wrapper)
  # - set icc compatibility mode to gcc 4.4/4.5 (CUDA 4.0 is not compatible with gcc >v4.4)
  # - (advanced) variables set:
index 655169cfa5a644bcae76f9003ce35be96d3702c3,0000000000000000000000000000000000000000..80491d43352a0f94af6e54452e1b44e86c19d5be
mode 100644,000000..100644
--- /dev/null
@@@ -1,610 -1,0 +1,610 @@@
-     npppn = cr->nnodes_pp_intra;
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + * This file is part of GROMACS.
 + * Copyright (c) 2012-  
 + *
 + * Written by the Gromacs development team under coordination of
 + * David van der Spoel, Berk Hess, and Erik Lindahl.
 + *
 + * This library is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU Lesser General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROup of MAchos and Cynical Suckers
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <stdlib.h>
 +#include <assert.h>
 +#include <string.h>
 +
 +#include "types/enums.h"
 +#include "types/hw_info.h"
 +#include "types/commrec.h"
 +#include "gmx_fatal.h"
 +#include "gmx_fatal_collective.h"
 +#include "smalloc.h"
 +#include "gpu_utils.h"
 +#include "statutil.h"
 +#include "gmx_detect_hardware.h"
 +#include "main.h"
 +#include "md_logging.h"
 +
 +#if ((defined(WIN32) || defined( _WIN32 ) || defined(WIN64) || defined( _WIN64 )) && !(defined (__CYGWIN__) || defined (__CYGWIN32__)))
 +#include "windows.h"
 +#endif
 +
 +/* Although we can't have more than 10 GPU different ID-s passed by the user as
 + * the id-s are assumed to be represented by single digits, as multiple
 + * processes can share a GPU, we can end up with more than 10 IDs.
 + * To account for potential extreme cases we'll set the limit to a pretty
 + * ridiculous number. */
 +static unsigned int max_gpu_ids_user = 64;
 +
 +/* FW decl. */
 +void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count);
 +
 +static void sprint_gpus(char *sbuf, const gmx_gpu_info_t *gpu_info, gmx_bool bPrintAll)
 +{
 +    int      i, ndev;
 +    char     stmp[STRLEN];
 +
 +    ndev = gpu_info->ncuda_dev;
 +
 +    sbuf[0] = '\0';
 +    for (i = 0; i < ndev; i++)
 +    {
 +        get_gpu_device_info_string(stmp, gpu_info, i);
 +        strcat(sbuf, "  ");
 +        strcat(sbuf, stmp);
 +        if (i < ndev - 1)
 +        {
 +            strcat(sbuf, "\n");
 +        }
 +    }
 +}
 +
 +static void print_gpu_detection_stats(FILE *fplog,
 +                                      const gmx_gpu_info_t *gpu_info,
 +                                      const t_commrec *cr)
 +{
 +    char onhost[266],stmp[STRLEN];
 +    int  ngpu;
 +
 +    ngpu = gpu_info->ncuda_dev;
 +
 +#if defined GMX_MPI && !defined GMX_THREAD_MPI
 +    /* We only print the detection on one, of possibly multiple, nodes */
 +    strncpy(onhost," on host ",10);
 +    gmx_gethostname(onhost+9,256);
 +#else
 +    /* We detect all relevant GPUs */
 +    strncpy(onhost,"",1);
 +#endif
 +
 +    if (ngpu > 0)
 +    {
 +        sprint_gpus(stmp, gpu_info, TRUE);
 +        md_print_warn(cr, fplog, "%d GPU%s detected%s:\n%s\n",
 +                      ngpu, (ngpu > 1) ? "s" : "", onhost, stmp);
 +    }
 +    else
 +    {
 +        md_print_warn(cr, fplog, "No GPUs detected%s\n", onhost);
 +    }
 +}
 +
 +static void print_gpu_use_stats(FILE *fplog,
 +                                const gmx_gpu_info_t *gpu_info,
 +                                const t_commrec *cr)
 +{
 +    char sbuf[STRLEN], stmp[STRLEN];
 +    int  i, ngpu, ngpu_all;
 +
 +    ngpu     = gpu_info->ncuda_dev_use;
 +    ngpu_all = gpu_info->ncuda_dev;
 +
 +    /* Issue note if GPUs are available but not used */
 +    if (ngpu_all > 0 && ngpu < 1)
 +    {
 +        sprintf(sbuf,
 +                "%d compatible GPU%s detected in the system, but none will be used.\n"
 +                "Consider trying GPU acceleration with the Verlet scheme!",
 +                ngpu_all, (ngpu_all > 1) ? "s" : "");
 +    }
 +    else
 +    {
 +        sprintf(sbuf, "%d GPU%s %sselected to be used for this run: ",
 +                ngpu, (ngpu > 1) ? "s" : "",
 +                gpu_info->bUserSet ? "user-" : "auto-");
 +        for (i = 0; i < ngpu; i++)
 +        {
 +            sprintf(stmp, "#%d", get_gpu_device_id(gpu_info, i));
 +            if (i < ngpu - 1)
 +            {
 +                strcat(stmp, ", ");
 +            }
 +            strcat(sbuf, stmp);
 +        }
 +    }
 +    md_print_info(cr, fplog, "%s\n\n", sbuf);
 +}
 +
 +/* Parse a "plain" GPU ID string which contains a sequence of digits corresponding
 + * to GPU IDs; the order will indicate the process/tMPI thread - GPU assignment. */
 +static void parse_gpu_id_plain_string(const char *idstr, int *nid, int *idlist)
 +{
 +    int  i;
 +    size_t len_idstr;
 +
 +    len_idstr = strlen(idstr);
 +
 +    if (len_idstr > max_gpu_ids_user)
 +    {
 +        gmx_fatal(FARGS,"%d GPU IDs provided, but only at most %d are supported",
 +                  len_idstr, max_gpu_ids_user);
 +    }
 +
 +    *nid = len_idstr;
 +
 +    for (i = 0; i < *nid; i++)
 +    {
 +        if (idstr[i] < '0' || idstr[i] > '9')
 +        {
 +            gmx_fatal(FARGS, "Invalid character in GPU ID string: '%c'\n", idstr[i]);
 +        }
 +        idlist[i] = idstr[i] - '0';
 +    }
 +}
 +
 +static void parse_gpu_id_csv_string(const char *idstr, int *nid, int *idlist)
 +{
 +    /* XXX implement cvs format to support more than 10 different GPUs in a box. */
 +    gmx_incons("Not implemented yet");
 +}
 +
 +void gmx_check_hw_runconf_consistency(FILE *fplog, gmx_hw_info_t *hwinfo,
 +                                      const t_commrec *cr, int ntmpi_requested,
 +                                      gmx_bool bUseGPU)
 +{
 +    int      npppn, ntmpi_pp, ngpu;
 +    char     sbuf[STRLEN], th_or_proc[STRLEN], th_or_proc_plural[STRLEN], pernode[STRLEN];
 +    char     gpu_plural[2];
 +    gmx_bool bGPUBin, btMPI, bMPI, bMaxMpiThreadsSet, bNthreadsAuto, bEmulateGPU;
 +
 +    assert(hwinfo);
 +    assert(cr);
 +
 +    btMPI = bMPI = FALSE;
 +    bNthreadsAuto = FALSE;
 +#if defined(GMX_THREAD_MPI)
 +    btMPI = TRUE;
 +    bNthreadsAuto = (ntmpi_requested < 1);
 +#elif defined(GMX_LIB_MPI)
 +    bMPI  = TRUE;
 +#endif
 +
 +#ifdef GMX_GPU
 +    bGPUBin      = TRUE;
 +#else
 +    bGPUBin      = FALSE;
 +#endif
 +
 +    /* GPU emulation detection is done later, but we need here as well
 +     * -- uncool, but there's no elegant workaround */
 +    bEmulateGPU       = (getenv("GMX_EMULATE_GPU") != NULL);
 +    bMaxMpiThreadsSet = (getenv("GMX_MAX_MPI_THREADS") != NULL);
 +
 +    if (SIMMASTER(cr))
 +    {
 +        /* check the acceleration mdrun is compiled with against hardware capabilities */
 +        /* TODO: Here we assume homogeneous hardware which is not necessarily the case!
 +         *       Might not hurt to add an extra check over MPI. */
 +        gmx_cpuid_acceleration_check(hwinfo->cpuid_info, fplog);
 +    }
 +
 +    /* Below we only do consistency checks for PP and GPUs,
 +     * this is irrelevant for PME only nodes, so in that case we return here.
 +     */
 +    if (!(cr->duty & DUTY_PP))
 +    {
 +        return;
 +    }
 +
 +    /* Need to ensure that we have enough GPUs:
 +     * - need one GPU per PP node
 +     * - no GPU oversubscription with tMPI
 +     * => keep on the GPU support, otherwise turn off (or bail if forced)
 +     * */
 +    /* number of PP processes per node */
-                     if (cr->nodeid_intra == 0)
++    npppn = cr->nrank_pp_intranode;
 +
 +    pernode[0] = '\0';
 +    th_or_proc_plural[0] = '\0';
 +    if (btMPI)
 +    {
 +        sprintf(th_or_proc, "thread-MPI thread");
 +        if (npppn > 1)
 +        {
 +            sprintf(th_or_proc_plural, "s");
 +        }
 +    }
 +    else if (bMPI)
 +    {
 +        sprintf(th_or_proc, "MPI process");
 +        if (npppn > 1)
 +        {
 +            sprintf(th_or_proc_plural, "es");
 +        }
 +        sprintf(pernode, " per node");
 +    }
 +    else
 +    {
 +        /* neither MPI nor tMPI */
 +        sprintf(th_or_proc, "process");
 +    }
 +
 +    if (bGPUBin)
 +    {
 +        print_gpu_detection_stats(fplog, &hwinfo->gpu_info, cr);
 +    }
 +
 +    if (bUseGPU && hwinfo->bCanUseGPU && !bEmulateGPU)
 +    {
 +        ngpu = hwinfo->gpu_info.ncuda_dev_use;
 +        sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
 +
 +        /* number of tMPI threads atuo-adjusted */
 +        if (btMPI && bNthreadsAuto && SIMMASTER(cr))
 +        {
 +            if (npppn < ngpu)
 +            {
 +                if (hwinfo->gpu_info.bUserSet)
 +                {
 +                    /* The user manually provided more GPUs than threads we could
 +                     * automatically start. */
 +                    gmx_fatal(FARGS,
 +                              "%d GPU%s provided, but only %d PP thread-MPI thread%s coud be started.\n"
 +                              "%s requires one PP tread-MPI thread per GPU; use fewer GPUs%s.",
 +                              ngpu, gpu_plural, npppn, th_or_proc_plural,
 +                              ShortProgram(), bMaxMpiThreadsSet ? "\nor allow more threads to be used" : "");
 +                }
 +                else
 +                {
 +                    /* There are more GPUs than tMPI threads; we have to limit the number GPUs used. */
 +                    md_print_warn(cr,fplog,
 +                                  "NOTE: %d GPU%s were detected, but only %d PP thread-MPI thread%s can be started.\n"
 +                                  "      %s can use one GPU per PP tread-MPI thread, so only %d GPU%s will be used.%s\n",
 +                                  ngpu, gpu_plural, npppn, th_or_proc_plural,
 +                                  ShortProgram(), npppn, npppn > 1 ? "s" : "",
 +                                  bMaxMpiThreadsSet ? "\n      Also, you can allow more threads to be used by increasing GMX_MAX_MPI_THREADS" : "");
 +
-                     if (bMPI || (btMPI && cr->nodeid_intra == 0))
++                    if (cr->rank_pp_intranode == 0)
 +                    {
 +                        limit_num_gpus_used(hwinfo, npppn);
 +                        ngpu = hwinfo->gpu_info.ncuda_dev_use;
 +                        sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
 +                    }
 +                }
 +            }
 +        }
 +
 +        if (ngpu != npppn)
 +        {
 +            if (hwinfo->gpu_info.bUserSet)
 +            {
 +                gmx_fatal(FARGS,
 +                          "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
 +                          "%s was started with %d PP %s%s%s, but you provided %d GPU%s.",
 +                          th_or_proc, btMPI ? "s" : "es" , pernode,
 +                          ShortProgram(), npppn, th_or_proc, th_or_proc_plural, pernode, ngpu, gpu_plural);
 +            }
 +            else
 +            {
 +                if (ngpu > npppn)
 +                {
 +                    md_print_warn(cr,fplog,
 +                                  "NOTE: potentially sub-optimal launch configuration, %s started with less\n"
 +                                  "      PP %s%s%s than GPU%s available.\n"
 +                                  "      Each PP %s can only use one GPU, so only %d GPU%s%s will be used.",
 +                                  ShortProgram(),
 +                                  th_or_proc, th_or_proc_plural, pernode, gpu_plural,
 +                                  th_or_proc, npppn, gpu_plural, pernode);
 +
-                     if (cr->nodeid_intra == 0)
++                    if (bMPI || (btMPI && cr->rank_pp_intranode == 0))
 +                    {
 +                        limit_num_gpus_used(hwinfo, npppn);
 +                        ngpu = hwinfo->gpu_info.ncuda_dev_use;
 +                        sprintf(gpu_plural, "%s", (ngpu > 1) ? "s" : "");
 +                    }
 +                }
 +                else
 +                {
 +                    /* Avoid duplicate error messages.
 +                     * Unfortunately we can only do this at the physical node
 +                     * level, since the hardware setup and MPI process count
 +                     * might be differ over physical nodes.
 +                     */
-         if (hwinfo->gpu_info.bUserSet && (cr->nodeid_intra == 0))
++                    if (cr->rank_pp_intranode == 0)
 +                    {
 +                        gmx_fatal(FARGS,
 +                                  "Incorrect launch configuration: mismatching number of PP %s%s and GPUs%s.\n"
 +                                  "%s was started with %d PP %s%s%s, but only %d GPU%s were detected.",
 +                                  th_or_proc, btMPI ? "s" : "es" , pernode,
 +                                  ShortProgram(), npppn, th_or_proc, th_or_proc_plural, pernode, ngpu, gpu_plural);
 +                    }
 +#ifdef GMX_MPI
 +                    else
 +                    {
 +                        /* Avoid other ranks to continue after inconsistency */
 +                        MPI_Barrier(cr->mpi_comm_mygroup);
 +                    }
 +#endif
 +                }
 +            }
 +        }
 +
++        if (hwinfo->gpu_info.bUserSet && (cr->rank_pp_intranode == 0))
 +        {
 +            int i, j, same_count;
 +            gmx_bool bSomeSame, bAllDifferent;
 +
 +            same_count = 0;
 +            bSomeSame = FALSE;
 +            bAllDifferent = TRUE;
 +
 +            for (i = 0; i < ngpu - 1; i++)
 +            {
 +                for (j = i + 1; j < ngpu; j++)
 +                {
 +                    bSomeSame       |= hwinfo->gpu_info.cuda_dev_use[i] == hwinfo->gpu_info.cuda_dev_use[j];
 +                    bAllDifferent   &= hwinfo->gpu_info.cuda_dev_use[i] != hwinfo->gpu_info.cuda_dev_use[j];
 +                    same_count      += hwinfo->gpu_info.cuda_dev_use[i] == hwinfo->gpu_info.cuda_dev_use[j];
 +                }
 +            }
 +
 +            if (btMPI && !bAllDifferent)
 +            {
 +                gmx_fatal(FARGS,
 +                          "Invalid GPU assignment: can't share a GPU among multiple thread-MPI threads.\n"
 +                          "Use MPI if you are sure that you want to assign GPU to multiple threads.");
 +            }
 +
 +            if (bSomeSame)
 +            {
 +                md_print_warn(cr,fplog,
 +                              "NOTE: Potentially sub-optimal launch configuration: you assigned %s to\n"
 +                              "      multiple %s%s; this should be avoided as it generally\n"
 +                              "      causes performance loss.",
 +                              same_count > 1 ? "GPUs" : "a GPU", th_or_proc, btMPI ? "s" : "es");
 +            }
 +        }
 +        print_gpu_use_stats(fplog, &hwinfo->gpu_info, cr);
 +    }
 +}
 +
 +/* Return the number of hardware threads supported by the current CPU.
 + * We assume that this is equal with the number of CPUs reported to be
 + * online by the OS at the time of the call.
 + */
 +static int get_nthreads_hw_avail(FILE *fplog, const t_commrec *cr)
 +{
 +     int ret = 0;
 +
 +#if ((defined(WIN32) || defined( _WIN32 ) || defined(WIN64) || defined( _WIN64 )) && !(defined (__CYGWIN__) || defined (__CYGWIN32__)))
 +    /* Windows */
 +    SYSTEM_INFO sysinfo;
 +    GetSystemInfo( &sysinfo );
 +    ret = sysinfo.dwNumberOfProcessors;
 +#elif defined HAVE_SYSCONF
 +    /* We are probably on Unix.
 +     * Now check if we have the argument to use before executing the call
 +     */
 +#if defined(_SC_NPROCESSORS_ONLN)
 +    ret = sysconf(_SC_NPROCESSORS_ONLN);
 +#elif defined(_SC_NPROC_ONLN)
 +    ret = sysconf(_SC_NPROC_ONLN);
 +#elif defined(_SC_NPROCESSORS_CONF)
 +    ret = sysconf(_SC_NPROCESSORS_CONF);
 +#elif defined(_SC_NPROC_CONF)
 +    ret = sysconf(_SC_NPROC_CONF);
 +#endif /* End of check for sysconf argument values */
 +
 +#else
 +    /* Neither windows nor Unix. No fscking idea how many CPUs we have! */
 +    ret = -1;
 +#endif
 +
 +    if (debug)
 +    {
 +        fprintf(debug, "Detected %d processors, will use this as the number "
 +                "of supported hardware threads.\n", ret);
 +    }
 +
 +#ifdef GMX_OMPENMP
 +    if (ret != gmx_omp_get_num_procs())
 +    {
 +        md_print_warn(cr, fplog,
 +                      "Number of CPUs detected (%d) does not match the number reported by OpenMP (%d).\n"
 +                      "Consider setting the launch configuration manually!",
 +                      ret, gmx_omp_get_num_procs());
 +    }
 +#endif
 +
 +    return ret;
 +}
 +
 +void gmx_detect_hardware(FILE *fplog, gmx_hw_info_t *hwinfo,
 +                         const t_commrec *cr,
 +                         gmx_bool bForceUseGPU, gmx_bool bTryUseGPU,
 +                         const char *gpu_id)
 +{
 +    int             i;
 +    const char      *env;
 +    char            sbuf[STRLEN], stmp[STRLEN];
 +    gmx_hw_info_t   *hw;
 +    gmx_gpu_info_t  gpuinfo_auto, gpuinfo_user;
 +    gmx_bool        bGPUBin;
 +
 +    assert(hwinfo);
 +
 +    /* detect CPUID info; no fuss, we don't detect system-wide
 +     * -- sloppy, but that's it for now */
 +    if (gmx_cpuid_init(&hwinfo->cpuid_info) != 0)
 +    {
 +        gmx_fatal_collective(FARGS, cr, NULL, "CPUID detection failed!");
 +    }
 +
 +    /* detect number of hardware threads */
 +    hwinfo->nthreads_hw_avail = get_nthreads_hw_avail(fplog, cr);
 +
 +    /* detect GPUs */
 +    hwinfo->gpu_info.ncuda_dev_use  = 0;
 +    hwinfo->gpu_info.cuda_dev_use   = NULL;
 +    hwinfo->gpu_info.ncuda_dev      = 0;
 +    hwinfo->gpu_info.cuda_dev       = NULL;
 +
 +#ifdef GMX_GPU
 +    bGPUBin      = TRUE;
 +#else
 +    bGPUBin      = FALSE;
 +#endif
 +
 +    /* Bail if binary is not compiled with GPU on */
 +    if (bForceUseGPU && !bGPUBin)
 +    {
 +        gmx_fatal_collective(FARGS, cr, NULL, "GPU acceleration requested, but %s was compiled without GPU support!", ShortProgram());
 +    }
 +
 +    /* run the detection if the binary was compiled with GPU support */
 +    if (bGPUBin && getenv("GMX_DISABLE_GPU_DETECTION")==NULL)
 +    {
 +        char detection_error[STRLEN];
 +
 +        if (detect_cuda_gpus(&hwinfo->gpu_info, detection_error) != 0)
 +        {
 +            if (detection_error != NULL && detection_error[0] != '\0')
 +            {
 +                sprintf(sbuf, ":\n      %s\n", detection_error);
 +            }
 +            else
 +            {
 +                sprintf(sbuf, ".");
 +            }
 +            md_print_warn(cr, fplog,
 +                          "NOTE: Error occurred during GPU detection%s"
 +                          "      Can not use GPU acceleration, will fall back to CPU kernels.\n",
 +                          sbuf);
 +        }
 +    }
 +
 +    if (bForceUseGPU || bTryUseGPU)
 +    {
 +        env = getenv("GMX_GPU_ID");
 +        if (env != NULL && gpu_id != NULL)
 +        {
 +            gmx_fatal(FARGS,"GMX_GPU_ID and -gpu_id can not be used at the same time");
 +        }
 +        if (env == NULL)
 +        {
 +            env = gpu_id;
 +        }
 +
 +        /* parse GPU IDs if the user passed any */
 +        if (env != NULL)
 +        {
 +            int *gpuid, *checkres;
 +            int nid, res;
 +
 +            snew(gpuid, max_gpu_ids_user);
 +            snew(checkres, max_gpu_ids_user);
 +
 +            parse_gpu_id_plain_string(env, &nid, gpuid);
 +
 +            if (nid == 0)
 +            {
 +                gmx_fatal(FARGS, "Empty GPU ID string passed\n");
 +            }
 +
 +            res = check_select_cuda_gpus(checkres, &hwinfo->gpu_info, gpuid, nid);
 +
 +            if (!res)
 +            {
 +                print_gpu_detection_stats(fplog, &hwinfo->gpu_info, cr);
 +
 +                sprintf(sbuf, "Some of the requested GPUs do not exist, behave strangely, or are not compatible:\n");
 +                for (i = 0; i < nid; i++)
 +                {
 +                    if (checkres[i] != egpuCompatible)
 +                    {
 +                        sprintf(stmp, "    GPU #%d: %s\n",
 +                                gpuid[i], gpu_detect_res_str[checkres[i]]);
 +                        strcat(sbuf, stmp);
 +                    }
 +                }
 +                gmx_fatal(FARGS, "%s", sbuf);
 +            }
 +
 +            hwinfo->gpu_info.bUserSet = TRUE;
 +
 +            sfree(gpuid);
 +            sfree(checkres);
 +        }
 +        else
 +        {
 +            pick_compatible_gpus(&hwinfo->gpu_info);
 +            hwinfo->gpu_info.bUserSet = FALSE;
 +        }
 +
 +        /* decide whether we can use GPU */
 +        hwinfo->bCanUseGPU = (hwinfo->gpu_info.ncuda_dev_use > 0);
 +        if (!hwinfo->bCanUseGPU && bForceUseGPU)
 +        {
 +            gmx_fatal(FARGS, "GPU acceleration requested, but no compatible GPUs were detected.");
 +        }
 +    }
 +}
 +
 +void limit_num_gpus_used(gmx_hw_info_t *hwinfo, int count)
 +{
 +    int ndev_use;
 +
 +    assert(hwinfo);
 +
 +    ndev_use = hwinfo->gpu_info.ncuda_dev_use;
 +
 +    if (count > ndev_use)
 +    {
 +        /* won't increase the # of GPUs */
 +        return;
 +    }
 +
 +    if (count < 1)
 +    {
 +        char sbuf[STRLEN];
 +        sprintf(sbuf, "Limiting the number of GPUs to <1 doesn't make sense (detected %d, %d requested)!",
 +                ndev_use, count);
 +        gmx_incons(sbuf);
 +    }
 +
 +    /* TODO: improve this implementation: either sort GPUs or remove the weakest here */
 +    hwinfo->gpu_info.ncuda_dev_use = count;
 +}
 +
 +void gmx_hardware_info_free(gmx_hw_info_t *hwinfo)
 +{
 +    if (hwinfo)
 +    {
 +        gmx_cpuid_done(hwinfo->cpuid_info);
 +        free_gpu_info(&hwinfo->gpu_info);
 +        sfree(hwinfo);
 +    }
 +}
index 8be120d79df9068f3a1114eb6f2115cc472c1729,0000000000000000000000000000000000000000..67a6e207c731536e4e83d22d8e4e195f8be965dc
mode 100644,000000..100644
--- /dev/null
@@@ -1,447 -1,0 +1,447 @@@
-     /* number of processes per node */
-     nppn = cr->nnodes_intra;
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2010, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +
 +#include <stdio.h>
 +#include <stdlib.h>
 +#include <string.h>
 +#include <assert.h>
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include "gmx_fatal.h"
 +#include "typedefs.h"
 +#include "macros.h"
 +#include "network.h"
 +#include "statutil.h"
 +#include "gmx_omp.h"
 +#include "gmx_omp_nthreads.h"
 +#include "md_logging.h"
 +
 +/** Structure with the number of threads for each OpenMP multi-threaded
 + *  algorithmic module in mdrun. */
 +typedef struct
 +{
 +    int gnth;               /**< Global num. of threads per PP or PP+PME process/tMPI thread. */
 +    int gnth_pme;           /**< Global num. of threads per PME only process/tMPI thread. */
 +
 +    int nth[emntNR];        /**< Number of threads for each module, indexed with module_nth_t */
 +    gmx_bool initialized;   /**< TRUE if the module as been initialized. */
 +} omp_module_nthreads_t;
 +
 +/** Names of environment variables to set the per module number of threads.
 + *
 + *  Indexed with the values of module_nth_t.
 + * */
 +static const char *modth_env_var[emntNR] =
 +{
 +    "GMX_DEFAULT_NUM_THREADS should never be set",
 +    "GMX_DOMDEC_NUM_THREADS", "GMX_PAIRSEARCH_NUM_THREADS",
 +    "GMX_NONBONDED_NUM_THREADS", "GMX_BONDED_NUM_THREADS",
 +    "GMX_PME_NUM_THREADS", "GMX_UPDATE_NUM_THREADS",
 +    "GMX_VSITE_NUM_THREADS",
 +    "GMX_LINCS_NUM_THREADS", "GMX_SETTLE_NUM_THREADS"
 +};
 +
 +/** Names of the modules. */
 +static const char *mod_name[emntNR] =
 +{
 +    "default", "domain decomposition", "pair search", "non-bonded",
 +    "bonded", "PME", "update", "LINCS", "SETTLE"
 +};
 +
 +/** Number of threads for each algorithmic module.
 + *
 + *  File-scope global variable that gets set once in \init_module_nthreads
 + *  and queried via gmx_omp_nthreads_get.
 + *
 + *  All fields are initialized to 0 which should result in errors if
 + *  the init call is omitted.
 + * */
 +static omp_module_nthreads_t modth = { 0, 0, {0, 0, 0, 0, 0, 0, 0, 0, 0}, FALSE};
 +
 +
 +/** Determine the number of threads for module \mod.
 + *
 + *  \m takes values form the module_nth_t enum and maps these to the
 + *  corresponding value in modth_env_var.
 + *
 + *  Each number of threads per module takes the default value unless
 + *  GMX_*_NUM_THERADS env var is set, case in which its value overrides
 + *  the deafult.
 + *
 + *  The "group" scheme supports OpenMP only in PME and in thise case all but
 + *  the PME nthread values default to 1.
 + */
 +static int pick_module_nthreads(FILE *fplog, int m,
 +                                gmx_bool bSimMaster,
 +                                gmx_bool bFullOmpSupport,
 +                                gmx_bool bSepPME)
 +{
 +    char *env;
 +    int  nth;
 +    char sbuf[STRLEN];
 +    gmx_bool bOMP;
 +
 +#ifdef GMX_OPENMP
 +    bOMP = TRUE;
 +#else
 +    bOMP = FALSE;
 +#endif /* GMX_OPENMP */
 +
 +    /* The default should never be set through a GMX_*_NUM_THREADS env var
 +     * as it's always equal with gnth. */
 +    if (m == emntDefault)
 +    {
 +        return modth.nth[emntDefault];
 +    }
 +
 +    /* check the environment variable */
 +    if ((env = getenv(modth_env_var[m])) != NULL)
 +    {
 +        sscanf(env, "%d", &nth);
 +
 +        if (!bOMP)
 +        {
 +            gmx_warning("%s=%d is set, but %s is compiled without OpenMP!",
 +                        modth_env_var[m], nth, ShortProgram());
 +        }
 +
 +        /* with the verlet codepath, when any GMX_*_NUM_THREADS env var is set,
 +         * OMP_NUM_THREADS also has to be set */
 +        if (bFullOmpSupport && getenv("OMP_NUM_THREADS") == NULL)
 +        {
 +            gmx_fatal(FARGS, "%s=%d is set, the default number of threads also "
 +                      "needs to be set with OMP_NUM_THREADS!",
 +                      modth_env_var[m], nth);
 +        }
 +
 +        /* with the group scheme warn if any env var except PME is set */
 +        if (!bFullOmpSupport)
 +        {
 +            if (m != emntPME)
 +            {
 +                gmx_warning("%s=%d is set, but OpenMP multithreading is not "
 +                            "supported in %s!",
 +                            modth_env_var[m], nth, mod_name[m]);
 +                nth = 1;
 +            }
 +        }
 +
 +        /* only babble if we are really overriding with a different value */
 +        if ((bSepPME && m == emntPME && nth != modth.gnth_pme) || (nth != modth.gnth))
 +        {
 +            sprintf(sbuf, "%s=%d set, overriding the default number of %s threads",
 +                    modth_env_var[m], nth, mod_name[m]);
 +            if (bSimMaster)
 +            {
 +                fprintf(stderr, "\n%s\n", sbuf);
 +            }
 +            if (fplog)
 +            {
 +                fprintf(fplog, "%s\n", sbuf);
 +            }
 +        }
 +    }
 +    else
 +    {
 +        /* pick the global PME node nthreads if we are setting the number
 +         * of threads in separate PME nodes  */
 +        nth = (bSepPME && m == emntPME) ? modth.gnth_pme : modth.gnth;
 +    }
 +
 +    return modth.nth[m] = nth;
 +}
 +
 +void gmx_omp_nthreads_read_env(int *nthreads_omp)
 +{
 +    char *env;
 +
 +    assert(nthreads_omp);
 +
 +    if ((env = getenv("OMP_NUM_THREADS")) != NULL)
 +    {
 +        int nt_omp;
 +
 +        sscanf(env,"%d",&nt_omp);
 +        if (nt_omp <= 0)
 +        {
 +            gmx_fatal(FARGS,"OMP_NUM_THREADS is invalid: '%s'",env);
 +        }
 +
 +        if (*nthreads_omp > 0 && nt_omp != *nthreads_omp)
 +        {
 +            gmx_fatal(FARGS,"OMP_NUM_THREADS (%d) and the number of threads requested on the command line (%d) have different values",nt_omp,*nthreads_omp);
 +        }
 +
 +        /* Setting the number of OpenMP threads.
 +         * NOTE: with tMPI this function is only called on the master node,
 +         * but with MPI on all nodes which means lots of messages on stderr.
 +         */
 +        fprintf(stderr,"Getting the number of OpenMP threads from OMP_NUM_THREADS: %d\n",nt_omp);
 +        *nthreads_omp = nt_omp;
 +    }
 +}
 +
 +void gmx_omp_nthreads_init(FILE *fplog, t_commrec *cr,
 +                           int nthreads_hw_avail,
 +                           int omp_nthreads_req,
 +                           int omp_nthreads_pme_req,
 +                           gmx_bool bThisNodePMEOnly,
 +                           gmx_bool bFullOmpSupport)
 +{
 +    int  nth, nth_pmeonly, gmx_maxth, nppn;
 +    char *env;
 +    gmx_bool bSepPME, bOMP;
 +
 +#ifdef GMX_OPENMP
 +    bOMP = TRUE;
 +#else
 +    bOMP = FALSE;
 +#endif /* GMX_OPENMP */
 +
-     if (!bSepPME && cr->nodeid_intra == 0)
++    /* number of MPI processes/threads per physical node */
++    nppn = cr->nrank_intranode;
 +
 +    bSepPME = ( (cr->duty & DUTY_PP) && !(cr->duty & DUTY_PME)) ||
 +              (!(cr->duty & DUTY_PP) &&  (cr->duty & DUTY_PME));
 +
 +#ifdef GMX_THREAD_MPI
 +    /* modth is shared among tMPI threads, so for thread safety do the
 +     * detection is done on the master only. It is not thread-safe with
 +     * multiple simulations, but that's anyway not supported by tMPI. */
 +    if (SIMMASTER(cr))
 +#endif
 +    {
 +        /* just return if the initialization has already been done */
 +        if (modth.initialized)
 +        {
 +            return;
 +        }
 +
 +        /* With full OpenMP support (verlet scheme) set the number of threads
 +         * per process / default:
 +         * - 1 if not compiled with OpenMP or
 +         * - OMP_NUM_THREADS if the env. var is set, or
 +         * - omp_nthreads_req = #of threads requested by the user on the mdrun
 +         *   command line, otherwise
 +         * - take the max number of available threads and distribute them
 +         *   on the processes/tMPI threads.
 +         * ~ The GMX_*_NUM_THREADS env var overrides the number of threads of
 +         *   the respective module and it has to be used in conjunction with
 +         *   OMP_NUM_THREADS.
 +         *
 +         * With the group scheme OpenMP multithreading is only supported in PME,
 +         * for all other modules nthreads is set to 1.
 +         * The number of PME threads is equal to:
 +         * - 1 if not compiled with OpenMP or
 +         * - GMX_PME_NUM_THREADS if defined, otherwise
 +         * - OMP_NUM_THREADS if defined, otherwise
 +         * - 1
 +         */
 +        nth = 1;
 +        if ((env = getenv("OMP_NUM_THREADS")) != NULL)
 +        {
 +            if (!bOMP && (strncmp(env, "1", 1) != 0))
 +            {
 +                gmx_warning("OMP_NUM_THREADS is set, but %s was compiled without OpenMP support!",
 +                            ShortProgram());
 +            }
 +            else
 +            {
 +                nth = gmx_omp_get_max_threads();
 +            }
 +        }
 +        else if (omp_nthreads_req > 0)
 +        {
 +            nth = omp_nthreads_req;
 +        }
 +        else if (bFullOmpSupport && bOMP)
 +        {
 +            /* max available threads per node */
 +            nth = nthreads_hw_avail;
 +
 +            /* divide the threads among the MPI processes/tMPI threads */
 +            if (nth >= nppn)
 +            {
 +                nth /= nppn;
 +            }
 +            else
 +            {
 +                nth = 1;
 +            }
 +        }
 +
 +        /* now we have the global values, set them:
 +         * - 1 if not compiled with OpenMP and for the group scheme
 +         * - nth for the verlet scheme when compiled with OpenMP
 +         */
 +        if (bFullOmpSupport && bOMP)
 +        {
 +            modth.gnth = nth;
 +        }
 +        else
 +        {
 +            modth.gnth = 1;
 +        }
 +
 +        if (bSepPME)
 +        {
 +            if (omp_nthreads_pme_req > 0)
 +            {
 +                modth.gnth_pme = omp_nthreads_pme_req;
 +            }
 +            else
 +            {
 +                modth.gnth_pme = nth;
 +            }
 +        }
 +        else
 +        {
 +            modth.gnth_pme = 0;
 +        }
 +
 +        /* now set the per-module values */
 +        modth.nth[emntDefault] = modth.gnth;
 +        pick_module_nthreads(fplog, emntDomdec, SIMMASTER(cr), bFullOmpSupport, bSepPME);
 +        pick_module_nthreads(fplog, emntPairsearch, SIMMASTER(cr), bFullOmpSupport, bSepPME);
 +        pick_module_nthreads(fplog, emntNonbonded, SIMMASTER(cr), bFullOmpSupport, bSepPME);
 +        pick_module_nthreads(fplog, emntBonded, SIMMASTER(cr), bFullOmpSupport, bSepPME);
 +        pick_module_nthreads(fplog, emntPME, SIMMASTER(cr), bFullOmpSupport, bSepPME);
 +        pick_module_nthreads(fplog, emntUpdate, SIMMASTER(cr), bFullOmpSupport, bSepPME);
 +        pick_module_nthreads(fplog, emntVSITE, SIMMASTER(cr), bFullOmpSupport, bSepPME);
 +        pick_module_nthreads(fplog, emntLINCS, SIMMASTER(cr), bFullOmpSupport, bSepPME);
 +        pick_module_nthreads(fplog, emntSETTLE, SIMMASTER(cr), bFullOmpSupport, bSepPME);
 +
 +        /* set the number of threads globally */
 +        if (bOMP)
 +        {
 +#ifndef GMX_THREAD_MPI
 +            if (bThisNodePMEOnly)
 +            {
 +                gmx_omp_set_num_threads(modth.gnth_pme);
 +            }
 +            else
 +#endif /* GMX_THREAD_MPI */
 +            {
 +                if (bFullOmpSupport)
 +                {
 +                    gmx_omp_set_num_threads(nth);
 +                }
 +                else
 +                {
 +                    gmx_omp_set_num_threads(1);
 +                }
 +            }
 +        }
 +
 +        modth.initialized = TRUE;
 +    }
 +#ifdef GMX_THREAD_MPI
 +    /* Non-master threads have to wait for the detection to be done. */
 +    if (PAR(cr))
 +    {
 +        MPI_Barrier(cr->mpi_comm_mysim);
 +    }
 +#endif
 +
 +    /* inform the user about the settings */
 +    if (SIMMASTER(cr) && bOMP)
 +    {
 +#ifdef GMX_THREAD_MPI
 +        const char *mpi_str="per tMPI thread";
 +#else
 +        const char *mpi_str="per MPI process";
 +#endif
 +
 +        /* for group scheme we print PME threads info only */
 +        if (bFullOmpSupport)
 +        {
 +            fprintf(stderr, "Using %d OpenMP thread%s %s\n",
 +                    modth.gnth,modth.gnth > 1 ? "s" : "",
 +                    cr->nnodes > 1 ? mpi_str : "");
 +        }
 +        if (bSepPME && modth.gnth_pme != modth.gnth)
 +        {
 +            fprintf(stderr, "Using %d OpenMP thread%s %s for PME\n",
 +                    modth.gnth_pme,modth.gnth_pme > 1 ? "s" : "",
 +                    cr->nnodes > 1 ? mpi_str : "");
 +        }
 +    }
 +
 +    /* detect and warn about oversubscription
 +     * TODO: enable this for separate PME nodes as well! */
++    if (!bSepPME && cr->rank_pp_intranode == 0)
 +    {
 +        char sbuf[STRLEN], sbuf1[STRLEN], sbuf2[STRLEN];
 +
 +        if (modth.gnth*nppn > nthreads_hw_avail)
 +        {
 +            sprintf(sbuf, "threads");
 +            sbuf1[0] = '\0';
 +            sprintf(sbuf2, "O");
 +#ifdef GMX_MPI
 +            if (modth.gnth == 1)
 +            {
 +#ifdef GMX_THREAD_MPI
 +                sprintf(sbuf, "thread-MPI threads");
 +#else
 +                sprintf(sbuf, "MPI processes");
 +                sprintf(sbuf1, " per node");
 +                sprintf(sbuf2, "On node %d: o", cr->sim_nodeid);
 +#endif
 +            }
 +#endif
 +            md_print_warn(cr, fplog,
 +                          "WARNING: %sversubscribing the available %d logical CPU cores%s with %d %s.\n"
 +                          "         This will cause considerable performance loss!",
 +                          sbuf2, nthreads_hw_avail, sbuf1, nppn*modth.gnth, sbuf);
 +        }
 +    }
 +}
 +
 +int gmx_omp_nthreads_get(int mod)
 +{
 +    if (mod < 0 || mod >= emntNR)
 +    {
 +        /* invalid module queried */
 +        return -1;
 +    }
 +    else
 +    {
 +        return modth.nth[mod];
 +    }
 +}
index d8c50f80bda87d6c1a6cbfb08cb9f0728908ff8d,0000000000000000000000000000000000000000..d99803d8287c5538ed9c6eb500aedd75d87ce5ac
mode 100644,000000..100644
--- /dev/null
@@@ -1,180 -1,0 +1,180 @@@
-   def_nofc    ("VTEMP",    "Vir. Temp."       ),
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROningen Mixture of Alchemy and Childrens' Stories
 + */
 +/* This file is completely threadsafe - keep it that way! */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +
 +#include "typedefs.h"
 +#include "bondf.h"
 +#include "disre.h"
 +#include "orires.h"
 +#include "genborn.h"
 +
 +
 +#define  def_bonded(str,lstr,nra,nrpa,nrpb,ind,func)\
 +   {str,lstr,(nra),(nrpa),(nrpb),IF_BOND,                        (ind),(func)}
 +
 +#define  def_bondedz(str,lstr,nra,nrpa,nrpb,ind,func)\
 +   {str,lstr,(nra),(nrpa),(nrpb),IF_BOND | IF_LIMZERO,           (ind),(func)}
 +
 +#define  def_bondedt(str,lstr,nra,nrpa,nrpb,ind,func)\
 +   {str,lstr,(nra),(nrpa),(nrpb),IF_BOND | IF_TABULATED,         (ind),(func)}
 +
 +#define  def_bondedtz(str,lstr,nra,nrpa,nrpb,ind,func)\
 +   {str,lstr,(nra),(nrpa),(nrpb),IF_BOND | IF_TABULATED | IF_LIMZERO,(ind),(func)}
 +
 +#define   def_angle(str,lstr,nra,nrpa,nrpb,ind,func)\
 +   {str,lstr,(nra),(nrpa),(nrpb),IF_BOND | IF_ATYPE,(ind),(func)}
 +   
 +#define    def_bond(str,lstr,nra,nrpa,nrpb,ind,func)\
 +   {str,lstr,(nra),(nrpa),(nrpb),IF_BOND | IF_CHEMBOND | IF_BTYPE,(ind),(func)}
 +
 +#define    def_bondt(str,lstr,nra,nrpa,nrpb,ind,func)\
 +   {str,lstr,(nra),(nrpa),(nrpb),IF_BOND | IF_CHEMBOND | IF_TABULATED,(ind),(func)}
 +
 +#define  def_bondnb(str,lstr,nra,nrpa,nrpb,ind,func)\
 +   {str,lstr,(nra),(nrpa),(nrpb),IF_BOND | IF_CHEMBOND,(ind),(func)}
 +
 +#define   def_vsite(str,lstr,nra,nrpa)\
 +   {str,lstr,(nra),(nrpa),     0,IF_VSITE,                  -1, unimplemented}
 +
 +#define     def_shk(str,lstr,nra,nrpa,nrpb)\
 +   {str,lstr,(nra),(nrpa),(nrpb),IF_CONSTRAINT,             -1, unimplemented}
 +
 +#define   def_shkcb(str,lstr,nra,nrpa,nrpb)\
 +   {str,lstr,(nra),(nrpa),(nrpb),IF_CONSTRAINT | IF_CHEMBOND,-1, unimplemented}
 +   
 +#define      def_nb(str,lstr,nra, nrp)\
 +   {str,lstr,(nra), (nrp),     0,IF_NULL,                    -1,unimplemented}
 +   
 +#define    def_nofc(str,lstr)\
 +   {str,lstr,    0,     0,     0,IF_NULL,                    -1,unimplemented}
 +
 +/* this MUST correspond to the enum in include/types/idef.h */
 +const t_interaction_function interaction_function[F_NRE]=
 +{
 +  def_bond    ("BONDS",    "Bond",            2, 2, 2,  eNR_BONDS,  bonds         ),
 +  def_bond    ("G96BONDS", "G96Bond",         2, 2, 2,  eNR_BONDS,  g96bonds      ),
 +  def_bond    ("MORSE",    "Morse",           2, 3, 3,  eNR_MORSE,  morse_bonds   ),
 +  def_bond    ("CUBICBONDS","Cubic Bonds",    2, 3, 0,  eNR_CUBICBONDS, cubic_bonds),
 +  def_bondnb  ("CONNBONDS","Connect Bonds",   2, 0, 0,  0,      unimplemented     ),
 +  def_bonded  ("HARMONIC", "Harmonic Pot.",   2, 2, 2,  eNR_BONDS,  bonds         ),
 +  def_bondnb  ("FENEBONDS", "FENE Bonds",     2, 2, 0,  eNR_FENEBONDS, FENE_bonds ),
 +  def_bondt   ("TABBONDS", "Tab. Bonds",      2, 2, 2,  eNR_TABBONDS, tab_bonds   ),
 +  def_bondedtz("TABBONDSNC", "Tab. Bonds NC", 2, 2, 2,  eNR_TABBONDS, tab_bonds   ),
 +  def_bonded  ("RESTRAINTPOT", "Restraint Pot.", 2, 4, 4,  eNR_RESTRBONDS,  restraint_bonds ),
 +  def_angle   ("ANGLES",   "Angle",           3, 2, 2,  eNR_ANGLES, angles        ),
 +  def_angle   ("G96ANGLES","G96Angle",        3, 2, 2,  eNR_ANGLES, g96angles     ),
 +  def_angle   ("LINEAR_ANGLES", "Lin. Angle", 3, 2, 2,  eNR_LINEAR_ANGLES, linear_angles ),
 +  def_bonded  ("CROSS_BOND_BOND", "Bond-Cross", 3, 3, 0,0,          cross_bond_bond ),
 +  def_bonded  ("CROSS_BOND_ANGLE","BA-Cross",   3, 4, 0,0,          cross_bond_angle ),
 +  def_angle   ("UREY_BRADLEY","U-B",          3, 4, 4,  0,          urey_bradley ),
 +  def_angle   ("QANGLES","Quartic Angles",    3, 6, 0,  eNR_QANGLES, quartic_angles ),
 +  def_bondedt ("TABANGLES", "Tab. Angles",    3, 2, 2,  eNR_TABANGLES, tab_angles ),
 +  def_bonded  ("PDIHS",    "Proper Dih.",     4, 3, 3,  eNR_PROPER, pdihs         ),
 +  def_bonded  ("RBDIHS",   "Ryckaert-Bell.",  4, 6, 6,  eNR_RB, rbdihs            ),
 +  def_bonded  ("FOURDIHS", "Fourier Dih.",    4, 4, 4,  eNR_FOURDIH, rbdihs       ),
 +  def_bonded  ("IDIHS",    "Improper Dih.",   4, 2, 2,  eNR_IMPROPER,idihs        ),
 +  def_bonded  ("PIDIHS",   "Improper Dih.",   4, 3, 3,  eNR_PROPER, pdihs         ),
 +  def_bondedt ("TABDIHS", "Tab. Dih.",        4, 2, 2,  eNR_TABDIHS, tab_dihs     ),
 +  def_bonded  ("CMAP",  "CMAP Dih.",          5, -1, -1,  eNR_CMAP,   unimplemented ),
 +  def_bonded  ("GB12",     "GB 1-2 Pol.",     2, 4, 0,  eNR_GB,     unimplemented ),
 +  def_bonded  ("GB13",     "GB 1-3 Pol.",     2, 4, 0,  eNR_GB,     unimplemented ),
 +  def_bonded  ("GB14",     "GB 1-4 Pol.",     2, 4, 0,  eNR_GB,     unimplemented ),
 +  def_nofc    ("GBPOL",    "GB Polarization" ),
 +  def_nofc    ("NPSOLVATION", "Nonpolar Sol." ),
 +  def_bondedz ("LJ14",     "LJ-14",           2, 2, 2,  eNR_NB14,   unimplemented ),
 +  def_nofc    ("COUL14",   "Coulomb-14"                                           ),
 +  def_bondedz ("LJC14_Q",  "LJC-14 q",        2, 5, 0,  eNR_NB14,   unimplemented ),
 +  def_bondedz ("LJC_NB",   "LJC Pairs NB",    2, 4, 0,  eNR_NB14,   unimplemented ),
 +  def_nb      ("LJ_SR",    "LJ (SR)",         2, 2                                ),
 +  def_nb      ("BHAM",     "Buck.ham (SR)",   2, 3                                ),
 +  def_nofc    ("LJ_LR",    "LJ (LR)"                                              ),
 +  def_nofc    ("BHAM_LR",  "Buck.ham (LR)"                                        ),
 +  def_nofc    ("DISPCORR", "Disper. corr."                                        ),
 +  def_nofc    ("COUL_SR",  "Coulomb (SR)"                                         ),
 +  def_nofc    ("COUL_LR",  "Coulomb (LR)"                                         ),
 +  def_nofc    ("RF_EXCL",  "RF excl."                                             ),
 +  def_nofc    ("COUL_RECIP", "Coul. recip."                                       ),
 +  def_nofc    ("DPD",      "DPD"                                                  ),
 +  def_bondnb  ("POLARIZATION", "Polarization",2, 1, 0,  0,          polarize      ),
 +  def_bonded  ("WATERPOL", "Water Pol.",      5, 6, 0,  eNR_WPOL,   water_pol     ),
 +  def_bonded  ("THOLE",    "Thole Pol.",      4, 3, 0,  eNR_THOLE,  thole_pol     ),
 +  def_bondnb  ("ANHARM_POL", "Anharm. Pol.",2, 3, 0, 0,          anharm_polarize      ),
 +  def_bonded  ("POSRES",   "Position Rest.",  1, 3, 3,  eNR_POSRES, unimplemented ),
 +  def_bonded  ("FBPOSRES","Flat-bottom posres", 1, 3, 0, eNR_FBPOSRES, unimplemented ),
 +  def_bonded  ("DISRES",   "Dis. Rest.",      2, 6, 0,  eNR_DISRES, ta_disres     ),
 +  def_nofc    ("DISRESVIOL",   "D.R.Viol. (nm)"                                       ),
 +  def_bonded  ("ORIRES",   "Orient. Rest.",   2, 6, 0,  eNR_ORIRES, orires        ),
 +  def_nofc    ("ORDEV",    "Ori. R. RMSD"                                         ),  
 +  def_bonded  ("ANGRES",   "Angle Rest.",     4, 3, 3,  eNR_ANGRES, angres        ),
 +  def_bonded  ("ANGRESZ",  "Angle Rest. Z",   2, 3, 3,  eNR_ANGRESZ,angresz       ),
 +  def_bonded  ("DIHRES",   "Dih. Rest.",      4, 3, 3,  eNR_DIHRES, dihres        ),
 +  def_nofc    ("DIHRESVIOL",  "Dih. Rest. Viol."                                     ), /* obsolete */
 +  def_shkcb   ("CONSTR",   "Constraint",      2, 1, 1                             ),
 +  def_shk     ("CONSTRNC", "Constr. No Conn.",2, 1, 1                             ),
 +  def_shkcb   ("SETTLE",   "Settle",          3, 2, 0                             ),
 +  def_vsite   ("VSITE2",   "Virtual site 2",  3, 1                                ),
 +  def_vsite   ("VSITE3",   "Virtual site 3",  4, 2                                ),
 +  def_vsite   ("VSITE3FD", "Virtual site 3fd",4, 2                                ),
 +  def_vsite   ("VSITE3FAD","Virtual site 3fad",4, 2                               ),
 +  def_vsite   ("VSITE3OUT","Virtual site 3out",4, 3                               ),
 +  def_vsite   ("VSITE4FD", "Virtual site 4fd", 5, 3                               ),
 +  def_vsite   ("VSITE4FDN","Virtual site 4fdn",5, 3                               ),
 +  def_vsite   ("VSITEN",   "Virtual site N",   2, 2                               ),
 +  def_nofc    ("COM_PULL", "COM Pull En."     ),
 +  def_nofc    ("EQM",      "Quantum En."      ),
 +  def_nofc    ("EPOT",     "Potential"        ),
 +  def_nofc    ("EKIN",     "Kinetic En."      ),
 +  def_nofc    ("ETOT",     "Total Energy"     ),
 +  def_nofc    ("ECONS",    "Conserved En."    ),
 +  def_nofc    ("TEMP",     "Temperature"      ),
++  def_nofc    ("VTEMP",    "Vir. Temp. (not used)"      ),
 +  /* Note that pressure names can not be more than 8 char's,
 +   * because " (bar)" is appended to them.
 +   */
 +  def_nofc    ("PDISPCORR","Pres. DC"         ),  
 +  def_nofc    ("PRES",     "Pressure"         ),
 +  def_nofc    ("DH/DL_CON","dH/dl constr."    ), /* obsolete */
 +  def_nofc    ("DV/DL",    "dVremain/dl"      ),
 +  def_nofc    ("DK/DL",    "dEkin/dl"         ),
 +  def_nofc    ("DVC/DL",   "dVcoul/dl"        ),
 +  def_nofc    ("DVV/DL",   "dVvdw/dl"         ),
 +  def_nofc    ("DVB/DL",   "dVbonded/dl"      ),
 +  def_nofc    ("DVR/DL",   "dVrestraint/dl"   ),
 +  def_nofc    ("DVT/DL",   "dVtemperature/dl" )
 +};
index 5cdd13419865cfdce40eeded7840db896c4357fc,0000000000000000000000000000000000000000..9921ef31f79ad2e61dca101cc2c854e4f121d059
mode 100644,000000..100644
--- /dev/null
@@@ -1,777 -1,0 +1,774 @@@
- void gmx_init_intra_counters(t_commrec *cr)
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROningen Mixture of Alchemy and Childrens' Stories
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <string.h>
 +#include "gmx_fatal.h"
 +#include "main.h"
 +#include "smalloc.h"
 +#include "network.h"
 +#include "copyrite.h"
 +#include "statutil.h"
 +#include <ctype.h>
 +#include "macros.h"
 +
 +#ifdef GMX_LIB_MPI
 +#include <mpi.h>
 +#endif
 +
 +#ifdef GMX_THREAD_MPI
 +#include "tmpi.h"
 +#endif
 +
 +
 +/* The source code in this file should be thread-safe. 
 +      Please keep it that way. */
 +
 +gmx_bool gmx_mpi_initialized(void)
 +{
 +  int n;
 +#ifndef GMX_MPI
 +  return 0;
 +#else
 +  MPI_Initialized(&n);
 +  
 +  return n;
 +#endif
 +}
 +
 +int gmx_setup(int *argc,char **argv,int *nnodes)
 +{
 +#ifndef GMX_MPI
 +  gmx_call("gmx_setup");
 +  return 0;
 +#else
 +  char   buf[256];
 +  int    resultlen;               /* actual length of node name      */
 +  int    i,flag;
 +  int  mpi_num_nodes;
 +  int  mpi_my_rank;
 +  char mpi_hostname[MPI_MAX_PROCESSOR_NAME];
 +
 +  /* Call the MPI routines */
 +#ifdef GMX_LIB_MPI
 +#ifdef GMX_FAHCORE
 +  (void) fah_MPI_Init(argc,&argv);
 +#else
 +  (void) MPI_Init(argc,&argv);
 +#endif
 +#endif
 +  (void) MPI_Comm_size( MPI_COMM_WORLD, &mpi_num_nodes );
 +  (void) MPI_Comm_rank( MPI_COMM_WORLD, &mpi_my_rank );
 +  (void) MPI_Get_processor_name( mpi_hostname, &resultlen );
 + 
 +#ifdef GMX_LIB_MPI 
 +  fprintf(stderr,"NNODES=%d, MYRANK=%d, HOSTNAME=%s\n",
 +        mpi_num_nodes,mpi_my_rank,mpi_hostname);
 +#endif
 +  
 +  *nnodes=mpi_num_nodes;
 +  
 +  return mpi_my_rank;
 +#endif
 +}
 +
 +int  gmx_node_num(void)
 +{
 +#ifndef GMX_MPI
 +  return 1;
 +#else
 +  int i;
 +  (void) MPI_Comm_size(MPI_COMM_WORLD, &i);
 +  return i;
 +#endif
 +}
 +
 +int gmx_node_rank(void)
 +{
 +#ifndef GMX_MPI
 +  return 0;
 +#else
 +  int i;
 +  (void) MPI_Comm_rank(MPI_COMM_WORLD, &i);
 +  return i;
 +#endif
 +}
 +
 +
 +int gmx_hostname_num()
 +{
 +#ifndef GMX_MPI
 +  return 0;
 +#else
 +#ifdef GMX_THREAD_MPI
 +  /* thread-MPI currently puts the thread number in the process name,
 +   * we might want to change this, as this is inconsistent with what
 +   * most MPI implementations would do when running on a single node.
 +   */
 +  return 0;
 +#else
 +  int  resultlen,hostnum,i,j;
 +  char mpi_hostname[MPI_MAX_PROCESSOR_NAME],hostnum_str[MPI_MAX_PROCESSOR_NAME];
 +
 +  MPI_Get_processor_name(mpi_hostname,&resultlen);
 +  /* This procedure can only differentiate nodes with host names
 +   * that end on unique numbers.
 +   */
 +  i = 0;
 +  j = 0;
 +  /* Only parse the host name up to the first dot */
 +  while(i < resultlen && mpi_hostname[i] != '.') {
 +    if (isdigit(mpi_hostname[i])) {
 +      hostnum_str[j++] = mpi_hostname[i];
 +    }
 +    i++;
 +  }
 +  hostnum_str[j] = '\0';
 +  if (j == 0) {
 +    hostnum = 0;
 +  } else {
 +    /* Use only the last 9 decimals, so we don't overflow an int */
 +    hostnum = strtol(hostnum_str + max(0,j-9), NULL, 10);
 +  }
 +
 +  if (debug) {
 +    fprintf(debug,"In gmx_setup_nodecomm: hostname '%s', hostnum %d\n",
 +        mpi_hostname,hostnum);
 +  }
 +  return hostnum;
 +#endif
 +#endif
 +}
 +
 +void gmx_setup_nodecomm(FILE *fplog,t_commrec *cr)
 +{
 +    gmx_nodecomm_t *nc;
 +    int  n,rank,hostnum,ng,ni;
 +
 +    /* Many MPI implementations do not optimize MPI_Allreduce
 +     * (and probably also other global communication calls)
 +     * for multi-core nodes connected by a network.
 +     * We can optimize such communication by using one MPI call
 +     * within each node and one between the nodes.
 +     * For MVAPICH2 and Intel MPI this reduces the time for
 +     * the global_stat communication by 25%
 +     * for 2x2-core 3 GHz Woodcrest connected by mixed DDR/SDR Infiniband.
 +     * B. Hess, November 2007
 +     */
 +
 +    nc = &cr->nc;
 +
 +    nc->bUse = FALSE;
 +#ifndef GMX_THREAD_MPI
 +#ifdef GMX_MPI
 +    MPI_Comm_size(cr->mpi_comm_mygroup,&n);
 +    MPI_Comm_rank(cr->mpi_comm_mygroup,&rank);
 +
 +    hostnum = gmx_hostname_num();
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"In gmx_setup_nodecomm: splitting communicator of size %d\n",n);
 +    }
 +
 +
 +    /* The intra-node communicator, split on node number */
 +    MPI_Comm_split(cr->mpi_comm_mygroup,hostnum,rank,&nc->comm_intra);
 +    MPI_Comm_rank(nc->comm_intra,&nc->rank_intra);
 +    if (debug)
 +    {
 +        fprintf(debug,"In gmx_setup_nodecomm: node rank %d rank_intra %d\n",
 +                rank,nc->rank_intra);
 +    }
 +    /* The inter-node communicator, split on rank_intra.
 +     * We actually only need the one for rank=0,
 +     * but it is easier to create them all.
 +     */
 +    MPI_Comm_split(cr->mpi_comm_mygroup,nc->rank_intra,rank,&nc->comm_inter);
 +    /* Check if this really created two step communication */
 +    MPI_Comm_size(nc->comm_inter,&ng);
 +    MPI_Comm_size(nc->comm_intra,&ni);
 +    if (debug)
 +    {
 +        fprintf(debug,"In gmx_setup_nodecomm: groups %d, my group size %d\n",
 +                ng,ni);
 +    }
 +
 +    if (getenv("GMX_NO_NODECOMM") == NULL &&
 +        ((ng > 1 && ng < n) || (ni > 1 && ni < n)))
 +    {
 +        nc->bUse = TRUE;
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Using two step summing over %d groups of on average %.1f processes\n\n",
 +                    ng,(real)n/(real)ng);
 +        }
 +        if (nc->rank_intra > 0)
 +        {
 +            MPI_Comm_free(&nc->comm_inter);
 +        }
 +    }
 +    else
 +    {
 +        /* One group or all processes in a separate group, use normal summing */
 +        MPI_Comm_free(&nc->comm_inter);
 +        MPI_Comm_free(&nc->comm_intra);
 +        if (debug)
 +        {
 +            fprintf(debug,"In gmx_setup_nodecomm: not unsing separate inter- and intra-node communicators.\n");
 +        }
 +    }
 +#endif
 +#else
 +    /* tMPI runs only on a single node so just use the nodeid */
 +    nc->rank_intra = cr->nodeid;
 +#endif
 +}
 +
-     /* counters for PP+PME and PP-only processes on my node */
-     int nnodes, nnodes_pp, id_mynode=-1, id_mynode_group=-1, nproc_mynode, nproc_mynode_pp;
++void gmx_init_intranode_counters(t_commrec *cr)
 +{
- #endif
++    /* counters for PP+PME and PP-only processes on my physical node */
++    int nrank_intranode, rank_intranode;
++    int nrank_pp_intranode, rank_pp_intranode;
++    /* thread-MPI is not initialized when not running in parallel */
 +#if defined GMX_MPI && !defined GMX_THREAD_MPI
++    int nrank_world, rank_world;
 +    int i, mynum, *num, *num_s, *num_pp, *num_pp_s;
-     nnodes    = cr->nnodes;
-     nnodes_pp = nnodes - cr->npmenodes;
 +
- #if defined GMX_MPI && !defined GMX_THREAD_MPI
-     /* We have MPI and can expect to have different compute nodes */
++    MPI_Comm_size(MPI_COMM_WORLD,&nrank_world);
++    MPI_Comm_rank(MPI_COMM_WORLD,&rank_world);
 +
-     snew(num,   nnodes);
-     snew(num_s, nnodes);
-     snew(num_pp,   nnodes_pp);
-     snew(num_pp_s, nnodes_pp);
-     num_s[cr->sim_nodeid] = mynum;
-     if (cr->duty & DUTY_PP)
-     {
-         num_pp_s[cr->nodeid] = mynum;
-     }
-     MPI_Allreduce(num_s, num, nnodes, MPI_INT, MPI_SUM, cr->mpi_comm_mysim);
-     MPI_Allreduce(num_pp_s, num_pp, nnodes_pp, MPI_INT, MPI_SUM, cr->mpi_comm_mygroup);
-     id_mynode       = 0;
-     id_mynode_group = 0;
-     nproc_mynode    = 0;
-     nproc_mynode_pp = 0;
-     for(i=0; i<nnodes; i++)
++    /* Get the node number from the hostname to identify the nodes */
 +    mynum = gmx_hostname_num();
 +
 +    /* We can't rely on MPI_IN_PLACE, so we need send and receive buffers */
-             nproc_mynode++;
-             if (i < cr->sim_nodeid)
++    snew(num,   nrank_world);
++    snew(num_s, nrank_world);
++    snew(num_pp,   nrank_world);
++    snew(num_pp_s, nrank_world);
++
++    num_s[rank_world]    = mynum;
++    num_pp_s[rank_world] = (cr->duty & DUTY_PP) ? mynum : -1;
++
++    MPI_Allreduce(num_s,    num,    nrank_world, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
++    MPI_Allreduce(num_pp_s, num_pp, nrank_world, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
++
++    nrank_intranode    = 0;
++    rank_intranode     = 0;
++    nrank_pp_intranode = 0;
++    rank_pp_intranode  = 0;
++    for(i=0; i<nrank_world; i++)
 +    {
 +        if (num[i] == mynum)
 +        {
-                 id_mynode++;
-             }
-             if (i < cr->nodeid)
-             {
-                 id_mynode_group++;
++            nrank_intranode++;
++            if (i < rank_world)
 +            {
-     }
-     for(i=0; i<nnodes_pp; i++)
-     {
-         if (num_pp[i] == mynum)
++                rank_intranode++;
 +            }
 +        }
-             nproc_mynode_pp++;
++        if ((cr->duty & DUTY_PP) && num_pp[i] == mynum)
 +        {
-     /* Serial or thread-MPI code, we are running within a node */
-     id_mynode       = cr->sim_nodeid;
-     id_mynode_group = cr->nodeid;
-     nproc_mynode    = cr->nnodes;
-     nproc_mynode_pp = cr->nnodes - cr->npmenodes;
++            nrank_pp_intranode++;
++            if (i < rank_world)
++            {
++                rank_pp_intranode++;
++            }
 +        }
 +    }
 +    sfree(num);
 +    sfree(num_s);
 +    sfree(num_pp);
 +    sfree(num_pp_s);
 +#else
-         fprintf(debug, "On %3s node %d: nodeid_intra=%d, nodeid_group_intra=%d, "
-                 "nnodes_intra=%d, nnodes_pp_intra=%d\n", sbuf, cr->sim_nodeid,
-                 id_mynode, id_mynode_group, nproc_mynode, nproc_mynode_pp);
++    /* Serial or thread-MPI code: we run within a single physical node */
++    nrank_intranode    = cr->nnodes;
++    rank_intranode     = cr->sim_nodeid;
++    nrank_pp_intranode = cr->nnodes - cr->npmenodes;
++    rank_pp_intranode  = cr->nodeid;
 +#endif
 +
 +    if (debug)
 +    {
 +        char sbuf[STRLEN];
 +        if (cr->duty & DUTY_PP && cr->duty & DUTY_PME)
 +        {
 +            sprintf(sbuf, "PP+PME");
 +        }
 +        else
 +        {
 +            sprintf(sbuf, "%s", cr->duty & DUTY_PP ? "PP" : "PME");
 +        }
-     cr->nodeid_intra        = id_mynode;
-     cr->nodeid_group_intra  = id_mynode_group;
-     cr->nnodes_intra        = nproc_mynode;
-     cr->nnodes_pp_intra     = nproc_mynode_pp;
++        fprintf(debug, "On %3s node %d: nrank_intranode=%d, rank_intranode=%d, "
++                "nrank_pp_intranode=%d, rank_pp_intranode=%d\n",
++                sbuf, cr->sim_nodeid,
++                nrank_intranode, rank_intranode,
++                nrank_pp_intranode, rank_pp_intranode);
 +    }
 +
++    cr->nrank_intranode    = nrank_intranode;
++    cr->rank_intranode     = rank_intranode;
++    cr->nrank_pp_intranode = nrank_pp_intranode;
++    cr->rank_pp_intranode  = rank_pp_intranode;
 +}
 +
 +
 +void gmx_barrier(const t_commrec *cr)
 +{
 +#ifndef GMX_MPI
 +  gmx_call("gmx_barrier");
 +#else
 +  MPI_Barrier(cr->mpi_comm_mygroup);
 +#endif
 +}
 +
 +void gmx_abort(int noderank,int nnodes,int errorno)
 +{
 +#ifndef GMX_MPI
 +  gmx_call("gmx_abort");
 +#else
 +#ifdef GMX_THREAD_MPI
 +  fprintf(stderr,"Halting program %s\n",ShortProgram());
 +  thanx(stderr);
 +  exit(1);
 +#else
 +  if (nnodes > 1)
 +  {
 +      fprintf(stderr,"Halting parallel program %s on CPU %d out of %d\n",
 +              ShortProgram(),noderank,nnodes);
 +  }
 +  else
 +  {
 +      fprintf(stderr,"Halting program %s\n",ShortProgram());
 +  }
 +
 +  thanx(stderr);
 +  MPI_Abort(MPI_COMM_WORLD,errorno);
 +  exit(1);
 +#endif
 +#endif
 +}
 +
 +void gmx_bcast(int nbytes,void *b,const t_commrec *cr)
 +{
 +#ifndef GMX_MPI
 +  gmx_call("gmx_bast");
 +#else
 +  MPI_Bcast(b,nbytes,MPI_BYTE,MASTERRANK(cr),cr->mpi_comm_mygroup);
 +#endif
 +}
 +
 +void gmx_bcast_sim(int nbytes,void *b,const t_commrec *cr)
 +{
 +#ifndef GMX_MPI
 +  gmx_call("gmx_bast");
 +#else
 +  MPI_Bcast(b,nbytes,MPI_BYTE,MASTERRANK(cr),cr->mpi_comm_mysim);
 +#endif
 +}
 +
 +void gmx_sumd(int nr,double r[],const t_commrec *cr)
 +{
 +#ifndef GMX_MPI
 +    gmx_call("gmx_sumd");
 +#else
 +#if defined(MPI_IN_PLACE_EXISTS) || defined(GMX_THREAD_MPI)
 +    if (cr->nc.bUse) {
 +        if (cr->nc.rank_intra == 0)
 +        {
 +            /* Use two step summing. */
 +            MPI_Reduce(MPI_IN_PLACE,r,nr,MPI_DOUBLE,MPI_SUM,0,
 +                       cr->nc.comm_intra);
 +            /* Sum the roots of the internal (intra) buffers. */
 +            MPI_Allreduce(MPI_IN_PLACE,r,nr,MPI_DOUBLE,MPI_SUM,
 +                          cr->nc.comm_inter);
 +        }
 +        else
 +        {
 +            /* This is here because of the silly MPI specification
 +                that MPI_IN_PLACE should be put in sendbuf instead of recvbuf */
 +            MPI_Reduce(r,NULL,nr,MPI_DOUBLE,MPI_SUM,0,cr->nc.comm_intra);
 +        }
 +        MPI_Bcast(r,nr,MPI_DOUBLE,0,cr->nc.comm_intra);
 +    } 
 +    else 
 +    {
 +        MPI_Allreduce(MPI_IN_PLACE,r,nr,MPI_DOUBLE,MPI_SUM, 
 +                      cr->mpi_comm_mygroup);
 +    }
 +#else
 +    int i;
 +
 +    if (nr > cr->mpb->dbuf_alloc) {
 +        cr->mpb->dbuf_alloc = nr;
 +        srenew(cr->mpb->dbuf,cr->mpb->dbuf_alloc);
 +    }
 +    if (cr->nc.bUse) {
 +        /* Use two step summing */
 +        MPI_Allreduce(r,cr->mpb->dbuf,nr,MPI_DOUBLE,MPI_SUM,cr->nc.comm_intra);
 +        if (cr->nc.rank_intra == 0) {
 +            /* Sum with the buffers reversed */
 +            MPI_Allreduce(cr->mpb->dbuf,r,nr,MPI_DOUBLE,MPI_SUM, 
 +                          cr->nc.comm_inter);
 +        }
 +        MPI_Bcast(r,nr,MPI_DOUBLE,0,cr->nc.comm_intra);
 +    } else {
 +        MPI_Allreduce(r,cr->mpb->dbuf,nr,MPI_DOUBLE,MPI_SUM,
 +                      cr->mpi_comm_mygroup);
 +        for(i=0; i<nr; i++)
 +            r[i] = cr->mpb->dbuf[i];
 +    }
 +#endif
 +#endif
 +}
 +
 +void gmx_sumf(int nr,float r[],const t_commrec *cr)
 +{
 +#ifndef GMX_MPI
 +    gmx_call("gmx_sumf");
 +#else
 +#if defined(MPI_IN_PLACE_EXISTS) || defined(GMX_THREAD_MPI)
 +    if (cr->nc.bUse) {
 +        /* Use two step summing.  */
 +        if (cr->nc.rank_intra == 0)
 +        {
 +            MPI_Reduce(MPI_IN_PLACE,r,nr,MPI_FLOAT,MPI_SUM,0,
 +                       cr->nc.comm_intra);
 +            /* Sum the roots of the internal (intra) buffers */
 +            MPI_Allreduce(MPI_IN_PLACE,r,nr,MPI_FLOAT,MPI_SUM,
 +                          cr->nc.comm_inter);
 +        }
 +        else
 +        {
 +            /* This is here because of the silly MPI specification
 +                that MPI_IN_PLACE should be put in sendbuf instead of recvbuf */
 +            MPI_Reduce(r,NULL,nr,MPI_FLOAT,MPI_SUM,0,cr->nc.comm_intra);
 +        }
 +        MPI_Bcast(r,nr,MPI_FLOAT,0,cr->nc.comm_intra);
 +    } 
 +    else 
 +    {
 +        MPI_Allreduce(MPI_IN_PLACE,r,nr,MPI_FLOAT,MPI_SUM,cr->mpi_comm_mygroup);
 +    }
 +#else
 +    int i;
 +
 +    if (nr > cr->mpb->fbuf_alloc) {
 +        cr->mpb->fbuf_alloc = nr;
 +        srenew(cr->mpb->fbuf,cr->mpb->fbuf_alloc);
 +    }
 +    if (cr->nc.bUse) {
 +        /* Use two step summing */
 +        MPI_Allreduce(r,cr->mpb->fbuf,nr,MPI_FLOAT,MPI_SUM,cr->nc.comm_intra);
 +        if (cr->nc.rank_intra == 0) {
 +            /* Sum with the buffers reversed */
 +            MPI_Allreduce(cr->mpb->fbuf,r,nr,MPI_FLOAT,MPI_SUM, 
 +                          cr->nc.comm_inter);
 +        }
 +        MPI_Bcast(r,nr,MPI_FLOAT,0,cr->nc.comm_intra);
 +    } else {
 +        MPI_Allreduce(r,cr->mpb->fbuf,nr,MPI_FLOAT,MPI_SUM,
 +                      cr->mpi_comm_mygroup);
 +        for(i=0; i<nr; i++)
 +            r[i] = cr->mpb->fbuf[i];
 +    }
 +#endif
 +#endif
 +}
 +
 +void gmx_sumi(int nr,int r[],const t_commrec *cr)
 +{
 +#ifndef GMX_MPI
 +    gmx_call("gmx_sumi");
 +#else
 +#if defined(MPI_IN_PLACE_EXISTS) || defined(GMX_THREAD_MPI)
 +    if (cr->nc.bUse) {
 +        /* Use two step summing */
 +        if (cr->nc.rank_intra == 0) 
 +        {
 +            MPI_Reduce(MPI_IN_PLACE,r,nr,MPI_INT,MPI_SUM,0,cr->nc.comm_intra);
 +            /* Sum with the buffers reversed */
 +            MPI_Allreduce(MPI_IN_PLACE,r,nr,MPI_INT,MPI_SUM,cr->nc.comm_inter);
 +        }
 +        else
 +        {
 +            /* This is here because of the silly MPI specification
 +                that MPI_IN_PLACE should be put in sendbuf instead of recvbuf */
 +            MPI_Reduce(r,NULL,nr,MPI_INT,MPI_SUM,0,cr->nc.comm_intra);
 +        }
 +        MPI_Bcast(r,nr,MPI_INT,0,cr->nc.comm_intra);
 +    } 
 +    else 
 +    {
 +        MPI_Allreduce(MPI_IN_PLACE,r,nr,MPI_INT,MPI_SUM,cr->mpi_comm_mygroup);
 +    }
 +#else
 +    int i;
 +
 +    if (nr > cr->mpb->ibuf_alloc) {
 +        cr->mpb->ibuf_alloc = nr;
 +        srenew(cr->mpb->ibuf,cr->mpb->ibuf_alloc);
 +    }
 +    if (cr->nc.bUse) {
 +        /* Use two step summing */
 +        MPI_Allreduce(r,cr->mpb->ibuf,nr,MPI_INT,MPI_SUM,cr->nc.comm_intra);
 +        if (cr->nc.rank_intra == 0) {
 +            /* Sum with the buffers reversed */
 +            MPI_Allreduce(cr->mpb->ibuf,r,nr,MPI_INT,MPI_SUM,cr->nc.comm_inter);
 +        }
 +        MPI_Bcast(r,nr,MPI_INT,0,cr->nc.comm_intra);
 +    } else {
 +        MPI_Allreduce(r,cr->mpb->ibuf,nr,MPI_INT,MPI_SUM,cr->mpi_comm_mygroup);
 +        for(i=0; i<nr; i++)
 +            r[i] = cr->mpb->ibuf[i];
 +    }
 +#endif
 +#endif
 +}
 +
 +void gmx_sumli(int nr,gmx_large_int_t r[],const t_commrec *cr)
 +{
 +#ifndef GMX_MPI
 +    gmx_call("gmx_sumli");
 +#else
 +#if defined(MPI_IN_PLACE_EXISTS) || defined(GMX_THREAD_MPI)
 +    if (cr->nc.bUse) {
 +        /* Use two step summing */
 +        if (cr->nc.rank_intra == 0) 
 +        {
 +            MPI_Reduce(MPI_IN_PLACE,r,nr,GMX_MPI_LARGE_INT,MPI_SUM,0,
 +                       cr->nc.comm_intra);
 +            /* Sum with the buffers reversed */
 +            MPI_Allreduce(MPI_IN_PLACE,r,nr,GMX_MPI_LARGE_INT,MPI_SUM,
 +                          cr->nc.comm_inter);
 +        }
 +        else
 +        {
 +            /* This is here because of the silly MPI specification
 +                that MPI_IN_PLACE should be put in sendbuf instead of recvbuf */
 +            MPI_Reduce(r,NULL,nr,GMX_MPI_LARGE_INT,MPI_SUM,0,cr->nc.comm_intra);
 +        }
 +        MPI_Bcast(r,nr,GMX_MPI_LARGE_INT,0,cr->nc.comm_intra);
 +    } 
 +    else 
 +    {
 +        MPI_Allreduce(MPI_IN_PLACE,r,nr,GMX_MPI_LARGE_INT,MPI_SUM,cr->mpi_comm_mygroup);
 +    }
 +#else
 +    int i;
 +
 +    if (nr > cr->mpb->libuf_alloc) {
 +        cr->mpb->libuf_alloc = nr;
 +        srenew(cr->mpb->libuf,cr->mpb->libuf_alloc);
 +    }
 +    if (cr->nc.bUse) {
 +        /* Use two step summing */
 +        MPI_Allreduce(r,cr->mpb->libuf,nr,GMX_MPI_LARGE_INT,MPI_SUM,
 +                      cr->nc.comm_intra);
 +        if (cr->nc.rank_intra == 0) {
 +            /* Sum with the buffers reversed */
 +            MPI_Allreduce(cr->mpb->libuf,r,nr,GMX_MPI_LARGE_INT,MPI_SUM,
 +                          cr->nc.comm_inter);
 +        }
 +        MPI_Bcast(r,nr,GMX_MPI_LARGE_INT,0,cr->nc.comm_intra);
 +    } else {
 +        MPI_Allreduce(r,cr->mpb->libuf,nr,GMX_MPI_LARGE_INT,MPI_SUM,
 +                      cr->mpi_comm_mygroup);
 +        for(i=0; i<nr; i++)
 +            r[i] = cr->mpb->libuf[i];
 +    }
 +#endif
 +#endif
 +}
 +
 +
 +
 +#ifdef GMX_MPI
 +void gmx_sumd_comm(int nr,double r[],MPI_Comm mpi_comm)
 +{
 +#if defined(MPI_IN_PLACE_EXISTS) || defined(GMX_THREAD_MPI)
 +    MPI_Allreduce(MPI_IN_PLACE,r,nr,MPI_DOUBLE,MPI_SUM,mpi_comm);
 +#else
 +    /* this function is only used in code that is not performance critical,
 +       (during setup, when comm_rec is not the appropriate communication  
 +       structure), so this isn't as bad as it looks. */
 +    double *buf;
 +    int i;
 +
 +    snew(buf, nr);
 +    MPI_Allreduce(r,buf,nr,MPI_DOUBLE,MPI_SUM,mpi_comm);
 +    for(i=0; i<nr; i++)
 +        r[i] = buf[i];
 +    sfree(buf);
 +#endif
 +}
 +#endif
 +
 +#ifdef GMX_MPI
 +void gmx_sumf_comm(int nr,float r[],MPI_Comm mpi_comm)
 +{
 +#if defined(MPI_IN_PLACE_EXISTS) || defined(GMX_THREAD_MPI)
 +    MPI_Allreduce(MPI_IN_PLACE,r,nr,MPI_FLOAT,MPI_SUM,mpi_comm);
 +#else
 +    /* this function is only used in code that is not performance critical,
 +       (during setup, when comm_rec is not the appropriate communication  
 +       structure), so this isn't as bad as it looks. */
 +    float *buf;
 +    int i;
 +
 +    snew(buf, nr);
 +    MPI_Allreduce(r,buf,nr,MPI_FLOAT,MPI_SUM,mpi_comm);
 +    for(i=0; i<nr; i++)
 +        r[i] = buf[i];
 +    sfree(buf);
 +#endif
 +}
 +#endif
 +
 +void gmx_sumd_sim(int nr,double r[],const gmx_multisim_t *ms)
 +{
 +#ifndef GMX_MPI
 +  gmx_call("gmx_sumd_sim");
 +#else
 +  gmx_sumd_comm(nr,r,ms->mpi_comm_masters);
 +#endif
 +}
 +
 +void gmx_sumf_sim(int nr,float r[],const gmx_multisim_t *ms)
 +{
 +#ifndef GMX_MPI
 +  gmx_call("gmx_sumf_sim");
 +#else
 +  gmx_sumf_comm(nr,r,ms->mpi_comm_masters);
 +#endif
 +}
 +
 +void gmx_sumi_sim(int nr,int r[], const gmx_multisim_t *ms)
 +{
 +#ifndef GMX_MPI
 +    gmx_call("gmx_sumi_sim");
 +#else
 +#if defined(MPI_IN_PLACE_EXISTS) || defined(GMX_THREAD_MPI)
 +    MPI_Allreduce(MPI_IN_PLACE,r,nr,MPI_INT,MPI_SUM,ms->mpi_comm_masters);
 +#else
 +    /* this is thread-unsafe, but it will do for now: */
 +    int i;
 +
 +    if (nr > ms->mpb->ibuf_alloc) {
 +        ms->mpb->ibuf_alloc = nr;
 +        srenew(ms->mpb->ibuf,ms->mpb->ibuf_alloc);
 +    }
 +    MPI_Allreduce(r,ms->mpb->ibuf,nr,MPI_INT,MPI_SUM,ms->mpi_comm_masters);
 +    for(i=0; i<nr; i++)
 +        r[i] = ms->mpb->ibuf[i];
 +#endif
 +#endif
 +}
 +
 +void gmx_sumli_sim(int nr,gmx_large_int_t r[], const gmx_multisim_t *ms)
 +{
 +#ifndef GMX_MPI
 +    gmx_call("gmx_sumli_sim");
 +#else
 +#if defined(MPI_IN_PLACE_EXISTS) || defined(GMX_THREAD_MPI)
 +    MPI_Allreduce(MPI_IN_PLACE,r,nr,GMX_MPI_LARGE_INT,MPI_SUM,
 +                  ms->mpi_comm_masters);
 +#else
 +    /* this is thread-unsafe, but it will do for now: */
 +    int i;
 +
 +    if (nr > ms->mpb->libuf_alloc) {
 +        ms->mpb->libuf_alloc = nr;
 +        srenew(ms->mpb->libuf,ms->mpb->libuf_alloc);
 +    }
 +    MPI_Allreduce(r,ms->mpb->libuf,nr,GMX_MPI_LARGE_INT,MPI_SUM,
 +                  ms->mpi_comm_masters);
 +    for(i=0; i<nr; i++)
 +        r[i] = ms->mpb->libuf[i];
 +#endif
 +#endif
 +}
 +
 +
 +void gmx_finalize_par(void)
 +{
 +#ifndef GMX_MPI
 +    /* Compiled without MPI, no MPI finalizing needed */
 +    return;
 +#else
 +    int initialized,finalized;
 +    int ret;
 +
 +    MPI_Initialized(&initialized);
 +    if (!initialized)
 +    {
 +        return;
 +    }
 +    /* just as a check; we don't want to finalize twice */
 +    MPI_Finalized(&finalized);
 +    if (finalized)
 +    {
 +      return;
 +    }
 +
 +  /* We sync the processes here to try to avoid problems
 +   * with buggy MPI implementations that could cause
 +   * unfinished processes to terminate.
 +   */
 +  MPI_Barrier(MPI_COMM_WORLD);
 +
 +  /*
 +  if (DOMAINDECOMP(cr)) {
 +    if (cr->npmenodes > 0 || cr->dd->bCartesian) 
 +      MPI_Comm_free(&cr->mpi_comm_mygroup);
 +    if (cr->dd->bCartesian)
 +      MPI_Comm_free(&cr->mpi_comm_mysim);
 +  }
 +  */
 +
 +  /* Apparently certain mpich implementations cause problems
 +   * with MPI_Finalize. In that case comment out MPI_Finalize.
 +   */
 +  if (debug)
 +    fprintf(debug,"Will call MPI_Finalize now\n");
 +
 +  ret = MPI_Finalize();
 +  if (debug)
 +    fprintf(debug,"Return code from MPI_Finalize = %d\n",ret);
 +#endif
 +}
 +
index 205f56043f94676ec066f7927ba7d6d0f21fd0e3,0000000000000000000000000000000000000000..ad43839a795fe4aa2af3697890300c77fce80479
mode 100644,000000..100644
--- /dev/null
@@@ -1,58 -1,0 +1,54 @@@
- gmx_nb_generic_adress_kernel(t_nblist *           nlist,
-                        t_forcerec *         fr,
-                        t_mdatoms *          mdatoms,
-                        real *               x,
-                        real *               f,
-                        real *               fshift,
-                        real *               Vc,
-                        real *               Vvdw,
-                        real                 tabscale,  
-                        real *               VFtab,
-                        int *                outeriter,
-                        int *                inneriter,
-                          gmx_bool                 bCG);
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
++ * Copyright (c) 2011 Christoph Junghans, Sebastian Fritsch
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 4.0.5
 + * Written by Christoph Junghans, Brad Lambeth, and possibly others.
 + * Copyright (c) 2009 Christoph Junghans, Brad Lambeth.
 + * All rights reserved.
 + 
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +
 +#ifndef _nb_generic_adress_h_
 +#define _nb_generic_adress_h_
 +
++#include "nb_kernel.h"
 +#include "types/simple.h"
 +#include "typedefs.h"
 +
 +void
++gmx_nb_generic_adress_kernel(t_nblist *                nlist,
++                      rvec *                    xx,
++                      rvec *                    ff,
++                      t_forcerec *              fr,
++                      t_mdatoms *               mdatoms,
++                      nb_kernel_data_t *        kernel_data,
++                      t_nrnb *                  nrnb);
 +
 +#endif
 +
index 0000000000000000000000000000000000000000,8232ba0ea21fbd392d383060dc3e768db38321be..8232ba0ea21fbd392d383060dc3e768db38321be
mode 000000,100755..100755
--- /dev/null
index 0000000000000000000000000000000000000000,6a2a501de398e10b5911f7acee721d6d916b9920..6a2a501de398e10b5911f7acee721d6d916b9920
mode 000000,100644..100644
--- /dev/null
index 939cab3c67fad16359c0da077ed2296121104970,0000000000000000000000000000000000000000..b7c1ef046f68dd5ca7d8e883eca7c3540431d785
mode 100644,000000..100644
--- /dev/null
@@@ -1,667 -1,0 +1,674 @@@
-     if(nl->free_energy)
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROningen Mixture of Alchemy and Childrens' Stories
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#ifdef GMX_THREAD_MPI
 +#include <thread_mpi.h>
 +#endif
 +
 +#include <stdio.h>
 +#include <stdlib.h>
 +#include "typedefs.h"
 +#include "txtdump.h"
 +#include "smalloc.h"
 +#include "ns.h"
 +#include "vec.h"
 +#include "maths.h"
 +#include "macros.h"
 +#include "string2.h"
 +#include "force.h"
 +#include "names.h"
 +#include "main.h"
 +#include "xvgr.h"
 +#include "gmx_fatal.h"
 +#include "physics.h"
 +#include "force.h"
 +#include "bondf.h"
 +#include "nrnb.h"
 +#include "smalloc.h"
 +#include "nonbonded.h"
 +
 +#include "nb_kernel.h"
 +#include "nb_free_energy.h"
 +#include "nb_generic.h"
 +#include "nb_generic_cg.h"
 +#include "nb_generic_adress.h"
 +
 +/* Different default (c) and accelerated interaction-specific kernels */
 +#include "nb_kernel_c/nb_kernel_c.h"
 +
 +#if (defined GMX_CPU_ACCELERATION_X86_SSE2) && !(defined GMX_DOUBLE)
 +#    include "nb_kernel_sse2_single/nb_kernel_sse2_single.h"
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_SSE4_1) && !(defined GMX_DOUBLE)
 +#    include "nb_kernel_sse4_1_single/nb_kernel_sse4_1_single.h"
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_AVX_128_FMA) && !(defined GMX_DOUBLE)
 +#    include "nb_kernel_avx_128_fma_single/nb_kernel_avx_128_fma_single.h"
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_AVX_256) && !(defined GMX_DOUBLE)
 +#    include "nb_kernel_avx_256_single/nb_kernel_avx_256_single.h"
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_SSE2 && defined GMX_DOUBLE)
 +#    include "nb_kernel_sse2_double/nb_kernel_sse2_double.h"
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_SSE4_1 && defined GMX_DOUBLE)
 +#    include "nb_kernel_sse4_1_double/nb_kernel_sse4_1_double.h"
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_AVX_128_FMA && defined GMX_DOUBLE)
 +#    include "nb_kernel_avx_128_fma_double/nb_kernel_avx_128_fma_double.h"
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_AVX_256 && defined GMX_DOUBLE)
 +#    include "nb_kernel_avx_256_double/nb_kernel_avx_256_double.h"
 +#endif
 +
 +
 +#ifdef GMX_THREAD_MPI
 +static tMPI_Thread_mutex_t nonbonded_setup_mutex = TMPI_THREAD_MUTEX_INITIALIZER;
 +#endif
 +static gmx_bool            nonbonded_setup_done  = FALSE;
 +
 +
 +void
 +gmx_nonbonded_setup(FILE *         fplog,
 +                    t_forcerec *   fr,
 +                    gmx_bool       bGenericKernelOnly)
 +{
 +#ifdef GMX_THREAD_MPI
 +    tMPI_Thread_mutex_lock(&nonbonded_setup_mutex);
 +#endif
 +    /* Here we are guaranteed only one thread made it. */
 +    if(nonbonded_setup_done==FALSE)
 +    {
 +        if(bGenericKernelOnly==FALSE)
 +        {
 +            /* Add the generic kernels to the structure stored statically in nb_kernel.c */
 +            nb_kernel_list_add_kernels(kernellist_c,kernellist_c_size);
 +            
 +            if(!(fr!=NULL && fr->use_cpu_acceleration==FALSE))
 +            {
 +                /* Add interaction-specific kernels for different architectures */
 +                /* Single precision */
 +#if (defined GMX_CPU_ACCELERATION_X86_SSE2) && !(defined GMX_DOUBLE)
 +                nb_kernel_list_add_kernels(kernellist_sse2_single,kernellist_sse2_single_size);
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_SSE4_1) && !(defined GMX_DOUBLE)
 +                nb_kernel_list_add_kernels(kernellist_sse4_1_single,kernellist_sse4_1_single_size);
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_AVX_128_FMA) && !(defined GMX_DOUBLE)
 +                nb_kernel_list_add_kernels(kernellist_avx_128_fma_single,kernellist_avx_128_fma_single_size);
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_AVX_256) && !(defined GMX_DOUBLE)
 +                nb_kernel_list_add_kernels(kernellist_avx_256_single,kernellist_avx_256_single_size);
 +#endif
 +                /* Double precision */
 +#if (defined GMX_CPU_ACCELERATION_X86_SSE2 && defined GMX_DOUBLE)
 +                nb_kernel_list_add_kernels(kernellist_sse2_double,kernellist_sse2_double_size);
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_SSE4_1 && defined GMX_DOUBLE)
 +                nb_kernel_list_add_kernels(kernellist_sse4_1_double,kernellist_sse4_1_double_size);
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_AVX_128_FMA && defined GMX_DOUBLE)
 +                nb_kernel_list_add_kernels(kernellist_avx_128_fma_double,kernellist_avx_128_fma_double_size);
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_AVX_256 && defined GMX_DOUBLE)
 +                nb_kernel_list_add_kernels(kernellist_avx_256_double,kernellist_avx_256_double_size);
 +#endif
 +                ; /* empty statement to avoid a completely empty block */
 +            }
 +        }
 +        /* Create a hash for faster lookups */
 +        nb_kernel_list_hash_init();
 +
 +        nonbonded_setup_done=TRUE;
 +    }
 +#ifdef GMX_THREAD_MPI
 +    tMPI_Thread_mutex_unlock(&nonbonded_setup_mutex);
 +#endif
 +}
 +
 +
 +
 +void
 +gmx_nonbonded_set_kernel_pointers(FILE *log, t_nblist *nl)
 +{
 +    const char *     elec;
 +    const char *     elec_mod;
 +    const char *     vdw;
 +    const char *     vdw_mod;
 +    const char *     geom;
 +    const char *     other;
 +    const char *     vf;
 +
 +    struct
 +    {
 +        const char *  arch;
 +        int           simd_padding_width;
 +    }
 +    arch_and_padding[] =
 +    {
 +        /* Single precision */
 +#if (defined GMX_CPU_ACCELERATION_X86_AVX_256) && !(defined GMX_DOUBLE)
 +        { "avx_256_single", 8 },
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_AVX_128_FMA) && !(defined GMX_DOUBLE)
 +        { "avx_128_fma_single", 4 },
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_SSE4_1) && !(defined GMX_DOUBLE)
 +        { "sse4_1_single", 4 },
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_SSE2) && !(defined GMX_DOUBLE)
 +        { "sse2_single", 4 },
 +#endif
 +        /* Double precision */
 +#if (defined GMX_CPU_ACCELERATION_X86_AVX_256 && defined GMX_DOUBLE)
 +        { "avx_256_double", 4 },
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_AVX_128_FMA && defined GMX_DOUBLE)
 +        /* Sic. Double precision 2-way SIMD does not require neighbor list padding,
 +         * since the kernels execute a loop unrolled a factor 2, followed by
 +         * a possible single odd-element epilogue.
 +         */
 +        { "avx_128_fma_double", 1 },
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_SSE2 && defined GMX_DOUBLE)
 +        /* No padding - see comment above */
 +        { "sse2_double", 1 },
 +#endif
 +#if (defined GMX_CPU_ACCELERATION_X86_SSE4_1 && defined GMX_DOUBLE)
 +        /* No padding - see comment above */
 +        { "sse4_1_double", 1 },
 +#endif
 +        { "c", 1 },
 +    };
 +    int              narch = asize(arch_and_padding);
 +    int              i;
 +
 +    if(nonbonded_setup_done==FALSE)
 +    {
 +        /* We typically call this setup routine before starting timers,
 +         * but if that has not been done for whatever reason we do it now.
 +         */
 +        gmx_nonbonded_setup(NULL,NULL,FALSE);
 +    }
 +
 +    /* Not used yet */
 +    other="";
 +
 +    nl->kernelptr_vf = NULL;
 +    nl->kernelptr_v  = NULL;
 +    nl->kernelptr_f  = NULL;
 +
 +    elec     = gmx_nbkernel_elec_names[nl->ielec];
 +    elec_mod = eintmod_names[nl->ielecmod];
 +    vdw      = gmx_nbkernel_vdw_names[nl->ivdw];
 +    vdw_mod  = eintmod_names[nl->ivdwmod];
 +    geom     = gmx_nblist_geometry_names[nl->igeometry];
 +
-                     if(nlist[i].free_energy==0 && (flags & GMX_NONBONDED_DO_FOREIGNLAMBDA))
++    if(nl->type==GMX_NBLIST_INTERACTION_ADRESS){
++        nl->kernelptr_vf = gmx_nb_generic_adress_kernel;
++        nl->kernelptr_f = gmx_nb_generic_adress_kernel;
++        nl->simd_padding_width = 1;
++        return;
++    }
++
++    if(nl->type==GMX_NBLIST_INTERACTION_FREE_ENERGY)
 +    {
 +        nl->kernelptr_vf = gmx_nb_free_energy_kernel;
 +        nl->kernelptr_f  = gmx_nb_free_energy_kernel;
 +        nl->simd_padding_width = 1;
 +    }
 +    else if(!gmx_strcasecmp_min(geom,"CG-CG"))
 +    {
 +        nl->kernelptr_vf = gmx_nb_generic_cg_kernel;
 +        nl->kernelptr_f  = gmx_nb_generic_cg_kernel;
 +        nl->simd_padding_width = 1;
 +    }
 +    else
 +    {
 +        /* Try to find a specific kernel first */
 +
 +        for(i=0;i<narch && nl->kernelptr_vf==NULL ;i++)
 +        {
 +            nl->kernelptr_vf = nb_kernel_list_findkernel(log,arch_and_padding[i].arch,elec,elec_mod,vdw,vdw_mod,geom,other,"PotentialAndForce");
 +            nl->simd_padding_width = arch_and_padding[i].simd_padding_width;
 +        }
 +        for(i=0;i<narch && nl->kernelptr_f==NULL ;i++)
 +        {
 +            nl->kernelptr_f = nb_kernel_list_findkernel(log,arch_and_padding[i].arch,elec,elec_mod,vdw,vdw_mod,geom,other,"Force");
 +            nl->simd_padding_width = arch_and_padding[i].simd_padding_width;
 +
 +            /* If there is not force-only optimized kernel, is there a potential & force one? */
 +            if(nl->kernelptr_f == NULL)
 +            {
 +                nl->kernelptr_f  = nb_kernel_list_findkernel(NULL,arch_and_padding[i].arch,elec,elec_mod,vdw,vdw_mod,geom,other,"PotentialAndForce");
 +                nl->simd_padding_width = arch_and_padding[i].simd_padding_width;
 +            }
 +        }
 +        
 +        /* Give up, pick a generic one instead */
 +        if(nl->kernelptr_vf==NULL)
 +        {
 +            nl->kernelptr_vf = gmx_nb_generic_kernel;
 +            nl->kernelptr_f  = gmx_nb_generic_kernel;
 +            nl->simd_padding_width = 1;
 +            if(debug)
 +            {
 +                fprintf(debug,
 +                        "WARNING - Slow generic NB kernel used for neighborlist with\n"
 +                        "    Elec: '%s', Modifier: '%s'\n"
 +                        "    Vdw:  '%s', Modifier: '%s'\n"
 +                        "    Geom: '%s', Other: '%s'\n\n",
 +                        elec,elec_mod,vdw,vdw_mod,geom,other);
 +            }
 +        }
 +    }
 +
 +    return;
 +}
 +
 +void do_nonbonded(t_commrec *cr,t_forcerec *fr,
 +                  rvec x[],rvec f_shortrange[],rvec f_longrange[],t_mdatoms *mdatoms,t_blocka *excl,
 +                  gmx_grppairener_t *grppener,rvec box_size,
 +                  t_nrnb *nrnb,real *lambda, real *dvdl,
 +                  int nls,int eNL,int flags)
 +{
 +      t_nblist *        nlist;
 +      int               n,n0,n1,i,i0,i1,sz,range;
 +      t_nblists *       nblists;
 +    nb_kernel_data_t  kernel_data;
 +    nb_kernel_t *     kernelptr=NULL;
 +    rvec *            f;
 +    
 +    kernel_data.flags                   = flags;
 +    kernel_data.exclusions              = excl;
 +    kernel_data.lambda                  = lambda;
 +    kernel_data.dvdl                    = dvdl;
 +        
 +    if(fr->bAllvsAll)
 +    {
 +        return;
 +    }
 +      
 +    if (eNL >= 0)
 +    {
 +              i0 = eNL;
 +              i1 = i0+1;
 +    }
 +    else
 +    {
 +              i0 = 0;
 +              i1 = eNL_NR;
 +      }
 +      
 +      if (nls >= 0)
 +      {
 +              n0 = nls;
 +              n1 = nls+1;
 +      }
 +      else
 +      {
 +              n0 = 0;
 +              n1 = fr->nnblists;
 +      }
 +
 +      for(n=n0; (n<n1); n++)
 +      {
 +              nblists = &fr->nblists[n];
 +
 +        kernel_data.table_elec              = &nblists->table_elec;
 +        kernel_data.table_vdw               = &nblists->table_vdw;
 +        kernel_data.table_elec_vdw          = &nblists->table_elec_vdw;
 +
 +        for(range=0;range<2;range++)
 +        {
 +            /* Are we doing short/long-range? */
 +            if(range==0)
 +            {
 +                /* Short-range */
 +                if(!(flags & GMX_NONBONDED_DO_SR))
 +                {
 +                    continue;
 +                }
 +                kernel_data.energygrp_elec          = grppener->ener[egCOULSR];
 +                kernel_data.energygrp_vdw           = grppener->ener[fr->bBHAM ? egBHAMSR : egLJSR];
 +                kernel_data.energygrp_polarization  = grppener->ener[egGB];
 +                nlist = nblists->nlist_sr;
 +                f                                   = f_shortrange;
 +            }
 +            else if(range==1)
 +            {
 +                /* Long-range */
 +                if(!(flags & GMX_NONBONDED_DO_LR))
 +                {
 +                    continue;
 +                }
 +                kernel_data.energygrp_elec          = grppener->ener[egCOULLR];
 +                kernel_data.energygrp_vdw           = grppener->ener[fr->bBHAM ? egBHAMLR : egLJLR];
 +                kernel_data.energygrp_polarization  = grppener->ener[egGB];
 +                nlist = nblists->nlist_lr;
 +                f                                   = f_longrange;
 +            }
 +
 +            for(i=i0; (i<i1); i++)
 +            {
 +                if (nlist[i].nri > 0)
 +                {
 +                    if(flags & GMX_NONBONDED_DO_POTENTIAL)
 +                    {
 +                        /* Potential and force */
 +                        kernelptr = (nb_kernel_t *)nlist[i].kernelptr_vf;
 +                    }
 +                    else
 +                    {
 +                        /* Force only, no potential */
 +                        kernelptr = (nb_kernel_t *)nlist[i].kernelptr_f;
 +                    }
 +
++                    if(nlist[i].type!=GMX_NBLIST_INTERACTION_FREE_ENERGY && (flags & GMX_NONBONDED_DO_FOREIGNLAMBDA))
 +                    {
 +                        /* We don't need the non-perturbed interactions */
 +                        continue;
 +                    }
 +                    (*kernelptr)(&(nlist[i]),x,f,fr,mdatoms,&kernel_data,nrnb);
 +                 }
 +            }
 +        }
 +    }
 +}
 +
 +static void
 +nb_listed_warning_rlimit(const rvec *x,int ai, int aj,int * global_atom_index,real r, real rlimit)
 +{
 +    gmx_warning("Listed nonbonded interaction between particles %d and %d\n"
 +                "at distance %.3f which is larger than the table limit %.3f nm.\n\n"
 +                "This is likely either a 1,4 interaction, or a listed interaction inside\n"
 +                "a smaller molecule you are decoupling during a free energy calculation.\n"
 +                "Since interactions at distances beyond the table cannot be computed,\n"
 +                "they are skipped until they are inside the table limit again. You will\n"
 +                "only see this message once, even if it occurs for several interactions.\n\n"
 +                "IMPORTANT: This should not happen in a stable simulation, so there is\n"
 +                "probably something wrong with your system. Only change the table-extension\n"
 +                "distance in the mdp file if you are really sure that is the reason.\n",
 +                glatnr(global_atom_index,ai),glatnr(global_atom_index,aj),r,rlimit);
 +
 +    if (debug)
 +    {
 +        fprintf(debug,
 +                "%8f %8f %8f\n%8f %8f %8f\n1-4 (%d,%d) interaction not within cut-off! r=%g. Ignored\n",
 +                x[ai][XX],x[ai][YY],x[ai][ZZ],x[aj][XX],x[aj][YY],x[aj][ZZ],
 +                glatnr(global_atom_index,ai),glatnr(global_atom_index,aj),r);
 +    }
 +}
 +
 +
 +
 +/* This might logically belong better in the nb_generic.c module, but it is only
 + * used in do_nonbonded_listed(), and we want it to be inlined there to avoid an
 + * extra functional call for every single pair listed in the topology.
 + */
 +static real
 +nb_evaluate_single(real r2, real tabscale,real *vftab,
 +                   real qq, real c6, real c12, real *velec, real *vvdw)
 +{
 +    real       rinv,r,rtab,eps,eps2,Y,F,Geps,Heps2,Fp,VVe,FFe,VVd,FFd,VVr,FFr,fscal;
 +    int        ntab;
 +
 +    /* Do the tabulated interactions - first table lookup */
 +    rinv             = gmx_invsqrt(r2);
 +    r                = r2*rinv;
 +    rtab             = r*tabscale;
 +    ntab             = rtab;
 +    eps              = rtab-ntab;
 +    eps2             = eps*eps;
 +    ntab             = 12*ntab;
 +    /* Electrostatics */
 +    Y                = vftab[ntab];
 +    F                = vftab[ntab+1];
 +    Geps             = eps*vftab[ntab+2];
 +    Heps2            = eps2*vftab[ntab+3];
 +    Fp               = F+Geps+Heps2;
 +    VVe              = Y+eps*Fp;
 +    FFe              = Fp+Geps+2.0*Heps2;
 +    /* Dispersion */
 +    Y                = vftab[ntab+4];
 +    F                = vftab[ntab+5];
 +    Geps             = eps*vftab[ntab+6];
 +    Heps2            = eps2*vftab[ntab+7];
 +    Fp               = F+Geps+Heps2;
 +    VVd              = Y+eps*Fp;
 +    FFd              = Fp+Geps+2.0*Heps2;
 +    /* Repulsion */
 +    Y                = vftab[ntab+8];
 +    F                = vftab[ntab+9];
 +    Geps             = eps*vftab[ntab+10];
 +    Heps2            = eps2*vftab[ntab+11];
 +    Fp               = F+Geps+Heps2;
 +    VVr              = Y+eps*Fp;
 +    FFr              = Fp+Geps+2.0*Heps2;
 +
 +    *velec           = qq*VVe;
 +    *vvdw            = c6*VVd+c12*VVr;
 +
 +    fscal            = -(qq*FFe+c6*FFd+c12*FFr)*tabscale*rinv;
 +
 +    return fscal;
 +}
 +
 +
 +real
 +do_nonbonded_listed(int ftype,int nbonds,
 +                const t_iatom iatoms[],const t_iparams iparams[],
 +                const rvec x[],rvec f[],rvec fshift[],
 +                const t_pbc *pbc,const t_graph *g,
 +                real *lambda, real *dvdl,
 +                const t_mdatoms *md,
 +                const t_forcerec *fr,gmx_grppairener_t *grppener,
 +                int *global_atom_index)
 +{
 +    int              ielec,ivdw;
 +    real             qq,c6,c12;
 +    rvec             dx;
 +    ivec             dt;
 +    int              i,j,itype,ai,aj,gid;
 +    int              fshift_index;
 +    real             r2,rinv;
 +    real             fscal,velec,vvdw;
 +    real *           energygrp_elec;
 +    real *           energygrp_vdw;
 +    static gmx_bool  warned_rlimit=FALSE;
 +    /* Free energy stuff */
 +    gmx_bool         bFreeEnergy;
 +    real             LFC[2],LFV[2],DLF[2],lfac_coul[2],lfac_vdw[2],dlfac_coul[2],dlfac_vdw[2];
 +    real             qqB,c6B,c12B,sigma2_def,sigma2_min;
 +    
 +    
 +    switch (ftype) {
 +        case F_LJ14:
 +        case F_LJC14_Q:
 +            energygrp_elec = grppener->ener[egCOUL14];
 +            energygrp_vdw  = grppener->ener[egLJ14];
 +            break;
 +        case F_LJC_PAIRS_NB:
 +            energygrp_elec = grppener->ener[egCOULSR];
 +            energygrp_vdw  = grppener->ener[egLJSR];
 +            break;
 +        default:
 +            energygrp_elec = NULL; /* Keep compiler happy */
 +            energygrp_vdw  = NULL; /* Keep compiler happy */
 +            gmx_fatal(FARGS,"Unknown function type %d in do_nonbonded14",ftype);
 +            break;
 +    }
 +    
 +    if(fr->efep != efepNO)
 +    {
 +        /* Lambda factor for state A=1-lambda and B=lambda */
 +        LFC[0] = 1.0 - lambda[efptCOUL];
 +        LFV[0] = 1.0 - lambda[efptVDW];
 +        LFC[1] = lambda[efptCOUL];
 +        LFV[1] = lambda[efptVDW];
 +
 +        /*derivative of the lambda factor for state A and B */
 +        DLF[0] = -1;
 +        DLF[1] = 1;
 +
 +        /* precalculate */
 +        sigma2_def = pow(fr->sc_sigma6_def,1.0/3.0);
 +        sigma2_min = pow(fr->sc_sigma6_min,1.0/3.0);
 +
 +        for (i=0;i<2;i++)
 +        {
 +            lfac_coul[i]  = (fr->sc_power==2 ? (1-LFC[i])*(1-LFC[i]) : (1-LFC[i]));
 +            dlfac_coul[i] = DLF[i]*fr->sc_power/fr->sc_r_power*(fr->sc_power==2 ? (1-LFC[i]) : 1);
 +            lfac_vdw[i]   = (fr->sc_power==2 ? (1-LFV[i])*(1-LFV[i]) : (1-LFV[i]));
 +            dlfac_vdw[i]  = DLF[i]*fr->sc_power/fr->sc_r_power*(fr->sc_power==2 ? (1-LFV[i]) : 1);
 +        }
 +    }
 +    else
 +    {
 +        sigma2_min = sigma2_def = 0;
 +    }
 +
 +    bFreeEnergy = FALSE;
 +    for(i=0; (i<nbonds); )
 +    {
 +        itype = iatoms[i++];
 +        ai    = iatoms[i++];
 +        aj    = iatoms[i++];
 +        gid   = GID(md->cENER[ai],md->cENER[aj],md->nenergrp);
 +        
 +        /* Get parameters */
 +        switch (ftype) {
 +            case F_LJ14:
 +                bFreeEnergy =
 +                (fr->efep != efepNO &&
 +                 ((md->nPerturbed && (md->bPerturbed[ai] || md->bPerturbed[aj])) ||
 +                  iparams[itype].lj14.c6A != iparams[itype].lj14.c6B ||
 +                  iparams[itype].lj14.c12A != iparams[itype].lj14.c12B));
 +                qq               = md->chargeA[ai]*md->chargeA[aj]*fr->epsfac*fr->fudgeQQ;
 +                c6               = iparams[itype].lj14.c6A;
 +                c12              = iparams[itype].lj14.c12A;
 +                break;
 +            case F_LJC14_Q:
 +                qq               = iparams[itype].ljc14.qi*iparams[itype].ljc14.qj*fr->epsfac*iparams[itype].ljc14.fqq;
 +                c6               = iparams[itype].ljc14.c6;
 +                c12              = iparams[itype].ljc14.c12;
 +                break;
 +            case F_LJC_PAIRS_NB:
 +                qq               = iparams[itype].ljcnb.qi*iparams[itype].ljcnb.qj*fr->epsfac;
 +                c6               = iparams[itype].ljcnb.c6;
 +                c12              = iparams[itype].ljcnb.c12;
 +                break;
 +            default:
 +                /* Cannot happen since we called gmx_fatal() above in this case */
 +                qq = c6 = c12 = 0; /* Keep compiler happy */
 +                break;
 +        }
 +        
 +        /* To save flops in the optimized kernels, c6/c12 have 6.0/12.0 derivative prefactors
 +         * included in the general nfbp array now. This means the tables are scaled down by the
 +         * same factor, so when we use the original c6/c12 parameters from iparams[] they must
 +         * be scaled up.
 +         */
 +        c6  *= 6.0;
 +        c12 *= 12.0;
 +        
 +        /* Do we need to apply full periodic boundary conditions? */
 +        if(fr->bMolPBC==TRUE)
 +        {
 +            fshift_index = pbc_dx_aiuc(pbc,x[ai],x[aj],dx);
 +        }
 +        else
 +        {
 +            fshift_index = CENTRAL;
 +            rvec_sub(x[ai],x[aj],dx);
 +        }
 +        r2           = norm2(dx);
 +
 +        if(r2>=fr->tab14.r*fr->tab14.r)
 +        {
 +            if(warned_rlimit==FALSE)
 +            {
 +                nb_listed_warning_rlimit(x,ai,aj,global_atom_index,sqrt(r2),fr->tab14.r);
 +                warned_rlimit=TRUE;
 +            }
 +            continue;
 +        }
 +
 +        if (bFreeEnergy)
 +        {
 +            /* Currently free energy is only supported for F_LJ14, so no need to check for that if we got here */
 +            qqB              = md->chargeB[ai]*md->chargeB[aj]*fr->epsfac*fr->fudgeQQ;
 +            c6B              = iparams[itype].lj14.c6B*6.0;
 +            c12B             = iparams[itype].lj14.c12B*12.0;
 +
 +            fscal            = nb_free_energy_evaluate_single(r2,fr->sc_r_power,fr->sc_alphacoul,fr->sc_alphavdw,
 +                                                              fr->tab14.scale,fr->tab14.data,qq,c6,c12,qqB,c6B,c12B,
 +                                                              LFC,LFV,DLF,lfac_coul,lfac_vdw,dlfac_coul,dlfac_vdw,
 +                                                              fr->sc_sigma6_def,fr->sc_sigma6_min,sigma2_def,sigma2_min,&velec,&vvdw,dvdl);
 +        }
 +        else
 +        {
 +            /* Evaluate tabulated interaction without free energy */
 +            fscal            = nb_evaluate_single(r2,fr->tab14.scale,fr->tab14.data,qq,c6,c12,&velec,&vvdw);
 +        }
 +
 +        energygrp_elec[gid]  += velec;
 +        energygrp_vdw[gid]   += vvdw;
 +        svmul(fscal,dx,dx);
 +
 +        /* Add the forces */
 +        rvec_inc(f[ai],dx);
 +        rvec_dec(f[aj],dx);
 +
 +        if (g)
 +        {
 +            /* Correct the shift forces using the graph */
 +            ivec_sub(SHIFT_IVEC(g,ai),SHIFT_IVEC(g,aj),dt);
 +            fshift_index = IVEC2IS(dt);
 +        }
 +        if(fshift_index!=CENTRAL)
 +        {
 +            rvec_inc(fshift[fshift_index],dx);
 +            rvec_dec(fshift[CENTRAL],dx);
 +        }
 +    }
 +    return 0.0;
 +}
 +
 +
index 5375ae056a15899ef54d5b3ec86883f78d41ef22,0000000000000000000000000000000000000000..dc6eebf60822c24d9f34aa7953ce415190c589a4
mode 100644,000000..100644
--- /dev/null
@@@ -1,2944 -1,0 +1,2944 @@@
-   { 69, F_VTEMP             },
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROningen Mixture of Alchemy and Childrens' Stories
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +/* This file is completely threadsafe - keep it that way! */
 +#ifdef GMX_THREAD_MPI
 +#include <thread_mpi.h>
 +#endif
 +
 +
 +#include <ctype.h>
 +#include "sysstuff.h"
 +#include "smalloc.h"
 +#include "string2.h"
 +#include "gmx_fatal.h"
 +#include "macros.h"
 +#include "names.h"
 +#include "symtab.h"
 +#include "futil.h"
 +#include "filenm.h"
 +#include "gmxfio.h"
 +#include "topsort.h"
 +#include "tpxio.h"
 +#include "txtdump.h"
 +#include "confio.h"
 +#include "atomprop.h"
 +#include "copyrite.h"
 +#include "vec.h"
 +#include "mtop_util.h"
 +
 +#define TPX_TAG_RELEASE  "release"
 +
 +/* This is the tag string which is stored in the tpx file.
 + * Change this if you want to change the tpx format in a feature branch.
 + * This ensures that there will not be different tpx formats around which
 + * can not be distinguished.
 + */
 +static const char *tpx_tag = TPX_TAG_RELEASE;
 +
 +/* This number should be increased whenever the file format changes! */
 +static const int tpx_version = 91;
 +
 +/* This number should only be increased when you edit the TOPOLOGY section
 + * or the HEADER of the tpx format.
 + * This way we can maintain forward compatibility too for all analysis tools
 + * and/or external programs that only need to know the atom/residue names,
 + * charges, and bond connectivity.
 + *  
 + * It first appeared in tpx version 26, when I also moved the inputrecord
 + * to the end of the tpx file, so we can just skip it if we only
 + * want the topology.
 + */
 +static const int tpx_generation = 25;
 +
 +/* This number should be the most recent backwards incompatible version 
 + * I.e., if this number is 9, we cannot read tpx version 9 with this code.
 + */
 +static const int tpx_incompatible_version = 9;
 +
 +
 +
 +/* Struct used to maintain tpx compatibility when function types are added */
 +typedef struct {
 +  int fvnr; /* file version number in which the function type first appeared */
 +  int ftype; /* function type */
 +} t_ftupd;
 +
 +/* 
 + *The entries should be ordered in:
 + * 1. ascending file version number
 + * 2. ascending function type number
 + */
 +/*static const t_ftupd ftupd[] = {
 +  { 20, F_CUBICBONDS        },
 +  { 20, F_CONNBONDS         },
 +  { 20, F_HARMONIC          },
 +  { 20, F_EQM,              },
 +  { 22, F_DISRESVIOL        },
 +  { 22, F_ORIRES            },
 +  { 22, F_ORIRESDEV         },
 +  { 26, F_FOURDIHS          },
 +  { 26, F_PIDIHS            },
 +  { 26, F_DIHRES            },
 +  { 26, F_DIHRESVIOL        },
 +  { 30, F_CROSS_BOND_BONDS  },
 +  { 30, F_CROSS_BOND_ANGLES },
 +  { 30, F_UREY_BRADLEY      },
 +  { 30, F_POLARIZATION      },
 +  { 54, F_DHDL_CON          },
 +  };*/
 +/* 
 + *The entries should be ordered in:
 + * 1. ascending function type number
 + * 2. ascending file version number
 + */
 +/* question; what is the purpose of the commented code above? */
 +static const t_ftupd ftupd[] = {
 +  { 20, F_CUBICBONDS        },
 +  { 20, F_CONNBONDS         },
 +  { 20, F_HARMONIC          },
 +  { 34, F_FENEBONDS         },
 +  { 43, F_TABBONDS          },
 +  { 43, F_TABBONDSNC        },
 +  { 70, F_RESTRBONDS        },
 +  { 76, F_LINEAR_ANGLES     },
 +  { 30, F_CROSS_BOND_BONDS  },
 +  { 30, F_CROSS_BOND_ANGLES },
 +  { 30, F_UREY_BRADLEY      },
 +  { 34, F_QUARTIC_ANGLES    },
 +  { 43, F_TABANGLES         },
 +  { 26, F_FOURDIHS          },
 +  { 26, F_PIDIHS            },
 +  { 43, F_TABDIHS           },
 +  { 65, F_CMAP              },
 +  { 60, F_GB12              },
 +  { 61, F_GB13              },
 +  { 61, F_GB14              },        
 +  { 72, F_GBPOL             },
 +  { 72, F_NPSOLVATION       },
 +  { 41, F_LJC14_Q           },
 +  { 41, F_LJC_PAIRS_NB      },
 +  { 32, F_BHAM_LR           },
 +  { 32, F_RF_EXCL           },
 +  { 32, F_COUL_RECIP        },
 +  { 46, F_DPD               },
 +  { 30, F_POLARIZATION      },
 +  { 36, F_THOLE_POL         },
 +  { 80, F_FBPOSRES          },
 +  { 22, F_DISRESVIOL        },
 +  { 22, F_ORIRES            },
 +  { 22, F_ORIRESDEV         },
 +  { 26, F_DIHRES            },
 +  { 26, F_DIHRESVIOL        },
 +  { 49, F_VSITE4FDN         },
 +  { 50, F_VSITEN            },
 +  { 46, F_COM_PULL          },
 +  { 20, F_EQM               },
 +  { 46, F_ECONSERVED        },
++  { 69, F_VTEMP_NOLONGERUSED},
 +  { 66, F_PDISPCORR         },
 +  { 54, F_DHDL_CON          },
 +  { 76, F_ANHARM_POL        },
 +  { 79, F_DVDL_COUL         },
 +  { 79, F_DVDL_VDW,         },
 +  { 79, F_DVDL_BONDED,      },
 +  { 79, F_DVDL_RESTRAINT    },
 +  { 79, F_DVDL_TEMPERATURE  },
 +  { 54, F_DHDL_CON          }
 +};
 +#define NFTUPD asize(ftupd)
 +
 +/* Needed for backward compatibility */
 +#define MAXNODES 256
 +
 +static void _do_section(t_fileio *fio,int key,gmx_bool bRead,const char *src,
 +                        int line)
 +{
 +  char buf[STRLEN];
 +  gmx_bool bDbg;
 +
 +  if (gmx_fio_getftp(fio) == efTPA) {
 +    if (!bRead) {
 +      gmx_fio_write_string(fio,itemstr[key]);
 +      bDbg       = gmx_fio_getdebug(fio);
 +      gmx_fio_setdebug(fio,FALSE);
 +      gmx_fio_write_string(fio,comment_str[key]);
 +      gmx_fio_setdebug(fio,bDbg);
 +    }
 +    else {
 +      if (gmx_fio_getdebug(fio))
 +      fprintf(stderr,"Looking for section %s (%s, %d)",
 +              itemstr[key],src,line);
 +      
 +      do {
 +      gmx_fio_do_string(fio,buf);
 +      } while ((gmx_strcasecmp(buf,itemstr[key]) != 0));
 +      
 +      if (gmx_strcasecmp(buf,itemstr[key]) != 0) 
 +      gmx_fatal(FARGS,"\nCould not find section heading %s",itemstr[key]);
 +      else if (gmx_fio_getdebug(fio))
 +      fprintf(stderr," and found it\n");
 +    }
 +  }
 +}
 +
 +#define do_section(fio,key,bRead) _do_section(fio,key,bRead,__FILE__,__LINE__)
 +
 +/**************************************************************
 + *
 + * Now the higer level routines that do io of the structures and arrays
 + *
 + **************************************************************/
 +static void do_pullgrp(t_fileio *fio, t_pullgrp *pgrp, gmx_bool bRead, 
 +                       int file_version)
 +{
 +  gmx_bool bDum=TRUE;
 +  int  i;
 +
 +  gmx_fio_do_int(fio,pgrp->nat);
 +  if (bRead)
 +    snew(pgrp->ind,pgrp->nat);
 +  bDum=gmx_fio_ndo_int(fio,pgrp->ind,pgrp->nat);
 +  gmx_fio_do_int(fio,pgrp->nweight);
 +  if (bRead)
 +    snew(pgrp->weight,pgrp->nweight);
 +  bDum=gmx_fio_ndo_real(fio,pgrp->weight,pgrp->nweight);
 +  gmx_fio_do_int(fio,pgrp->pbcatom);
 +  gmx_fio_do_rvec(fio,pgrp->vec);
 +  gmx_fio_do_rvec(fio,pgrp->init);
 +  gmx_fio_do_real(fio,pgrp->rate);
 +  gmx_fio_do_real(fio,pgrp->k);
 +  if (file_version >= 56) {
 +    gmx_fio_do_real(fio,pgrp->kB);
 +  } else {
 +    pgrp->kB = pgrp->k;
 +  }
 +}
 +
 +static void do_expandedvals(t_fileio *fio,t_expanded *expand,int n_lambda, gmx_bool bRead, int file_version)
 +{
 +  /* i is used in the ndo_double macro*/
 +  int i;
 +  real fv;
 +  gmx_bool bDum=TRUE;
 +  real rdum;
 +
 +  if (file_version >= 79)
 +  {
 +      if (n_lambda>0)
 +      {
 +          if (bRead)
 +          {
 +              snew(expand->init_lambda_weights,n_lambda);
 +          }
 +          bDum=gmx_fio_ndo_real(fio,expand->init_lambda_weights,n_lambda);
 +          gmx_fio_do_gmx_bool(fio,expand->bInit_weights);
 +      }
 +
 +      gmx_fio_do_int(fio,expand->nstexpanded);
 +      gmx_fio_do_int(fio,expand->elmcmove);
 +      gmx_fio_do_int(fio,expand->elamstats);
 +      gmx_fio_do_int(fio,expand->lmc_repeats);
 +      gmx_fio_do_int(fio,expand->gibbsdeltalam);
 +      gmx_fio_do_int(fio,expand->lmc_forced_nstart);
 +      gmx_fio_do_int(fio,expand->lmc_seed);
 +      gmx_fio_do_real(fio,expand->mc_temp);
 +      gmx_fio_do_int(fio,expand->bSymmetrizedTMatrix);
 +      gmx_fio_do_int(fio,expand->nstTij);
 +      gmx_fio_do_int(fio,expand->minvarmin);
 +      gmx_fio_do_int(fio,expand->c_range);
 +      gmx_fio_do_real(fio,expand->wl_scale);
 +      gmx_fio_do_real(fio,expand->wl_ratio);
 +      gmx_fio_do_real(fio,expand->init_wl_delta);
 +      gmx_fio_do_gmx_bool(fio,expand->bWLoneovert);
 +      gmx_fio_do_int(fio,expand->elmceq);
 +      gmx_fio_do_int(fio,expand->equil_steps);
 +      gmx_fio_do_int(fio,expand->equil_samples);
 +      gmx_fio_do_int(fio,expand->equil_n_at_lam);
 +      gmx_fio_do_real(fio,expand->equil_wl_delta);
 +      gmx_fio_do_real(fio,expand->equil_ratio);
 +  }
 +}
 +
 +static void do_simtempvals(t_fileio *fio,t_simtemp *simtemp, int n_lambda, gmx_bool bRead, 
 +                           int file_version)
 +{
 +  gmx_bool bDum=TRUE;
 +
 +  if (file_version >= 79)
 +  {
 +      gmx_fio_do_int(fio,simtemp->eSimTempScale);
 +      gmx_fio_do_real(fio,simtemp->simtemp_high);
 +      gmx_fio_do_real(fio,simtemp->simtemp_low);
 +      if (n_lambda>0)
 +      {
 +          if (bRead)
 +          {
 +              snew(simtemp->temperatures,n_lambda);
 +          }
 +          bDum=gmx_fio_ndo_real(fio,simtemp->temperatures,n_lambda);
 +      }
 +  }
 +}
 +
 +static void do_fepvals(t_fileio *fio,t_lambda *fepvals,gmx_bool bRead, int file_version)
 +{
 +  /* i is defined in the ndo_double macro; use g to iterate. */
 +  int i,g;
 +  real fv;
 +  gmx_bool bDum=TRUE;
 +  real rdum;
 +
 +  /* free energy values */
 +  if (file_version >= 79)
 +  {
 +      gmx_fio_do_int(fio,fepvals->init_fep_state);
 +      gmx_fio_do_double(fio,fepvals->init_lambda);
 +      gmx_fio_do_double(fio,fepvals->delta_lambda);
 +  }
 +  else if (file_version >= 59) {
 +      gmx_fio_do_double(fio,fepvals->init_lambda);
 +      gmx_fio_do_double(fio,fepvals->delta_lambda);
 +  } else {
 +      gmx_fio_do_real(fio,rdum);
 +      fepvals->init_lambda = rdum;
 +      gmx_fio_do_real(fio,rdum);
 +      fepvals->delta_lambda = rdum;
 +  }
 +  if (file_version >= 79)
 +  {
 +      gmx_fio_do_int(fio,fepvals->n_lambda);
 +      if (bRead)
 +      {
 +          snew(fepvals->all_lambda,efptNR);
 +      }
 +      for (g=0;g<efptNR;g++)
 +      {
 +          if (fepvals->n_lambda > 0) {
 +              if (bRead)
 +              {
 +                  snew(fepvals->all_lambda[g],fepvals->n_lambda);
 +              }
 +              bDum=gmx_fio_ndo_double(fio,fepvals->all_lambda[g],fepvals->n_lambda);
 +              bDum=gmx_fio_ndo_int(fio,fepvals->separate_dvdl,efptNR);
 +          }
 +          else if (fepvals->init_lambda >= 0)
 +          {
 +              fepvals->separate_dvdl[efptFEP] = TRUE;
 +          }
 +      }
 +  }
 +  else if (file_version >= 64)
 +  {
 +      gmx_fio_do_int(fio,fepvals->n_lambda);
 +      snew(fepvals->all_lambda,efptNR);
 +      if (bRead)
 +      {
 +          snew(fepvals->all_lambda[efptFEP],fepvals->n_lambda);
 +      }
 +      bDum=gmx_fio_ndo_double(fio,fepvals->all_lambda[efptFEP],fepvals->n_lambda);
 +      if (fepvals->init_lambda >= 0)
 +      {
 +          fepvals->separate_dvdl[efptFEP] = TRUE;
 +      }
 +      /* still allocate the all_lambda array's contents. */
 +      for (g=0;g<efptNR;g++)
 +      {
 +          if (fepvals->n_lambda > 0) {
 +              if (bRead)
 +              {
 +                  snew(fepvals->all_lambda[g],fepvals->n_lambda);
 +              }
 +          }
 +      }
 +  }
 +  else
 +  {
 +      fepvals->n_lambda = 0;
 +      fepvals->all_lambda   = NULL;
 +      if (fepvals->init_lambda >= 0)
 +      {
 +          fepvals->separate_dvdl[efptFEP] = TRUE;
 +      }
 +  }
 +  if (file_version >= 13)
 +  {
 +      gmx_fio_do_real(fio,fepvals->sc_alpha);
 +  }
 +  else
 +  {
 +      fepvals->sc_alpha = 0;
 +  }
 +  if (file_version >= 38)
 +  {
 +      gmx_fio_do_int(fio,fepvals->sc_power);
 +  }
 +  else
 +  {
 +      fepvals->sc_power = 2;
 +  }
 +  if (file_version >= 79)
 +  {
 +      gmx_fio_do_real(fio,fepvals->sc_r_power);
 +  }
 +  else
 +  {
 +      fepvals->sc_r_power = 6.0;
 +  }
 +  if (file_version >= 15)
 +  {
 +      gmx_fio_do_real(fio,fepvals->sc_sigma);
 +  }
 +  else
 +  {
 +      fepvals->sc_sigma = 0.3;
 +  }
 +  if (bRead)
 +  {
 +      if (file_version >= 71)
 +      {
 +          fepvals->sc_sigma_min = fepvals->sc_sigma;
 +      }
 +      else
 +      {
 +          fepvals->sc_sigma_min = 0;
 +      }
 +  }
 +  if (file_version >= 79)
 +  {
 +      gmx_fio_do_int(fio,fepvals->bScCoul);
 +  }
 +  else
 +  {
 +      fepvals->bScCoul = TRUE;
 +  }
 +  if (file_version >= 64) {
 +      gmx_fio_do_int(fio,fepvals->nstdhdl);
 +  } else {
 +      fepvals->nstdhdl = 1;
 +  }
 +
 +  if (file_version >= 73)
 +  {
 +      gmx_fio_do_int(fio, fepvals->separate_dhdl_file);
 +      gmx_fio_do_int(fio, fepvals->dhdl_derivatives);
 +  }
 +  else
 +  {
 +      fepvals->separate_dhdl_file = esepdhdlfileYES;
 +      fepvals->dhdl_derivatives = edhdlderivativesYES;
 +  }
 +  if (file_version >= 71)
 +  {
 +      gmx_fio_do_int(fio,fepvals->dh_hist_size);
 +      gmx_fio_do_double(fio,fepvals->dh_hist_spacing);
 +  }
 +  else
 +  {
 +      fepvals->dh_hist_size    = 0;
 +      fepvals->dh_hist_spacing = 0.1;
 +  }
 +  if (file_version >= 79)
 +  {
 +      gmx_fio_do_int(fio,fepvals->bPrintEnergy);
 +  }
 +  else
 +  {
 +      fepvals->bPrintEnergy = FALSE;
 +  }
 +}
 +
 +static void do_pull(t_fileio *fio, t_pull *pull,gmx_bool bRead, int file_version)
 +{
 +  int g;
 +
 +  gmx_fio_do_int(fio,pull->ngrp);
 +  gmx_fio_do_int(fio,pull->eGeom);
 +  gmx_fio_do_ivec(fio,pull->dim);
 +  gmx_fio_do_real(fio,pull->cyl_r1);
 +  gmx_fio_do_real(fio,pull->cyl_r0);
 +  gmx_fio_do_real(fio,pull->constr_tol);
 +  gmx_fio_do_int(fio,pull->nstxout);
 +  gmx_fio_do_int(fio,pull->nstfout);
 +  if (bRead)
 +    snew(pull->grp,pull->ngrp+1);
 +  for(g=0; g<pull->ngrp+1; g++)
 +    do_pullgrp(fio,&pull->grp[g],bRead,file_version);
 +}
 +
 +
 +static void do_rotgrp(t_fileio *fio, t_rotgrp *rotg,gmx_bool bRead, int file_version)
 +{
 +  gmx_bool bDum=TRUE;
 +  int  i;
 +
 +  gmx_fio_do_int(fio,rotg->eType);
 +  gmx_fio_do_int(fio,rotg->bMassW);
 +  gmx_fio_do_int(fio,rotg->nat);
 +  if (bRead)
 +    snew(rotg->ind,rotg->nat);
 +  gmx_fio_ndo_int(fio,rotg->ind,rotg->nat);
 +  if (bRead)
 +      snew(rotg->x_ref,rotg->nat);
 +  gmx_fio_ndo_rvec(fio,rotg->x_ref,rotg->nat);
 +  gmx_fio_do_rvec(fio,rotg->vec);
 +  gmx_fio_do_rvec(fio,rotg->pivot);
 +  gmx_fio_do_real(fio,rotg->rate);
 +  gmx_fio_do_real(fio,rotg->k);
 +  gmx_fio_do_real(fio,rotg->slab_dist);
 +  gmx_fio_do_real(fio,rotg->min_gaussian);
 +  gmx_fio_do_real(fio,rotg->eps);
 +  gmx_fio_do_int(fio,rotg->eFittype);
 +  gmx_fio_do_int(fio,rotg->PotAngle_nstep);
 +  gmx_fio_do_real(fio,rotg->PotAngle_step);
 +}
 +
 +static void do_rot(t_fileio *fio, t_rot *rot,gmx_bool bRead, int file_version)
 +{
 +  int g;
 +
 +  gmx_fio_do_int(fio,rot->ngrp);
 +  gmx_fio_do_int(fio,rot->nstrout);
 +  gmx_fio_do_int(fio,rot->nstsout);
 +  if (bRead)
 +    snew(rot->grp,rot->ngrp);
 +  for(g=0; g<rot->ngrp; g++)
 +    do_rotgrp(fio, &rot->grp[g],bRead,file_version);
 +}
 +
 +
 +static void do_inputrec(t_fileio *fio, t_inputrec *ir,gmx_bool bRead, 
 +                        int file_version, real *fudgeQQ)
 +{
 +    int  i,j,k,*tmp,idum=0; 
 +    gmx_bool bDum=TRUE;
 +    real rdum,bd_temp;
 +    rvec vdum;
 +    gmx_bool bSimAnn;
 +    real zerotemptime,finish_t,init_temp,finish_temp;
 +    
 +    if (file_version != tpx_version)
 +    {
 +        /* Give a warning about features that are not accessible */
 +        fprintf(stderr,"Note: file tpx version %d, software tpx version %d\n",
 +                file_version,tpx_version);
 +    }
 +
 +    if (bRead)
 +    {
 +        init_inputrec(ir);
 +    }
 +
 +    if (file_version == 0)
 +    {
 +        return;
 +    }
 +
 +    /* Basic inputrec stuff */  
 +    gmx_fio_do_int(fio,ir->eI); 
 +    if (file_version >= 62) {
 +      gmx_fio_do_gmx_large_int(fio, ir->nsteps);
 +    } else {
 +      gmx_fio_do_int(fio,idum);
 +      ir->nsteps = idum;
 +    }
 +    if(file_version > 25) {
 +      if (file_version >= 62) {
 +      gmx_fio_do_gmx_large_int(fio, ir->init_step);
 +      } else {
 +      gmx_fio_do_int(fio,idum);
 +      ir->init_step = idum;
 +      }
 +    }  else {
 +      ir->init_step=0;
 +    }
 +
 +      if(file_version >= 58)
 +        gmx_fio_do_int(fio,ir->simulation_part);
 +      else
 +        ir->simulation_part=1;
 +        
 +    if (file_version >= 67) {
 +      gmx_fio_do_int(fio,ir->nstcalcenergy);
 +    } else {
 +      ir->nstcalcenergy = 1;
 +    }
 +    if (file_version < 53) {
 +      /* The pbc info has been moved out of do_inputrec,
 +       * since we always want it, also without reading the inputrec.
 +       */
 +      gmx_fio_do_int(fio,ir->ePBC);
 +      if ((file_version <= 15) && (ir->ePBC == 2))
 +      ir->ePBC = epbcNONE;
 +      if (file_version >= 45) {
 +      gmx_fio_do_int(fio,ir->bPeriodicMols);
 +      } else {
 +      if (ir->ePBC == 2) {
 +        ir->ePBC = epbcXYZ;
 +        ir->bPeriodicMols = TRUE;
 +      } else {
 +      ir->bPeriodicMols = FALSE;
 +      }
 +      }
 +    }
 +    if (file_version >= 81)
 +    {
 +        gmx_fio_do_int(fio,ir->cutoff_scheme);
 +    }
 +    else
 +    {
 +        ir->cutoff_scheme = ecutsGROUP;
 +    }
 +    gmx_fio_do_int(fio,ir->ns_type);
 +    gmx_fio_do_int(fio,ir->nstlist);
 +    gmx_fio_do_int(fio,ir->ndelta);
 +    if (file_version < 41) {
 +      gmx_fio_do_int(fio,idum);
 +      gmx_fio_do_int(fio,idum);
 +    }
 +    if (file_version >= 45)
 +      gmx_fio_do_real(fio,ir->rtpi);
 +    else
 +      ir->rtpi = 0.05;
 +    gmx_fio_do_int(fio,ir->nstcomm); 
 +    if (file_version > 34)
 +      gmx_fio_do_int(fio,ir->comm_mode);
 +    else if (ir->nstcomm < 0) 
 +      ir->comm_mode = ecmANGULAR;
 +    else
 +      ir->comm_mode = ecmLINEAR;
 +    ir->nstcomm = abs(ir->nstcomm);
 +    
 +    if(file_version > 25)
 +      gmx_fio_do_int(fio,ir->nstcheckpoint);
 +    else
 +      ir->nstcheckpoint=0;
 +    
 +    gmx_fio_do_int(fio,ir->nstcgsteep); 
 +
 +    if(file_version>=30)
 +      gmx_fio_do_int(fio,ir->nbfgscorr); 
 +    else if (bRead)
 +      ir->nbfgscorr = 10;
 +
 +    gmx_fio_do_int(fio,ir->nstlog); 
 +    gmx_fio_do_int(fio,ir->nstxout); 
 +    gmx_fio_do_int(fio,ir->nstvout); 
 +    gmx_fio_do_int(fio,ir->nstfout); 
 +    gmx_fio_do_int(fio,ir->nstenergy); 
 +    gmx_fio_do_int(fio,ir->nstxtcout); 
 +    if (file_version >= 59) {
 +      gmx_fio_do_double(fio,ir->init_t);
 +      gmx_fio_do_double(fio,ir->delta_t);
 +    } else {
 +      gmx_fio_do_real(fio,rdum);
 +      ir->init_t = rdum;
 +      gmx_fio_do_real(fio,rdum);
 +      ir->delta_t = rdum;
 +    }
 +    gmx_fio_do_real(fio,ir->xtcprec); 
 +    if (file_version < 19) {
 +      gmx_fio_do_int(fio,idum); 
 +      gmx_fio_do_int(fio,idum);
 +    }
 +    if(file_version < 18)
 +      gmx_fio_do_int(fio,idum); 
 +    if (file_version >= 81) {
 +      gmx_fio_do_real(fio,ir->verletbuf_drift);
 +    } else {
 +      ir->verletbuf_drift = 0;
 +    }
 +    gmx_fio_do_real(fio,ir->rlist); 
 +    if (file_version >= 67) {
 +      gmx_fio_do_real(fio,ir->rlistlong);
 +    }
 +    if(file_version >= 82 && file_version != 90)
 +    {
 +        gmx_fio_do_int(fio,ir->nstcalclr);
 +    }
 +    else
 +    {
 +        /* Calculate at NS steps */
 +        ir->nstcalclr = ir->nstlist;
 +    }
 +    gmx_fio_do_int(fio,ir->coulombtype);
 +    if (file_version < 32 && ir->coulombtype == eelRF)
 +      ir->coulombtype = eelRF_NEC;      
 +    if (file_version >= 81)
 +    {
 +        gmx_fio_do_int(fio,ir->coulomb_modifier); 
 +    }
 +    else
 +    {
 +        ir->coulomb_modifier = (ir->cutoff_scheme == ecutsVERLET ? eintmodPOTSHIFT : eintmodNONE);
 +    }
 +    gmx_fio_do_real(fio,ir->rcoulomb_switch); 
 +    gmx_fio_do_real(fio,ir->rcoulomb); 
 +    gmx_fio_do_int(fio,ir->vdwtype);
 +    if (file_version >= 81)
 +    {
 +        gmx_fio_do_int(fio,ir->vdw_modifier); 
 +    }
 +    else
 +    {
 +        ir->vdw_modifier = (ir->cutoff_scheme == ecutsVERLET ? eintmodPOTSHIFT : eintmodNONE);
 +    }
 +    gmx_fio_do_real(fio,ir->rvdw_switch); 
 +    gmx_fio_do_real(fio,ir->rvdw); 
 +    if (file_version < 67) {
 +      ir->rlistlong = max_cutoff(ir->rlist,max_cutoff(ir->rvdw,ir->rcoulomb));
 +    }
 +    gmx_fio_do_int(fio,ir->eDispCorr); 
 +    gmx_fio_do_real(fio,ir->epsilon_r);
 +    if (file_version >= 37) {
 +      gmx_fio_do_real(fio,ir->epsilon_rf);
 +    } else {
 +      if (EEL_RF(ir->coulombtype)) {
 +      ir->epsilon_rf = ir->epsilon_r;
 +      ir->epsilon_r  = 1.0;
 +      } else {
 +      ir->epsilon_rf = 1.0;
 +      }
 +    }
 +    if (file_version >= 29)
 +      gmx_fio_do_real(fio,ir->tabext);
 +    else
 +      ir->tabext=1.0;
 + 
 +    if(file_version > 25) {
 +      gmx_fio_do_int(fio,ir->gb_algorithm);
 +      gmx_fio_do_int(fio,ir->nstgbradii);
 +      gmx_fio_do_real(fio,ir->rgbradii);
 +      gmx_fio_do_real(fio,ir->gb_saltconc);
 +      gmx_fio_do_int(fio,ir->implicit_solvent);
 +    } else {
 +      ir->gb_algorithm=egbSTILL;
 +      ir->nstgbradii=1;
 +      ir->rgbradii=1.0;
 +      ir->gb_saltconc=0;
 +      ir->implicit_solvent=eisNO;
 +    }
 +      if(file_version>=55)
 +      {
 +              gmx_fio_do_real(fio,ir->gb_epsilon_solvent);
 +              gmx_fio_do_real(fio,ir->gb_obc_alpha);
 +              gmx_fio_do_real(fio,ir->gb_obc_beta);
 +              gmx_fio_do_real(fio,ir->gb_obc_gamma);
 +              if(file_version>=60)
 +              {
 +                      gmx_fio_do_real(fio,ir->gb_dielectric_offset);
 +                      gmx_fio_do_int(fio,ir->sa_algorithm);
 +              }
 +              else
 +              {
 +                      ir->gb_dielectric_offset = 0.009;
 +                      ir->sa_algorithm = esaAPPROX;
 +              }
 +              gmx_fio_do_real(fio,ir->sa_surface_tension);
 +
 +    /* Override sa_surface_tension if it is not changed in the mpd-file */
 +    if(ir->sa_surface_tension<0)
 +    {
 +      if(ir->gb_algorithm==egbSTILL)
 +      {
 +        ir->sa_surface_tension = 0.0049 * 100 * CAL2JOULE;
 +      }
 +      else if(ir->gb_algorithm==egbHCT || ir->gb_algorithm==egbOBC)
 +      {
 +        ir->sa_surface_tension = 0.0054 * 100 * CAL2JOULE;
 +      }
 +    }
 +    
 +      }
 +      else
 +      {
 +              /* Better use sensible values than insane (0.0) ones... */
 +              ir->gb_epsilon_solvent = 80;
 +              ir->gb_obc_alpha       = 1.0;
 +              ir->gb_obc_beta        = 0.8;
 +              ir->gb_obc_gamma       = 4.85;
 +              ir->sa_surface_tension = 2.092;
 +      }
 +
 +       
 +    if (file_version >= 81)
 +    {
 +        gmx_fio_do_real(fio,ir->fourier_spacing); 
 +    }
 +    else
 +    {
 +        ir->fourier_spacing = 0.0;
 +    }
 +    gmx_fio_do_int(fio,ir->nkx); 
 +    gmx_fio_do_int(fio,ir->nky); 
 +    gmx_fio_do_int(fio,ir->nkz);
 +    gmx_fio_do_int(fio,ir->pme_order);
 +    gmx_fio_do_real(fio,ir->ewald_rtol);
 +
 +    if (file_version >=24) 
 +      gmx_fio_do_int(fio,ir->ewald_geometry);
 +    else
 +      ir->ewald_geometry=eewg3D;
 +
 +    if (file_version <=17) {
 +      ir->epsilon_surface=0;
 +      if (file_version==17)
 +      gmx_fio_do_int(fio,idum);
 +    } 
 +    else
 +      gmx_fio_do_real(fio,ir->epsilon_surface);
 +    
 +    gmx_fio_do_gmx_bool(fio,ir->bOptFFT);
 +
 +    gmx_fio_do_gmx_bool(fio,ir->bContinuation); 
 +    gmx_fio_do_int(fio,ir->etc);
 +    /* before version 18, ir->etc was a gmx_bool (ir->btc),
 +     * but the values 0 and 1 still mean no and
 +     * berendsen temperature coupling, respectively.
 +     */
 +    if (file_version >= 79) {
 +        gmx_fio_do_gmx_bool(fio,ir->bPrintNHChains);
 +    }
 +    if (file_version >= 71)
 +    {
 +        gmx_fio_do_int(fio,ir->nsttcouple);
 +    }
 +    else
 +    {
 +        ir->nsttcouple = ir->nstcalcenergy;
 +    }
 +    if (file_version <= 15)
 +    {
 +        gmx_fio_do_int(fio,idum);
 +    }
 +    if (file_version <=17)
 +    {
 +        gmx_fio_do_int(fio,ir->epct); 
 +        if (file_version <= 15)
 +        {
 +            if (ir->epct == 5)
 +            {
 +                ir->epct = epctSURFACETENSION;
 +            }
 +            gmx_fio_do_int(fio,idum);
 +        }
 +        ir->epct -= 1;
 +        /* we have removed the NO alternative at the beginning */
 +        if(ir->epct==-1)
 +        {
 +            ir->epc=epcNO;
 +            ir->epct=epctISOTROPIC;
 +        } 
 +        else
 +        {
 +            ir->epc=epcBERENDSEN;
 +        }
 +    } 
 +    else
 +    {
 +        gmx_fio_do_int(fio,ir->epc);
 +        gmx_fio_do_int(fio,ir->epct);
 +    }
 +    if (file_version >= 71)
 +    {
 +        gmx_fio_do_int(fio,ir->nstpcouple);
 +    }
 +    else
 +    {
 +        ir->nstpcouple = ir->nstcalcenergy;
 +    }
 +    gmx_fio_do_real(fio,ir->tau_p); 
 +    if (file_version <= 15) {
 +      gmx_fio_do_rvec(fio,vdum);
 +      clear_mat(ir->ref_p);
 +      for(i=0; i<DIM; i++)
 +      ir->ref_p[i][i] = vdum[i];
 +    } else {
 +      gmx_fio_do_rvec(fio,ir->ref_p[XX]);
 +      gmx_fio_do_rvec(fio,ir->ref_p[YY]);
 +      gmx_fio_do_rvec(fio,ir->ref_p[ZZ]);
 +    }
 +    if (file_version <= 15) {
 +      gmx_fio_do_rvec(fio,vdum);
 +      clear_mat(ir->compress);
 +      for(i=0; i<DIM; i++)
 +      ir->compress[i][i] = vdum[i];
 +    } 
 +    else {
 +      gmx_fio_do_rvec(fio,ir->compress[XX]);
 +      gmx_fio_do_rvec(fio,ir->compress[YY]);
 +      gmx_fio_do_rvec(fio,ir->compress[ZZ]);
 +    }
 +    if (file_version >= 47) {
 +      gmx_fio_do_int(fio,ir->refcoord_scaling);
 +      gmx_fio_do_rvec(fio,ir->posres_com);
 +      gmx_fio_do_rvec(fio,ir->posres_comB);
 +    } else {
 +      ir->refcoord_scaling = erscNO;
 +      clear_rvec(ir->posres_com);
 +      clear_rvec(ir->posres_comB);
 +    }
 +    if((file_version > 25) && (file_version < 79))
 +        gmx_fio_do_int(fio,ir->andersen_seed);
 +    else
 +        ir->andersen_seed=0;
 +    if(file_version < 26) {
 +      gmx_fio_do_gmx_bool(fio,bSimAnn); 
 +      gmx_fio_do_real(fio,zerotemptime);
 +    }
 +    
 +    if (file_version < 37)
 +      gmx_fio_do_real(fio,rdum); 
 +
 +    gmx_fio_do_real(fio,ir->shake_tol);
 +    if (file_version < 54)
 +      gmx_fio_do_real(fio,*fudgeQQ);
 +
 +    gmx_fio_do_int(fio,ir->efep);
 +    if (file_version <= 14 && ir->efep != efepNO)
 +    {
 +        ir->efep = efepYES;
 +    }
 +    do_fepvals(fio,ir->fepvals,bRead,file_version);
 +
 +    if (file_version >= 79)
 +    {
 +        gmx_fio_do_gmx_bool(fio,ir->bSimTemp);
 +        if (ir->bSimTemp) 
 +        {
 +            ir->bSimTemp = TRUE;
 +        }
 +    }
 +    else
 +    {
 +        ir->bSimTemp = FALSE;
 +    }
 +    if (ir->bSimTemp)
 +    {
 +        do_simtempvals(fio,ir->simtempvals,ir->fepvals->n_lambda,bRead,file_version);
 +    }
 +
 +    if (file_version >= 79)
 +    {
 +        gmx_fio_do_gmx_bool(fio,ir->bExpanded);
 +        if (ir->bExpanded)
 +        {
 +            ir->bExpanded = TRUE;
 +        }
 +        else
 +        {
 +            ir->bExpanded = FALSE;
 +        }
 +    }
 +    if (ir->bExpanded)
 +    {
 +        do_expandedvals(fio,ir->expandedvals,ir->fepvals->n_lambda,bRead,file_version);
 +    }
 +    if (file_version >= 57) {
 +      gmx_fio_do_int(fio,ir->eDisre); 
 +    }
 +    gmx_fio_do_int(fio,ir->eDisreWeighting); 
 +    if (file_version < 22) {
 +      if (ir->eDisreWeighting == 0)
 +      ir->eDisreWeighting = edrwEqual;
 +      else
 +      ir->eDisreWeighting = edrwConservative;
 +    }
 +    gmx_fio_do_gmx_bool(fio,ir->bDisreMixed); 
 +    gmx_fio_do_real(fio,ir->dr_fc); 
 +    gmx_fio_do_real(fio,ir->dr_tau); 
 +    gmx_fio_do_int(fio,ir->nstdisreout);
 +    if (file_version >= 22) {
 +      gmx_fio_do_real(fio,ir->orires_fc);
 +      gmx_fio_do_real(fio,ir->orires_tau);
 +      gmx_fio_do_int(fio,ir->nstorireout);
 +    } else {
 +      ir->orires_fc = 0;
 +      ir->orires_tau = 0;
 +      ir->nstorireout = 0;
 +    }
 +    if(file_version >= 26 && file_version < 79) {
 +      gmx_fio_do_real(fio,ir->dihre_fc);
 +      if (file_version < 56) 
 +      {
 +          gmx_fio_do_real(fio,rdum);
 +          gmx_fio_do_int(fio,idum);
 +      }
 +    } else {
 +        ir->dihre_fc=0;
 +    }
 +
 +    gmx_fio_do_real(fio,ir->em_stepsize); 
 +    gmx_fio_do_real(fio,ir->em_tol); 
 +    if (file_version >= 22) 
 +      gmx_fio_do_gmx_bool(fio,ir->bShakeSOR);
 +    else if (bRead)
 +      ir->bShakeSOR = TRUE;
 +    if (file_version >= 11)
 +      gmx_fio_do_int(fio,ir->niter);
 +    else if (bRead) {
 +      ir->niter = 25;
 +      fprintf(stderr,"Note: niter not in run input file, setting it to %d\n",
 +            ir->niter);
 +    }
 +    if (file_version >= 21)
 +      gmx_fio_do_real(fio,ir->fc_stepsize);
 +    else
 +      ir->fc_stepsize = 0;
 +    gmx_fio_do_int(fio,ir->eConstrAlg);
 +    gmx_fio_do_int(fio,ir->nProjOrder);
 +    gmx_fio_do_real(fio,ir->LincsWarnAngle);
 +    if (file_version <= 14)
 +      gmx_fio_do_int(fio,idum);
 +    if (file_version >=26)
 +      gmx_fio_do_int(fio,ir->nLincsIter);
 +    else if (bRead) {
 +      ir->nLincsIter = 1;
 +      fprintf(stderr,"Note: nLincsIter not in run input file, setting it to %d\n",
 +            ir->nLincsIter);
 +    }
 +    if (file_version < 33)
 +      gmx_fio_do_real(fio,bd_temp);
 +    gmx_fio_do_real(fio,ir->bd_fric);
 +    gmx_fio_do_int(fio,ir->ld_seed);
 +    if (file_version >= 33) {
 +      for(i=0; i<DIM; i++)
 +      gmx_fio_do_rvec(fio,ir->deform[i]);
 +    } else {
 +      for(i=0; i<DIM; i++)
 +      clear_rvec(ir->deform[i]);
 +    }
 +    if (file_version >= 14)
 +      gmx_fio_do_real(fio,ir->cos_accel);
 +    else if (bRead)
 +      ir->cos_accel = 0;
 +    gmx_fio_do_int(fio,ir->userint1); 
 +    gmx_fio_do_int(fio,ir->userint2); 
 +    gmx_fio_do_int(fio,ir->userint3); 
 +    gmx_fio_do_int(fio,ir->userint4); 
 +    gmx_fio_do_real(fio,ir->userreal1); 
 +    gmx_fio_do_real(fio,ir->userreal2); 
 +    gmx_fio_do_real(fio,ir->userreal3); 
 +    gmx_fio_do_real(fio,ir->userreal4); 
 +    
 +    /* AdResS stuff */
 +    if (file_version >= 77) {
 +      gmx_fio_do_gmx_bool(fio,ir->bAdress);
 +      if(ir->bAdress){
 +          if (bRead) snew(ir->adress, 1);
 +          gmx_fio_do_int(fio,ir->adress->type);
 +          gmx_fio_do_real(fio,ir->adress->const_wf);
 +          gmx_fio_do_real(fio,ir->adress->ex_width);
 +          gmx_fio_do_real(fio,ir->adress->hy_width);
 +          gmx_fio_do_int(fio,ir->adress->icor);
 +          gmx_fio_do_int(fio,ir->adress->site);
 +          gmx_fio_do_rvec(fio,ir->adress->refs);
 +          gmx_fio_do_int(fio,ir->adress->n_tf_grps);
 +          gmx_fio_do_real(fio, ir->adress->ex_forcecap);
 +          gmx_fio_do_int(fio, ir->adress->n_energy_grps);
 +          gmx_fio_do_int(fio,ir->adress->do_hybridpairs);
 +
 +          if (bRead)snew(ir->adress->tf_table_index,ir->adress->n_tf_grps);
 +          if (ir->adress->n_tf_grps > 0) {
 +            bDum=gmx_fio_ndo_int(fio,ir->adress->tf_table_index,ir->adress->n_tf_grps);
 +          }
 +          if (bRead)snew(ir->adress->group_explicit,ir->adress->n_energy_grps);
 +          if (ir->adress->n_energy_grps > 0) {
 +            bDum=gmx_fio_ndo_int(fio, ir->adress->group_explicit,ir->adress->n_energy_grps);
 +          }
 +      }
 +    } else {
 +      ir->bAdress = FALSE;
 +    }
 +
 +    /* pull stuff */
 +    if (file_version >= 48) {
 +      gmx_fio_do_int(fio,ir->ePull);
 +      if (ir->ePull != epullNO) {
 +      if (bRead)
 +        snew(ir->pull,1);
 +      do_pull(fio, ir->pull,bRead,file_version);
 +      }
 +    } else {
 +      ir->ePull = epullNO;
 +    }
 +    
 +    /* Enforced rotation */
 +    if (file_version >= 74) {
 +        gmx_fio_do_int(fio,ir->bRot);
 +        if (ir->bRot == TRUE) {
 +            if (bRead)
 +                snew(ir->rot,1);
 +            do_rot(fio, ir->rot,bRead,file_version);
 +        }
 +    } else {
 +        ir->bRot = FALSE;
 +    }
 +    
 +    /* grpopts stuff */
 +    gmx_fio_do_int(fio,ir->opts.ngtc); 
 +    if (file_version >= 69) {
 +      gmx_fio_do_int(fio,ir->opts.nhchainlength);
 +    } else {
 +      ir->opts.nhchainlength = 1;
 +    }
 +    gmx_fio_do_int(fio,ir->opts.ngacc); 
 +    gmx_fio_do_int(fio,ir->opts.ngfrz); 
 +    gmx_fio_do_int(fio,ir->opts.ngener);
 +    
 +    if (bRead) {
 +      snew(ir->opts.nrdf,   ir->opts.ngtc); 
 +      snew(ir->opts.ref_t,  ir->opts.ngtc); 
 +      snew(ir->opts.annealing, ir->opts.ngtc); 
 +      snew(ir->opts.anneal_npoints, ir->opts.ngtc); 
 +      snew(ir->opts.anneal_time, ir->opts.ngtc); 
 +      snew(ir->opts.anneal_temp, ir->opts.ngtc); 
 +      snew(ir->opts.tau_t,  ir->opts.ngtc); 
 +      snew(ir->opts.nFreeze,ir->opts.ngfrz); 
 +      snew(ir->opts.acc,    ir->opts.ngacc); 
 +      snew(ir->opts.egp_flags,ir->opts.ngener*ir->opts.ngener);
 +    } 
 +    if (ir->opts.ngtc > 0) {
 +      if (bRead && file_version<13) {
 +      snew(tmp,ir->opts.ngtc);
 +      bDum=gmx_fio_ndo_int(fio,tmp, ir->opts.ngtc);
 +      for(i=0; i<ir->opts.ngtc; i++)
 +        ir->opts.nrdf[i] = tmp[i];
 +      sfree(tmp);
 +      } else {
 +      bDum=gmx_fio_ndo_real(fio,ir->opts.nrdf, ir->opts.ngtc);
 +      }
 +      bDum=gmx_fio_ndo_real(fio,ir->opts.ref_t,ir->opts.ngtc); 
 +      bDum=gmx_fio_ndo_real(fio,ir->opts.tau_t,ir->opts.ngtc); 
 +      if (file_version<33 && ir->eI==eiBD) {
 +      for(i=0; i<ir->opts.ngtc; i++)
 +        ir->opts.tau_t[i] = bd_temp;
 +      }
 +    }
 +    if (ir->opts.ngfrz > 0) 
 +      bDum=gmx_fio_ndo_ivec(fio,ir->opts.nFreeze,ir->opts.ngfrz);
 +    if (ir->opts.ngacc > 0) 
 +      gmx_fio_ndo_rvec(fio,ir->opts.acc,ir->opts.ngacc); 
 +    if (file_version >= 12)
 +      bDum=gmx_fio_ndo_int(fio,ir->opts.egp_flags,
 +                           ir->opts.ngener*ir->opts.ngener);
 +
 +    if(bRead && file_version < 26) {
 +      for(i=0;i<ir->opts.ngtc;i++) {
 +      if(bSimAnn) {
 +        ir->opts.annealing[i] = eannSINGLE;
 +        ir->opts.anneal_npoints[i] = 2;
 +        snew(ir->opts.anneal_time[i],2);
 +        snew(ir->opts.anneal_temp[i],2);
 +        /* calculate the starting/ending temperatures from reft, zerotemptime, and nsteps */
 +        finish_t = ir->init_t + ir->nsteps * ir->delta_t;
 +        init_temp = ir->opts.ref_t[i]*(1-ir->init_t/zerotemptime);
 +        finish_temp = ir->opts.ref_t[i]*(1-finish_t/zerotemptime);
 +        ir->opts.anneal_time[i][0] = ir->init_t;
 +        ir->opts.anneal_time[i][1] = finish_t;
 +        ir->opts.anneal_temp[i][0] = init_temp;
 +        ir->opts.anneal_temp[i][1] = finish_temp;
 +      } else {
 +        ir->opts.annealing[i] = eannNO;
 +        ir->opts.anneal_npoints[i] = 0;
 +      }
 +      }
 +    } else {
 +      /* file version 26 or later */
 +      /* First read the lists with annealing and npoints for each group */
 +      bDum=gmx_fio_ndo_int(fio,ir->opts.annealing,ir->opts.ngtc);
 +      bDum=gmx_fio_ndo_int(fio,ir->opts.anneal_npoints,ir->opts.ngtc);
 +      for(j=0;j<(ir->opts.ngtc);j++) {
 +      k=ir->opts.anneal_npoints[j];
 +      if(bRead) {
 +        snew(ir->opts.anneal_time[j],k);
 +        snew(ir->opts.anneal_temp[j],k);
 +      }
 +      bDum=gmx_fio_ndo_real(fio,ir->opts.anneal_time[j],k);
 +      bDum=gmx_fio_ndo_real(fio,ir->opts.anneal_temp[j],k);
 +      }
 +    }
 +    /* Walls */
 +    if (file_version >= 45) {
 +      gmx_fio_do_int(fio,ir->nwall);
 +      gmx_fio_do_int(fio,ir->wall_type);
 +      if (file_version >= 50)
 +      gmx_fio_do_real(fio,ir->wall_r_linpot);
 +      else
 +      ir->wall_r_linpot = -1;
 +      gmx_fio_do_int(fio,ir->wall_atomtype[0]);
 +      gmx_fio_do_int(fio,ir->wall_atomtype[1]);
 +      gmx_fio_do_real(fio,ir->wall_density[0]);
 +      gmx_fio_do_real(fio,ir->wall_density[1]);
 +      gmx_fio_do_real(fio,ir->wall_ewald_zfac);
 +    } else {
 +      ir->nwall = 0;
 +      ir->wall_type = 0;
 +      ir->wall_atomtype[0] = -1;
 +      ir->wall_atomtype[1] = -1;
 +      ir->wall_density[0] = 0;
 +      ir->wall_density[1] = 0;
 +      ir->wall_ewald_zfac = 3;
 +    }
 +    /* Cosine stuff for electric fields */
 +    for(j=0; (j<DIM); j++) {
 +      gmx_fio_do_int(fio,ir->ex[j].n);
 +      gmx_fio_do_int(fio,ir->et[j].n);
 +      if (bRead) {
 +      snew(ir->ex[j].a,  ir->ex[j].n);
 +      snew(ir->ex[j].phi,ir->ex[j].n);
 +      snew(ir->et[j].a,  ir->et[j].n);
 +      snew(ir->et[j].phi,ir->et[j].n);
 +      }
 +      bDum=gmx_fio_ndo_real(fio,ir->ex[j].a,  ir->ex[j].n);
 +      bDum=gmx_fio_ndo_real(fio,ir->ex[j].phi,ir->ex[j].n);
 +      bDum=gmx_fio_ndo_real(fio,ir->et[j].a,  ir->et[j].n);
 +      bDum=gmx_fio_ndo_real(fio,ir->et[j].phi,ir->et[j].n);
 +    }
 +    
 +    /* QMMM stuff */
 +    if(file_version>=39){
 +      gmx_fio_do_gmx_bool(fio,ir->bQMMM);
 +      gmx_fio_do_int(fio,ir->QMMMscheme);
 +      gmx_fio_do_real(fio,ir->scalefactor);
 +      gmx_fio_do_int(fio,ir->opts.ngQM);
 +      if (bRead) {
 +        snew(ir->opts.QMmethod,    ir->opts.ngQM);
 +        snew(ir->opts.QMbasis,     ir->opts.ngQM);
 +        snew(ir->opts.QMcharge,    ir->opts.ngQM);
 +        snew(ir->opts.QMmult,      ir->opts.ngQM);
 +        snew(ir->opts.bSH,         ir->opts.ngQM);
 +        snew(ir->opts.CASorbitals, ir->opts.ngQM);
 +        snew(ir->opts.CASelectrons,ir->opts.ngQM);
 +        snew(ir->opts.SAon,        ir->opts.ngQM);
 +        snew(ir->opts.SAoff,       ir->opts.ngQM);
 +        snew(ir->opts.SAsteps,     ir->opts.ngQM);
 +        snew(ir->opts.bOPT,        ir->opts.ngQM);
 +        snew(ir->opts.bTS,         ir->opts.ngQM);
 +      }
 +      if (ir->opts.ngQM > 0) {
 +        bDum=gmx_fio_ndo_int(fio,ir->opts.QMmethod,ir->opts.ngQM);
 +        bDum=gmx_fio_ndo_int(fio,ir->opts.QMbasis,ir->opts.ngQM);
 +        bDum=gmx_fio_ndo_int(fio,ir->opts.QMcharge,ir->opts.ngQM);
 +        bDum=gmx_fio_ndo_int(fio,ir->opts.QMmult,ir->opts.ngQM);
 +        bDum=gmx_fio_ndo_gmx_bool(fio,ir->opts.bSH,ir->opts.ngQM);
 +        bDum=gmx_fio_ndo_int(fio,ir->opts.CASorbitals,ir->opts.ngQM);
 +        bDum=gmx_fio_ndo_int(fio,ir->opts.CASelectrons,ir->opts.ngQM);
 +        bDum=gmx_fio_ndo_real(fio,ir->opts.SAon,ir->opts.ngQM);
 +        bDum=gmx_fio_ndo_real(fio,ir->opts.SAoff,ir->opts.ngQM);
 +        bDum=gmx_fio_ndo_int(fio,ir->opts.SAsteps,ir->opts.ngQM);
 +        bDum=gmx_fio_ndo_gmx_bool(fio,ir->opts.bOPT,ir->opts.ngQM);
 +        bDum=gmx_fio_ndo_gmx_bool(fio,ir->opts.bTS,ir->opts.ngQM);
 +      }
 +      /* end of QMMM stuff */
 +    }    
 +}
 +
 +
 +static void do_harm(t_fileio *fio, t_iparams *iparams,gmx_bool bRead)
 +{
 +  gmx_fio_do_real(fio,iparams->harmonic.rA);
 +  gmx_fio_do_real(fio,iparams->harmonic.krA);
 +  gmx_fio_do_real(fio,iparams->harmonic.rB);
 +  gmx_fio_do_real(fio,iparams->harmonic.krB);
 +}
 +
 +void do_iparams(t_fileio *fio, t_functype ftype,t_iparams *iparams,
 +                gmx_bool bRead, int file_version)
 +{
 +  int idum;
 +  gmx_bool bDum;
 +  real rdum;
 +  
 +  if (!bRead)
 +    gmx_fio_set_comment(fio, interaction_function[ftype].name);
 +  switch (ftype) {
 +  case F_ANGLES:
 +  case F_G96ANGLES:
 +  case F_BONDS:
 +  case F_G96BONDS:
 +  case F_HARMONIC:
 +  case F_IDIHS:
 +    do_harm(fio, iparams,bRead);
 +    if ((ftype == F_ANGRES || ftype == F_ANGRESZ) && bRead) {
 +      /* Correct incorrect storage of parameters */
 +      iparams->pdihs.phiB = iparams->pdihs.phiA;
 +      iparams->pdihs.cpB  = iparams->pdihs.cpA;
 +    }
 +    break;
 +  case F_LINEAR_ANGLES:
 +    gmx_fio_do_real(fio,iparams->linangle.klinA);
 +    gmx_fio_do_real(fio,iparams->linangle.aA);
 +    gmx_fio_do_real(fio,iparams->linangle.klinB);
 +    gmx_fio_do_real(fio,iparams->linangle.aB);
 +    break;
 +  case F_FENEBONDS:
 +    gmx_fio_do_real(fio,iparams->fene.bm);
 +    gmx_fio_do_real(fio,iparams->fene.kb);
 +    break;
 +  case F_RESTRBONDS:
 +    gmx_fio_do_real(fio,iparams->restraint.lowA);
 +    gmx_fio_do_real(fio,iparams->restraint.up1A);
 +    gmx_fio_do_real(fio,iparams->restraint.up2A);
 +    gmx_fio_do_real(fio,iparams->restraint.kA);
 +    gmx_fio_do_real(fio,iparams->restraint.lowB);
 +    gmx_fio_do_real(fio,iparams->restraint.up1B);
 +    gmx_fio_do_real(fio,iparams->restraint.up2B);
 +    gmx_fio_do_real(fio,iparams->restraint.kB);
 +    break;
 +  case F_TABBONDS:
 +  case F_TABBONDSNC:
 +  case F_TABANGLES:
 +  case F_TABDIHS:
 +    gmx_fio_do_real(fio,iparams->tab.kA);
 +    gmx_fio_do_int(fio,iparams->tab.table);
 +    gmx_fio_do_real(fio,iparams->tab.kB);
 +    break;
 +  case F_CROSS_BOND_BONDS:
 +    gmx_fio_do_real(fio,iparams->cross_bb.r1e);
 +    gmx_fio_do_real(fio,iparams->cross_bb.r2e);
 +    gmx_fio_do_real(fio,iparams->cross_bb.krr);
 +    break;
 +  case F_CROSS_BOND_ANGLES:
 +    gmx_fio_do_real(fio,iparams->cross_ba.r1e);
 +    gmx_fio_do_real(fio,iparams->cross_ba.r2e);
 +    gmx_fio_do_real(fio,iparams->cross_ba.r3e);
 +    gmx_fio_do_real(fio,iparams->cross_ba.krt);
 +    break;
 +  case F_UREY_BRADLEY:
 +    gmx_fio_do_real(fio,iparams->u_b.thetaA);
 +    gmx_fio_do_real(fio,iparams->u_b.kthetaA);
 +    gmx_fio_do_real(fio,iparams->u_b.r13A);
 +    gmx_fio_do_real(fio,iparams->u_b.kUBA);
 +    if (file_version >= 79) {
 +        gmx_fio_do_real(fio,iparams->u_b.thetaB);
 +        gmx_fio_do_real(fio,iparams->u_b.kthetaB);
 +        gmx_fio_do_real(fio,iparams->u_b.r13B);
 +        gmx_fio_do_real(fio,iparams->u_b.kUBB);
 +    } else {
 +        iparams->u_b.thetaB=iparams->u_b.thetaA;
 +        iparams->u_b.kthetaB=iparams->u_b.kthetaA;
 +        iparams->u_b.r13B=iparams->u_b.r13A;
 +        iparams->u_b.kUBB=iparams->u_b.kUBA;
 +    }
 +    break;
 +  case F_QUARTIC_ANGLES:
 +    gmx_fio_do_real(fio,iparams->qangle.theta);
 +    bDum=gmx_fio_ndo_real(fio,iparams->qangle.c,5);
 +    break;
 +  case F_BHAM:
 +    gmx_fio_do_real(fio,iparams->bham.a);
 +    gmx_fio_do_real(fio,iparams->bham.b);
 +    gmx_fio_do_real(fio,iparams->bham.c);
 +    break;
 +  case F_MORSE:
 +    gmx_fio_do_real(fio,iparams->morse.b0A);
 +    gmx_fio_do_real(fio,iparams->morse.cbA);
 +    gmx_fio_do_real(fio,iparams->morse.betaA);
 +    if (file_version >= 79) {
 +        gmx_fio_do_real(fio,iparams->morse.b0B);
 +        gmx_fio_do_real(fio,iparams->morse.cbB);
 +        gmx_fio_do_real(fio,iparams->morse.betaB);
 +    } else {
 +        iparams->morse.b0B = iparams->morse.b0A;
 +        iparams->morse.cbB = iparams->morse.cbA;
 +        iparams->morse.betaB = iparams->morse.betaA;
 +    }
 +    break;
 +  case F_CUBICBONDS:
 +    gmx_fio_do_real(fio,iparams->cubic.b0);
 +    gmx_fio_do_real(fio,iparams->cubic.kb);
 +    gmx_fio_do_real(fio,iparams->cubic.kcub);
 +    break;
 +  case F_CONNBONDS:
 +    break;
 +  case F_POLARIZATION:
 +    gmx_fio_do_real(fio,iparams->polarize.alpha);
 +    break;
 +  case F_ANHARM_POL:
 +    gmx_fio_do_real(fio,iparams->anharm_polarize.alpha);
 +    gmx_fio_do_real(fio,iparams->anharm_polarize.drcut);
 +    gmx_fio_do_real(fio,iparams->anharm_polarize.khyp);
 +    break;
 +  case F_WATER_POL:
 +    if (file_version < 31) 
 +      gmx_fatal(FARGS,"Old tpr files with water_polarization not supported. Make a new.");
 +    gmx_fio_do_real(fio,iparams->wpol.al_x);
 +    gmx_fio_do_real(fio,iparams->wpol.al_y);
 +    gmx_fio_do_real(fio,iparams->wpol.al_z);
 +    gmx_fio_do_real(fio,iparams->wpol.rOH);
 +    gmx_fio_do_real(fio,iparams->wpol.rHH);
 +    gmx_fio_do_real(fio,iparams->wpol.rOD);
 +    break;
 +  case F_THOLE_POL:
 +    gmx_fio_do_real(fio,iparams->thole.a);
 +    gmx_fio_do_real(fio,iparams->thole.alpha1);
 +    gmx_fio_do_real(fio,iparams->thole.alpha2);
 +    gmx_fio_do_real(fio,iparams->thole.rfac);
 +    break;
 +  case F_LJ:
 +    gmx_fio_do_real(fio,iparams->lj.c6);
 +    gmx_fio_do_real(fio,iparams->lj.c12);
 +    break;
 +  case F_LJ14:
 +    gmx_fio_do_real(fio,iparams->lj14.c6A);
 +    gmx_fio_do_real(fio,iparams->lj14.c12A);
 +    gmx_fio_do_real(fio,iparams->lj14.c6B);
 +    gmx_fio_do_real(fio,iparams->lj14.c12B);
 +    break;
 +  case F_LJC14_Q:
 +    gmx_fio_do_real(fio,iparams->ljc14.fqq);
 +    gmx_fio_do_real(fio,iparams->ljc14.qi);
 +    gmx_fio_do_real(fio,iparams->ljc14.qj);
 +    gmx_fio_do_real(fio,iparams->ljc14.c6);
 +    gmx_fio_do_real(fio,iparams->ljc14.c12);
 +    break;
 +  case F_LJC_PAIRS_NB:
 +    gmx_fio_do_real(fio,iparams->ljcnb.qi);
 +    gmx_fio_do_real(fio,iparams->ljcnb.qj);
 +    gmx_fio_do_real(fio,iparams->ljcnb.c6);
 +    gmx_fio_do_real(fio,iparams->ljcnb.c12);
 +    break;
 +  case F_PDIHS:
 +  case F_PIDIHS:
 +  case F_ANGRES:
 +  case F_ANGRESZ:
 +    gmx_fio_do_real(fio,iparams->pdihs.phiA);
 +    gmx_fio_do_real(fio,iparams->pdihs.cpA);
 +    if ((ftype == F_ANGRES || ftype == F_ANGRESZ) && file_version < 42) {
 +      /* Read the incorrectly stored multiplicity */
 +      gmx_fio_do_real(fio,iparams->harmonic.rB);
 +      gmx_fio_do_real(fio,iparams->harmonic.krB);
 +      iparams->pdihs.phiB = iparams->pdihs.phiA;
 +      iparams->pdihs.cpB  = iparams->pdihs.cpA;
 +    } else {
 +      gmx_fio_do_real(fio,iparams->pdihs.phiB);
 +      gmx_fio_do_real(fio,iparams->pdihs.cpB);
 +      gmx_fio_do_int(fio,iparams->pdihs.mult);
 +    }
 +    break;
 +  case F_DISRES:
 +    gmx_fio_do_int(fio,iparams->disres.label);
 +    gmx_fio_do_int(fio,iparams->disres.type);
 +    gmx_fio_do_real(fio,iparams->disres.low);
 +    gmx_fio_do_real(fio,iparams->disres.up1);
 +    gmx_fio_do_real(fio,iparams->disres.up2);
 +    gmx_fio_do_real(fio,iparams->disres.kfac);
 +    break;
 +  case F_ORIRES:
 +    gmx_fio_do_int(fio,iparams->orires.ex);
 +    gmx_fio_do_int(fio,iparams->orires.label);
 +    gmx_fio_do_int(fio,iparams->orires.power);
 +    gmx_fio_do_real(fio,iparams->orires.c);
 +    gmx_fio_do_real(fio,iparams->orires.obs);
 +    gmx_fio_do_real(fio,iparams->orires.kfac);
 +    break;
 +  case F_DIHRES:
 +    if ( file_version < 72) {
 +        gmx_fio_do_int(fio,idum);
 +        gmx_fio_do_int(fio,idum);
 +    }
 +    gmx_fio_do_real(fio,iparams->dihres.phiA);
 +    gmx_fio_do_real(fio,iparams->dihres.dphiA);
 +    gmx_fio_do_real(fio,iparams->dihres.kfacA);
 +    if (file_version >= 72) {
 +        gmx_fio_do_real(fio,iparams->dihres.phiB);
 +        gmx_fio_do_real(fio,iparams->dihres.dphiB);
 +        gmx_fio_do_real(fio,iparams->dihres.kfacB);
 +    } else {
 +        iparams->dihres.phiB=iparams->dihres.phiA;
 +        iparams->dihres.dphiB=iparams->dihres.dphiA;
 +        iparams->dihres.kfacB=iparams->dihres.kfacA;
 +    }
 +    break;
 +  case F_POSRES:
 +    gmx_fio_do_rvec(fio,iparams->posres.pos0A);
 +    gmx_fio_do_rvec(fio,iparams->posres.fcA);
 +    if (bRead && file_version < 27) {
 +      copy_rvec(iparams->posres.pos0A,iparams->posres.pos0B);
 +      copy_rvec(iparams->posres.fcA,iparams->posres.fcB);
 +    } else {
 +      gmx_fio_do_rvec(fio,iparams->posres.pos0B);
 +      gmx_fio_do_rvec(fio,iparams->posres.fcB);
 +    }
 +    break;
 +  case F_FBPOSRES:
 +      gmx_fio_do_int(fio,iparams->fbposres.geom);
 +      gmx_fio_do_rvec(fio,iparams->fbposres.pos0);
 +      gmx_fio_do_real(fio,iparams->fbposres.r);
 +      gmx_fio_do_real(fio,iparams->fbposres.k);
 +      break;
 +  case F_RBDIHS:
 +    bDum=gmx_fio_ndo_real(fio,iparams->rbdihs.rbcA,NR_RBDIHS);
 +    if(file_version>=25) 
 +      bDum=gmx_fio_ndo_real(fio,iparams->rbdihs.rbcB,NR_RBDIHS);
 +    break;
 +  case F_FOURDIHS:
 +    /* Fourier dihedrals are internally represented
 +     * as Ryckaert-Bellemans since those are faster to compute.
 +     */
 +     bDum=gmx_fio_ndo_real(fio,iparams->rbdihs.rbcA, NR_RBDIHS);
 +     bDum=gmx_fio_ndo_real(fio,iparams->rbdihs.rbcB, NR_RBDIHS);
 +    break;
 +  case F_CONSTR:
 +  case F_CONSTRNC:
 +    gmx_fio_do_real(fio,iparams->constr.dA);
 +    gmx_fio_do_real(fio,iparams->constr.dB);
 +    break;
 +  case F_SETTLE:
 +    gmx_fio_do_real(fio,iparams->settle.doh);
 +    gmx_fio_do_real(fio,iparams->settle.dhh);
 +    break;
 +  case F_VSITE2:
 +    gmx_fio_do_real(fio,iparams->vsite.a);
 +    break;
 +  case F_VSITE3:
 +  case F_VSITE3FD:
 +  case F_VSITE3FAD:
 +    gmx_fio_do_real(fio,iparams->vsite.a);
 +    gmx_fio_do_real(fio,iparams->vsite.b);
 +    break;
 +  case F_VSITE3OUT:
 +  case F_VSITE4FD: 
 +  case F_VSITE4FDN: 
 +    gmx_fio_do_real(fio,iparams->vsite.a);
 +    gmx_fio_do_real(fio,iparams->vsite.b);
 +    gmx_fio_do_real(fio,iparams->vsite.c);
 +    break;
 +  case F_VSITEN:
 +    gmx_fio_do_int(fio,iparams->vsiten.n);
 +    gmx_fio_do_real(fio,iparams->vsiten.a);
 +    break;
 +  case F_GB12:
 +  case F_GB13:
 +  case F_GB14:
 +    /* We got rid of some parameters in version 68 */
 +    if(bRead && file_version<68)
 +    {
 +        gmx_fio_do_real(fio,rdum);    
 +        gmx_fio_do_real(fio,rdum);    
 +        gmx_fio_do_real(fio,rdum);    
 +        gmx_fio_do_real(fio,rdum);    
 +    }
 +      gmx_fio_do_real(fio,iparams->gb.sar);   
 +      gmx_fio_do_real(fio,iparams->gb.st);
 +      gmx_fio_do_real(fio,iparams->gb.pi);
 +      gmx_fio_do_real(fio,iparams->gb.gbr);
 +      gmx_fio_do_real(fio,iparams->gb.bmlt);
 +      break;
 +  case F_CMAP:
 +      gmx_fio_do_int(fio,iparams->cmap.cmapA);
 +      gmx_fio_do_int(fio,iparams->cmap.cmapB);
 +    break;
 +  default:
 +      gmx_fatal(FARGS,"unknown function type %d (%s) in %s line %d",
 +                ftype,interaction_function[ftype].name,__FILE__,__LINE__);
 +  }
 +  if (!bRead)
 +    gmx_fio_unset_comment(fio);
 +}
 +
 +static void do_ilist(t_fileio *fio, t_ilist *ilist,gmx_bool bRead,int file_version,
 +                   int ftype)
 +{
 +  int  i,k,idum;
 +  gmx_bool bDum=TRUE;
 +  
 +  if (!bRead) {
 +    gmx_fio_set_comment(fio, interaction_function[ftype].name);
 +  }
 +  if (file_version < 44) {
 +    for(i=0; i<MAXNODES; i++)
 +      gmx_fio_do_int(fio,idum);
 +  }
 +  gmx_fio_do_int(fio,ilist->nr);
 +  if (bRead)
 +    snew(ilist->iatoms,ilist->nr);
 +  bDum=gmx_fio_ndo_int(fio,ilist->iatoms,ilist->nr);
 +  if (!bRead)
 +    gmx_fio_unset_comment(fio);
 +}
 +
 +static void do_ffparams(t_fileio *fio, gmx_ffparams_t *ffparams,
 +                      gmx_bool bRead, int file_version)
 +{
 +  int  idum,i,j;
 +  gmx_bool bDum=TRUE;
 +  unsigned int k;
 +
 +  gmx_fio_do_int(fio,ffparams->atnr);
 +  if (file_version < 57) {
 +    gmx_fio_do_int(fio,idum);
 +  }
 +  gmx_fio_do_int(fio,ffparams->ntypes);
 +  if (bRead && debug)
 +    fprintf(debug,"ffparams->atnr = %d, ntypes = %d\n",
 +          ffparams->atnr,ffparams->ntypes);
 +  if (bRead) {
 +    snew(ffparams->functype,ffparams->ntypes);
 +    snew(ffparams->iparams,ffparams->ntypes);
 +  }
 +  /* Read/write all the function types */
 +  bDum=gmx_fio_ndo_int(fio,ffparams->functype,ffparams->ntypes);
 +  if (bRead && debug)
 +    pr_ivec(debug,0,"functype",ffparams->functype,ffparams->ntypes,TRUE);
 +
 +  if (file_version >= 66) {
 +    gmx_fio_do_double(fio,ffparams->reppow);
 +  } else {
 +    ffparams->reppow = 12.0;
 +  }
 +
 +  if (file_version >= 57) {
 +    gmx_fio_do_real(fio,ffparams->fudgeQQ);
 +  }
 +
 +  /* Check whether all these function types are supported by the code.
 +   * In practice the code is backwards compatible, which means that the
 +   * numbering may have to be altered from old numbering to new numbering
 +   */
 +  for (i=0; (i<ffparams->ntypes); i++) {
 +    if (bRead)
 +      /* Loop over file versions */
 +      for (k=0; (k<NFTUPD); k++)
 +      /* Compare the read file_version to the update table */
 +      if ((file_version < ftupd[k].fvnr) && 
 +          (ffparams->functype[i] >= ftupd[k].ftype)) {
 +        ffparams->functype[i] += 1;
 +        if (debug) {
 +          fprintf(debug,"Incrementing function type %d to %d (due to %s)\n",
 +                  i,ffparams->functype[i],
 +                  interaction_function[ftupd[k].ftype].longname);
 +          fflush(debug);
 +        }
 +      }
 +    
 +    do_iparams(fio, ffparams->functype[i],&ffparams->iparams[i],bRead,
 +               file_version);
 +    if (bRead && debug)
 +      pr_iparams(debug,ffparams->functype[i],&ffparams->iparams[i]);
 +  }
 +}
 +
 +static void add_settle_atoms(t_ilist *ilist)
 +{
 +    int i;
 +
 +    /* Settle used to only store the first atom: add the other two */
 +    srenew(ilist->iatoms,2*ilist->nr);
 +    for(i=ilist->nr/2-1; i>=0; i--)
 +    {
 +        ilist->iatoms[4*i+0] = ilist->iatoms[2*i+0];
 +        ilist->iatoms[4*i+1] = ilist->iatoms[2*i+1];
 +        ilist->iatoms[4*i+2] = ilist->iatoms[2*i+1] + 1;
 +        ilist->iatoms[4*i+3] = ilist->iatoms[2*i+1] + 2;
 +    }
 +    ilist->nr = 2*ilist->nr;
 +}
 +
 +static void do_ilists(t_fileio *fio, t_ilist *ilist,gmx_bool bRead, 
 +                      int file_version)
 +{
 +  int i,j,renum[F_NRE];
 +  gmx_bool bDum=TRUE,bClear;
 +  unsigned int k;
 +  
 +  for(j=0; (j<F_NRE); j++) {
 +    bClear = FALSE;
 +    if (bRead)
 +      for (k=0; k<NFTUPD; k++)
 +        if ((file_version < ftupd[k].fvnr) && (j == ftupd[k].ftype)) 
 +          bClear = TRUE;
 +    if (bClear) {
 +      ilist[j].nr = 0;
 +      ilist[j].iatoms = NULL;
 +    } else {
 +      do_ilist(fio, &ilist[j],bRead,file_version,j);
 +      if (file_version < 78 && j == F_SETTLE && ilist[j].nr > 0)
 +      {
 +          add_settle_atoms(&ilist[j]);
 +      }
 +    }
 +    /*
 +    if (bRead && gmx_debug_at)
 +      pr_ilist(debug,0,interaction_function[j].longname,
 +             functype,&ilist[j],TRUE);
 +    */
 +  }
 +}
 +
 +static void do_idef(t_fileio *fio, gmx_ffparams_t *ffparams,gmx_moltype_t *molt,
 +                  gmx_bool bRead, int file_version)
 +{
 +  do_ffparams(fio, ffparams,bRead,file_version);
 +    
 +  if (file_version >= 54) {
 +    gmx_fio_do_real(fio,ffparams->fudgeQQ);
 +  }
 +
 +  do_ilists(fio, molt->ilist,bRead,file_version);
 +}
 +
 +static void do_block(t_fileio *fio, t_block *block,gmx_bool bRead,int file_version)
 +{
 +  int  i,idum,dum_nra,*dum_a;
 +  gmx_bool bDum=TRUE;
 +
 +  if (file_version < 44)
 +    for(i=0; i<MAXNODES; i++)
 +      gmx_fio_do_int(fio,idum);
 +  gmx_fio_do_int(fio,block->nr);
 +  if (file_version < 51)
 +    gmx_fio_do_int(fio,dum_nra);
 +  if (bRead) {
 +    block->nalloc_index = block->nr+1;
 +    snew(block->index,block->nalloc_index);
 +  }
 +  bDum=gmx_fio_ndo_int(fio,block->index,block->nr+1);
 +
 +  if (file_version < 51 && dum_nra > 0) {
 +    snew(dum_a,dum_nra);
 +    bDum=gmx_fio_ndo_int(fio,dum_a,dum_nra);
 +    sfree(dum_a);
 +  }
 +}
 +
 +static void do_blocka(t_fileio *fio, t_blocka *block,gmx_bool bRead,
 +                      int file_version)
 +{
 +  int  i,idum;
 +  gmx_bool bDum=TRUE;
 +
 +  if (file_version < 44)
 +    for(i=0; i<MAXNODES; i++)
 +      gmx_fio_do_int(fio,idum);
 +  gmx_fio_do_int(fio,block->nr);
 +  gmx_fio_do_int(fio,block->nra);
 +  if (bRead) {
 +    block->nalloc_index = block->nr+1;
 +    snew(block->index,block->nalloc_index);
 +    block->nalloc_a = block->nra;
 +    snew(block->a,block->nalloc_a);
 +  }
 +  bDum=gmx_fio_ndo_int(fio,block->index,block->nr+1);
 +  bDum=gmx_fio_ndo_int(fio,block->a,block->nra);
 +}
 +
 +static void do_atom(t_fileio *fio, t_atom *atom,int ngrp,gmx_bool bRead, 
 +                    int file_version, gmx_groups_t *groups,int atnr)
 +{ 
 +  int i,myngrp;
 +  
 +  gmx_fio_do_real(fio,atom->m);
 +  gmx_fio_do_real(fio,atom->q);
 +  gmx_fio_do_real(fio,atom->mB);
 +  gmx_fio_do_real(fio,atom->qB);
 +  gmx_fio_do_ushort(fio, atom->type);
 +  gmx_fio_do_ushort(fio, atom->typeB);
 +  gmx_fio_do_int(fio,atom->ptype);
 +  gmx_fio_do_int(fio,atom->resind);
 +  if (file_version >= 52)
 +    gmx_fio_do_int(fio,atom->atomnumber);
 +  else if (bRead)
 +    atom->atomnumber = NOTSET;
 +  if (file_version < 23) 
 +    myngrp = 8;
 +  else if (file_version < 39) 
 +    myngrp = 9;
 +  else
 +    myngrp = ngrp;
 +
 +  if (file_version < 57) {
 +    unsigned char uchar[egcNR];
 +    gmx_fio_ndo_uchar(fio,uchar,myngrp);
 +    for(i=myngrp; (i<ngrp); i++) {
 +      uchar[i] = 0;
 +    }
 +    /* Copy the old data format to the groups struct */
 +    for(i=0; i<ngrp; i++) {
 +      groups->grpnr[i][atnr] = uchar[i];
 +    }
 +  }
 +}
 +
 +static void do_grps(t_fileio *fio, int ngrp,t_grps grps[],gmx_bool bRead, 
 +                    int file_version)
 +{
 +  int i,j,myngrp;
 +  gmx_bool bDum=TRUE;
 +  
 +  if (file_version < 23) 
 +    myngrp = 8;
 +  else if (file_version < 39) 
 +    myngrp = 9;
 +  else
 +    myngrp = ngrp;
 +
 +  for(j=0; (j<ngrp); j++) {
 +    if (j<myngrp) {
 +      gmx_fio_do_int(fio,grps[j].nr);
 +      if (bRead)
 +      snew(grps[j].nm_ind,grps[j].nr);
 +      bDum=gmx_fio_ndo_int(fio,grps[j].nm_ind,grps[j].nr);
 +    }
 +    else {
 +      grps[j].nr = 1;
 +      snew(grps[j].nm_ind,grps[j].nr);
 +    }
 +  }
 +}
 +
 +static void do_symstr(t_fileio *fio, char ***nm,gmx_bool bRead,t_symtab *symtab)
 +{
 +  int ls;
 +  
 +  if (bRead) {
 +    gmx_fio_do_int(fio,ls);
 +    *nm = get_symtab_handle(symtab,ls);
 +  }
 +  else {
 +    ls = lookup_symtab(symtab,*nm);
 +    gmx_fio_do_int(fio,ls);
 +  }
 +}
 +
 +static void do_strstr(t_fileio *fio, int nstr,char ***nm,gmx_bool bRead,
 +                      t_symtab *symtab)
 +{
 +  int  j;
 +  
 +  for (j=0; (j<nstr); j++) 
 +    do_symstr(fio, &(nm[j]),bRead,symtab);
 +}
 +
 +static void do_resinfo(t_fileio *fio, int n,t_resinfo *ri,gmx_bool bRead,
 +                       t_symtab *symtab, int file_version)
 +{
 +  int  j;
 +  
 +  for (j=0; (j<n); j++) {
 +    do_symstr(fio, &(ri[j].name),bRead,symtab);
 +    if (file_version >= 63) {
 +      gmx_fio_do_int(fio,ri[j].nr);
 +      gmx_fio_do_uchar(fio, ri[j].ic);
 +    } else {
 +      ri[j].nr = j + 1;
 +      ri[j].ic = ' ';
 +    }
 +  }
 +}
 +
 +static void do_atoms(t_fileio *fio, t_atoms *atoms,gmx_bool bRead,t_symtab *symtab,
 +                   int file_version,
 +                   gmx_groups_t *groups)
 +{
 +  int i;
 +  
 +  gmx_fio_do_int(fio,atoms->nr);
 +  gmx_fio_do_int(fio,atoms->nres);
 +  if (file_version < 57) {
 +    gmx_fio_do_int(fio,groups->ngrpname);
 +    for(i=0; i<egcNR; i++) {
 +      groups->ngrpnr[i] = atoms->nr;
 +      snew(groups->grpnr[i],groups->ngrpnr[i]);
 +    }
 +  }
 +  if (bRead) {
 +    snew(atoms->atom,atoms->nr);
 +    snew(atoms->atomname,atoms->nr);
 +    snew(atoms->atomtype,atoms->nr);
 +    snew(atoms->atomtypeB,atoms->nr);
 +    snew(atoms->resinfo,atoms->nres);
 +    if (file_version < 57) {
 +      snew(groups->grpname,groups->ngrpname);
 +    }
 +    atoms->pdbinfo = NULL;
 +  }
 +  for(i=0; (i<atoms->nr); i++) {
 +    do_atom(fio, &atoms->atom[i],egcNR,bRead, file_version,groups,i);
 +  }
 +  do_strstr(fio, atoms->nr,atoms->atomname,bRead,symtab);
 +  if (bRead && (file_version <= 20)) {
 +    for(i=0; i<atoms->nr; i++) {
 +      atoms->atomtype[i]  = put_symtab(symtab,"?");
 +      atoms->atomtypeB[i] = put_symtab(symtab,"?");
 +    }
 +  } else {
 +    do_strstr(fio, atoms->nr,atoms->atomtype,bRead,symtab);
 +    do_strstr(fio, atoms->nr,atoms->atomtypeB,bRead,symtab);
 +  }
 +  do_resinfo(fio, atoms->nres,atoms->resinfo,bRead,symtab,file_version);
 +
 +  if (file_version < 57) {
 +    do_strstr(fio, groups->ngrpname,groups->grpname,bRead,symtab);
 +  
 +    do_grps(fio, egcNR,groups->grps,bRead,file_version);
 +  }
 +}
 +
 +static void do_groups(t_fileio *fio, gmx_groups_t *groups,
 +                    gmx_bool bRead,t_symtab *symtab,
 +                    int file_version)
 +{
 +  int  g,n,i;
 +  gmx_bool bDum=TRUE;
 +
 +  do_grps(fio, egcNR,groups->grps,bRead,file_version);
 +  gmx_fio_do_int(fio,groups->ngrpname);
 +  if (bRead) {
 +    snew(groups->grpname,groups->ngrpname);
 +  }
 +  do_strstr(fio, groups->ngrpname,groups->grpname,bRead,symtab);
 +  for(g=0; g<egcNR; g++) {
 +    gmx_fio_do_int(fio,groups->ngrpnr[g]);
 +    if (groups->ngrpnr[g] == 0) {
 +      if (bRead) {
 +      groups->grpnr[g] = NULL;
 +      }
 +    } else {
 +      if (bRead) {
 +      snew(groups->grpnr[g],groups->ngrpnr[g]);
 +      }
 +      bDum=gmx_fio_ndo_uchar(fio, groups->grpnr[g],groups->ngrpnr[g]);
 +    }
 +  }
 +}
 +
 +static void do_atomtypes(t_fileio *fio, t_atomtypes *atomtypes,gmx_bool bRead,
 +                       t_symtab *symtab,int file_version)
 +{
 +  int i,j;
 +  gmx_bool bDum = TRUE;
 +  
 +  if (file_version > 25) {
 +    gmx_fio_do_int(fio,atomtypes->nr);
 +    j=atomtypes->nr;
 +    if (bRead) {
 +      snew(atomtypes->radius,j);
 +      snew(atomtypes->vol,j);
 +      snew(atomtypes->surftens,j);
 +      snew(atomtypes->atomnumber,j);
 +      snew(atomtypes->gb_radius,j);
 +      snew(atomtypes->S_hct,j);
 +    }
 +    bDum=gmx_fio_ndo_real(fio,atomtypes->radius,j);
 +    bDum=gmx_fio_ndo_real(fio,atomtypes->vol,j);
 +    bDum=gmx_fio_ndo_real(fio,atomtypes->surftens,j);
 +    if(file_version >= 40)
 +    {
 +        bDum=gmx_fio_ndo_int(fio,atomtypes->atomnumber,j);
 +    }
 +      if(file_version >= 60)
 +      {
 +              bDum=gmx_fio_ndo_real(fio,atomtypes->gb_radius,j);
 +              bDum=gmx_fio_ndo_real(fio,atomtypes->S_hct,j);
 +      }
 +  } else {
 +    /* File versions prior to 26 cannot do GBSA, 
 +     * so they dont use this structure 
 +     */
 +    atomtypes->nr = 0;
 +    atomtypes->radius = NULL;
 +    atomtypes->vol = NULL;
 +    atomtypes->surftens = NULL;
 +    atomtypes->atomnumber = NULL;
 +    atomtypes->gb_radius = NULL;
 +    atomtypes->S_hct = NULL;
 +  }  
 +}
 +
 +static void do_symtab(t_fileio *fio, t_symtab *symtab,gmx_bool bRead)
 +{
 +  int i,nr;
 +  t_symbuf *symbuf;
 +  char buf[STRLEN];
 +  
 +  gmx_fio_do_int(fio,symtab->nr);
 +  nr     = symtab->nr;
 +  if (bRead) {
 +    snew(symtab->symbuf,1);
 +    symbuf = symtab->symbuf;
 +    symbuf->bufsize = nr;
 +    snew(symbuf->buf,nr);
 +    for (i=0; (i<nr); i++) {
 +      gmx_fio_do_string(fio,buf);
 +      symbuf->buf[i]=strdup(buf);
 +    }
 +  }
 +  else {
 +    symbuf = symtab->symbuf;
 +    while (symbuf!=NULL) {
 +      for (i=0; (i<symbuf->bufsize) && (i<nr); i++) 
 +      gmx_fio_do_string(fio,symbuf->buf[i]);
 +      nr-=i;
 +      symbuf=symbuf->next;
 +    }
 +    if (nr != 0)
 +      gmx_fatal(FARGS,"nr of symtab strings left: %d",nr);
 +  }
 +}
 +
 +static void do_cmap(t_fileio *fio, gmx_cmap_t *cmap_grid, gmx_bool bRead)
 +{
 +      int i,j,ngrid,gs,nelem;
 +      
 +      gmx_fio_do_int(fio,cmap_grid->ngrid);
 +      gmx_fio_do_int(fio,cmap_grid->grid_spacing);
 +      
 +      ngrid = cmap_grid->ngrid;
 +      gs    = cmap_grid->grid_spacing;
 +      nelem = gs * gs;
 +      
 +      if(bRead)
 +      {
 +              snew(cmap_grid->cmapdata,ngrid);
 +              
 +              for(i=0;i<cmap_grid->ngrid;i++)
 +              {
 +                      snew(cmap_grid->cmapdata[i].cmap,4*nelem);
 +              }
 +      }
 +      
 +      for(i=0;i<cmap_grid->ngrid;i++)
 +      {
 +              for(j=0;j<nelem;j++)
 +              {
 +                      gmx_fio_do_real(fio,cmap_grid->cmapdata[i].cmap[j*4]);
 +                      gmx_fio_do_real(fio,cmap_grid->cmapdata[i].cmap[j*4+1]);
 +                      gmx_fio_do_real(fio,cmap_grid->cmapdata[i].cmap[j*4+2]);
 +                      gmx_fio_do_real(fio,cmap_grid->cmapdata[i].cmap[j*4+3]);
 +              }
 +      }       
 +}
 +
 +
 +void tpx_make_chain_identifiers(t_atoms *atoms,t_block *mols)
 +{
 +    int m,a,a0,a1,r;
 +    char c,chainid;
 +    int  chainnum;
 +    
 +    /* We always assign a new chain number, but save the chain id characters 
 +     * for larger molecules.
 +     */
 +#define CHAIN_MIN_ATOMS 15
 +    
 +    chainnum=0;
 +    chainid='A';
 +    for(m=0; m<mols->nr; m++) 
 +    {
 +        a0=mols->index[m];
 +        a1=mols->index[m+1];
 +        if ((a1-a0 >= CHAIN_MIN_ATOMS) && (chainid <= 'Z')) 
 +        {
 +            c=chainid;
 +            chainid++;
 +        } 
 +        else
 +        {
 +            c=' ';
 +        }
 +        for(a=a0; a<a1; a++) 
 +        {
 +            atoms->resinfo[atoms->atom[a].resind].chainnum = chainnum;
 +            atoms->resinfo[atoms->atom[a].resind].chainid  = c;
 +        }
 +        chainnum++;
 +    }
 +    
 +    /* Blank out the chain id if there was only one chain */
 +    if (chainid == 'B') 
 +    {
 +        for(r=0; r<atoms->nres; r++) 
 +        {
 +            atoms->resinfo[r].chainid = ' ';
 +        }
 +    }
 +}
 +  
 +static void do_moltype(t_fileio *fio, gmx_moltype_t *molt,gmx_bool bRead,
 +                       t_symtab *symtab, int file_version,
 +                     gmx_groups_t *groups)
 +{
 +  int i;
 +
 +  if (file_version >= 57) {
 +    do_symstr(fio, &(molt->name),bRead,symtab);
 +  }
 +
 +  do_atoms(fio, &molt->atoms, bRead, symtab, file_version, groups);
 +
 +  if (bRead && gmx_debug_at) {
 +    pr_atoms(debug,0,"atoms",&molt->atoms,TRUE);
 +  }
 +  
 +  if (file_version >= 57) {
 +    do_ilists(fio, molt->ilist,bRead,file_version);
 +
 +    do_block(fio, &molt->cgs,bRead,file_version);
 +    if (bRead && gmx_debug_at) {
 +      pr_block(debug,0,"cgs",&molt->cgs,TRUE);
 +    }
 +  }
 +
 +  /* This used to be in the atoms struct */
 +  do_blocka(fio, &molt->excls, bRead, file_version);
 +}
 +
 +static void do_molblock(t_fileio *fio, gmx_molblock_t *molb,gmx_bool bRead,
 +                        int file_version)
 +{
 +  int i;
 +
 +  gmx_fio_do_int(fio,molb->type);
 +  gmx_fio_do_int(fio,molb->nmol);
 +  gmx_fio_do_int(fio,molb->natoms_mol);
 +  /* Position restraint coordinates */
 +  gmx_fio_do_int(fio,molb->nposres_xA);
 +  if (molb->nposres_xA > 0) {
 +    if (bRead) {
 +      snew(molb->posres_xA,molb->nposres_xA);
 +    }
 +    gmx_fio_ndo_rvec(fio,molb->posres_xA,molb->nposres_xA);
 +  }
 +  gmx_fio_do_int(fio,molb->nposres_xB);
 +  if (molb->nposres_xB > 0) {
 +    if (bRead) {
 +      snew(molb->posres_xB,molb->nposres_xB);
 +    }
 +    gmx_fio_ndo_rvec(fio,molb->posres_xB,molb->nposres_xB);
 +  }
 +
 +}
 +
 +static t_block mtop_mols(gmx_mtop_t *mtop)
 +{
 +  int mb,m,a,mol;
 +  t_block mols;
 +
 +  mols.nr = 0;
 +  for(mb=0; mb<mtop->nmolblock; mb++) {
 +    mols.nr += mtop->molblock[mb].nmol;
 +  }
 +  mols.nalloc_index = mols.nr + 1;
 +  snew(mols.index,mols.nalloc_index);
 +
 +  a = 0;
 +  m = 0;
 +  mols.index[m] = a;
 +  for(mb=0; mb<mtop->nmolblock; mb++) {
 +    for(mol=0; mol<mtop->molblock[mb].nmol; mol++) {
 +      a += mtop->molblock[mb].natoms_mol;
 +      m++;
 +      mols.index[m] = a;
 +    }
 +  }
 +  
 +  return mols;
 +}
 +
 +static void add_posres_molblock(gmx_mtop_t *mtop)
 +{
 +    t_ilist *il,*ilfb;
 +  int am,i,mol,a;
 +  gmx_bool bFE;
 +  gmx_molblock_t *molb;
 +  t_iparams *ip;
 +
 +  /* posres reference positions are stored in ip->posres (if present) and
 +     in ip->fbposres (if present). If normal and flat-bottomed posres are present,
 +     posres.pos0A are identical to fbposres.pos0. */
 +  il = &mtop->moltype[0].ilist[F_POSRES];
 +  ilfb = &mtop->moltype[0].ilist[F_FBPOSRES];
 +  if (il->nr == 0 && ilfb->nr == 0) {
 +    return;
 +  }
 +  am = 0;
 +  bFE = FALSE;
 +  for(i=0; i<il->nr; i+=2) {
 +    ip = &mtop->ffparams.iparams[il->iatoms[i]];
 +    am = max(am,il->iatoms[i+1]);
 +    if (ip->posres.pos0B[XX] != ip->posres.pos0A[XX] ||
 +      ip->posres.pos0B[YY] != ip->posres.pos0A[YY] ||
 +      ip->posres.pos0B[ZZ] != ip->posres.pos0A[ZZ]) {
 +      bFE = TRUE;
 +    }
 +  }
 +  /* This loop is required if we have only flat-bottomed posres:
 +     - set am
 +     - bFE == FALSE (no B-state for flat-bottomed posres) */
 +  if (il->nr == 0)
 +  {
 +      for(i=0; i<ilfb->nr; i+=2) {
 +          ip = &mtop->ffparams.iparams[ilfb->iatoms[i]];
 +          am = max(am,ilfb->iatoms[i+1]);
 +      }
 +  }
 +  /* Make the posres coordinate block end at a molecule end */
 +  mol = 0;
 +  while(am >= mtop->mols.index[mol+1]) {
 +    mol++;
 +  }
 +  molb = &mtop->molblock[0];
 +  molb->nposres_xA = mtop->mols.index[mol+1];
 +  snew(molb->posres_xA,molb->nposres_xA);
 +  if (bFE) {
 +    molb->nposres_xB = molb->nposres_xA;
 +    snew(molb->posres_xB,molb->nposres_xB);
 +  } else {
 +    molb->nposres_xB = 0;
 +  }
 +  for(i=0; i<il->nr; i+=2) {
 +    ip = &mtop->ffparams.iparams[il->iatoms[i]];
 +    a  = il->iatoms[i+1];
 +    molb->posres_xA[a][XX] = ip->posres.pos0A[XX];
 +    molb->posres_xA[a][YY] = ip->posres.pos0A[YY];
 +    molb->posres_xA[a][ZZ] = ip->posres.pos0A[ZZ];
 +    if (bFE) {
 +      molb->posres_xB[a][XX] = ip->posres.pos0B[XX];
 +      molb->posres_xB[a][YY] = ip->posres.pos0B[YY];
 +      molb->posres_xB[a][ZZ] = ip->posres.pos0B[ZZ];
 +    }
 +  }
 +  if (il->nr == 0)
 +  {
 +      /* If only flat-bottomed posres are present, take reference pos from them.
 +         Here: bFE == FALSE      */
 +      for(i=0; i<ilfb->nr; i+=2)
 +      {
 +          ip = &mtop->ffparams.iparams[ilfb->iatoms[i]];
 +          a  = ilfb->iatoms[i+1];
 +          molb->posres_xA[a][XX] = ip->fbposres.pos0[XX];
 +          molb->posres_xA[a][YY] = ip->fbposres.pos0[YY];
 +          molb->posres_xA[a][ZZ] = ip->fbposres.pos0[ZZ];
 +      }
 +  }
 +}
 +
 +static void set_disres_npair(gmx_mtop_t *mtop)
 +{
 +  int mt,i,npair;
 +  t_iparams *ip;
 +  t_ilist *il;
 +  t_iatom *a;
 +
 +  ip = mtop->ffparams.iparams;
 +
 +  for(mt=0; mt<mtop->nmoltype; mt++) {
 +    il = &mtop->moltype[mt].ilist[F_DISRES];
 +    if (il->nr > 0) {
 +      a = il->iatoms;
 +      npair = 0;
 +      for(i=0; i<il->nr; i+=3) {
 +      npair++;
 +      if (i+3 == il->nr || ip[a[i]].disres.label != ip[a[i+3]].disres.label) {
 +        ip[a[i]].disres.npair = npair;
 +        npair = 0;
 +      }
 +      }
 +    }
 +  }
 +}
 +
 +static void do_mtop(t_fileio *fio, gmx_mtop_t *mtop,gmx_bool bRead, 
 +                    int file_version)
 +{
 +  int  mt,mb,i;
 +  t_blocka dumb;
 +
 +  if (bRead)
 +    init_mtop(mtop);
 +  do_symtab(fio, &(mtop->symtab),bRead);
 +  if (bRead && debug) 
 +    pr_symtab(debug,0,"symtab",&mtop->symtab);
 +  
 +  do_symstr(fio, &(mtop->name),bRead,&(mtop->symtab));
 +  
 +  if (file_version >= 57) {
 +    do_ffparams(fio, &mtop->ffparams,bRead,file_version);
 +
 +    gmx_fio_do_int(fio,mtop->nmoltype);
 +  } else {
 +    mtop->nmoltype = 1;
 +  }
 +  if (bRead) {
 +    snew(mtop->moltype,mtop->nmoltype);
 +    if (file_version < 57) {
 +      mtop->moltype[0].name = mtop->name;
 +    }
 +  }
 +  for(mt=0; mt<mtop->nmoltype; mt++) {
 +    do_moltype(fio, &mtop->moltype[mt],bRead,&mtop->symtab,file_version,
 +             &mtop->groups);
 +  }
 +
 +  if (file_version >= 57) {
 +    gmx_fio_do_int(fio,mtop->nmolblock);
 +  } else {
 +    mtop->nmolblock = 1;
 +  }
 +  if (bRead) {
 +    snew(mtop->molblock,mtop->nmolblock);
 +  }
 +  if (file_version >= 57) {
 +    for(mb=0; mb<mtop->nmolblock; mb++) {
 +      do_molblock(fio, &mtop->molblock[mb],bRead,file_version);
 +    }
 +    gmx_fio_do_int(fio,mtop->natoms);
 +  } else {
 +    mtop->molblock[0].type = 0;
 +    mtop->molblock[0].nmol = 1;
 +    mtop->molblock[0].natoms_mol = mtop->moltype[0].atoms.nr;
 +    mtop->molblock[0].nposres_xA = 0;
 +    mtop->molblock[0].nposres_xB = 0;
 +  }
 +
 +  do_atomtypes (fio, &(mtop->atomtypes),bRead,&(mtop->symtab), file_version);
 +  if (bRead && debug) 
 +    pr_atomtypes(debug,0,"atomtypes",&mtop->atomtypes,TRUE);
 +
 +  if (file_version < 57) {
 +    /* Debug statements are inside do_idef */    
 +    do_idef (fio, &mtop->ffparams,&mtop->moltype[0],bRead,file_version);
 +    mtop->natoms = mtop->moltype[0].atoms.nr;
 +  }
 +      
 +  if(file_version >= 65)
 +  {
 +      do_cmap(fio, &mtop->ffparams.cmap_grid,bRead);
 +  }
 +  else
 +  {
 +      mtop->ffparams.cmap_grid.ngrid        = 0;
 +      mtop->ffparams.cmap_grid.grid_spacing = 0;
 +      mtop->ffparams.cmap_grid.cmapdata     = NULL;
 +  }
 +        
 +  if (file_version >= 57) {
 +    do_groups(fio, &mtop->groups,bRead,&(mtop->symtab),file_version);
 +  }
 +
 +  if (file_version < 57) {
 +    do_block(fio, &mtop->moltype[0].cgs,bRead,file_version);
 +    if (bRead && gmx_debug_at) {
 +      pr_block(debug,0,"cgs",&mtop->moltype[0].cgs,TRUE);
 +    }
 +    do_block(fio, &mtop->mols,bRead,file_version);
 +    /* Add the posres coordinates to the molblock */
 +    add_posres_molblock(mtop);
 +  }
 +  if (bRead) {
 +    if (file_version >= 57) {
 +      mtop->mols = mtop_mols(mtop);
 +    }
 +    if (gmx_debug_at) { 
 +      pr_block(debug,0,"mols",&mtop->mols,TRUE);
 +    }
 +  }
 +
 +  if (file_version < 51) {
 +    /* Here used to be the shake blocks */
 +    do_blocka(fio, &dumb,bRead,file_version);
 +    if (dumb.nr > 0)
 +      sfree(dumb.index);
 +    if (dumb.nra > 0)
 +      sfree(dumb.a);
 +  }
 +
 +  if (bRead) {
 +    close_symtab(&(mtop->symtab));
 +  }
 +}
 +
 +/* If TopOnlyOK is TRUE then we can read even future versions
 + * of tpx files, provided the file_generation hasn't changed.
 + * If it is FALSE, we need the inputrecord too, and bail out
 + * if the file is newer than the program.
 + * 
 + * The version and generation if the topology (see top of this file)
 + * are returned in the two last arguments.
 + * 
 + * If possible, we will read the inputrec even when TopOnlyOK is TRUE.
 + */
 +static void do_tpxheader(t_fileio *fio,gmx_bool bRead,t_tpxheader *tpx, 
 +                         gmx_bool TopOnlyOK, int *file_version, 
 +                         int *file_generation)
 +{
 +    char  buf[STRLEN];
 +    char  file_tag[STRLEN];
 +  gmx_bool  bDouble;
 +  int   precision;
 +  int   fver,fgen;
 +  int   idum=0;
 +  real  rdum=0;
 +
 +  gmx_fio_checktype(fio);
 +  gmx_fio_setdebug(fio,bDebugMode());
 +  
 +  /* NEW! XDR tpb file */
 +  precision = sizeof(real);
 +  if (bRead) {
 +    gmx_fio_do_string(fio,buf);
 +    if (strncmp(buf,"VERSION",7))
 +      gmx_fatal(FARGS,"Can not read file %s,\n"
 +                "             this file is from a Gromacs version which is older than 2.0\n"
 +                "             Make a new one with grompp or use a gro or pdb file, if possible",
 +                gmx_fio_getname(fio));
 +    gmx_fio_do_int(fio,precision);
 +    bDouble = (precision == sizeof(double));
 +    if ((precision != sizeof(float)) && !bDouble)
 +      gmx_fatal(FARGS,"Unknown precision in file %s: real is %d bytes "
 +                "instead of %d or %d",
 +                gmx_fio_getname(fio),precision,sizeof(float),sizeof(double));
 +    gmx_fio_setprecision(fio,bDouble);
 +    fprintf(stderr,"Reading file %s, %s (%s precision)\n",
 +          gmx_fio_getname(fio),buf,bDouble ? "double" : "single");
 +  }
 +  else {
 +    gmx_fio_write_string(fio,GromacsVersion());
 +    bDouble = (precision == sizeof(double));
 +    gmx_fio_setprecision(fio,bDouble);
 +    gmx_fio_do_int(fio,precision);
 +    fver = tpx_version;
 +    sprintf(file_tag,"%s",tpx_tag);
 +    fgen = tpx_generation;
 +  }
 +  
 +    /* Check versions! */
 +    gmx_fio_do_int(fio,fver);
 +
 +    /* This is for backward compatibility with development versions 77-79
 +     * where the tag was, mistakenly, placed before the generation,
 +     * which would cause a segv instead of a proper error message
 +     * when reading the topology only from tpx with <77 code.
 +     */
 +    if (fver >= 77 && fver <= 79)
 +    {
 +        gmx_fio_do_string(fio,file_tag);
 +    }
 +  
 +    if (fver >= 26)
 +    {
 +        gmx_fio_do_int(fio,fgen);
 +    }
 +    else
 +    {
 +        fgen = 0;
 +    }
 + 
 +    if (fver >= 81)
 +    {
 +        gmx_fio_do_string(fio,file_tag);
 +    }
 +    if (bRead)
 +    {
 +        if (fver < 77)
 +        {
 +            /* Versions before 77 don't have the tag, set it to release */
 +            sprintf(file_tag,"%s",TPX_TAG_RELEASE);
 +        }
 +
 +        if (strcmp(file_tag,tpx_tag) != 0)
 +        {
 +            fprintf(stderr,"Note: file tpx tag '%s', software tpx tag '%s'\n",
 +                    file_tag,tpx_tag);
 +
 +            /* We only support reading tpx files with the same tag as the code
 +             * or tpx files with the release tag and with lower version number.
 +             */
 +            if (!strcmp(file_tag,TPX_TAG_RELEASE) == 0 && fver < tpx_version) 
 +            {
 +                gmx_fatal(FARGS,"tpx tag/version mismatch: reading tpx file (%s) version %d, tag '%s' with program for tpx version %d, tag '%s'",
 +                          gmx_fio_getname(fio),fver,file_tag,
 +                          tpx_version,tpx_tag);
 +            }
 +        }
 +    }
 +
 +    if (file_version != NULL)
 +    {
 +        *file_version = fver;
 +    }
 +    if (file_generation != NULL)
 +    {
 +        *file_generation = fgen;
 +    }
 +   
 +  
 +  if ((fver <= tpx_incompatible_version) ||
 +      ((fver > tpx_version) && !TopOnlyOK) ||
 +      (fgen > tpx_generation))
 +    gmx_fatal(FARGS,"reading tpx file (%s) version %d with version %d program",
 +              gmx_fio_getname(fio),fver,tpx_version);
 +  
 +  do_section(fio,eitemHEADER,bRead);
 +  gmx_fio_do_int(fio,tpx->natoms);
 +  if (fver >= 28)
 +    gmx_fio_do_int(fio,tpx->ngtc);
 +  else
 +    tpx->ngtc = 0;
 +  if (fver < 62) {
 +      gmx_fio_do_int(fio,idum);
 +      gmx_fio_do_real(fio,rdum);
 +  }
 +  /*a better decision will eventually (5.0 or later) need to be made
 +    on how to treat the alchemical state of the system, which can now
 +    vary through a simulation, and cannot be completely described
 +    though a single lambda variable, or even a single state
 +    index. Eventually, should probably be a vector. MRS*/
 +  if (fver >= 79) 
 +  {
 +      gmx_fio_do_int(fio,tpx->fep_state);
 +  }
 +  gmx_fio_do_real(fio,tpx->lambda);
 +  gmx_fio_do_int(fio,tpx->bIr);
 +  gmx_fio_do_int(fio,tpx->bTop);
 +  gmx_fio_do_int(fio,tpx->bX);
 +  gmx_fio_do_int(fio,tpx->bV);
 +  gmx_fio_do_int(fio,tpx->bF);
 +  gmx_fio_do_int(fio,tpx->bBox);
 +
 +  if((fgen > tpx_generation)) {
 +    /* This can only happen if TopOnlyOK=TRUE */
 +    tpx->bIr=FALSE;
 +  }
 +}
 +
 +static int do_tpx(t_fileio *fio, gmx_bool bRead,
 +                t_inputrec *ir,t_state *state,rvec *f,gmx_mtop_t *mtop,
 +                gmx_bool bXVallocated)
 +{
 +  t_tpxheader tpx;
 +  t_inputrec  dum_ir;
 +  gmx_mtop_t  dum_top;
 +  gmx_bool        TopOnlyOK,bDum=TRUE;
 +  int         file_version,file_generation;
 +  int         i;
 +  rvec        *xptr,*vptr;
 +  int         ePBC;
 +  gmx_bool        bPeriodicMols;
 +
 +  if (!bRead) {
 +    tpx.natoms = state->natoms;
 +    tpx.ngtc   = state->ngtc;  /* need to add nnhpres here? */
 +    tpx.fep_state = state->fep_state;
 +    tpx.lambda = state->lambda[efptFEP];
 +    tpx.bIr  = (ir       != NULL);
 +    tpx.bTop = (mtop     != NULL);
 +    tpx.bX   = (state->x != NULL);
 +    tpx.bV   = (state->v != NULL);
 +    tpx.bF   = (f        != NULL);
 +    tpx.bBox = TRUE;
 +  }
 +  
 +  TopOnlyOK = (ir==NULL);
 +  
 +  do_tpxheader(fio,bRead,&tpx,TopOnlyOK,&file_version,&file_generation);
 +
 +  if (bRead) {
 +    state->flags  = 0;
 +    /* state->lambda = tpx.lambda;*/ /*remove this eventually? */
 +    /* The init_state calls initialize the Nose-Hoover xi integrals to zero */
 +    if (bXVallocated) {
 +      xptr = state->x;
 +      vptr = state->v;
 +      init_state(state,0,tpx.ngtc,0,0,0);  /* nose-hoover chains */ /* eventually, need to add nnhpres here? */
 +      state->natoms = tpx.natoms;
 +      state->nalloc = tpx.natoms;
 +      state->x = xptr;
 +      state->v = vptr;
 +    } else {
 +        init_state(state,tpx.natoms,tpx.ngtc,0,0,0);  /* nose-hoover chains */
 +    }
 +  }
 +
 +#define do_test(fio,b,p) if (bRead && (p!=NULL) && !b) gmx_fatal(FARGS,"No %s in %s",#p,gmx_fio_getname(fio)) 
 +
 +  do_test(fio,tpx.bBox,state->box);
 +  do_section(fio,eitemBOX,bRead);
 +  if (tpx.bBox) {
 +    gmx_fio_ndo_rvec(fio,state->box,DIM);
 +    if (file_version >= 51) {
 +      gmx_fio_ndo_rvec(fio,state->box_rel,DIM);
 +    } else {
 +      /* We initialize box_rel after reading the inputrec */
 +      clear_mat(state->box_rel);
 +    }
 +    if (file_version >= 28) {
 +      gmx_fio_ndo_rvec(fio,state->boxv,DIM);
 +      if (file_version < 56) {
 +      matrix mdum;
 +      gmx_fio_ndo_rvec(fio,mdum,DIM);
 +      }
 +    }
 +  }
 +  
 +  if (state->ngtc > 0 && file_version >= 28) {
 +    real *dumv;
 +    /*ndo_double(state->nosehoover_xi,state->ngtc,bDum);*/
 +    /*ndo_double(state->nosehoover_vxi,state->ngtc,bDum);*/
 +    /*ndo_double(state->therm_integral,state->ngtc,bDum);*/
 +    snew(dumv,state->ngtc);
 +    if (file_version < 69) {
 +      bDum=gmx_fio_ndo_real(fio,dumv,state->ngtc);
 +    }
 +    /* These used to be the Berendsen tcoupl_lambda's */
 +    bDum=gmx_fio_ndo_real(fio,dumv,state->ngtc);
 +    sfree(dumv);
 +  }
 +
 +  /* Prior to tpx version 26, the inputrec was here.
 +   * I moved it to enable partial forward-compatibility
 +   * for analysis/viewer programs.
 +   */
 +  if(file_version<26) {
 +    do_test(fio,tpx.bIr,ir);
 +    do_section(fio,eitemIR,bRead);
 +    if (tpx.bIr) {
 +      if (ir) {
 +      do_inputrec(fio, ir,bRead,file_version,
 +                    mtop ? &mtop->ffparams.fudgeQQ : NULL);
 +      if (bRead && debug) 
 +        pr_inputrec(debug,0,"inputrec",ir,FALSE);
 +      }
 +      else {
 +      do_inputrec(fio, &dum_ir,bRead,file_version,
 +                    mtop ? &mtop->ffparams.fudgeQQ :NULL);
 +      if (bRead && debug) 
 +        pr_inputrec(debug,0,"inputrec",&dum_ir,FALSE);
 +      done_inputrec(&dum_ir);
 +      }
 +      
 +    }
 +  }
 +  
 +  do_test(fio,tpx.bTop,mtop);
 +  do_section(fio,eitemTOP,bRead);
 +  if (tpx.bTop) {
 +    int mtop_file_version = file_version;
 +    /*allow reading of Gromacs 4.6 files*/
 +    if (mtop_file_version>80 && mtop_file_version<90)
 +    {
 +        mtop_file_version = 79;
 +    }
 +    if (mtop) {
 +      do_mtop(fio,mtop,bRead, mtop_file_version);
 +    } else {
 +      do_mtop(fio,&dum_top,bRead,mtop_file_version);
 +      done_mtop(&dum_top,TRUE);
 +    }
 +  }
 +  do_test(fio,tpx.bX,state->x);  
 +  do_section(fio,eitemX,bRead);
 +  if (tpx.bX) {
 +    if (bRead) {
 +      state->flags |= (1<<estX);
 +    }
 +    gmx_fio_ndo_rvec(fio,state->x,state->natoms);
 +  }
 +  
 +  do_test(fio,tpx.bV,state->v);
 +  do_section(fio,eitemV,bRead);
 +  if (tpx.bV) {
 +    if (bRead) {
 +      state->flags |= (1<<estV);
 +    }
 +    gmx_fio_ndo_rvec(fio,state->v,state->natoms);
 +  }
 +
 +  do_test(fio,tpx.bF,f);
 +  do_section(fio,eitemF,bRead);
 +  if (tpx.bF) gmx_fio_ndo_rvec(fio,f,state->natoms);
 +
 +  /* Starting with tpx version 26, we have the inputrec
 +   * at the end of the file, so we can ignore it 
 +   * if the file is never than the software (but still the
 +   * same generation - see comments at the top of this file.
 +   *
 +   * 
 +   */
 +  ePBC = -1;
 +  bPeriodicMols = FALSE;
 +  if (file_version >= 26) {
 +    do_test(fio,tpx.bIr,ir);
 +    do_section(fio,eitemIR,bRead);
 +    if (tpx.bIr) {
 +      if (file_version >= 53) {
 +      /* Removed the pbc info from do_inputrec, since we always want it */
 +      if (!bRead) {
 +        ePBC          = ir->ePBC;
 +        bPeriodicMols = ir->bPeriodicMols;
 +      }
 +      gmx_fio_do_int(fio,ePBC);
 +      gmx_fio_do_gmx_bool(fio,bPeriodicMols);
 +      }
 +      if (file_generation <= tpx_generation && ir) {
 +      do_inputrec(fio, ir,bRead,file_version,mtop ? &mtop->ffparams.fudgeQQ : NULL);
 +      if (bRead && debug) 
 +        pr_inputrec(debug,0,"inputrec",ir,FALSE);
 +      if (file_version < 51)
 +        set_box_rel(ir,state);
 +      if (file_version < 53) {
 +        ePBC          = ir->ePBC;
 +        bPeriodicMols = ir->bPeriodicMols;
 +      }
 +      }
 +      if (bRead && ir && file_version >= 53) {
 +      /* We need to do this after do_inputrec, since that initializes ir */
 +      ir->ePBC          = ePBC;
 +      ir->bPeriodicMols = bPeriodicMols;
 +      }
 +    }
 +  }
 +
 +    if (bRead)
 +    {
 +        if (tpx.bIr && ir)
 +        {
 +            if (state->ngtc == 0)
 +            {
 +                /* Reading old version without tcoupl state data: set it */
 +                init_gtc_state(state,ir->opts.ngtc,0,ir->opts.nhchainlength);
 +            }
 +            if (tpx.bTop && mtop)
 +            {
 +                if (file_version < 57)
 +                {
 +                    if (mtop->moltype[0].ilist[F_DISRES].nr > 0)
 +                    {
 +                        ir->eDisre = edrSimple;
 +                    }
 +                    else
 +                    {
 +                        ir->eDisre = edrNone;
 +                    }
 +                }
 +                set_disres_npair(mtop);
 +            }
 +        }
 +
 +        if (tpx.bTop && mtop)
 +        {
 +            gmx_mtop_finalize(mtop);
 +        }
 +
 +        if (file_version >= 57)
 +        {
 +            char *env;
 +            int  ienv;
 +            env = getenv("GMX_NOCHARGEGROUPS");
 +            if (env != NULL)
 +            {
 +                sscanf(env,"%d",&ienv);
 +                fprintf(stderr,"\nFound env.var. GMX_NOCHARGEGROUPS = %d\n",
 +                        ienv);
 +                if (ienv > 0)
 +                {
 +                    fprintf(stderr,
 +                            "Will make single atomic charge groups in non-solvent%s\n",
 +                            ienv > 1 ? " and solvent" : "");
 +                    gmx_mtop_make_atomic_charge_groups(mtop,ienv==1);
 +                }
 +                fprintf(stderr,"\n");
 +            }
 +        }
 +    }
 +
 +    return ePBC;
 +}
 +
 +/************************************************************
 + *
 + *  The following routines are the exported ones
 + *
 + ************************************************************/
 +
 +t_fileio *open_tpx(const char *fn,const char *mode)
 +{
 +  return gmx_fio_open(fn,mode);
 +}    
 + 
 +void close_tpx(t_fileio *fio)
 +{
 +  gmx_fio_close(fio);
 +}
 +
 +void read_tpxheader(const char *fn, t_tpxheader *tpx, gmx_bool TopOnlyOK,
 +                    int *file_version, int *file_generation)
 +{
 +  t_fileio *fio;
 +
 +  fio = open_tpx(fn,"r");
 +  do_tpxheader(fio,TRUE,tpx,TopOnlyOK,file_version,file_generation);
 +  close_tpx(fio);
 +}
 +
 +void write_tpx_state(const char *fn,
 +                   t_inputrec *ir,t_state *state,gmx_mtop_t *mtop)
 +{
 +  t_fileio *fio;
 +
 +  fio = open_tpx(fn,"w");
 +  do_tpx(fio,FALSE,ir,state,NULL,mtop,FALSE);
 +  close_tpx(fio);
 +}
 +
 +void read_tpx_state(const char *fn,
 +                  t_inputrec *ir,t_state *state,rvec *f,gmx_mtop_t *mtop)
 +{
 +  t_fileio *fio;
 +      
 +  fio = open_tpx(fn,"r");
 +  do_tpx(fio,TRUE,ir,state,f,mtop,FALSE);
 +  close_tpx(fio);
 +}
 +
 +int read_tpx(const char *fn,
 +           t_inputrec *ir, matrix box,int *natoms,
 +           rvec *x,rvec *v,rvec *f,gmx_mtop_t *mtop)
 +{
 +  t_fileio *fio;
 +  t_state state;
 +  int ePBC;
 +
 +  state.x = x;
 +  state.v = v;
 +  fio = open_tpx(fn,"r");
 +  ePBC = do_tpx(fio,TRUE,ir,&state,f,mtop,TRUE);
 +  close_tpx(fio);
 +  *natoms = state.natoms;
 +  if (box) 
 +    copy_mat(state.box,box);
 +  state.x = NULL;
 +  state.v = NULL;
 +  done_state(&state);
 +
 +  return ePBC;
 +}
 +
 +int read_tpx_top(const char *fn,
 +               t_inputrec *ir, matrix box,int *natoms,
 +               rvec *x,rvec *v,rvec *f,t_topology *top)
 +{
 +  gmx_mtop_t mtop;
 +  t_topology *ltop;
 +  int ePBC;
 +
 +  ePBC = read_tpx(fn,ir,box,natoms,x,v,f,&mtop);
 +  
 +  *top = gmx_mtop_t_to_t_topology(&mtop);
 +
 +  return ePBC;
 +}
 +
 +gmx_bool fn2bTPX(const char *file)
 +{
 +  switch (fn2ftp(file)) {
 +  case efTPR:
 +  case efTPB:
 +  case efTPA:
 +    return TRUE;
 +  default:
 +    return FALSE;
 +  }
 +}
 +
 +gmx_bool read_tps_conf(const char *infile,char *title,t_topology *top,int *ePBC,
 +                 rvec **x,rvec **v,matrix box,gmx_bool bMass)
 +{
 +  t_tpxheader  header;
 +  int          natoms,i,version,generation;
 +  gmx_bool         bTop,bXNULL=FALSE;
 +  gmx_mtop_t   *mtop;
 +  t_topology   *topconv;
 +  gmx_atomprop_t aps;
 +  
 +  bTop = fn2bTPX(infile);
 +  *ePBC = -1;
 +  if (bTop) {
 +    read_tpxheader(infile,&header,TRUE,&version,&generation);
 +    if (x)
 +      snew(*x,header.natoms);
 +    if (v)
 +      snew(*v,header.natoms);
 +    snew(mtop,1);
 +    *ePBC = read_tpx(infile,NULL,box,&natoms,
 +                   (x==NULL) ? NULL : *x,(v==NULL) ? NULL : *v,NULL,mtop);
 +    *top = gmx_mtop_t_to_t_topology(mtop);
 +    sfree(mtop);
 +    strcpy(title,*top->name);
 +    tpx_make_chain_identifiers(&top->atoms,&top->mols);
 +  }
 +  else {
 +    get_stx_coordnum(infile,&natoms);
 +    init_t_atoms(&top->atoms,natoms,(fn2ftp(infile) == efPDB));
 +    if (x == NULL)
 +    {
 +        snew(x,1);
 +        bXNULL = TRUE;
 +    }
 +    snew(*x,natoms);
 +    if (v)
 +      snew(*v,natoms);
 +    read_stx_conf(infile,title,&top->atoms,*x,(v==NULL) ? NULL : *v,ePBC,box);
 +    if (bXNULL)
 +    {
 +      sfree(*x);
 +      sfree(x);
 +    }
 +    if (bMass) {
 +      aps = gmx_atomprop_init();
 +      for(i=0; (i<natoms); i++)
 +      if (!gmx_atomprop_query(aps,epropMass,
 +                              *top->atoms.resinfo[top->atoms.atom[i].resind].name,
 +                              *top->atoms.atomname[i],
 +                              &(top->atoms.atom[i].m))) {
 +        if (debug) 
 +          fprintf(debug,"Can not find mass for atom %s %d %s, setting to 1\n",
 +                  *top->atoms.resinfo[top->atoms.atom[i].resind].name,
 +                  top->atoms.resinfo[top->atoms.atom[i].resind].nr,
 +                  *top->atoms.atomname[i]);
 +      }
 +      gmx_atomprop_destroy(aps);
 +    }
 +    top->idef.ntypes=-1;
 +  }
 +
 +  return bTop;
 +}
index 695695d3a49d7271f840aad323348496499c1cfd,0000000000000000000000000000000000000000..46d7c47f2fc508e7fd35ca1bf68f3ed94b242120
mode 100644,000000..100644
--- /dev/null
@@@ -1,3459 -1,0 +1,3458 @@@
-         warning_error(wi,"AdResS is currently disabled\n");
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <ctype.h>
 +#include <stdlib.h>
 +#include <limits.h>
 +#include "sysstuff.h"
 +#include "smalloc.h"
 +#include "typedefs.h"
 +#include "physics.h"
 +#include "names.h"
 +#include "gmx_fatal.h"
 +#include "macros.h"
 +#include "index.h"
 +#include "symtab.h"
 +#include "string2.h"
 +#include "readinp.h"
 +#include "warninp.h"
 +#include "readir.h" 
 +#include "toputil.h"
 +#include "index.h"
 +#include "network.h"
 +#include "vec.h"
 +#include "pbc.h"
 +#include "mtop_util.h"
 +#include "chargegroup.h"
 +#include "inputrec.h"
 +
 +#define MAXPTR 254
 +#define NOGID  255
 +#define MAXLAMBDAS 1024
 +
 +/* Resource parameters 
 + * Do not change any of these until you read the instruction
 + * in readinp.h. Some cpp's do not take spaces after the backslash
 + * (like the c-shell), which will give you a very weird compiler
 + * message.
 + */
 +
 +static char tcgrps[STRLEN],tau_t[STRLEN],ref_t[STRLEN],
 +  acc[STRLEN],accgrps[STRLEN],freeze[STRLEN],frdim[STRLEN],
 +  energy[STRLEN],user1[STRLEN],user2[STRLEN],vcm[STRLEN],xtc_grps[STRLEN],
 +  couple_moltype[STRLEN],orirefitgrp[STRLEN],egptable[STRLEN],egpexcl[STRLEN],
 +  wall_atomtype[STRLEN],wall_density[STRLEN],deform[STRLEN],QMMM[STRLEN];
 +static char fep_lambda[efptNR][STRLEN];
 +static char lambda_weights[STRLEN];
 +static char **pull_grp;
 +static char **rot_grp;
 +static char anneal[STRLEN],anneal_npoints[STRLEN],
 +  anneal_time[STRLEN],anneal_temp[STRLEN];
 +static char QMmethod[STRLEN],QMbasis[STRLEN],QMcharge[STRLEN],QMmult[STRLEN],
 +  bSH[STRLEN],CASorbitals[STRLEN], CASelectrons[STRLEN],SAon[STRLEN],
 +  SAoff[STRLEN],SAsteps[STRLEN],bTS[STRLEN],bOPT[STRLEN]; 
 +static char efield_x[STRLEN],efield_xt[STRLEN],efield_y[STRLEN],
 +  efield_yt[STRLEN],efield_z[STRLEN],efield_zt[STRLEN];
 +
 +enum {
 +    egrptpALL,         /* All particles have to be a member of a group.     */
 +    egrptpALL_GENREST, /* A rest group with name is generated for particles *
 +                        * that are not part of any group.                   */
 +    egrptpPART,        /* As egrptpALL_GENREST, but no name is generated    *
 +                        * for the rest group.                               */
 +    egrptpONE          /* Merge all selected groups into one group,         *
 +                        * make a rest group for the remaining particles.    */
 +};
 +
 +
 +void init_ir(t_inputrec *ir, t_gromppopts *opts)
 +{
 +  snew(opts->include,STRLEN); 
 +  snew(opts->define,STRLEN);
 +  snew(ir->fepvals,1);
 +  snew(ir->expandedvals,1);
 +  snew(ir->simtempvals,1);
 +}
 +
 +static void GetSimTemps(int ntemps, t_simtemp *simtemp, double *temperature_lambdas)
 +{
 +
 +    int i;
 +
 +    for (i=0;i<ntemps;i++)
 +    {
 +        /* simple linear scaling -- allows more control */
 +        if (simtemp->eSimTempScale == esimtempLINEAR)
 +        {
 +            simtemp->temperatures[i] = simtemp->simtemp_low + (simtemp->simtemp_high-simtemp->simtemp_low)*temperature_lambdas[i];
 +        }
 +        else if (simtemp->eSimTempScale == esimtempGEOMETRIC)  /* should give roughly equal acceptance for constant heat capacity . . . */
 +        {
 +            simtemp->temperatures[i] = simtemp->simtemp_low * pow(simtemp->simtemp_high/simtemp->simtemp_low,(1.0*i)/(ntemps-1));
 +        }
 +        else if (simtemp->eSimTempScale == esimtempEXPONENTIAL)
 +        {
 +            simtemp->temperatures[i] = simtemp->simtemp_low + (simtemp->simtemp_high-simtemp->simtemp_low)*((exp(temperature_lambdas[i])-1)/(exp(1.0)-1));
 +        }
 +        else
 +        {
 +            char errorstr[128];
 +            sprintf(errorstr,"eSimTempScale=%d not defined",simtemp->eSimTempScale);
 +            gmx_fatal(FARGS,errorstr);
 +        }
 +    }
 +}
 +
 +
 +
 +static void _low_check(gmx_bool b,char *s,warninp_t wi)
 +{
 +    if (b)
 +    {
 +        warning_error(wi,s);
 +    }
 +}
 +
 +static void check_nst(const char *desc_nst,int nst,
 +                      const char *desc_p,int *p,
 +                      warninp_t wi)
 +{
 +    char buf[STRLEN];
 +
 +    if (*p > 0 && *p % nst != 0)
 +    {
 +        /* Round up to the next multiple of nst */
 +        *p = ((*p)/nst + 1)*nst;
 +        sprintf(buf,"%s should be a multiple of %s, changing %s to %d\n",
 +              desc_p,desc_nst,desc_p,*p);
 +        warning(wi,buf);
 +    }
 +}
 +
 +static gmx_bool ir_NVE(const t_inputrec *ir)
 +{
 +    return ((ir->eI == eiMD || EI_VV(ir->eI)) && ir->etc == etcNO);
 +}
 +
 +static int lcd(int n1,int n2)
 +{
 +    int d,i;
 +    
 +    d = 1;
 +    for(i=2; (i<=n1 && i<=n2); i++)
 +    {
 +        if (n1 % i == 0 && n2 % i == 0)
 +        {
 +            d = i;
 +        }
 +    }
 +    
 +  return d;
 +}
 +
 +static void process_interaction_modifier(const t_inputrec *ir,int *eintmod)
 +{
 +    if (*eintmod == eintmodPOTSHIFT_VERLET)
 +    {
 +        if (ir->cutoff_scheme == ecutsVERLET)
 +        {
 +            *eintmod = eintmodPOTSHIFT;
 +        }
 +        else
 +        {
 +            *eintmod = eintmodNONE;
 +        }
 +    }
 +}
 +
 +void check_ir(const char *mdparin,t_inputrec *ir, t_gromppopts *opts,
 +              warninp_t wi)
 +/* Check internal consistency */
 +{
 +    /* Strange macro: first one fills the err_buf, and then one can check 
 +     * the condition, which will print the message and increase the error
 +     * counter.
 +     */
 +#define CHECK(b) _low_check(b,err_buf,wi)
 +    char err_buf[256],warn_buf[STRLEN];
 +    int i,j;
 +    int  ns_type=0;
 +    real dt_coupl=0;
 +    real dt_pcoupl;
 +    int  nstcmin;
 +    t_lambda *fep = ir->fepvals;
 +    t_expanded *expand = ir->expandedvals;
 +
 +  set_warning_line(wi,mdparin,-1);
 +
 +    /* BASIC CUT-OFF STUFF */
 +    if (ir->rcoulomb < 0)
 +    {
 +        warning_error(wi,"rcoulomb should be >= 0");
 +    }
 +    if (ir->rvdw < 0)
 +    {
 +        warning_error(wi,"rvdw should be >= 0");
 +    }
 +    if (ir->rlist < 0 &&
 +        !(ir->cutoff_scheme == ecutsVERLET && ir->verletbuf_drift > 0))
 +    {
 +        warning_error(wi,"rlist should be >= 0");
 +    }
 +
 +    process_interaction_modifier(ir,&ir->coulomb_modifier);
 +    process_interaction_modifier(ir,&ir->vdw_modifier);
 +
 +    if (ir->cutoff_scheme == ecutsGROUP)
 +    {
 +        /* BASIC CUT-OFF STUFF */
 +        if (ir->rlist == 0 ||
 +            !((EEL_MIGHT_BE_ZERO_AT_CUTOFF(ir->coulombtype) && ir->rcoulomb > ir->rlist) ||
 +              (EVDW_MIGHT_BE_ZERO_AT_CUTOFF(ir->vdwtype)    && ir->rvdw     > ir->rlist))) {
 +            /* No switched potential and/or no twin-range:
 +             * we can set the long-range cut-off to the maximum of the other cut-offs.
 +             */
 +            ir->rlistlong = max_cutoff(ir->rlist,max_cutoff(ir->rvdw,ir->rcoulomb));
 +        }
 +        else if (ir->rlistlong < 0)
 +        {
 +            ir->rlistlong = max_cutoff(ir->rlist,max_cutoff(ir->rvdw,ir->rcoulomb));
 +            sprintf(warn_buf,"rlistlong was not set, setting it to %g (no buffer)",
 +                    ir->rlistlong);
 +            warning(wi,warn_buf);
 +        }
 +        if (ir->rlistlong == 0 && ir->ePBC != epbcNONE)
 +        {
 +            warning_error(wi,"Can not have an infinite cut-off with PBC");
 +        }
 +        if (ir->rlistlong > 0 && (ir->rlist == 0 || ir->rlistlong < ir->rlist))
 +        {
 +            warning_error(wi,"rlistlong can not be shorter than rlist");
 +        }
 +        if (IR_TWINRANGE(*ir) && ir->nstlist <= 0)
 +        {
 +            warning_error(wi,"Can not have nstlist<=0 with twin-range interactions");
 +        }
 +    }
 +    
 +    if(ir->rlistlong == ir->rlist)
 +    {
 +        ir->nstcalclr = 0;
 +    }
 +    else if(ir->rlistlong>ir->rlist && ir->nstcalclr==0)
 +    {
 +        warning_error(wi,"With different cutoffs for electrostatics and VdW, nstcalclr must be -1 or a positive number");
 +    }
 +    
 +    if (ir->cutoff_scheme == ecutsVERLET)
 +    {
 +        real rc_max;
 +
 +        /* Normal Verlet type neighbor-list, currently only limited feature support */
 +        if (inputrec2nboundeddim(ir) < 3)
 +        {
 +            warning_error(wi,"With Verlet lists only full pbc or pbc=xy with walls is supported");
 +        }
 +        if (ir->rcoulomb != ir->rvdw)
 +        {
 +            warning_error(wi,"With Verlet lists rcoulomb!=rvdw is not supported");
 +        }
 +        if (ir->vdwtype != evdwCUT)
 +        {
 +            warning_error(wi,"With Verlet lists only cut-off LJ interactions are supported");
 +        }
 +        if (!(ir->coulombtype == eelCUT ||
 +              (EEL_RF(ir->coulombtype) && ir->coulombtype != eelRF_NEC) ||
 +              EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD))
 +        {
 +            warning_error(wi,"With Verlet lists only cut-off, reaction-field, PME and Ewald electrostatics are supported");
 +        }
 +
 +        if (ir->nstlist <= 0)
 +        {
 +             warning_error(wi,"With Verlet lists nstlist should be larger than 0");
 +        }
 +
 +        if (ir->nstlist < 10)
 +        {
 +            warning_note(wi,"With Verlet lists the optimal nstlist is >= 10, with GPUs >= 20. Note that with the Verlet scheme, nstlist has no effect on the accuracy of your simulation.");
 +        }
 +
 +        rc_max = max(ir->rvdw,ir->rcoulomb);
 +
 +        if (ir->verletbuf_drift <= 0)
 +        {
 +            if (ir->verletbuf_drift == 0)
 +            {
 +                warning_error(wi,"Can not have an energy drift of exactly 0");
 +            }
 +
 +            if (ir->rlist < rc_max)
 +            {
 +                warning_error(wi,"With verlet lists rlist can not be smaller than rvdw or rcoulomb");
 +            }
 +            
 +            if (ir->rlist == rc_max && ir->nstlist > 1)
 +            {
 +                warning_note(wi,"rlist is equal to rvdw and/or rcoulomb: there is no explicit Verlet buffer. The cluster pair list does have a buffering effect, but choosing a larger rlist might be necessary for good energy conservation.");
 +            }
 +        }
 +        else
 +        {
 +            if (ir->rlist > rc_max)
 +            {
 +                warning_note(wi,"You have set rlist larger than the interaction cut-off, but you also have verlet-buffer-drift > 0. Will set rlist using verlet-buffer-drift.");
 +            }
 +
 +            if (ir->nstlist == 1)
 +            {
 +                /* No buffer required */
 +                ir->rlist = rc_max;
 +            }
 +            else
 +            {
 +                if (EI_DYNAMICS(ir->eI))
 +                {
 +                    if (EI_MD(ir->eI) && ir->etc == etcNO)
 +                    {
 +                        warning_error(wi,"Temperature coupling is required for calculating rlist using the energy drift with verlet-buffer-drift > 0. Either use temperature coupling or set rlist yourself together with verlet-buffer-drift = -1."); 
 +                    }
 +
 +                    if (inputrec2nboundeddim(ir) < 3)
 +                    {
 +                        warning_error(wi,"The box volume is required for calculating rlist from the energy drift with verlet-buffer-drift > 0. You are using at least one unbounded dimension, so no volume can be computed. Either use a finite box, or set rlist yourself together with verlet-buffer-drift = -1.");
 +                    }
 +                    /* Set rlist temporarily so we can continue processing */
 +                    ir->rlist = rc_max;
 +                }
 +                else
 +                {
 +                    /* Set the buffer to 5% of the cut-off */
 +                    ir->rlist = 1.05*rc_max;
 +                }
 +            }
 +        }
 +
 +        /* No twin-range calculations with Verlet lists */
 +        ir->rlistlong = ir->rlist;
 +    }
 +
 +    if(ir->nstcalclr==-1)
 +    {
 +        /* if rlist=rlistlong, this will later be changed to nstcalclr=0 */
 +        ir->nstcalclr = ir->nstlist;
 +    }
 +    else if(ir->nstcalclr>0)
 +    {
 +        if(ir->nstlist>0 && (ir->nstlist % ir->nstcalclr != 0))
 +        {
 +            warning_error(wi,"nstlist must be evenly divisible by nstcalclr. Use nstcalclr = -1 to automatically follow nstlist");
 +        }
 +    }
 +    else if(ir->nstcalclr<-1)
 +    {
 +        warning_error(wi,"nstcalclr must be a positive number (divisor of nstcalclr), or -1 to follow nstlist.");
 +    }
 +    
 +    if(EEL_PME(ir->coulombtype) && ir->rcoulomb > ir->rvdw && ir->nstcalclr>1)
 +    {
 +        warning_error(wi,"When used with PME, the long-range component of twin-range interactions must be updated every step (nstcalclr)");
 +    }
 +       
 +    /* GENERAL INTEGRATOR STUFF */
 +    if (!(ir->eI == eiMD || EI_VV(ir->eI)))
 +    {
 +        ir->etc = etcNO;
 +    }
 +    if (ir->eI == eiVVAK) {
 +        sprintf(warn_buf,"Integrator method %s is implemented primarily for validation purposes; for molecular dynamics, you should probably be using %s or %s",ei_names[eiVVAK],ei_names[eiMD],ei_names[eiVV]);
 +        warning_note(wi,warn_buf);
 +    }
 +    if (!EI_DYNAMICS(ir->eI))
 +    {
 +        ir->epc = epcNO;
 +    }
 +    if (EI_DYNAMICS(ir->eI))
 +    {
 +        if (ir->nstcalcenergy < 0)
 +        {
 +            ir->nstcalcenergy = ir_optimal_nstcalcenergy(ir);
 +            if (ir->nstenergy != 0 && ir->nstenergy < ir->nstcalcenergy)
 +            {
 +                /* nstcalcenergy larger than nstener does not make sense.
 +                 * We ideally want nstcalcenergy=nstener.
 +                 */
 +                if (ir->nstlist > 0)
 +                {
 +                    ir->nstcalcenergy = lcd(ir->nstenergy,ir->nstlist);
 +                }
 +                else
 +                {
 +                    ir->nstcalcenergy = ir->nstenergy;
 +                }
 +            }
 +        }
 +        else if (ir->nstenergy > 0 && ir->nstcalcenergy > ir->nstenergy)
 +        {
 +            /* If the user sets nstenergy small, we should respect that */
 +            sprintf(warn_buf,"Setting nstcalcenergy (%d) equal to nstenergy (%d)",ir->nstcalcenergy,ir->nstenergy);
 +            ir->nstcalcenergy = ir->nstenergy;
 +        }
 +
 +        if (ir->epc != epcNO)
 +        {
 +            if (ir->nstpcouple < 0)
 +            {
 +                ir->nstpcouple = ir_optimal_nstpcouple(ir);
 +            }
 +        }
 +        if (IR_TWINRANGE(*ir))
 +        {
 +            check_nst("nstlist",ir->nstlist,
 +                      "nstcalcenergy",&ir->nstcalcenergy,wi);
 +            if (ir->epc != epcNO)
 +            {
 +                check_nst("nstlist",ir->nstlist,
 +                          "nstpcouple",&ir->nstpcouple,wi); 
 +            }
 +        }
 +
 +        if (ir->nstcalcenergy > 1)
 +        {
 +            /* for storing exact averages nstenergy should be
 +             * a multiple of nstcalcenergy
 +             */
 +            check_nst("nstcalcenergy",ir->nstcalcenergy,
 +                      "nstenergy",&ir->nstenergy,wi);
 +            if (ir->efep != efepNO)
 +            {
 +                /* nstdhdl should be a multiple of nstcalcenergy */
 +                check_nst("nstcalcenergy",ir->nstcalcenergy,
 +                          "nstdhdl",&ir->fepvals->nstdhdl,wi);
 +                /* nstexpanded should be a multiple of nstcalcenergy */
 +                check_nst("nstcalcenergy",ir->nstcalcenergy,
 +                          "nstdhdl",&ir->expandedvals->nstexpanded,wi);
 +            }
 +        }
 +    }
 +
 +  /* LD STUFF */
 +  if ((EI_SD(ir->eI) || ir->eI == eiBD) &&
 +      ir->bContinuation && ir->ld_seed != -1) {
 +      warning_note(wi,"You are doing a continuation with SD or BD, make sure that ld_seed is different from the previous run (using ld_seed=-1 will ensure this)");
 +  }
 +
 +  /* TPI STUFF */
 +  if (EI_TPI(ir->eI)) {
 +    sprintf(err_buf,"TPI only works with pbc = %s",epbc_names[epbcXYZ]);
 +    CHECK(ir->ePBC != epbcXYZ);
 +    sprintf(err_buf,"TPI only works with ns = %s",ens_names[ensGRID]);
 +    CHECK(ir->ns_type != ensGRID);
 +    sprintf(err_buf,"with TPI nstlist should be larger than zero");
 +    CHECK(ir->nstlist <= 0);
 +    sprintf(err_buf,"TPI does not work with full electrostatics other than PME");
 +    CHECK(EEL_FULL(ir->coulombtype) && !EEL_PME(ir->coulombtype));
 +  }
 +
 +  /* SHAKE / LINCS */
 +  if ( (opts->nshake > 0) && (opts->bMorse) ) {
 +      sprintf(warn_buf,
 +              "Using morse bond-potentials while constraining bonds is useless");
 +      warning(wi,warn_buf);
 +  }
 +
 +  if ((EI_SD(ir->eI) || ir->eI == eiBD) &&
 +      ir->bContinuation && ir->ld_seed != -1) {
 +      warning_note(wi,"You are doing a continuation with SD or BD, make sure that ld_seed is different from the previous run (using ld_seed=-1 will ensure this)");
 +  }
 +  /* verify simulated tempering options */
 +
 +  if (ir->bSimTemp) {
 +      gmx_bool bAllTempZero = TRUE;
 +      for (i=0;i<fep->n_lambda;i++)
 +      {
 +          sprintf(err_buf,"Entry %d for %s must be between 0 and 1, instead is %g",i,efpt_names[efptTEMPERATURE],fep->all_lambda[efptTEMPERATURE][i]);
 +          CHECK((fep->all_lambda[efptTEMPERATURE][i] < 0) || (fep->all_lambda[efptTEMPERATURE][i] > 1));
 +          if (fep->all_lambda[efptTEMPERATURE][i] > 0)
 +          {
 +              bAllTempZero = FALSE;
 +          }
 +      }
 +      sprintf(err_buf,"if simulated tempering is on, temperature-lambdas may not be all zero");
 +      CHECK(bAllTempZero==TRUE);
 +
 +      sprintf(err_buf,"Simulated tempering is currently only compatible with md-vv");
 +      CHECK(ir->eI != eiVV);
 +
 +      /* check compatability of the temperature coupling with simulated tempering */
 +
 +      if (ir->etc == etcNOSEHOOVER) {
 +          sprintf(warn_buf,"Nose-Hoover based temperature control such as [%s] my not be entirelyconsistent with simulated tempering",etcoupl_names[ir->etc]);
 +          warning_note(wi,warn_buf);
 +      }
 +
 +      /* check that the temperatures make sense */
 +
 +      sprintf(err_buf,"Higher simulated tempering temperature (%g) must be >= than the simulated tempering lower temperature (%g)",ir->simtempvals->simtemp_high,ir->simtempvals->simtemp_low);
 +      CHECK(ir->simtempvals->simtemp_high <= ir->simtempvals->simtemp_low);
 +
 +      sprintf(err_buf,"Higher simulated tempering temperature (%g) must be >= zero",ir->simtempvals->simtemp_high);
 +      CHECK(ir->simtempvals->simtemp_high <= 0);
 +
 +      sprintf(err_buf,"Lower simulated tempering temperature (%g) must be >= zero",ir->simtempvals->simtemp_low);
 +      CHECK(ir->simtempvals->simtemp_low <= 0);
 +  }
 +
 +  /* verify free energy options */
 +
 +  if (ir->efep != efepNO) {
 +      fep = ir->fepvals;
 +      sprintf(err_buf,"The soft-core power is %d and can only be 1 or 2",
 +              fep->sc_power);
 +      CHECK(fep->sc_alpha!=0 && fep->sc_power!=1 && fep->sc_power!=2);
 +
 +      sprintf(err_buf,"The soft-core sc-r-power is %d and can only be 6 or 48",
 +              (int)fep->sc_r_power);
 +      CHECK(fep->sc_alpha!=0 && fep->sc_r_power!=6.0 && fep->sc_r_power!=48.0);
 +
 +      /* check validity of options */
 +      if (fep->n_lambda > 0 && ir->rlist < max(ir->rvdw,ir->rcoulomb))
 +      {
 +          sprintf(warn_buf,
 +                  "For foreign lambda free energy differences it is assumed that the soft-core interactions have no effect beyond the neighborlist cut-off");
 +          warning(wi,warn_buf);
 +      }
 +
 +      sprintf(err_buf,"Can't use postive delta-lambda (%g) if initial state/lambda does not start at zero",fep->delta_lambda);
 +      CHECK(fep->delta_lambda > 0 && ((fep->init_fep_state !=0) ||  (fep->init_lambda !=0)));
 +
 +      sprintf(err_buf,"Can't use postive delta-lambda (%g) with expanded ensemble simulations",fep->delta_lambda);
 +      CHECK(fep->delta_lambda > 0 && (ir->efep == efepEXPANDED));
 +
 +      sprintf(err_buf,"Free-energy not implemented for Ewald");
 +      CHECK(ir->coulombtype==eelEWALD);
 +
 +      /* check validty of lambda inputs */
 +      sprintf(err_buf,"initial thermodynamic state %d does not exist, only goes to %d",fep->init_fep_state,fep->n_lambda);
 +      CHECK((fep->init_fep_state > fep->n_lambda));
 +
 +      for (j=0;j<efptNR;j++)
 +      {
 +          for (i=0;i<fep->n_lambda;i++)
 +          {
 +              sprintf(err_buf,"Entry %d for %s must be between 0 and 1, instead is %g",i,efpt_names[j],fep->all_lambda[j][i]);
 +              CHECK((fep->all_lambda[j][i] < 0) || (fep->all_lambda[j][i] > 1));
 +          }
 +      }
 +
 +      if ((fep->sc_alpha>0) && (!fep->bScCoul))
 +      {
 +          for (i=0;i<fep->n_lambda;i++)
 +          {
 +              sprintf(err_buf,"For state %d, vdw-lambdas (%f) is changing with vdw softcore, while coul-lambdas (%f) is nonzero without coulomb softcore: this will lead to crashes, and is not supported.",i,fep->all_lambda[efptVDW][i],
 +                      fep->all_lambda[efptCOUL][i]);
 +              CHECK((fep->sc_alpha>0) &&
 +                    (((fep->all_lambda[efptCOUL][i] > 0.0) &&
 +                      (fep->all_lambda[efptCOUL][i] < 1.0)) &&
 +                     ((fep->all_lambda[efptVDW][i] > 0.0) &&
 +                      (fep->all_lambda[efptVDW][i] < 1.0))));
 +          }
 +      }
 +
 +      if ((fep->bScCoul) && (EEL_PME(ir->coulombtype)))
 +      {
 +          sprintf(warn_buf,"With coulomb soft core, the reciprocal space calculation will not necessarily cancel.  It may be necessary to decrease the reciprocal space energy, and increase the cutoff radius to get sufficiently close matches to energies with free energy turned off.");
 +          warning(wi, warn_buf);
 +      }
 +
 +      /*  Free Energy Checks -- In an ideal world, slow growth and FEP would
 +          be treated differently, but that's the next step */
 +
 +      for (i=0;i<efptNR;i++) {
 +          for (j=0;j<fep->n_lambda;j++) {
 +              sprintf(err_buf,"%s[%d] must be between 0 and 1",efpt_names[i],j);
 +              CHECK((fep->all_lambda[i][j] < 0) || (fep->all_lambda[i][j] > 1));
 +          }
 +      }
 +  }
 +
 +  if ((ir->bSimTemp) || (ir->efep == efepEXPANDED)) {
 +      fep = ir->fepvals;
 +      expand = ir->expandedvals;
 +
 +      /* checking equilibration of weights inputs for validity */
 +
 +      sprintf(err_buf,"weight-equil-number-all-lambda (%d) is ignored if lmc-weights-equil is not equal to %s",
 +              expand->equil_n_at_lam,elmceq_names[elmceqNUMATLAM]);
 +      CHECK((expand->equil_n_at_lam>0) && (expand->elmceq!=elmceqNUMATLAM));
 +
 +      sprintf(err_buf,"weight-equil-number-samples (%d) is ignored if lmc-weights-equil is not equal to %s",
 +              expand->equil_samples,elmceq_names[elmceqSAMPLES]);
 +      CHECK((expand->equil_samples>0) && (expand->elmceq!=elmceqSAMPLES));
 +
 +      sprintf(err_buf,"weight-equil-number-steps (%d) is ignored if lmc-weights-equil is not equal to %s",
 +              expand->equil_steps,elmceq_names[elmceqSTEPS]);
 +      CHECK((expand->equil_steps>0) && (expand->elmceq!=elmceqSTEPS));
 +
 +      sprintf(err_buf,"weight-equil-wl-delta (%d) is ignored if lmc-weights-equil is not equal to %s",
 +              expand->equil_samples,elmceq_names[elmceqWLDELTA]);
 +      CHECK((expand->equil_wl_delta>0) && (expand->elmceq!=elmceqWLDELTA));
 +
 +      sprintf(err_buf,"weight-equil-count-ratio (%f) is ignored if lmc-weights-equil is not equal to %s",
 +              expand->equil_ratio,elmceq_names[elmceqRATIO]);
 +      CHECK((expand->equil_ratio>0) && (expand->elmceq!=elmceqRATIO));
 +
 +      sprintf(err_buf,"weight-equil-number-all-lambda (%d) must be a positive integer if lmc-weights-equil=%s",
 +              expand->equil_n_at_lam,elmceq_names[elmceqNUMATLAM]);
 +      CHECK((expand->equil_n_at_lam<=0) && (expand->elmceq==elmceqNUMATLAM));
 +
 +      sprintf(err_buf,"weight-equil-number-samples (%d) must be a positive integer if lmc-weights-equil=%s",
 +              expand->equil_samples,elmceq_names[elmceqSAMPLES]);
 +      CHECK((expand->equil_samples<=0) && (expand->elmceq==elmceqSAMPLES));
 +
 +      sprintf(err_buf,"weight-equil-number-steps (%d) must be a positive integer if lmc-weights-equil=%s",
 +              expand->equil_steps,elmceq_names[elmceqSTEPS]);
 +      CHECK((expand->equil_steps<=0) && (expand->elmceq==elmceqSTEPS));
 +
 +      sprintf(err_buf,"weight-equil-wl-delta (%f) must be > 0 if lmc-weights-equil=%s",
 +              expand->equil_wl_delta,elmceq_names[elmceqWLDELTA]);
 +      CHECK((expand->equil_wl_delta<=0) && (expand->elmceq==elmceqWLDELTA));
 +
 +      sprintf(err_buf,"weight-equil-count-ratio (%f) must be > 0 if lmc-weights-equil=%s",
 +              expand->equil_ratio,elmceq_names[elmceqRATIO]);
 +      CHECK((expand->equil_ratio<=0) && (expand->elmceq==elmceqRATIO));
 +
 +      sprintf(err_buf,"lmc-weights-equil=%s only possible when lmc-stats = %s or lmc-stats %s",
 +              elmceq_names[elmceqWLDELTA],elamstats_names[elamstatsWL],elamstats_names[elamstatsWWL]);
 +      CHECK((expand->elmceq==elmceqWLDELTA) && (!EWL(expand->elamstats)));
 +
 +      sprintf(err_buf,"lmc-repeats (%d) must be greater than 0",expand->lmc_repeats);
 +      CHECK((expand->lmc_repeats <= 0));
 +      sprintf(err_buf,"minimum-var-min (%d) must be greater than 0",expand->minvarmin);
 +      CHECK((expand->minvarmin <= 0));
 +      sprintf(err_buf,"weight-c-range (%d) must be greater or equal to 0",expand->c_range);
 +      CHECK((expand->c_range < 0));
 +      sprintf(err_buf,"init-lambda-state (%d) must be zero if lmc-forced-nstart (%d)> 0 and lmc-move != 'no'",
 +              fep->init_fep_state, expand->lmc_forced_nstart);
 +      CHECK((fep->init_fep_state!=0) && (expand->lmc_forced_nstart>0) && (expand->elmcmove!=elmcmoveNO));
 +      sprintf(err_buf,"lmc-forced-nstart (%d) must not be negative",expand->lmc_forced_nstart);
 +      CHECK((expand->lmc_forced_nstart < 0));
 +      sprintf(err_buf,"init-lambda-state (%d) must be in the interval [0,number of lambdas)",fep->init_fep_state);
 +      CHECK((fep->init_fep_state < 0) || (fep->init_fep_state >= fep->n_lambda));
 +
 +      sprintf(err_buf,"init-wl-delta (%f) must be greater than or equal to 0",expand->init_wl_delta);
 +      CHECK((expand->init_wl_delta < 0));
 +      sprintf(err_buf,"wl-ratio (%f) must be between 0 and 1",expand->wl_ratio);
 +      CHECK((expand->wl_ratio <= 0) || (expand->wl_ratio >= 1));
 +      sprintf(err_buf,"wl-scale (%f) must be between 0 and 1",expand->wl_scale);
 +      CHECK((expand->wl_scale <= 0) || (expand->wl_scale >= 1));
 +
 +      /* if there is no temperature control, we need to specify an MC temperature */
 +      sprintf(err_buf,"If there is no temperature control, and lmc-mcmove!= 'no',mc_temperature must be set to a positive number");
 +      if (expand->nstTij > 0)
 +      {
 +          sprintf(err_buf,"nst-transition-matrix (%d) must be an integer multiple of nstlog (%d)",
 +                  expand->nstTij,ir->nstlog);
 +          CHECK((mod(expand->nstTij,ir->nstlog)!=0));
 +      }
 +  }
 +
 +  /* PBC/WALLS */
 +  sprintf(err_buf,"walls only work with pbc=%s",epbc_names[epbcXY]);
 +  CHECK(ir->nwall && ir->ePBC!=epbcXY);
 +
 +  /* VACUUM STUFF */
 +  if (ir->ePBC != epbcXYZ && ir->nwall != 2) {
 +    if (ir->ePBC == epbcNONE) {
 +      if (ir->epc != epcNO) {
 +          warning(wi,"Turning off pressure coupling for vacuum system");
 +          ir->epc = epcNO;
 +      }
 +    } else {
 +      sprintf(err_buf,"Can not have pressure coupling with pbc=%s",
 +            epbc_names[ir->ePBC]);
 +      CHECK(ir->epc != epcNO);
 +    }
 +    sprintf(err_buf,"Can not have Ewald with pbc=%s",epbc_names[ir->ePBC]);
 +    CHECK(EEL_FULL(ir->coulombtype));
 +
 +    sprintf(err_buf,"Can not have dispersion correction with pbc=%s",
 +          epbc_names[ir->ePBC]);
 +    CHECK(ir->eDispCorr != edispcNO);
 +  }
 +
 +  if (ir->rlist == 0.0) {
 +    sprintf(err_buf,"can only have neighborlist cut-off zero (=infinite)\n"
 +          "with coulombtype = %s or coulombtype = %s\n"
 +          "without periodic boundary conditions (pbc = %s) and\n"
 +          "rcoulomb and rvdw set to zero",
 +          eel_names[eelCUT],eel_names[eelUSER],epbc_names[epbcNONE]);
 +    CHECK(((ir->coulombtype != eelCUT) && (ir->coulombtype != eelUSER)) ||
 +        (ir->ePBC     != epbcNONE) ||
 +        (ir->rcoulomb != 0.0)      || (ir->rvdw != 0.0));
 +
 +    if (ir->nstlist < 0) {
 +        warning_error(wi,"Can not have heuristic neighborlist updates without cut-off");
 +    }
 +    if (ir->nstlist > 0) {
 +        warning_note(wi,"Simulating without cut-offs is usually (slightly) faster with nstlist=0, nstype=simple and particle decomposition");
 +    }
 +  }
 +
 +  /* COMM STUFF */
 +  if (ir->nstcomm == 0) {
 +    ir->comm_mode = ecmNO;
 +  }
 +  if (ir->comm_mode != ecmNO) {
 +    if (ir->nstcomm < 0) {
 +        warning(wi,"If you want to remove the rotation around the center of mass, you should set comm_mode = Angular instead of setting nstcomm < 0. nstcomm is modified to its absolute value");
 +      ir->nstcomm = abs(ir->nstcomm);
 +    }
 +
 +    if (ir->nstcalcenergy > 0 && ir->nstcomm < ir->nstcalcenergy) {
 +        warning_note(wi,"nstcomm < nstcalcenergy defeats the purpose of nstcalcenergy, setting nstcomm to nstcalcenergy");
 +        ir->nstcomm = ir->nstcalcenergy;
 +    }
 +
 +    if (ir->comm_mode == ecmANGULAR) {
 +      sprintf(err_buf,"Can not remove the rotation around the center of mass with periodic molecules");
 +      CHECK(ir->bPeriodicMols);
 +      if (ir->ePBC != epbcNONE)
 +          warning(wi,"Removing the rotation around the center of mass in a periodic system (this is not a problem when you have only one molecule).");
 +    }
 +  }
 +
 +  if (EI_STATE_VELOCITY(ir->eI) && ir->ePBC == epbcNONE && ir->comm_mode != ecmANGULAR) {
 +      warning_note(wi,"Tumbling and or flying ice-cubes: We are not removing rotation around center of mass in a non-periodic system. You should probably set comm_mode = ANGULAR.");
 +  }
 +  
 +  sprintf(err_buf,"Twin-range neighbour searching (NS) with simple NS"
 +        " algorithm not implemented");
 +  CHECK(((ir->rcoulomb > ir->rlist) || (ir->rvdw > ir->rlist))
 +      && (ir->ns_type == ensSIMPLE));
 +
 +  /* TEMPERATURE COUPLING */
 +  if (ir->etc == etcYES)
 +    {
 +        ir->etc = etcBERENDSEN;
 +        warning_note(wi,"Old option for temperature coupling given: "
 +                     "changing \"yes\" to \"Berendsen\"\n");
 +    }
 +
 +    if ((ir->etc == etcNOSEHOOVER) || (ir->epc == epcMTTK))
 +    {
 +        if (ir->opts.nhchainlength < 1)
 +        {
 +            sprintf(warn_buf,"number of Nose-Hoover chains (currently %d) cannot be less than 1,reset to 1\n",ir->opts.nhchainlength);
 +            ir->opts.nhchainlength =1;
 +            warning(wi,warn_buf);
 +        }
 +        
 +        if (ir->etc==etcNOSEHOOVER && !EI_VV(ir->eI) && ir->opts.nhchainlength > 1)
 +        {
 +            warning_note(wi,"leapfrog does not yet support Nose-Hoover chains, nhchainlength reset to 1");
 +            ir->opts.nhchainlength = 1;
 +        }
 +    }
 +    else
 +    {
 +        ir->opts.nhchainlength = 0;
 +    }
 +
 +    if (ir->eI == eiVVAK) {
 +        sprintf(err_buf,"%s implemented primarily for validation, and requires nsttcouple = 1 and nstpcouple = 1.",
 +                ei_names[eiVVAK]);
 +        CHECK((ir->nsttcouple != 1) || (ir->nstpcouple != 1));
 +    }
 +
 +    if (ETC_ANDERSEN(ir->etc))
 +    {
 +        sprintf(err_buf,"%s temperature control not supported for integrator %s.",etcoupl_names[ir->etc],ei_names[ir->eI]);
 +        CHECK(!(EI_VV(ir->eI)));
 +
 +        for (i=0;i<ir->opts.ngtc;i++)
 +        {
 +            sprintf(err_buf,"all tau_t must currently be equal using Andersen temperature control, violated for group %d",i);
 +            CHECK(ir->opts.tau_t[0] != ir->opts.tau_t[i]);
 +            sprintf(err_buf,"all tau_t must be postive using Andersen temperature control, tau_t[%d]=%10.6f",
 +                    i,ir->opts.tau_t[i]);
 +            CHECK(ir->opts.tau_t[i]<0);
 +        }
 +        if (ir->nstcomm > 0 && (ir->etc == etcANDERSEN)) {
 +            sprintf(warn_buf,"Center of mass removal not necessary for %s.  All velocities of coupled groups are rerandomized periodically, so flying ice cube errors will not occur.",etcoupl_names[ir->etc]);
 +            warning_note(wi,warn_buf);
 +        }
 +
 +        sprintf(err_buf,"nstcomm must be 1, not %d for %s, as velocities of atoms in coupled groups are randomized every time step",ir->nstcomm,etcoupl_names[ir->etc]);
 +        CHECK(ir->nstcomm > 1 && (ir->etc == etcANDERSEN));
 +
 +        for (i=0;i<ir->opts.ngtc;i++)
 +        {
 +            int nsteps = (int)(ir->opts.tau_t[i]/ir->delta_t);
 +            sprintf(err_buf,"tau_t/delta_t for group %d for temperature control method %s must be a multiple of nstcomm (%d), as velocities of atoms in coupled groups are randomized every time step. The input tau_t (%8.3f) leads to %d steps per randomization",i,etcoupl_names[ir->etc],ir->nstcomm,ir->opts.tau_t[i],nsteps);
 +            CHECK((nsteps % ir->nstcomm) && (ir->etc == etcANDERSENMASSIVE));
 +        }
 +    }
 +    if (ir->etc == etcBERENDSEN)
 +    {
 +        sprintf(warn_buf,"The %s thermostat does not generate the correct kinetic energy distribution. You might want to consider using the %s thermostat.",
 +                ETCOUPLTYPE(ir->etc),ETCOUPLTYPE(etcVRESCALE));
 +        warning_note(wi,warn_buf);
 +    }
 +
 +    if ((ir->etc==etcNOSEHOOVER || ETC_ANDERSEN(ir->etc))
 +        && ir->epc==epcBERENDSEN)
 +    {
 +        sprintf(warn_buf,"Using Berendsen pressure coupling invalidates the "
 +                "true ensemble for the thermostat");
 +        warning(wi,warn_buf);
 +    }
 +
 +    /* PRESSURE COUPLING */
 +    if (ir->epc == epcISOTROPIC)
 +    {
 +        ir->epc = epcBERENDSEN;
 +        warning_note(wi,"Old option for pressure coupling given: "
 +                     "changing \"Isotropic\" to \"Berendsen\"\n"); 
 +    }
 +
 +    if (ir->epc != epcNO)
 +    {
 +        dt_pcoupl = ir->nstpcouple*ir->delta_t;
 +
 +        sprintf(err_buf,"tau-p must be > 0 instead of %g\n",ir->tau_p);
 +        CHECK(ir->tau_p <= 0);
 +
 +        if (ir->tau_p/dt_pcoupl < pcouple_min_integration_steps(ir->epc))
 +        {
 +            sprintf(warn_buf,"For proper integration of the %s barostat, tau-p (%g) should be at least %d times larger than nstpcouple*dt (%g)",
 +                    EPCOUPLTYPE(ir->epc),ir->tau_p,pcouple_min_integration_steps(ir->epc),dt_pcoupl);
 +            warning(wi,warn_buf);
 +        }
 +
 +        sprintf(err_buf,"compressibility must be > 0 when using pressure"
 +                " coupling %s\n",EPCOUPLTYPE(ir->epc));
 +        CHECK(ir->compress[XX][XX] < 0 || ir->compress[YY][YY] < 0 ||
 +              ir->compress[ZZ][ZZ] < 0 ||
 +              (trace(ir->compress) == 0 && ir->compress[YY][XX] <= 0 &&
 +               ir->compress[ZZ][XX] <= 0 && ir->compress[ZZ][YY] <= 0));
 +        
 +        if (epcPARRINELLORAHMAN == ir->epc && opts->bGenVel)
 +        {
 +            sprintf(warn_buf,
 +                    "You are generating velocities so I am assuming you "
 +                    "are equilibrating a system. You are using "
 +                    "%s pressure coupling, but this can be "
 +                    "unstable for equilibration. If your system crashes, try "
 +                    "equilibrating first with Berendsen pressure coupling. If "
 +                    "you are not equilibrating the system, you can probably "
 +                    "ignore this warning.",
 +                    epcoupl_names[ir->epc]);
 +            warning(wi,warn_buf);
 +        }
 +    }
 +
 +    if (EI_VV(ir->eI))
 +    {
 +        if (ir->epc > epcNO)
 +        {
 +            if ((ir->epc!=epcBERENDSEN) && (ir->epc!=epcMTTK))
 +            {
 +                warning_error(wi,"for md-vv and md-vv-avek, can only use Berendsen and Martyna-Tuckerman-Tobias-Klein (MTTK) equations for pressure control; MTTK is equivalent to Parrinello-Rahman.");
 +            }
 +        }
 +    }
 +
 +  /* ELECTROSTATICS */
 +  /* More checks are in triple check (grompp.c) */
 +
 +  if (ir->coulombtype == eelSWITCH) {
 +    sprintf(warn_buf,"coulombtype = %s is only for testing purposes and can lead to serious "
 +            "artifacts, advice: use coulombtype = %s",
 +          eel_names[ir->coulombtype],
 +          eel_names[eelRF_ZERO]);
 +    warning(wi,warn_buf);
 +  }
 +
 +  if (ir->epsilon_r!=1 && ir->implicit_solvent==eisGBSA) {
 +    sprintf(warn_buf,"epsilon-r = %g with GB implicit solvent, will use this value for inner dielectric",ir->epsilon_r);
 +    warning_note(wi,warn_buf);
 +  }
 +
 +  if (EEL_RF(ir->coulombtype) && ir->epsilon_rf==1 && ir->epsilon_r!=1) {
 +    sprintf(warn_buf,"epsilon-r = %g and epsilon-rf = 1 with reaction field, proceeding assuming old format and exchanging epsilon-r and epsilon-rf",ir->epsilon_r);
 +    warning(wi,warn_buf);
 +    ir->epsilon_rf = ir->epsilon_r;
 +    ir->epsilon_r  = 1.0;
 +  }
 +
 +  if (getenv("GALACTIC_DYNAMICS") == NULL) {  
 +    sprintf(err_buf,"epsilon-r must be >= 0 instead of %g\n",ir->epsilon_r);
 +    CHECK(ir->epsilon_r < 0);
 +  }
 +  
 +  if (EEL_RF(ir->coulombtype)) {
 +    /* reaction field (at the cut-off) */
 +    
 +    if (ir->coulombtype == eelRF_ZERO) {
 +       sprintf(warn_buf,"With coulombtype = %s, epsilon-rf must be 0, assuming you meant epsilon_rf=0",
 +             eel_names[ir->coulombtype]);
 +        CHECK(ir->epsilon_rf != 0);
 +        ir->epsilon_rf = 0.0;
 +    }
 +
 +    sprintf(err_buf,"epsilon-rf must be >= epsilon-r");
 +    CHECK((ir->epsilon_rf < ir->epsilon_r && ir->epsilon_rf != 0) ||
 +        (ir->epsilon_r == 0));
 +    if (ir->epsilon_rf == ir->epsilon_r) {
 +      sprintf(warn_buf,"Using epsilon-rf = epsilon-r with %s does not make sense",
 +            eel_names[ir->coulombtype]);
 +      warning(wi,warn_buf);
 +    }
 +  }
 +  /* Allow rlist>rcoulomb for tabulated long range stuff. This just
 +   * means the interaction is zero outside rcoulomb, but it helps to
 +   * provide accurate energy conservation.
 +   */
 +  if (EEL_MIGHT_BE_ZERO_AT_CUTOFF(ir->coulombtype)) {
 +    if (EEL_SWITCHED(ir->coulombtype)) {
 +      sprintf(err_buf,
 +            "With coulombtype = %s rcoulomb_switch must be < rcoulomb. Or, better: Use the potential modifier options!",
 +            eel_names[ir->coulombtype]);
 +      CHECK(ir->rcoulomb_switch >= ir->rcoulomb);
 +    }
 +  } else if (ir->coulombtype == eelCUT || EEL_RF(ir->coulombtype)) {
 +      if (ir->cutoff_scheme == ecutsGROUP && ir->coulomb_modifier == eintmodNONE) {
 +          sprintf(err_buf,"With coulombtype = %s, rcoulomb should be >= rlist unless you use a potential modifier",
 +                  eel_names[ir->coulombtype]);
 +          CHECK(ir->rlist > ir->rcoulomb);
 +      }
 +  }
 +
 +  if(ir->coulombtype==eelSWITCH || ir->coulombtype==eelSHIFT ||
 +     ir->vdwtype==evdwSWITCH || ir->vdwtype==evdwSHIFT)
 +  {
 +      sprintf(warn_buf,
 +              "The switch/shift interaction settings are just for compatibility; you will get better"
 +              "performance from applying potential modifiers to your interactions!\n");
 +      warning_note(wi,warn_buf);
 +  }
 +
 +  if (EEL_FULL(ir->coulombtype))
 +  {
 +      if (ir->coulombtype==eelPMESWITCH || ir->coulombtype==eelPMEUSER ||
 +          ir->coulombtype==eelPMEUSERSWITCH)
 +      {
 +          sprintf(err_buf,"With coulombtype = %s, rcoulomb must be <= rlist",
 +                  eel_names[ir->coulombtype]);
 +          CHECK(ir->rcoulomb > ir->rlist);
 +      }
 +      else if (ir->cutoff_scheme == ecutsGROUP && ir->coulomb_modifier == eintmodNONE)
 +      {
 +          if (ir->coulombtype == eelPME || ir->coulombtype == eelP3M_AD)
 +          {
 +              sprintf(err_buf,
 +                      "With coulombtype = %s (without modifier), rcoulomb must be equal to rlist,\n"
 +                      "or rlistlong if nstcalclr=1. For optimal energy conservation,consider using\n"
 +                      "a potential modifier.",eel_names[ir->coulombtype]);
 +              if(ir->nstcalclr==1)
 +              {
 +                  CHECK(ir->rcoulomb != ir->rlist && ir->rcoulomb != ir->rlistlong);
 +              }
 +              else
 +              {
 +                  CHECK(ir->rcoulomb != ir->rlist);
 +              }
 +          }
 +      }
 +  }
 +
 +  if (EEL_PME(ir->coulombtype)) {
 +    if (ir->pme_order < 3) {
 +        warning_error(wi,"pme-order can not be smaller than 3");
 +    }
 +  }
 +
 +  if (ir->nwall==2 && EEL_FULL(ir->coulombtype)) {
 +    if (ir->ewald_geometry == eewg3D) {
 +      sprintf(warn_buf,"With pbc=%s you should use ewald-geometry=%s",
 +            epbc_names[ir->ePBC],eewg_names[eewg3DC]);
 +      warning(wi,warn_buf);
 +    }
 +    /* This check avoids extra pbc coding for exclusion corrections */
 +    sprintf(err_buf,"wall-ewald-zfac should be >= 2");
 +    CHECK(ir->wall_ewald_zfac < 2);
 +  }
 +
 +  if (EVDW_SWITCHED(ir->vdwtype)) {
 +    sprintf(err_buf,"With vdwtype = %s rvdw-switch must be < rvdw. Or, better - use a potential modifier.",
 +          evdw_names[ir->vdwtype]);
 +    CHECK(ir->rvdw_switch >= ir->rvdw);
 +  } else if (ir->vdwtype == evdwCUT) {
 +      if (ir->cutoff_scheme == ecutsGROUP && ir->vdw_modifier == eintmodNONE) {
 +          sprintf(err_buf,"With vdwtype = %s, rvdw must be >= rlist unless you use a potential modifier",evdw_names[ir->vdwtype]);
 +          CHECK(ir->rlist > ir->rvdw);
 +      }
 +  }
 +    if (ir->cutoff_scheme == ecutsGROUP)
 +    {
 +        if (EEL_IS_ZERO_AT_CUTOFF(ir->coulombtype)
 +            && (ir->rlistlong <= ir->rcoulomb))
 +        {
 +            sprintf(warn_buf,"For energy conservation with switch/shift potentials, %s should be 0.1 to 0.3 nm larger than rcoulomb.",
 +                    IR_TWINRANGE(*ir) ? "rlistlong" : "rlist");
 +            warning_note(wi,warn_buf);
 +        }
 +        if (EVDW_SWITCHED(ir->vdwtype) && (ir->rlistlong <= ir->rvdw))
 +        {
 +            sprintf(warn_buf,"For energy conservation with switch/shift potentials, %s should be 0.1 to 0.3 nm larger than rvdw.",
 +                    IR_TWINRANGE(*ir) ? "rlistlong" : "rlist");
 +            warning_note(wi,warn_buf);
 +        }
 +    }
 +
 +  if (ir->vdwtype == evdwUSER && ir->eDispCorr != edispcNO) {
 +      warning_note(wi,"You have selected user tables with dispersion correction, the dispersion will be corrected to -C6/r^6 beyond rvdw_switch (the tabulated interaction between rvdw_switch and rvdw will not be double counted). Make sure that you really want dispersion correction to -C6/r^6.");
 +  }
 +
 +  if (ir->nstlist == -1) {
 +    sprintf(err_buf,"With nstlist=-1 rvdw and rcoulomb should be smaller than rlist to account for diffusion and possibly charge-group radii");
 +    CHECK(ir->rvdw >= ir->rlist || ir->rcoulomb >= ir->rlist);
 +  }
 +  sprintf(err_buf,"nstlist can not be smaller than -1");
 +  CHECK(ir->nstlist < -1);
 +
 +  if (ir->eI == eiLBFGS && (ir->coulombtype==eelCUT || ir->vdwtype==evdwCUT)
 +     && ir->rvdw != 0) {
 +    warning(wi,"For efficient BFGS minimization, use switch/shift/pme instead of cut-off.");
 +  }
 +
 +  if (ir->eI == eiLBFGS && ir->nbfgscorr <= 0) {
 +    warning(wi,"Using L-BFGS with nbfgscorr<=0 just gets you steepest descent.");
 +  }
 +
 +    /* ENERGY CONSERVATION */
 +    if (ir_NVE(ir) && ir->cutoff_scheme == ecutsGROUP)
 +    {
 +        if (!EVDW_MIGHT_BE_ZERO_AT_CUTOFF(ir->vdwtype) && ir->rvdw > 0 && ir->vdw_modifier == eintmodNONE)
 +        {
 +            sprintf(warn_buf,"You are using a cut-off for VdW interactions with NVE, for good energy conservation use vdwtype = %s (possibly with DispCorr)",
 +                    evdw_names[evdwSHIFT]);
 +            warning_note(wi,warn_buf);
 +        }
 +        if (!EEL_MIGHT_BE_ZERO_AT_CUTOFF(ir->coulombtype) && ir->rcoulomb > 0 && ir->coulomb_modifier == eintmodNONE)
 +        {
 +            sprintf(warn_buf,"You are using a cut-off for electrostatics with NVE, for good energy conservation use coulombtype = %s or %s",
 +                    eel_names[eelPMESWITCH],eel_names[eelRF_ZERO]);
 +            warning_note(wi,warn_buf);
 +        }
 +    }
 +
 +  /* IMPLICIT SOLVENT */
 +  if(ir->coulombtype==eelGB_NOTUSED)
 +  {
 +    ir->coulombtype=eelCUT;
 +    ir->implicit_solvent=eisGBSA;
 +    fprintf(stderr,"Note: Old option for generalized born electrostatics given:\n"
 +          "Changing coulombtype from \"generalized-born\" to \"cut-off\" and instead\n"
 +            "setting implicit-solvent value to \"GBSA\" in input section.\n");
 +  }
 +
 +  if(ir->sa_algorithm==esaSTILL)
 +  {
 +    sprintf(err_buf,"Still SA algorithm not available yet, use %s or %s instead\n",esa_names[esaAPPROX],esa_names[esaNO]);
 +    CHECK(ir->sa_algorithm == esaSTILL);
 +  }
 +  
 +  if(ir->implicit_solvent==eisGBSA)
 +  {
 +    sprintf(err_buf,"With GBSA implicit solvent, rgbradii must be equal to rlist.");
 +    CHECK(ir->rgbradii != ir->rlist);
 +        
 +    if(ir->coulombtype!=eelCUT)
 +        {
 +                sprintf(err_buf,"With GBSA, coulombtype must be equal to %s\n",eel_names[eelCUT]);
 +                CHECK(ir->coulombtype!=eelCUT);
 +        }
 +        if(ir->vdwtype!=evdwCUT)
 +        {
 +                sprintf(err_buf,"With GBSA, vdw-type must be equal to %s\n",evdw_names[evdwCUT]);
 +                CHECK(ir->vdwtype!=evdwCUT);
 +        }
 +    if(ir->nstgbradii<1)
 +    {
 +      sprintf(warn_buf,"Using GBSA with nstgbradii<1, setting nstgbradii=1");
 +      warning_note(wi,warn_buf);
 +      ir->nstgbradii=1;
 +    }
 +    if(ir->sa_algorithm==esaNO)
 +    {
 +      sprintf(warn_buf,"No SA (non-polar) calculation requested together with GB. Are you sure this is what you want?\n");
 +      warning_note(wi,warn_buf);
 +    }
 +    if(ir->sa_surface_tension<0 && ir->sa_algorithm!=esaNO)
 +    {
 +      sprintf(warn_buf,"Value of sa_surface_tension is < 0. Changing it to 2.05016 or 2.25936 kJ/nm^2/mol for Still and HCT/OBC respectively\n");
 +      warning_note(wi,warn_buf);
 +      
 +      if(ir->gb_algorithm==egbSTILL)
 +      {
 +        ir->sa_surface_tension = 0.0049 * CAL2JOULE * 100;
 +      }
 +      else
 +      {
 +        ir->sa_surface_tension = 0.0054 * CAL2JOULE * 100;
 +      }
 +    }
 +    if(ir->sa_surface_tension==0 && ir->sa_algorithm!=esaNO)
 +    {
 +      sprintf(err_buf, "Surface tension set to 0 while SA-calculation requested\n");
 +      CHECK(ir->sa_surface_tension==0 && ir->sa_algorithm!=esaNO);
 +    }
 +    
 +  }
 +
 +    if (ir->bAdress)
 +    {
 +        if (ir->cutoff_scheme != ecutsGROUP)
 +        {
 +            warning_error(wi,"AdresS simulation supports only cutoff-scheme=group");
 +        }
 +        if (!EI_SD(ir->eI))
 +        {
 +            warning_error(wi,"AdresS simulation supports only stochastic dynamics");
 +        }
 +        if (ir->epc != epcNO)
 +        {
 +            warning_error(wi,"AdresS simulation does not support pressure coupling");
 +        }
 +        if (EEL_FULL(ir->coulombtype))
 +        {
 +            warning_error(wi,"AdresS simulation does not support long-range electrostatics");
 +        }
 +    }
 +}
 +
 +/* count the number of text elemets separated by whitespace in a string.
 +    str = the input string
 +    maxptr = the maximum number of allowed elements
 +    ptr = the output array of pointers to the first character of each element
 +    returns: the number of elements. */
 +int str_nelem(const char *str,int maxptr,char *ptr[])
 +{
 +  int  np=0;
 +  char *copy0,*copy;
 +  
 +  copy0=strdup(str); 
 +  copy=copy0;
 +  ltrim(copy);
 +  while (*copy != '\0') {
 +    if (np >= maxptr)
 +      gmx_fatal(FARGS,"Too many groups on line: '%s' (max is %d)",
 +                str,maxptr);
 +    if (ptr) 
 +      ptr[np]=copy;
 +    np++;
 +    while ((*copy != '\0') && !isspace(*copy))
 +      copy++;
 +    if (*copy != '\0') {
 +      *copy='\0';
 +      copy++;
 +    }
 +    ltrim(copy);
 +  }
 +  if (ptr == NULL)
 +    sfree(copy0);
 +
 +  return np;
 +}
 +
 +/* interpret a number of doubles from a string and put them in an array,
 +   after allocating space for them.
 +   str = the input string
 +   n = the (pre-allocated) number of doubles read
 +   r = the output array of doubles. */
 +static void parse_n_real(char *str,int *n,real **r)
 +{
 +  char *ptr[MAXPTR];
 +  int  i;
 +
 +  *n = str_nelem(str,MAXPTR,ptr);
 +
 +  snew(*r,*n);
 +  for(i=0; i<*n; i++) {
 +    (*r)[i] = strtod(ptr[i],NULL);
 +  }
 +}
 +
 +static void do_fep_params(t_inputrec *ir, char fep_lambda[][STRLEN],char weights[STRLEN]) {
 +
 +    int i,j,max_n_lambda,nweights,nfep[efptNR];
 +    t_lambda *fep = ir->fepvals;
 +    t_expanded *expand = ir->expandedvals;
 +    real **count_fep_lambdas;
 +    gmx_bool bOneLambda = TRUE;
 +
 +    snew(count_fep_lambdas,efptNR);
 +
 +    /* FEP input processing */
 +    /* first, identify the number of lambda values for each type.
 +       All that are nonzero must have the same number */
 +
 +    for (i=0;i<efptNR;i++)
 +    {
 +        parse_n_real(fep_lambda[i],&(nfep[i]),&(count_fep_lambdas[i]));
 +    }
 +
 +    /* now, determine the number of components.  All must be either zero, or equal. */
 +
 +    max_n_lambda = 0;
 +    for (i=0;i<efptNR;i++)
 +    {
 +        if (nfep[i] > max_n_lambda) {
 +            max_n_lambda = nfep[i];  /* here's a nonzero one.  All of them
 +                                        must have the same number if its not zero.*/
 +            break;
 +        }
 +    }
 +
 +    for (i=0;i<efptNR;i++)
 +    {
 +        if (nfep[i] == 0)
 +        {
 +            ir->fepvals->separate_dvdl[i] = FALSE;
 +        }
 +        else if (nfep[i] == max_n_lambda)
 +        {
 +            if (i!=efptTEMPERATURE)  /* we treat this differently -- not really a reason to compute the derivative with
 +                                        respect to the temperature currently */
 +            {
 +                ir->fepvals->separate_dvdl[i] = TRUE;
 +            }
 +        }
 +        else
 +        {
 +            gmx_fatal(FARGS,"Number of lambdas (%d) for FEP type %s not equal to number of other types (%d)",
 +                      nfep[i],efpt_names[i],max_n_lambda);
 +        }
 +    }
 +    /* we don't print out dhdl if the temperature is changing, since we can't correctly define dhdl in this case */
 +    ir->fepvals->separate_dvdl[efptTEMPERATURE] = FALSE;
 +
 +    /* the number of lambdas is the number we've read in, which is either zero
 +       or the same for all */
 +    fep->n_lambda = max_n_lambda;
 +
 +    /* allocate space for the array of lambda values */
 +    snew(fep->all_lambda,efptNR);
 +    /* if init_lambda is defined, we need to set lambda */
 +    if ((fep->init_lambda > 0) && (fep->n_lambda == 0))
 +    {
 +        ir->fepvals->separate_dvdl[efptFEP] = TRUE;
 +    }
 +    /* otherwise allocate the space for all of the lambdas, and transfer the data */
 +    for (i=0;i<efptNR;i++)
 +    {
 +        snew(fep->all_lambda[i],fep->n_lambda);
 +        if (nfep[i] > 0)  /* if it's zero, then the count_fep_lambda arrays
 +                             are zero */
 +        {
 +            for (j=0;j<fep->n_lambda;j++)
 +            {
 +                fep->all_lambda[i][j] = (double)count_fep_lambdas[i][j];
 +            }
 +            sfree(count_fep_lambdas[i]);
 +        }
 +    }
 +    sfree(count_fep_lambdas);
 +
 +    /* "fep-vals" is either zero or the full number. If zero, we'll need to define fep-lambdas for internal
 +       bookkeeping -- for now, init_lambda */
 +
 +    if ((nfep[efptFEP] == 0) && (fep->init_lambda >= 0) && (fep->init_lambda <= 1))
 +    {
 +        for (i=0;i<fep->n_lambda;i++)
 +        {
 +            fep->all_lambda[efptFEP][i] = fep->init_lambda;
 +        }
 +    }
 +
 +    /* check to see if only a single component lambda is defined, and soft core is defined.
 +       In this case, turn on coulomb soft core */
 +
 +    if (max_n_lambda == 0)
 +    {
 +        bOneLambda = TRUE;
 +    }
 +    else
 +    {
 +        for (i=0;i<efptNR;i++)
 +        {
 +            if ((nfep[i] != 0) && (i!=efptFEP))
 +            {
 +                bOneLambda = FALSE;
 +            }
 +        }
 +    }
 +    if ((bOneLambda) && (fep->sc_alpha > 0))
 +    {
 +        fep->bScCoul = TRUE;
 +    }
 +
 +    /* Fill in the others with the efptFEP if they are not explicitly
 +       specified (i.e. nfep[i] == 0).  This means if fep is not defined,
 +       they are all zero. */
 +
 +    for (i=0;i<efptNR;i++)
 +    {
 +        if ((nfep[i] == 0) && (i!=efptFEP))
 +        {
 +            for (j=0;j<fep->n_lambda;j++)
 +            {
 +                fep->all_lambda[i][j] = fep->all_lambda[efptFEP][j];
 +            }
 +        }
 +    }
 +
 +
 +    /* make it easier if sc_r_power = 48 by increasing it to the 4th power, to be in the right scale. */
 +    if (fep->sc_r_power == 48)
 +    {
 +        if (fep->sc_alpha > 0.1)
 +        {
 +            gmx_fatal(FARGS,"sc_alpha (%f) for sc_r_power = 48 should usually be between 0.001 and 0.004", fep->sc_alpha);
 +        }
 +    }
 +
 +    expand = ir->expandedvals;
 +    /* now read in the weights */
 +    parse_n_real(weights,&nweights,&(expand->init_lambda_weights));
 +    if (nweights == 0)
 +    {
 +        expand->bInit_weights = FALSE;
 +        snew(expand->init_lambda_weights,fep->n_lambda); /* initialize to zero */
 +    }
 +    else if (nweights != fep->n_lambda)
 +    {
 +        gmx_fatal(FARGS,"Number of weights (%d) is not equal to number of lambda values (%d)",
 +                  nweights,fep->n_lambda);
 +    }
 +    else
 +    {
 +        expand->bInit_weights = TRUE;
 +    }
 +    if ((expand->nstexpanded < 0) && (ir->efep != efepNO)) {
 +        expand->nstexpanded = fep->nstdhdl;
 +        /* if you don't specify nstexpanded when doing expanded ensemble free energy calcs, it is set to nstdhdl */
 +    }
 +    if ((expand->nstexpanded < 0) && ir->bSimTemp) {
 +        expand->nstexpanded = 2*(int)(ir->opts.tau_t[0]/ir->delta_t);
 +        /* if you don't specify nstexpanded when doing expanded ensemble simulated tempering, it is set to
 +           2*tau_t just to be careful so it's not to frequent  */
 +    }
 +}
 +
 +
 +static void do_simtemp_params(t_inputrec *ir) {
 +
 +    snew(ir->simtempvals->temperatures,ir->fepvals->n_lambda);
 +    GetSimTemps(ir->fepvals->n_lambda,ir->simtempvals,ir->fepvals->all_lambda[efptTEMPERATURE]);
 +
 +    return;
 +}
 +
 +static void do_wall_params(t_inputrec *ir,
 +                           char *wall_atomtype, char *wall_density,
 +                           t_gromppopts *opts)
 +{
 +    int  nstr,i;
 +    char *names[MAXPTR];
 +    double dbl;
 +
 +    opts->wall_atomtype[0] = NULL;
 +    opts->wall_atomtype[1] = NULL;
 +
 +    ir->wall_atomtype[0] = -1;
 +    ir->wall_atomtype[1] = -1;
 +    ir->wall_density[0] = 0;
 +    ir->wall_density[1] = 0;
 +  
 +    if (ir->nwall > 0)
 +    {
 +        nstr = str_nelem(wall_atomtype,MAXPTR,names);
 +        if (nstr != ir->nwall)
 +        {
 +            gmx_fatal(FARGS,"Expected %d elements for wall_atomtype, found %d",
 +                      ir->nwall,nstr);
 +        }
 +        for(i=0; i<ir->nwall; i++)
 +        {
 +            opts->wall_atomtype[i] = strdup(names[i]);
 +        }
 +    
 +        if (ir->wall_type == ewt93 || ir->wall_type == ewt104) {
 +            nstr = str_nelem(wall_density,MAXPTR,names);
 +            if (nstr != ir->nwall)
 +            {
 +                gmx_fatal(FARGS,"Expected %d elements for wall-density, found %d",ir->nwall,nstr);
 +            }
 +            for(i=0; i<ir->nwall; i++)
 +            {
 +                sscanf(names[i],"%lf",&dbl);
 +                if (dbl <= 0)
 +                {
 +                    gmx_fatal(FARGS,"wall-density[%d] = %f\n",i,dbl);
 +                }
 +                ir->wall_density[i] = dbl;
 +            }
 +        }
 +    }
 +}
 +
 +static void add_wall_energrps(gmx_groups_t *groups,int nwall,t_symtab *symtab)
 +{
 +  int  i;
 +  t_grps *grps;
 +  char str[STRLEN];
 +  
 +  if (nwall > 0) {
 +    srenew(groups->grpname,groups->ngrpname+nwall);
 +    grps = &(groups->grps[egcENER]);
 +    srenew(grps->nm_ind,grps->nr+nwall);
 +    for(i=0; i<nwall; i++) {
 +      sprintf(str,"wall%d",i);
 +      groups->grpname[groups->ngrpname] = put_symtab(symtab,str);
 +      grps->nm_ind[grps->nr++] = groups->ngrpname++;
 +    }
 +  }
 +}
 +
 +void read_expandedparams(int *ninp_p,t_inpfile **inp_p,
 +                         t_expanded *expand,warninp_t wi)
 +{
 +  int  ninp,nerror=0;
 +  t_inpfile *inp;
 +
 +  ninp   = *ninp_p;
 +  inp    = *inp_p;
 +
 +  /* read expanded ensemble parameters */
 +  CCTYPE ("expanded ensemble variables");
 +  ITYPE ("nstexpanded",expand->nstexpanded,-1);
 +  EETYPE("lmc-stats", expand->elamstats, elamstats_names);
 +  EETYPE("lmc-move", expand->elmcmove, elmcmove_names);
 +  EETYPE("lmc-weights-equil",expand->elmceq,elmceq_names);
 +  ITYPE ("weight-equil-number-all-lambda",expand->equil_n_at_lam,-1);
 +  ITYPE ("weight-equil-number-samples",expand->equil_samples,-1);
 +  ITYPE ("weight-equil-number-steps",expand->equil_steps,-1);
 +  RTYPE ("weight-equil-wl-delta",expand->equil_wl_delta,-1);
 +  RTYPE ("weight-equil-count-ratio",expand->equil_ratio,-1);
 +  CCTYPE("Seed for Monte Carlo in lambda space");
 +  ITYPE ("lmc-seed",expand->lmc_seed,-1);
 +  RTYPE ("mc-temperature",expand->mc_temp,-1);
 +  ITYPE ("lmc-repeats",expand->lmc_repeats,1);
 +  ITYPE ("lmc-gibbsdelta",expand->gibbsdeltalam,-1);
 +  ITYPE ("lmc-forced-nstart",expand->lmc_forced_nstart,0);
 +  EETYPE("symmetrized-transition-matrix", expand->bSymmetrizedTMatrix, yesno_names);
 +  ITYPE("nst-transition-matrix", expand->nstTij, -1);
 +  ITYPE ("mininum-var-min",expand->minvarmin, 100); /*default is reasonable */
 +  ITYPE ("weight-c-range",expand->c_range, 0); /* default is just C=0 */
 +  RTYPE ("wl-scale",expand->wl_scale,0.8);
 +  RTYPE ("wl-ratio",expand->wl_ratio,0.8);
 +  RTYPE ("init-wl-delta",expand->init_wl_delta,1.0);
 +  EETYPE("wl-oneovert",expand->bWLoneovert,yesno_names);
 +
 +  *ninp_p   = ninp;
 +  *inp_p    = inp;
 +
 +  return;
 +}
 +
 +void get_ir(const char *mdparin,const char *mdparout,
 +            t_inputrec *ir,t_gromppopts *opts,
 +            warninp_t wi)
 +{
 +  char      *dumstr[2];
 +  double    dumdub[2][6];
 +  t_inpfile *inp;
 +  const char *tmp;
 +  int       i,j,m,ninp;
 +  char      warn_buf[STRLEN];
 +  t_lambda  *fep = ir->fepvals;
 +  t_expanded *expand = ir->expandedvals;
 +
 +  inp = read_inpfile(mdparin, &ninp, NULL, wi);
 +
 +  snew(dumstr[0],STRLEN);
 +  snew(dumstr[1],STRLEN);
 +
 +  /* remove the following deprecated commands */
 +  REM_TYPE("title");
 +  REM_TYPE("cpp");
 +  REM_TYPE("domain-decomposition");
 +  REM_TYPE("andersen-seed");
 +  REM_TYPE("dihre");
 +  REM_TYPE("dihre-fc");
 +  REM_TYPE("dihre-tau");
 +  REM_TYPE("nstdihreout");
 +  REM_TYPE("nstcheckpoint");
 +
 +  /* replace the following commands with the clearer new versions*/
 +  REPL_TYPE("unconstrained-start","continuation");
 +  REPL_TYPE("foreign-lambda","fep-lambdas");
 +
 +  CCTYPE ("VARIOUS PREPROCESSING OPTIONS");
 +  CTYPE ("Preprocessor information: use cpp syntax.");
 +  CTYPE ("e.g.: -I/home/joe/doe -I/home/mary/roe");
 +  STYPE ("include",   opts->include,  NULL);
 +  CTYPE ("e.g.: -DPOSRES -DFLEXIBLE (note these variable names are case sensitive)");
 +  STYPE ("define",    opts->define,   NULL);
 +    
 +  CCTYPE ("RUN CONTROL PARAMETERS");
 +  EETYPE("integrator",  ir->eI,         ei_names);
 +  CTYPE ("Start time and timestep in ps");
 +  RTYPE ("tinit",     ir->init_t,     0.0);
 +  RTYPE ("dt",                ir->delta_t,    0.001);
 +  STEPTYPE ("nsteps",   ir->nsteps,     0);
 +  CTYPE ("For exact run continuation or redoing part of a run");
 +  STEPTYPE ("init-step",ir->init_step,  0);
 +  CTYPE ("Part index is updated automatically on checkpointing (keeps files separate)");
 +  ITYPE ("simulation-part", ir->simulation_part, 1);
 +  CTYPE ("mode for center of mass motion removal");
 +  EETYPE("comm-mode",   ir->comm_mode,  ecm_names);
 +  CTYPE ("number of steps for center of mass motion removal");
 +  ITYPE ("nstcomm",   ir->nstcomm,    100);
 +  CTYPE ("group(s) for center of mass motion removal");
 +  STYPE ("comm-grps",   vcm,            NULL);
 +  
 +  CCTYPE ("LANGEVIN DYNAMICS OPTIONS");
 +  CTYPE ("Friction coefficient (amu/ps) and random seed");
 +  RTYPE ("bd-fric",     ir->bd_fric,    0.0);
 +  ITYPE ("ld-seed",     ir->ld_seed,    1993);
 +  
 +  /* Em stuff */
 +  CCTYPE ("ENERGY MINIMIZATION OPTIONS");
 +  CTYPE ("Force tolerance and initial step-size");
 +  RTYPE ("emtol",       ir->em_tol,     10.0);
 +  RTYPE ("emstep",      ir->em_stepsize,0.01);
 +  CTYPE ("Max number of iterations in relax-shells");
 +  ITYPE ("niter",       ir->niter,      20);
 +  CTYPE ("Step size (ps^2) for minimization of flexible constraints");
 +  RTYPE ("fcstep",      ir->fc_stepsize, 0);
 +  CTYPE ("Frequency of steepest descents steps when doing CG");
 +  ITYPE ("nstcgsteep",        ir->nstcgsteep, 1000);
 +  ITYPE ("nbfgscorr",   ir->nbfgscorr,  10); 
 +
 +  CCTYPE ("TEST PARTICLE INSERTION OPTIONS");
 +  RTYPE ("rtpi",      ir->rtpi,       0.05);
 +
 +  /* Output options */
 +  CCTYPE ("OUTPUT CONTROL OPTIONS");
 +  CTYPE ("Output frequency for coords (x), velocities (v) and forces (f)");
 +  ITYPE ("nstxout",   ir->nstxout,    0);
 +  ITYPE ("nstvout",   ir->nstvout,    0);
 +  ITYPE ("nstfout",   ir->nstfout,    0);
 +  ir->nstcheckpoint = 1000;
 +  CTYPE ("Output frequency for energies to log file and energy file");
 +  ITYPE ("nstlog",    ir->nstlog,     1000);
 +  ITYPE ("nstcalcenergy",ir->nstcalcenergy,   100);
 +  ITYPE ("nstenergy",   ir->nstenergy,  1000);
 +  CTYPE ("Output frequency and precision for .xtc file");
 +  ITYPE ("nstxtcout",   ir->nstxtcout,  0);
 +  RTYPE ("xtc-precision",ir->xtcprec,   1000.0);
 +  CTYPE ("This selects the subset of atoms for the .xtc file. You can");
 +  CTYPE ("select multiple groups. By default all atoms will be written.");
 +  STYPE ("xtc-grps",    xtc_grps,       NULL);
 +  CTYPE ("Selection of energy groups");
 +  STYPE ("energygrps",  energy,         NULL);
 +
 +  /* Neighbor searching */  
 +  CCTYPE ("NEIGHBORSEARCHING PARAMETERS");
 +  CTYPE ("cut-off scheme (group: using charge groups, Verlet: particle based cut-offs)");
 +  EETYPE("cutoff-scheme",     ir->cutoff_scheme,    ecutscheme_names);
 +  CTYPE ("nblist update frequency");
 +  ITYPE ("nstlist",   ir->nstlist,    10);
 +  CTYPE ("ns algorithm (simple or grid)");
 +  EETYPE("ns-type",     ir->ns_type,    ens_names);
 +  /* set ndelta to the optimal value of 2 */
 +  ir->ndelta = 2;
 +  CTYPE ("Periodic boundary conditions: xyz, no, xy");
 +  EETYPE("pbc",         ir->ePBC,       epbc_names);
 +  EETYPE("periodic-molecules", ir->bPeriodicMols, yesno_names);
 +  CTYPE ("Allowed energy drift due to the Verlet buffer in kJ/mol/ps per atom,");
 +  CTYPE ("a value of -1 means: use rlist");
 +  RTYPE("verlet-buffer-drift", ir->verletbuf_drift,    0.005);
 +  CTYPE ("nblist cut-off");
 +  RTYPE ("rlist",     ir->rlist,      -1);
 +  CTYPE ("long-range cut-off for switched potentials");
 +  RTYPE ("rlistlong", ir->rlistlong,  -1);
 +  ITYPE ("nstcalclr", ir->nstcalclr,  -1);
 +
 +  /* Electrostatics */
 +  CCTYPE ("OPTIONS FOR ELECTROSTATICS AND VDW");
 +  CTYPE ("Method for doing electrostatics");
 +  EETYPE("coulombtype",       ir->coulombtype,    eel_names);
 +  EETYPE("coulomb-modifier",  ir->coulomb_modifier,    eintmod_names);
 +  CTYPE ("cut-off lengths");
 +  RTYPE ("rcoulomb-switch",   ir->rcoulomb_switch,    0.0);
 +  RTYPE ("rcoulomb",  ir->rcoulomb,   -1);
 +  CTYPE ("Relative dielectric constant for the medium and the reaction field");
 +  RTYPE ("epsilon-r",   ir->epsilon_r,  1.0);
 +  RTYPE ("epsilon-rf",  ir->epsilon_rf, 0.0);
 +  CTYPE ("Method for doing Van der Waals");
 +  EETYPE("vdw-type",  ir->vdwtype,    evdw_names);
 +  EETYPE("vdw-modifier",      ir->vdw_modifier,    eintmod_names);
 +  CTYPE ("cut-off lengths");
 +  RTYPE ("rvdw-switch",       ir->rvdw_switch,        0.0);
 +  RTYPE ("rvdw",      ir->rvdw,       -1);
 +  CTYPE ("Apply long range dispersion corrections for Energy and Pressure");
 +  EETYPE("DispCorr",    ir->eDispCorr,  edispc_names);
 +  CTYPE ("Extension of the potential lookup tables beyond the cut-off");
 +  RTYPE ("table-extension", ir->tabext, 1.0);
 +  CTYPE ("Seperate tables between energy group pairs");
 +  STYPE ("energygrp-table", egptable,   NULL);
 +  CTYPE ("Spacing for the PME/PPPM FFT grid");
 +  RTYPE ("fourierspacing", ir->fourier_spacing,0.12);
 +  CTYPE ("FFT grid size, when a value is 0 fourierspacing will be used");
 +  ITYPE ("fourier-nx",  ir->nkx,         0);
 +  ITYPE ("fourier-ny",  ir->nky,         0);
 +  ITYPE ("fourier-nz",  ir->nkz,         0);
 +  CTYPE ("EWALD/PME/PPPM parameters");
 +  ITYPE ("pme-order",   ir->pme_order,   4);
 +  RTYPE ("ewald-rtol",  ir->ewald_rtol, 0.00001);
 +  EETYPE("ewald-geometry", ir->ewald_geometry, eewg_names);
 +  RTYPE ("epsilon-surface", ir->epsilon_surface, 0.0);
 +  EETYPE("optimize-fft",ir->bOptFFT,  yesno_names);
 +
 +  CCTYPE("IMPLICIT SOLVENT ALGORITHM");
 +  EETYPE("implicit-solvent", ir->implicit_solvent, eis_names);
 +      
 +  CCTYPE ("GENERALIZED BORN ELECTROSTATICS"); 
 +  CTYPE ("Algorithm for calculating Born radii");
 +  EETYPE("gb-algorithm", ir->gb_algorithm, egb_names);
 +  CTYPE ("Frequency of calculating the Born radii inside rlist");
 +  ITYPE ("nstgbradii", ir->nstgbradii, 1);
 +  CTYPE ("Cutoff for Born radii calculation; the contribution from atoms");
 +  CTYPE ("between rlist and rgbradii is updated every nstlist steps");
 +  RTYPE ("rgbradii",  ir->rgbradii, 1.0);
 +  CTYPE ("Dielectric coefficient of the implicit solvent");
 +  RTYPE ("gb-epsilon-solvent",ir->gb_epsilon_solvent, 80.0);
 +  CTYPE ("Salt concentration in M for Generalized Born models");
 +  RTYPE ("gb-saltconc",  ir->gb_saltconc, 0.0);
 +  CTYPE ("Scaling factors used in the OBC GB model. Default values are OBC(II)");
 +  RTYPE ("gb-obc-alpha", ir->gb_obc_alpha, 1.0);
 +  RTYPE ("gb-obc-beta", ir->gb_obc_beta, 0.8);
 +  RTYPE ("gb-obc-gamma", ir->gb_obc_gamma, 4.85);
 +  RTYPE ("gb-dielectric-offset", ir->gb_dielectric_offset, 0.009);
 +  EETYPE("sa-algorithm", ir->sa_algorithm, esa_names);
 +  CTYPE ("Surface tension (kJ/mol/nm^2) for the SA (nonpolar surface) part of GBSA");
 +  CTYPE ("The value -1 will set default value for Still/HCT/OBC GB-models.");
 +  RTYPE ("sa-surface-tension", ir->sa_surface_tension, -1);
 +               
 +  /* Coupling stuff */
 +  CCTYPE ("OPTIONS FOR WEAK COUPLING ALGORITHMS");
 +  CTYPE ("Temperature coupling");
 +  EETYPE("tcoupl",    ir->etc,        etcoupl_names);
 +  ITYPE ("nsttcouple", ir->nsttcouple,  -1);
 +  ITYPE("nh-chain-length",     ir->opts.nhchainlength, NHCHAINLENGTH);
 +  EETYPE("print-nose-hoover-chain-variables", ir->bPrintNHChains, yesno_names);
 +  CTYPE ("Groups to couple separately");
 +  STYPE ("tc-grps",     tcgrps,         NULL);
 +  CTYPE ("Time constant (ps) and reference temperature (K)");
 +  STYPE ("tau-t",     tau_t,          NULL);
 +  STYPE ("ref-t",     ref_t,          NULL);
 +  CTYPE ("pressure coupling");
 +  EETYPE("pcoupl",    ir->epc,        epcoupl_names);
 +  EETYPE("pcoupltype",        ir->epct,       epcoupltype_names);
 +  ITYPE ("nstpcouple", ir->nstpcouple,  -1);
 +  CTYPE ("Time constant (ps), compressibility (1/bar) and reference P (bar)");
 +  RTYPE ("tau-p",     ir->tau_p,      1.0);
 +  STYPE ("compressibility",   dumstr[0],      NULL);
 +  STYPE ("ref-p",       dumstr[1],      NULL);
 +  CTYPE ("Scaling of reference coordinates, No, All or COM");
 +  EETYPE ("refcoord-scaling",ir->refcoord_scaling,erefscaling_names);
 +
 +  /* QMMM */
 +  CCTYPE ("OPTIONS FOR QMMM calculations");
 +  EETYPE("QMMM", ir->bQMMM, yesno_names);
 +  CTYPE ("Groups treated Quantum Mechanically");
 +  STYPE ("QMMM-grps",  QMMM,          NULL);
 +  CTYPE ("QM method");
 +  STYPE("QMmethod",     QMmethod, NULL);
 +  CTYPE ("QMMM scheme");
 +  EETYPE("QMMMscheme",  ir->QMMMscheme,    eQMMMscheme_names);
 +  CTYPE ("QM basisset");
 +  STYPE("QMbasis",      QMbasis, NULL);
 +  CTYPE ("QM charge");
 +  STYPE ("QMcharge",    QMcharge,NULL);
 +  CTYPE ("QM multiplicity");
 +  STYPE ("QMmult",      QMmult,NULL);
 +  CTYPE ("Surface Hopping");
 +  STYPE ("SH",          bSH, NULL);
 +  CTYPE ("CAS space options");
 +  STYPE ("CASorbitals",      CASorbitals,   NULL);
 +  STYPE ("CASelectrons",     CASelectrons,  NULL);
 +  STYPE ("SAon", SAon, NULL);
 +  STYPE ("SAoff",SAoff,NULL);
 +  STYPE ("SAsteps",  SAsteps, NULL);
 +  CTYPE ("Scale factor for MM charges");
 +  RTYPE ("MMChargeScaleFactor", ir->scalefactor, 1.0);
 +  CTYPE ("Optimization of QM subsystem");
 +  STYPE ("bOPT",          bOPT, NULL);
 +  STYPE ("bTS",          bTS, NULL);
 +
 +  /* Simulated annealing */
 +  CCTYPE("SIMULATED ANNEALING");
 +  CTYPE ("Type of annealing for each temperature group (no/single/periodic)");
 +  STYPE ("annealing",   anneal,      NULL);
 +  CTYPE ("Number of time points to use for specifying annealing in each group");
 +  STYPE ("annealing-npoints", anneal_npoints, NULL);
 +  CTYPE ("List of times at the annealing points for each group");
 +  STYPE ("annealing-time",       anneal_time,       NULL);
 +  CTYPE ("Temp. at each annealing point, for each group.");
 +  STYPE ("annealing-temp",  anneal_temp,  NULL);
 +  
 +  /* Startup run */
 +  CCTYPE ("GENERATE VELOCITIES FOR STARTUP RUN");
 +  EETYPE("gen-vel",     opts->bGenVel,  yesno_names);
 +  RTYPE ("gen-temp",    opts->tempi,    300.0);
 +  ITYPE ("gen-seed",    opts->seed,     173529);
 +  
 +  /* Shake stuff */
 +  CCTYPE ("OPTIONS FOR BONDS");
 +  EETYPE("constraints",       opts->nshake,   constraints);
 +  CTYPE ("Type of constraint algorithm");
 +  EETYPE("constraint-algorithm",  ir->eConstrAlg, econstr_names);
 +  CTYPE ("Do not constrain the start configuration");
 +  EETYPE("continuation", ir->bContinuation, yesno_names);
 +  CTYPE ("Use successive overrelaxation to reduce the number of shake iterations");
 +  EETYPE("Shake-SOR", ir->bShakeSOR, yesno_names);
 +  CTYPE ("Relative tolerance of shake");
 +  RTYPE ("shake-tol", ir->shake_tol, 0.0001);
 +  CTYPE ("Highest order in the expansion of the constraint coupling matrix");
 +  ITYPE ("lincs-order", ir->nProjOrder, 4);
 +  CTYPE ("Number of iterations in the final step of LINCS. 1 is fine for");
 +  CTYPE ("normal simulations, but use 2 to conserve energy in NVE runs.");
 +  CTYPE ("For energy minimization with constraints it should be 4 to 8.");
 +  ITYPE ("lincs-iter", ir->nLincsIter, 1);
 +  CTYPE ("Lincs will write a warning to the stderr if in one step a bond"); 
 +  CTYPE ("rotates over more degrees than");
 +  RTYPE ("lincs-warnangle", ir->LincsWarnAngle, 30.0);
 +  CTYPE ("Convert harmonic bonds to morse potentials");
 +  EETYPE("morse",       opts->bMorse,yesno_names);
 +
 +  /* Energy group exclusions */
 +  CCTYPE ("ENERGY GROUP EXCLUSIONS");
 +  CTYPE ("Pairs of energy groups for which all non-bonded interactions are excluded");
 +  STYPE ("energygrp-excl", egpexcl,     NULL);
 +  
 +  /* Walls */
 +  CCTYPE ("WALLS");
 +  CTYPE ("Number of walls, type, atom types, densities and box-z scale factor for Ewald");
 +  ITYPE ("nwall", ir->nwall, 0);
 +  EETYPE("wall-type",     ir->wall_type,   ewt_names);
 +  RTYPE ("wall-r-linpot", ir->wall_r_linpot, -1);
 +  STYPE ("wall-atomtype", wall_atomtype, NULL);
 +  STYPE ("wall-density",  wall_density,  NULL);
 +  RTYPE ("wall-ewald-zfac", ir->wall_ewald_zfac, 3);
 +  
 +  /* COM pulling */
 +  CCTYPE("COM PULLING");
 +  CTYPE("Pull type: no, umbrella, constraint or constant-force");
 +  EETYPE("pull",          ir->ePull, epull_names);
 +  if (ir->ePull != epullNO) {
 +    snew(ir->pull,1);
 +    pull_grp = read_pullparams(&ninp,&inp,ir->pull,&opts->pull_start,wi);
 +  }
 +  
 +  /* Enforced rotation */
 +  CCTYPE("ENFORCED ROTATION");
 +  CTYPE("Enforced rotation: No or Yes");
 +  EETYPE("rotation",       ir->bRot, yesno_names);
 +  if (ir->bRot) {
 +    snew(ir->rot,1);
 +    rot_grp = read_rotparams(&ninp,&inp,ir->rot,wi);
 +  }
 +
 +  /* Refinement */
 +  CCTYPE("NMR refinement stuff");
 +  CTYPE ("Distance restraints type: No, Simple or Ensemble");
 +  EETYPE("disre",       ir->eDisre,     edisre_names);
 +  CTYPE ("Force weighting of pairs in one distance restraint: Conservative or Equal");
 +  EETYPE("disre-weighting", ir->eDisreWeighting, edisreweighting_names);
 +  CTYPE ("Use sqrt of the time averaged times the instantaneous violation");
 +  EETYPE("disre-mixed", ir->bDisreMixed, yesno_names);
 +  RTYPE ("disre-fc",  ir->dr_fc,      1000.0);
 +  RTYPE ("disre-tau", ir->dr_tau,     0.0);
 +  CTYPE ("Output frequency for pair distances to energy file");
 +  ITYPE ("nstdisreout", ir->nstdisreout, 100);
 +  CTYPE ("Orientation restraints: No or Yes");
 +  EETYPE("orire",       opts->bOrire,   yesno_names);
 +  CTYPE ("Orientation restraints force constant and tau for time averaging");
 +  RTYPE ("orire-fc",  ir->orires_fc,  0.0);
 +  RTYPE ("orire-tau", ir->orires_tau, 0.0);
 +  STYPE ("orire-fitgrp",orirefitgrp,    NULL);
 +  CTYPE ("Output frequency for trace(SD) and S to energy file");
 +  ITYPE ("nstorireout", ir->nstorireout, 100);
 +
 +  /* free energy variables */
 +  CCTYPE ("Free energy variables");
 +  EETYPE("free-energy", ir->efep, efep_names);
 +  STYPE ("couple-moltype",  couple_moltype,  NULL);
 +  EETYPE("couple-lambda0", opts->couple_lam0, couple_lam);
 +  EETYPE("couple-lambda1", opts->couple_lam1, couple_lam);
 +  EETYPE("couple-intramol", opts->bCoupleIntra, yesno_names);
 +
 +  RTYPE ("init-lambda", fep->init_lambda,-1); /* start with -1 so
 +                                                 we can recognize if
 +                                                 it was not entered */
 +  ITYPE ("init-lambda-state", fep->init_fep_state,0);
 +  RTYPE ("delta-lambda",fep->delta_lambda,0.0);
 +  ITYPE ("nstdhdl",fep->nstdhdl, 100);
 +  STYPE ("fep-lambdas", fep_lambda[efptFEP], NULL);
 +  STYPE ("mass-lambdas", fep_lambda[efptMASS], NULL);
 +  STYPE ("coul-lambdas", fep_lambda[efptCOUL], NULL);
 +  STYPE ("vdw-lambdas", fep_lambda[efptVDW], NULL);
 +  STYPE ("bonded-lambdas", fep_lambda[efptBONDED], NULL);
 +  STYPE ("restraint-lambdas", fep_lambda[efptRESTRAINT], NULL);
 +  STYPE ("temperature-lambdas", fep_lambda[efptTEMPERATURE], NULL);
 +  STYPE ("init-lambda-weights",lambda_weights,NULL);
 +  EETYPE("dhdl-print-energy", fep->bPrintEnergy, yesno_names);
 +  RTYPE ("sc-alpha",fep->sc_alpha,0.0);
 +  ITYPE ("sc-power",fep->sc_power,1);
 +  RTYPE ("sc-r-power",fep->sc_r_power,6.0);
 +  RTYPE ("sc-sigma",fep->sc_sigma,0.3);
 +  EETYPE("sc-coul",fep->bScCoul,yesno_names);
 +  ITYPE ("dh_hist_size", fep->dh_hist_size, 0);
 +  RTYPE ("dh_hist_spacing", fep->dh_hist_spacing, 0.1);
 +  EETYPE("separate-dhdl-file", fep->separate_dhdl_file,
 +                               separate_dhdl_file_names);
 +  EETYPE("dhdl-derivatives", fep->dhdl_derivatives, dhdl_derivatives_names);
 +  ITYPE ("dh_hist_size", fep->dh_hist_size, 0);
 +  RTYPE ("dh_hist_spacing", fep->dh_hist_spacing, 0.1);
 +
 +  /* Non-equilibrium MD stuff */  
 +  CCTYPE("Non-equilibrium MD stuff");
 +  STYPE ("acc-grps",    accgrps,        NULL);
 +  STYPE ("accelerate",  acc,            NULL);
 +  STYPE ("freezegrps",  freeze,         NULL);
 +  STYPE ("freezedim",   frdim,          NULL);
 +  RTYPE ("cos-acceleration", ir->cos_accel, 0);
 +  STYPE ("deform",      deform,         NULL);
 +
 +  /* simulated tempering variables */
 +  CCTYPE("simulated tempering variables");
 +  EETYPE("simulated-tempering",ir->bSimTemp,yesno_names);
 +  EETYPE("simulated-tempering-scaling",ir->simtempvals->eSimTempScale,esimtemp_names);
 +  RTYPE("sim-temp-low",ir->simtempvals->simtemp_low,300.0);
 +  RTYPE("sim-temp-high",ir->simtempvals->simtemp_high,300.0);
 +
 +  /* expanded ensemble variables */
 +  if (ir->efep==efepEXPANDED || ir->bSimTemp)
 +  {
 +      read_expandedparams(&ninp,&inp,expand,wi);
 +  }
 +
 +  /* Electric fields */
 +  CCTYPE("Electric fields");
 +  CTYPE ("Format is number of terms (int) and for all terms an amplitude (real)");
 +  CTYPE ("and a phase angle (real)");
 +  STYPE ("E-x",       efield_x,       NULL);
 +  STYPE ("E-xt",      efield_xt,      NULL);
 +  STYPE ("E-y",       efield_y,       NULL);
 +  STYPE ("E-yt",      efield_yt,      NULL);
 +  STYPE ("E-z",       efield_z,       NULL);
 +  STYPE ("E-zt",      efield_zt,      NULL);
 +  
 +  /* AdResS defined thingies */
 +  CCTYPE ("AdResS parameters");
 +  EETYPE("adress",       ir->bAdress, yesno_names);
 +  if (ir->bAdress) {
 +    snew(ir->adress,1);
 +    read_adressparams(&ninp,&inp,ir->adress,wi);
 +  }
 +
 +  /* User defined thingies */
 +  CCTYPE ("User defined thingies");
 +  STYPE ("user1-grps",  user1,          NULL);
 +  STYPE ("user2-grps",  user2,          NULL);
 +  ITYPE ("userint1",    ir->userint1,   0);
 +  ITYPE ("userint2",    ir->userint2,   0);
 +  ITYPE ("userint3",    ir->userint3,   0);
 +  ITYPE ("userint4",    ir->userint4,   0);
 +  RTYPE ("userreal1",   ir->userreal1,  0);
 +  RTYPE ("userreal2",   ir->userreal2,  0);
 +  RTYPE ("userreal3",   ir->userreal3,  0);
 +  RTYPE ("userreal4",   ir->userreal4,  0);
 +#undef CTYPE
 +
 +  write_inpfile(mdparout,ninp,inp,FALSE,wi);
 +  for (i=0; (i<ninp); i++) {
 +    sfree(inp[i].name);
 +    sfree(inp[i].value);
 +  }
 +  sfree(inp);
 +
 +  /* Process options if necessary */
 +  for(m=0; m<2; m++) {
 +    for(i=0; i<2*DIM; i++)
 +      dumdub[m][i]=0.0;
 +    if(ir->epc) {
 +      switch (ir->epct) {
 +      case epctISOTROPIC:
 +      if (sscanf(dumstr[m],"%lf",&(dumdub[m][XX]))!=1) {
 +        warning_error(wi,"Pressure coupling not enough values (I need 1)");
 +      }
 +      dumdub[m][YY]=dumdub[m][ZZ]=dumdub[m][XX];
 +      break;
 +      case epctSEMIISOTROPIC:
 +      case epctSURFACETENSION:
 +      if (sscanf(dumstr[m],"%lf%lf",
 +                 &(dumdub[m][XX]),&(dumdub[m][ZZ]))!=2) {
 +        warning_error(wi,"Pressure coupling not enough values (I need 2)");
 +      }
 +      dumdub[m][YY]=dumdub[m][XX];
 +      break;
 +      case epctANISOTROPIC:
 +      if (sscanf(dumstr[m],"%lf%lf%lf%lf%lf%lf",
 +                 &(dumdub[m][XX]),&(dumdub[m][YY]),&(dumdub[m][ZZ]),
 +                 &(dumdub[m][3]),&(dumdub[m][4]),&(dumdub[m][5]))!=6) {
 +        warning_error(wi,"Pressure coupling not enough values (I need 6)");
 +      }
 +      break;
 +      default:
 +      gmx_fatal(FARGS,"Pressure coupling type %s not implemented yet",
 +                  epcoupltype_names[ir->epct]);
 +      }
 +    }
 +  }
 +  clear_mat(ir->ref_p);
 +  clear_mat(ir->compress);
 +  for(i=0; i<DIM; i++) {
 +    ir->ref_p[i][i]    = dumdub[1][i];
 +    ir->compress[i][i] = dumdub[0][i];
 +  }
 +  if (ir->epct == epctANISOTROPIC) {
 +    ir->ref_p[XX][YY] = dumdub[1][3];
 +    ir->ref_p[XX][ZZ] = dumdub[1][4];
 +    ir->ref_p[YY][ZZ] = dumdub[1][5];
 +    if (ir->ref_p[XX][YY]!=0 && ir->ref_p[XX][ZZ]!=0 && ir->ref_p[YY][ZZ]!=0) {
 +      warning(wi,"All off-diagonal reference pressures are non-zero. Are you sure you want to apply a threefold shear stress?\n");
 +    }
 +    ir->compress[XX][YY] = dumdub[0][3];
 +    ir->compress[XX][ZZ] = dumdub[0][4];
 +    ir->compress[YY][ZZ] = dumdub[0][5];
 +    for(i=0; i<DIM; i++) {
 +      for(m=0; m<i; m++) {
 +      ir->ref_p[i][m] = ir->ref_p[m][i];
 +      ir->compress[i][m] = ir->compress[m][i];
 +      }
 +    }
 +  } 
 +  
 +  if (ir->comm_mode == ecmNO)
 +    ir->nstcomm = 0;
 +
 +  opts->couple_moltype = NULL;
 +  if (strlen(couple_moltype) > 0) 
 +  {
 +      if (ir->efep != efepNO) 
 +      {
 +          opts->couple_moltype = strdup(couple_moltype);
 +          if (opts->couple_lam0 == opts->couple_lam1)
 +          {
 +              warning(wi,"The lambda=0 and lambda=1 states for coupling are identical");
 +          }
 +          if (ir->eI == eiMD && (opts->couple_lam0 == ecouplamNONE ||
 +                                 opts->couple_lam1 == ecouplamNONE)) 
 +          {
 +              warning(wi,"For proper sampling of the (nearly) decoupled state, stochastic dynamics should be used");
 +          }
 +      }
 +      else
 +      {
 +          warning(wi,"Can not couple a molecule with free_energy = no");
 +      }
 +  }
 +  /* FREE ENERGY AND EXPANDED ENSEMBLE OPTIONS */
 +  if (ir->efep != efepNO) {
 +      if (fep->delta_lambda > 0) {
 +          ir->efep = efepSLOWGROWTH;
 +      }
 +  }
 +
 +  if (ir->bSimTemp) {
 +      fep->bPrintEnergy = TRUE;
 +      /* always print out the energy to dhdl if we are doing expanded ensemble, since we need the total energy
 +         if the temperature is changing. */
 +  }
 +
 +  if ((ir->efep != efepNO) || ir->bSimTemp)
 +  {
 +      ir->bExpanded = FALSE;
 +      if ((ir->efep == efepEXPANDED) || ir->bSimTemp)
 +      {
 +          ir->bExpanded = TRUE;
 +      }
 +      do_fep_params(ir,fep_lambda,lambda_weights);
 +      if (ir->bSimTemp) { /* done after fep params */
 +          do_simtemp_params(ir);
 +      }
 +  }
 +  else
 +  {
 +      ir->fepvals->n_lambda = 0;
 +  }
 +
 +  /* WALL PARAMETERS */
 +
 +  do_wall_params(ir,wall_atomtype,wall_density,opts);
 +
 +  /* ORIENTATION RESTRAINT PARAMETERS */
 +  
 +  if (opts->bOrire && str_nelem(orirefitgrp,MAXPTR,NULL)!=1) {
 +      warning_error(wi,"ERROR: Need one orientation restraint fit group\n");
 +  }
 +
 +  /* DEFORMATION PARAMETERS */
 +
 +  clear_mat(ir->deform);
 +  for(i=0; i<6; i++)
 +  {
 +      dumdub[0][i] = 0;
 +  }
 +  m = sscanf(deform,"%lf %lf %lf %lf %lf %lf",
 +           &(dumdub[0][0]),&(dumdub[0][1]),&(dumdub[0][2]),
 +           &(dumdub[0][3]),&(dumdub[0][4]),&(dumdub[0][5]));
 +  for(i=0; i<3; i++)
 +  {
 +      ir->deform[i][i] = dumdub[0][i];
 +  }
 +  ir->deform[YY][XX] = dumdub[0][3];
 +  ir->deform[ZZ][XX] = dumdub[0][4];
 +  ir->deform[ZZ][YY] = dumdub[0][5];
 +  if (ir->epc != epcNO) {
 +    for(i=0; i<3; i++)
 +      for(j=0; j<=i; j++)
 +      if (ir->deform[i][j]!=0 && ir->compress[i][j]!=0) {
 +        warning_error(wi,"A box element has deform set and compressibility > 0");
 +      }
 +    for(i=0; i<3; i++)
 +      for(j=0; j<i; j++)
 +      if (ir->deform[i][j]!=0) {
 +        for(m=j; m<DIM; m++)
 +          if (ir->compress[m][j]!=0) {
 +            sprintf(warn_buf,"An off-diagonal box element has deform set while compressibility > 0 for the same component of another box vector, this might lead to spurious periodicity effects.");
 +            warning(wi,warn_buf);
 +          }
 +      }
 +  }
 +
 +  sfree(dumstr[0]);
 +  sfree(dumstr[1]);
 +}
 +
 +static int search_QMstring(char *s,int ng,const char *gn[])
 +{
 +  /* same as normal search_string, but this one searches QM strings */
 +  int i;
 +
 +  for(i=0; (i<ng); i++)
 +    if (gmx_strcasecmp(s,gn[i]) == 0)
 +      return i;
 +
 +  gmx_fatal(FARGS,"this QM method or basisset (%s) is not implemented\n!",s);
 +
 +  return -1;
 +
 +} /* search_QMstring */
 +
 +
 +int search_string(char *s,int ng,char *gn[])
 +{
 +  int i;
 +  
 +  for(i=0; (i<ng); i++)
 +  {
 +    if (gmx_strcasecmp(s,gn[i]) == 0)
 +    {
 +      return i;
 +    }
 +  }
 +    
 +  gmx_fatal(FARGS,
 +            "Group %s referenced in the .mdp file was not found in the index file.\n"
 +            "Group names must match either [moleculetype] names or custom index group\n"
 +            "names, in which case you must supply an index file to the '-n' option\n"
 +            "of grompp.",
 +            s);
 +  
 +  return -1;
 +}
 +
 +static gmx_bool do_numbering(int natoms,gmx_groups_t *groups,int ng,char *ptrs[],
 +                         t_blocka *block,char *gnames[],
 +                         int gtype,int restnm,
 +                         int grptp,gmx_bool bVerbose,
 +                         warninp_t wi)
 +{
 +    unsigned short *cbuf;
 +    t_grps *grps=&(groups->grps[gtype]);
 +    int    i,j,gid,aj,ognr,ntot=0;
 +    const char *title;
 +    gmx_bool   bRest;
 +    char   warn_buf[STRLEN];
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"Starting numbering %d groups of type %d\n",ng,gtype);
 +    }
 +  
 +    title = gtypes[gtype];
 +    
 +    snew(cbuf,natoms);
 +    /* Mark all id's as not set */
 +    for(i=0; (i<natoms); i++)
 +    {
 +        cbuf[i] = NOGID;
 +    }
 +  
 +    snew(grps->nm_ind,ng+1); /* +1 for possible rest group */
 +    for(i=0; (i<ng); i++)
 +    {
 +        /* Lookup the group name in the block structure */
 +        gid = search_string(ptrs[i],block->nr,gnames);
 +        if ((grptp != egrptpONE) || (i == 0))
 +        {
 +            grps->nm_ind[grps->nr++]=gid;
 +        }
 +        if (debug) 
 +        {
 +            fprintf(debug,"Found gid %d for group %s\n",gid,ptrs[i]);
 +        }
 +    
 +        /* Now go over the atoms in the group */
 +        for(j=block->index[gid]; (j<block->index[gid+1]); j++)
 +        {
 +
 +            aj=block->a[j];
 +      
 +            /* Range checking */
 +            if ((aj < 0) || (aj >= natoms)) 
 +            {
 +                gmx_fatal(FARGS,"Invalid atom number %d in indexfile",aj);
 +            }
 +            /* Lookup up the old group number */
 +            ognr = cbuf[aj];
 +            if (ognr != NOGID)
 +            {
 +                gmx_fatal(FARGS,"Atom %d in multiple %s groups (%d and %d)",
 +                          aj+1,title,ognr+1,i+1);
 +            }
 +            else
 +            {
 +                /* Store the group number in buffer */
 +                if (grptp == egrptpONE)
 +                {
 +                    cbuf[aj] = 0;
 +                }
 +                else
 +                {
 +                    cbuf[aj] = i;
 +                }
 +                ntot++;
 +            }
 +        }
 +    }
 +    
 +    /* Now check whether we have done all atoms */
 +    bRest = FALSE;
 +    if (ntot != natoms)
 +    {
 +        if (grptp == egrptpALL)
 +        {
 +            gmx_fatal(FARGS,"%d atoms are not part of any of the %s groups",
 +                      natoms-ntot,title);
 +        }
 +        else if (grptp == egrptpPART)
 +        {
 +            sprintf(warn_buf,"%d atoms are not part of any of the %s groups",
 +                    natoms-ntot,title);
 +            warning_note(wi,warn_buf);
 +        }
 +        /* Assign all atoms currently unassigned to a rest group */
 +        for(j=0; (j<natoms); j++)
 +        {
 +            if (cbuf[j] == NOGID)
 +            {
 +                cbuf[j] = grps->nr;
 +                bRest = TRUE;
 +            }
 +        }
 +        if (grptp != egrptpPART)
 +        {
 +            if (bVerbose)
 +            {
 +                fprintf(stderr,
 +                        "Making dummy/rest group for %s containing %d elements\n",
 +                        title,natoms-ntot);
 +            }
 +            /* Add group name "rest" */ 
 +            grps->nm_ind[grps->nr] = restnm;
 +            
 +            /* Assign the rest name to all atoms not currently assigned to a group */
 +            for(j=0; (j<natoms); j++)
 +            {
 +                if (cbuf[j] == NOGID)
 +                {
 +                    cbuf[j] = grps->nr;
 +                }
 +            }
 +            grps->nr++;
 +        }
 +    }
 +    
 +    if (grps->nr == 1 && (ntot == 0 || ntot == natoms))
 +    {
 +        /* All atoms are part of one (or no) group, no index required */
 +        groups->ngrpnr[gtype] = 0;
 +        groups->grpnr[gtype]  = NULL;
 +    }
 +    else
 +    {
 +        groups->ngrpnr[gtype] = natoms;
 +        snew(groups->grpnr[gtype],natoms);
 +        for(j=0; (j<natoms); j++)
 +        {
 +            groups->grpnr[gtype][j] = cbuf[j];
 +        }
 +    }
 +    
 +    sfree(cbuf);
 +
 +    return (bRest && grptp == egrptpPART);
 +}
 +
 +static void calc_nrdf(gmx_mtop_t *mtop,t_inputrec *ir,char **gnames)
 +{
 +  t_grpopts *opts;
 +  gmx_groups_t *groups;
 +  t_pull  *pull;
 +  int     natoms,ai,aj,i,j,d,g,imin,jmin,nc;
 +  t_iatom *ia;
 +  int     *nrdf2,*na_vcm,na_tot;
 +  double  *nrdf_tc,*nrdf_vcm,nrdf_uc,n_sub=0;
 +  gmx_mtop_atomloop_all_t aloop;
 +  t_atom  *atom;
 +  int     mb,mol,ftype,as;
 +  gmx_molblock_t *molb;
 +  gmx_moltype_t *molt;
 +
 +  /* Calculate nrdf. 
 +   * First calc 3xnr-atoms for each group
 +   * then subtract half a degree of freedom for each constraint
 +   *
 +   * Only atoms and nuclei contribute to the degrees of freedom...
 +   */
 +
 +  opts = &ir->opts;
 +  
 +  groups = &mtop->groups;
 +  natoms = mtop->natoms;
 +
 +  /* Allocate one more for a possible rest group */
 +  /* We need to sum degrees of freedom into doubles,
 +   * since floats give too low nrdf's above 3 million atoms.
 +   */
 +  snew(nrdf_tc,groups->grps[egcTC].nr+1);
 +  snew(nrdf_vcm,groups->grps[egcVCM].nr+1);
 +  snew(na_vcm,groups->grps[egcVCM].nr+1);
 +  
 +  for(i=0; i<groups->grps[egcTC].nr; i++)
 +    nrdf_tc[i] = 0;
 +  for(i=0; i<groups->grps[egcVCM].nr+1; i++)
 +    nrdf_vcm[i] = 0;
 +
 +  snew(nrdf2,natoms);
 +  aloop = gmx_mtop_atomloop_all_init(mtop);
 +  while (gmx_mtop_atomloop_all_next(aloop,&i,&atom)) {
 +    nrdf2[i] = 0;
 +    if (atom->ptype == eptAtom || atom->ptype == eptNucleus) {
 +      g = ggrpnr(groups,egcFREEZE,i);
 +      /* Double count nrdf for particle i */
 +      for(d=0; d<DIM; d++) {
 +      if (opts->nFreeze[g][d] == 0) {
 +        nrdf2[i] += 2;
 +      }
 +      }
 +      nrdf_tc [ggrpnr(groups,egcTC ,i)] += 0.5*nrdf2[i];
 +      nrdf_vcm[ggrpnr(groups,egcVCM,i)] += 0.5*nrdf2[i];
 +    }
 +  }
 +
 +  as = 0;
 +  for(mb=0; mb<mtop->nmolblock; mb++) {
 +    molb = &mtop->molblock[mb];
 +    molt = &mtop->moltype[molb->type];
 +    atom = molt->atoms.atom;
 +    for(mol=0; mol<molb->nmol; mol++) {
 +      for (ftype=F_CONSTR; ftype<=F_CONSTRNC; ftype++) {
 +      ia = molt->ilist[ftype].iatoms;
 +      for(i=0; i<molt->ilist[ftype].nr; ) {
 +        /* Subtract degrees of freedom for the constraints,
 +         * if the particles still have degrees of freedom left.
 +         * If one of the particles is a vsite or a shell, then all
 +         * constraint motion will go there, but since they do not
 +         * contribute to the constraints the degrees of freedom do not
 +         * change.
 +         */
 +        ai = as + ia[1];
 +        aj = as + ia[2];
 +        if (((atom[ia[1]].ptype == eptNucleus) ||
 +             (atom[ia[1]].ptype == eptAtom)) &&
 +            ((atom[ia[2]].ptype == eptNucleus) ||
 +             (atom[ia[2]].ptype == eptAtom))) {
 +          if (nrdf2[ai] > 0) 
 +            jmin = 1;
 +          else
 +            jmin = 2;
 +          if (nrdf2[aj] > 0)
 +            imin = 1;
 +          else
 +            imin = 2;
 +          imin = min(imin,nrdf2[ai]);
 +          jmin = min(jmin,nrdf2[aj]);
 +          nrdf2[ai] -= imin;
 +          nrdf2[aj] -= jmin;
 +          nrdf_tc [ggrpnr(groups,egcTC ,ai)] -= 0.5*imin;
 +          nrdf_tc [ggrpnr(groups,egcTC ,aj)] -= 0.5*jmin;
 +          nrdf_vcm[ggrpnr(groups,egcVCM,ai)] -= 0.5*imin;
 +          nrdf_vcm[ggrpnr(groups,egcVCM,aj)] -= 0.5*jmin;
 +        }
 +        ia += interaction_function[ftype].nratoms+1;
 +        i  += interaction_function[ftype].nratoms+1;
 +      }
 +      }
 +      ia = molt->ilist[F_SETTLE].iatoms;
 +      for(i=0; i<molt->ilist[F_SETTLE].nr; ) {
 +      /* Subtract 1 dof from every atom in the SETTLE */
 +      for(j=0; j<3; j++) {
 +      ai = as + ia[1+j];
 +        imin = min(2,nrdf2[ai]);
 +        nrdf2[ai] -= imin;
 +        nrdf_tc [ggrpnr(groups,egcTC ,ai)] -= 0.5*imin;
 +        nrdf_vcm[ggrpnr(groups,egcVCM,ai)] -= 0.5*imin;
 +      }
 +      ia += 4;
 +      i  += 4;
 +      }
 +      as += molt->atoms.nr;
 +    }
 +  }
 +
 +  if (ir->ePull == epullCONSTRAINT) {
 +    /* Correct nrdf for the COM constraints.
 +     * We correct using the TC and VCM group of the first atom
 +     * in the reference and pull group. If atoms in one pull group
 +     * belong to different TC or VCM groups it is anyhow difficult
 +     * to determine the optimal nrdf assignment.
 +     */
 +    pull = ir->pull;
 +    if (pull->eGeom == epullgPOS) {
 +      nc = 0;
 +      for(i=0; i<DIM; i++) {
 +      if (pull->dim[i])
 +        nc++;
 +      }
 +    } else {
 +      nc = 1;
 +    }
 +    for(i=0; i<pull->ngrp; i++) {
 +      imin = 2*nc;
 +      if (pull->grp[0].nat > 0) {
 +      /* Subtract 1/2 dof from the reference group */
 +      ai = pull->grp[0].ind[0];
 +      if (nrdf_tc[ggrpnr(groups,egcTC,ai)] > 1) {
 +        nrdf_tc [ggrpnr(groups,egcTC ,ai)] -= 0.5;
 +        nrdf_vcm[ggrpnr(groups,egcVCM,ai)] -= 0.5;
 +        imin--;
 +      }
 +      }
 +      /* Subtract 1/2 dof from the pulled group */
 +      ai = pull->grp[1+i].ind[0];
 +      nrdf_tc [ggrpnr(groups,egcTC ,ai)] -= 0.5*imin;
 +      nrdf_vcm[ggrpnr(groups,egcVCM,ai)] -= 0.5*imin;
 +      if (nrdf_tc[ggrpnr(groups,egcTC,ai)] < 0)
 +      gmx_fatal(FARGS,"Center of mass pulling constraints caused the number of degrees of freedom for temperature coupling group %s to be negative",gnames[groups->grps[egcTC].nm_ind[ggrpnr(groups,egcTC,ai)]]);
 +    }
 +  }
 +  
 +  if (ir->nstcomm != 0) {
 +    /* Subtract 3 from the number of degrees of freedom in each vcm group
 +     * when com translation is removed and 6 when rotation is removed
 +     * as well.
 +     */
 +    switch (ir->comm_mode) {
 +    case ecmLINEAR:
 +      n_sub = ndof_com(ir);
 +      break;
 +    case ecmANGULAR:
 +      n_sub = 6;
 +      break;
 +    default:
 +      n_sub = 0;
 +      gmx_incons("Checking comm_mode");
 +    }
 +    
 +    for(i=0; i<groups->grps[egcTC].nr; i++) {
 +      /* Count the number of atoms of TC group i for every VCM group */
 +      for(j=0; j<groups->grps[egcVCM].nr+1; j++)
 +      na_vcm[j] = 0;
 +      na_tot = 0;
 +      for(ai=0; ai<natoms; ai++)
 +      if (ggrpnr(groups,egcTC,ai) == i) {
 +        na_vcm[ggrpnr(groups,egcVCM,ai)]++;
 +        na_tot++;
 +      }
 +      /* Correct for VCM removal according to the fraction of each VCM
 +       * group present in this TC group.
 +       */
 +      nrdf_uc = nrdf_tc[i];
 +      if (debug) {
 +      fprintf(debug,"T-group[%d] nrdf_uc = %g, n_sub = %g\n",
 +              i,nrdf_uc,n_sub);
 +      }
 +      nrdf_tc[i] = 0;
 +      for(j=0; j<groups->grps[egcVCM].nr+1; j++) {
 +      if (nrdf_vcm[j] > n_sub) {
 +        nrdf_tc[i] += nrdf_uc*((double)na_vcm[j]/(double)na_tot)*
 +          (nrdf_vcm[j] - n_sub)/nrdf_vcm[j];
 +      }
 +      if (debug) {
 +        fprintf(debug,"  nrdf_vcm[%d] = %g, nrdf = %g\n",
 +                j,nrdf_vcm[j],nrdf_tc[i]);
 +      }
 +      }
 +    }
 +  }
 +  for(i=0; (i<groups->grps[egcTC].nr); i++) {
 +    opts->nrdf[i] = nrdf_tc[i];
 +    if (opts->nrdf[i] < 0)
 +      opts->nrdf[i] = 0;
 +    fprintf(stderr,
 +          "Number of degrees of freedom in T-Coupling group %s is %.2f\n",
 +          gnames[groups->grps[egcTC].nm_ind[i]],opts->nrdf[i]);
 +  }
 +  
 +  sfree(nrdf2);
 +  sfree(nrdf_tc);
 +  sfree(nrdf_vcm);
 +  sfree(na_vcm);
 +}
 +
 +static void decode_cos(char *s,t_cosines *cosine,gmx_bool bTime)
 +{
 +  char   *t;
 +  char   format[STRLEN],f1[STRLEN];
 +  double a,phi;
 +  int    i;
 +  
 +  t=strdup(s);
 +  trim(t);
 +  
 +  cosine->n=0;
 +  cosine->a=NULL;
 +  cosine->phi=NULL;
 +  if (strlen(t)) {
 +    sscanf(t,"%d",&(cosine->n));
 +    if (cosine->n <= 0) {
 +      cosine->n=0;
 +    } else {
 +      snew(cosine->a,cosine->n);
 +      snew(cosine->phi,cosine->n);
 +      
 +      sprintf(format,"%%*d");
 +      for(i=0; (i<cosine->n); i++) {
 +      strcpy(f1,format);
 +      strcat(f1,"%lf%lf");
 +      if (sscanf(t,f1,&a,&phi) < 2)
 +        gmx_fatal(FARGS,"Invalid input for electric field shift: '%s'",t);
 +      cosine->a[i]=a;
 +      cosine->phi[i]=phi;
 +      strcat(format,"%*lf%*lf");
 +      }
 +    }
 +  }
 +  sfree(t);
 +}
 +
 +static gmx_bool do_egp_flag(t_inputrec *ir,gmx_groups_t *groups,
 +                      const char *option,const char *val,int flag)
 +{
 +  /* The maximum number of energy group pairs would be MAXPTR*(MAXPTR+1)/2.
 +   * But since this is much larger than STRLEN, such a line can not be parsed.
 +   * The real maximum is the number of names that fit in a string: STRLEN/2.
 +   */
 +#define EGP_MAX (STRLEN/2)
 +  int  nelem,i,j,k,nr;
 +  char *names[EGP_MAX];
 +  char ***gnames;
 +  gmx_bool bSet;
 +
 +  gnames = groups->grpname;
 +
 +  nelem = str_nelem(val,EGP_MAX,names);
 +  if (nelem % 2 != 0)
 +    gmx_fatal(FARGS,"The number of groups for %s is odd",option);
 +  nr = groups->grps[egcENER].nr;
 +  bSet = FALSE;
 +  for(i=0; i<nelem/2; i++) {
 +    j = 0;
 +    while ((j < nr) &&
 +         gmx_strcasecmp(names[2*i],*(gnames[groups->grps[egcENER].nm_ind[j]])))
 +      j++;
 +    if (j == nr)
 +      gmx_fatal(FARGS,"%s in %s is not an energy group\n",
 +                names[2*i],option);
 +    k = 0;
 +    while ((k < nr) &&
 +         gmx_strcasecmp(names[2*i+1],*(gnames[groups->grps[egcENER].nm_ind[k]])))
 +      k++;
 +    if (k==nr)
 +      gmx_fatal(FARGS,"%s in %s is not an energy group\n",
 +            names[2*i+1],option);
 +    if ((j < nr) && (k < nr)) {
 +      ir->opts.egp_flags[nr*j+k] |= flag;
 +      ir->opts.egp_flags[nr*k+j] |= flag;
 +      bSet = TRUE;
 +    }
 +  }
 +
 +  return bSet;
 +}
 +
 +void do_index(const char* mdparin, const char *ndx,
 +              gmx_mtop_t *mtop,
 +              gmx_bool bVerbose,
 +              t_inputrec *ir,rvec *v,
 +              warninp_t wi)
 +{
 +  t_blocka *grps;
 +  gmx_groups_t *groups;
 +  int     natoms;
 +  t_symtab *symtab;
 +  t_atoms atoms_all;
 +  char    warnbuf[STRLEN],**gnames;
 +  int     nr,ntcg,ntau_t,nref_t,nacc,nofg,nSA,nSA_points,nSA_time,nSA_temp;
 +  real    tau_min;
 +  int     nstcmin;
 +  int     nacg,nfreeze,nfrdim,nenergy,nvcm,nuser;
 +  char    *ptr1[MAXPTR],*ptr2[MAXPTR],*ptr3[MAXPTR];
 +  int     i,j,k,restnm;
 +  real    SAtime;
 +  gmx_bool    bExcl,bTable,bSetTCpar,bAnneal,bRest;
 +  int     nQMmethod,nQMbasis,nQMcharge,nQMmult,nbSH,nCASorb,nCASelec,
 +    nSAon,nSAoff,nSAsteps,nQMg,nbOPT,nbTS;
 +  char    warn_buf[STRLEN];
 +
 +  if (bVerbose)
 +    fprintf(stderr,"processing index file...\n");
 +  debug_gmx();
 +  if (ndx == NULL) {
 +    snew(grps,1);
 +    snew(grps->index,1);
 +    snew(gnames,1);
 +    atoms_all = gmx_mtop_global_atoms(mtop);
 +    analyse(&atoms_all,grps,&gnames,FALSE,TRUE);
 +    free_t_atoms(&atoms_all,FALSE);
 +  } else {
 +    grps = init_index(ndx,&gnames);
 +  }
 +
 +  groups = &mtop->groups;
 +  natoms = mtop->natoms;
 +  symtab = &mtop->symtab;
 +
 +  snew(groups->grpname,grps->nr+1);
 +  
 +  for(i=0; (i<grps->nr); i++) {
 +    groups->grpname[i] = put_symtab(symtab,gnames[i]);
 +  }
 +  groups->grpname[i] = put_symtab(symtab,"rest");
 +  restnm=i;
 +  srenew(gnames,grps->nr+1);
 +  gnames[restnm] = *(groups->grpname[i]);
 +  groups->ngrpname = grps->nr+1;
 +
 +  set_warning_line(wi,mdparin,-1);
 +
 +  ntau_t = str_nelem(tau_t,MAXPTR,ptr1);
 +  nref_t = str_nelem(ref_t,MAXPTR,ptr2);
 +  ntcg   = str_nelem(tcgrps,MAXPTR,ptr3);
 +  if ((ntau_t != ntcg) || (nref_t != ntcg)) {
 +    gmx_fatal(FARGS,"Invalid T coupling input: %d groups, %d ref-t values and "
 +                "%d tau-t values",ntcg,nref_t,ntau_t);
 +  }
 +
 +  bSetTCpar = (ir->etc || EI_SD(ir->eI) || ir->eI==eiBD || EI_TPI(ir->eI));
 +  do_numbering(natoms,groups,ntcg,ptr3,grps,gnames,egcTC,
 +               restnm,bSetTCpar ? egrptpALL : egrptpALL_GENREST,bVerbose,wi);
 +  nr = groups->grps[egcTC].nr;
 +  ir->opts.ngtc = nr;
 +  snew(ir->opts.nrdf,nr);
 +  snew(ir->opts.tau_t,nr);
 +  snew(ir->opts.ref_t,nr);
 +  if (ir->eI==eiBD && ir->bd_fric==0) {
 +    fprintf(stderr,"bd-fric=0, so tau-t will be used as the inverse friction constant(s)\n");
 +  }
 +
 +  if (bSetTCpar)
 +  {
 +      if (nr != nref_t)
 +      {
 +          gmx_fatal(FARGS,"Not enough ref-t and tau-t values!");
 +      }
 +      
 +      tau_min = 1e20;
 +      for(i=0; (i<nr); i++)
 +      {
 +          ir->opts.tau_t[i] = strtod(ptr1[i],NULL);
 +          if ((ir->eI == eiBD || ir->eI == eiSD2) && ir->opts.tau_t[i] <= 0)
 +          {
 +              sprintf(warn_buf,"With integrator %s tau-t should be larger than 0",ei_names[ir->eI]);
 +              warning_error(wi,warn_buf);
 +          }
 +
 +          if (ir->etc != etcVRESCALE && ir->opts.tau_t[i] == 0)
 +          {
 +              warning_note(wi,"tau-t = -1 is the value to signal that a group should not have temperature coupling. Treating your use of tau-t = 0 as if you used -1.");
 +          }
 +
 +          if (ir->opts.tau_t[i] >= 0)
 +          {
 +              tau_min = min(tau_min,ir->opts.tau_t[i]);
 +          }
 +      }
 +      if (ir->etc != etcNO && ir->nsttcouple == -1)
 +      {
 +            ir->nsttcouple = ir_optimal_nsttcouple(ir);
 +      }
 +
 +      if (EI_VV(ir->eI)) 
 +      {
 +          if ((ir->etc==etcNOSEHOOVER) && (ir->epc==epcBERENDSEN)) {
 +              gmx_fatal(FARGS,"Cannot do Nose-Hoover temperature with Berendsen pressure control with md-vv; use either vrescale temperature with berendsen pressure or Nose-Hoover temperature with MTTK pressure");
 +          }
 +          if ((ir->epc==epcMTTK) && (ir->etc>etcNO))
 +          {
 +              int mincouple;
 +              mincouple = ir->nsttcouple;
 +              if (ir->nstpcouple < mincouple)
 +              {
 +                  mincouple = ir->nstpcouple;
 +              }
 +              ir->nstpcouple = mincouple;
 +              ir->nsttcouple = mincouple;
 +              sprintf(warn_buf,"for current Trotter decomposition methods with vv, nsttcouple and nstpcouple must be equal.  Both have been reset to min(nsttcouple,nstpcouple) = %d",mincouple);
 +              warning_note(wi,warn_buf);
 +          }
 +      }
 +      /* velocity verlet with averaged kinetic energy KE = 0.5*(v(t+1/2) - v(t-1/2)) is implemented
 +         primarily for testing purposes, and does not work with temperature coupling other than 1 */
 +
 +      if (ETC_ANDERSEN(ir->etc)) {
 +          if (ir->nsttcouple != 1) {
 +              ir->nsttcouple = 1;
 +              sprintf(warn_buf,"Andersen temperature control methods assume nsttcouple = 1; there is no need for larger nsttcouple > 1, since no global parameters are computed. nsttcouple has been reset to 1");
 +              warning_note(wi,warn_buf);
 +          }
 +      }
 +      nstcmin = tcouple_min_integration_steps(ir->etc);
 +      if (nstcmin > 1)
 +      {
 +          if (tau_min/(ir->delta_t*ir->nsttcouple) < nstcmin)
 +          {
 +              sprintf(warn_buf,"For proper integration of the %s thermostat, tau-t (%g) should be at least %d times larger than nsttcouple*dt (%g)",
 +                      ETCOUPLTYPE(ir->etc),
 +                      tau_min,nstcmin,
 +                      ir->nsttcouple*ir->delta_t);
 +              warning(wi,warn_buf);
 +          }
 +      }
 +      for(i=0; (i<nr); i++)
 +      {
 +          ir->opts.ref_t[i] = strtod(ptr2[i],NULL);
 +          if (ir->opts.ref_t[i] < 0)
 +          {
 +              gmx_fatal(FARGS,"ref-t for group %d negative",i);
 +          }
 +      }
 +      /* set the lambda mc temperature to the md integrator temperature (which should be defined
 +         if we are in this conditional) if mc_temp is negative */
 +      if (ir->expandedvals->mc_temp < 0)
 +      {
 +          ir->expandedvals->mc_temp = ir->opts.ref_t[0];  /*for now, set to the first reft */
 +      }
 +  }
 +
 +  /* Simulated annealing for each group. There are nr groups */
 +  nSA = str_nelem(anneal,MAXPTR,ptr1);
 +  if (nSA == 1 && (ptr1[0][0]=='n' || ptr1[0][0]=='N'))
 +     nSA = 0;
 +  if(nSA>0 && nSA != nr) 
 +    gmx_fatal(FARGS,"Not enough annealing values: %d (for %d groups)\n",nSA,nr);
 +  else {
 +    snew(ir->opts.annealing,nr);
 +    snew(ir->opts.anneal_npoints,nr);
 +    snew(ir->opts.anneal_time,nr);
 +    snew(ir->opts.anneal_temp,nr);
 +    for(i=0;i<nr;i++) {
 +      ir->opts.annealing[i]=eannNO;
 +      ir->opts.anneal_npoints[i]=0;
 +      ir->opts.anneal_time[i]=NULL;
 +      ir->opts.anneal_temp[i]=NULL;
 +    }
 +    if (nSA > 0) {
 +      bAnneal=FALSE;
 +      for(i=0;i<nr;i++) { 
 +      if(ptr1[i][0]=='n' || ptr1[i][0]=='N') {
 +        ir->opts.annealing[i]=eannNO;
 +      } else if(ptr1[i][0]=='s'|| ptr1[i][0]=='S') {
 +        ir->opts.annealing[i]=eannSINGLE;
 +        bAnneal=TRUE;
 +      } else if(ptr1[i][0]=='p'|| ptr1[i][0]=='P') {
 +        ir->opts.annealing[i]=eannPERIODIC;
 +        bAnneal=TRUE;
 +      } 
 +      } 
 +      if(bAnneal) {
 +      /* Read the other fields too */
 +      nSA_points = str_nelem(anneal_npoints,MAXPTR,ptr1);
 +      if(nSA_points!=nSA) 
 +          gmx_fatal(FARGS,"Found %d annealing-npoints values for %d groups\n",nSA_points,nSA);
 +      for(k=0,i=0;i<nr;i++) {
 +        ir->opts.anneal_npoints[i]=strtol(ptr1[i],NULL,10);
 +        if(ir->opts.anneal_npoints[i]==1)
 +          gmx_fatal(FARGS,"Please specify at least a start and an end point for annealing\n");
 +        snew(ir->opts.anneal_time[i],ir->opts.anneal_npoints[i]);
 +        snew(ir->opts.anneal_temp[i],ir->opts.anneal_npoints[i]);
 +        k += ir->opts.anneal_npoints[i];
 +      }
 +
 +      nSA_time = str_nelem(anneal_time,MAXPTR,ptr1);
 +      if(nSA_time!=k) 
 +          gmx_fatal(FARGS,"Found %d annealing-time values, wanter %d\n",nSA_time,k);
 +      nSA_temp = str_nelem(anneal_temp,MAXPTR,ptr2);
 +      if(nSA_temp!=k) 
 +          gmx_fatal(FARGS,"Found %d annealing-temp values, wanted %d\n",nSA_temp,k);
 +
 +      for(i=0,k=0;i<nr;i++) {
 +        
 +        for(j=0;j<ir->opts.anneal_npoints[i];j++) {
 +          ir->opts.anneal_time[i][j]=strtod(ptr1[k],NULL);
 +          ir->opts.anneal_temp[i][j]=strtod(ptr2[k],NULL);
 +          if(j==0) {
 +            if(ir->opts.anneal_time[i][0] > (ir->init_t+GMX_REAL_EPS))
 +              gmx_fatal(FARGS,"First time point for annealing > init_t.\n");      
 +          } else { 
 +            /* j>0 */
 +            if(ir->opts.anneal_time[i][j]<ir->opts.anneal_time[i][j-1])
 +              gmx_fatal(FARGS,"Annealing timepoints out of order: t=%f comes after t=%f\n",
 +                          ir->opts.anneal_time[i][j],ir->opts.anneal_time[i][j-1]);
 +          }
 +          if(ir->opts.anneal_temp[i][j]<0) 
 +            gmx_fatal(FARGS,"Found negative temperature in annealing: %f\n",ir->opts.anneal_temp[i][j]);    
 +          k++;
 +        }
 +      }
 +      /* Print out some summary information, to make sure we got it right */
 +      for(i=0,k=0;i<nr;i++) {
 +        if(ir->opts.annealing[i]!=eannNO) {
 +          j = groups->grps[egcTC].nm_ind[i];
 +          fprintf(stderr,"Simulated annealing for group %s: %s, %d timepoints\n",
 +                  *(groups->grpname[j]),eann_names[ir->opts.annealing[i]],
 +                  ir->opts.anneal_npoints[i]);
 +          fprintf(stderr,"Time (ps)   Temperature (K)\n");
 +          /* All terms except the last one */
 +          for(j=0;j<(ir->opts.anneal_npoints[i]-1);j++) 
 +              fprintf(stderr,"%9.1f      %5.1f\n",ir->opts.anneal_time[i][j],ir->opts.anneal_temp[i][j]);
 +          
 +          /* Finally the last one */
 +          j = ir->opts.anneal_npoints[i]-1;
 +          if(ir->opts.annealing[i]==eannSINGLE)
 +            fprintf(stderr,"%9.1f-     %5.1f\n",ir->opts.anneal_time[i][j],ir->opts.anneal_temp[i][j]);
 +          else {
 +            fprintf(stderr,"%9.1f      %5.1f\n",ir->opts.anneal_time[i][j],ir->opts.anneal_temp[i][j]);
 +            if(fabs(ir->opts.anneal_temp[i][j]-ir->opts.anneal_temp[i][0])>GMX_REAL_EPS)
 +              warning_note(wi,"There is a temperature jump when your annealing loops back.\n");
 +          }
 +        }
 +      } 
 +      }
 +    }
 +  }   
 +
 +  if (ir->ePull != epullNO) {
 +    make_pull_groups(ir->pull,pull_grp,grps,gnames);
 +  }
 +  
 +  if (ir->bRot) {
 +    make_rotation_groups(ir->rot,rot_grp,grps,gnames);
 +  }
 +
 +  nacc = str_nelem(acc,MAXPTR,ptr1);
 +  nacg = str_nelem(accgrps,MAXPTR,ptr2);
 +  if (nacg*DIM != nacc)
 +    gmx_fatal(FARGS,"Invalid Acceleration input: %d groups and %d acc. values",
 +              nacg,nacc);
 +  do_numbering(natoms,groups,nacg,ptr2,grps,gnames,egcACC,
 +               restnm,egrptpALL_GENREST,bVerbose,wi);
 +  nr = groups->grps[egcACC].nr;
 +  snew(ir->opts.acc,nr);
 +  ir->opts.ngacc=nr;
 +  
 +  for(i=k=0; (i<nacg); i++)
 +    for(j=0; (j<DIM); j++,k++)
 +      ir->opts.acc[i][j]=strtod(ptr1[k],NULL);
 +  for( ;(i<nr); i++)
 +    for(j=0; (j<DIM); j++)
 +      ir->opts.acc[i][j]=0;
 +  
 +  nfrdim  = str_nelem(frdim,MAXPTR,ptr1);
 +  nfreeze = str_nelem(freeze,MAXPTR,ptr2);
 +  if (nfrdim != DIM*nfreeze)
 +    gmx_fatal(FARGS,"Invalid Freezing input: %d groups and %d freeze values",
 +              nfreeze,nfrdim);
 +  do_numbering(natoms,groups,nfreeze,ptr2,grps,gnames,egcFREEZE,
 +               restnm,egrptpALL_GENREST,bVerbose,wi);
 +  nr = groups->grps[egcFREEZE].nr;
 +  ir->opts.ngfrz=nr;
 +  snew(ir->opts.nFreeze,nr);
 +  for(i=k=0; (i<nfreeze); i++)
 +    for(j=0; (j<DIM); j++,k++) {
 +      ir->opts.nFreeze[i][j]=(gmx_strncasecmp(ptr1[k],"Y",1)==0);
 +      if (!ir->opts.nFreeze[i][j]) {
 +      if (gmx_strncasecmp(ptr1[k],"N",1) != 0) {
 +        sprintf(warnbuf,"Please use Y(ES) or N(O) for freezedim only "
 +                "(not %s)", ptr1[k]);
 +        warning(wi,warn_buf);
 +      }
 +      }
 +    }
 +  for( ; (i<nr); i++)
 +    for(j=0; (j<DIM); j++)
 +      ir->opts.nFreeze[i][j]=0;
 +  
 +  nenergy=str_nelem(energy,MAXPTR,ptr1);
 +  do_numbering(natoms,groups,nenergy,ptr1,grps,gnames,egcENER,
 +               restnm,egrptpALL_GENREST,bVerbose,wi);
 +  add_wall_energrps(groups,ir->nwall,symtab);
 +  ir->opts.ngener = groups->grps[egcENER].nr;
 +  nvcm=str_nelem(vcm,MAXPTR,ptr1);
 +  bRest =
 +    do_numbering(natoms,groups,nvcm,ptr1,grps,gnames,egcVCM,
 +                 restnm,nvcm==0 ? egrptpALL_GENREST : egrptpPART,bVerbose,wi);
 +  if (bRest) {
 +    warning(wi,"Some atoms are not part of any center of mass motion removal group.\n"
 +          "This may lead to artifacts.\n"
 +          "In most cases one should use one group for the whole system.");
 +  }
 +
 +  /* Now we have filled the freeze struct, so we can calculate NRDF */ 
 +  calc_nrdf(mtop,ir,gnames);
 +
 +  if (v && NULL) {
 +    real fac,ntot=0;
 +    
 +    /* Must check per group! */
 +    for(i=0; (i<ir->opts.ngtc); i++) 
 +      ntot += ir->opts.nrdf[i];
 +    if (ntot != (DIM*natoms)) {
 +      fac = sqrt(ntot/(DIM*natoms));
 +      if (bVerbose)
 +      fprintf(stderr,"Scaling velocities by a factor of %.3f to account for constraints\n"
 +              "and removal of center of mass motion\n",fac);
 +      for(i=0; (i<natoms); i++)
 +      svmul(fac,v[i],v[i]);
 +    }
 +  }
 +  
 +  nuser=str_nelem(user1,MAXPTR,ptr1);
 +  do_numbering(natoms,groups,nuser,ptr1,grps,gnames,egcUser1,
 +               restnm,egrptpALL_GENREST,bVerbose,wi);
 +  nuser=str_nelem(user2,MAXPTR,ptr1);
 +  do_numbering(natoms,groups,nuser,ptr1,grps,gnames,egcUser2,
 +               restnm,egrptpALL_GENREST,bVerbose,wi);
 +  nuser=str_nelem(xtc_grps,MAXPTR,ptr1);
 +  do_numbering(natoms,groups,nuser,ptr1,grps,gnames,egcXTC,
 +               restnm,egrptpONE,bVerbose,wi);
 +  nofg = str_nelem(orirefitgrp,MAXPTR,ptr1);
 +  do_numbering(natoms,groups,nofg,ptr1,grps,gnames,egcORFIT,
 +               restnm,egrptpALL_GENREST,bVerbose,wi);
 +
 +  /* QMMM input processing */
 +  nQMg          = str_nelem(QMMM,MAXPTR,ptr1);
 +  nQMmethod     = str_nelem(QMmethod,MAXPTR,ptr2);
 +  nQMbasis      = str_nelem(QMbasis,MAXPTR,ptr3);
 +  if((nQMmethod != nQMg)||(nQMbasis != nQMg)){
 +    gmx_fatal(FARGS,"Invalid QMMM input: %d groups %d basissets"
 +            " and %d methods\n",nQMg,nQMbasis,nQMmethod);
 +  }
 +  /* group rest, if any, is always MM! */
 +  do_numbering(natoms,groups,nQMg,ptr1,grps,gnames,egcQMMM,
 +               restnm,egrptpALL_GENREST,bVerbose,wi);
 +  nr = nQMg; /*atoms->grps[egcQMMM].nr;*/
 +  ir->opts.ngQM = nQMg;
 +  snew(ir->opts.QMmethod,nr);
 +  snew(ir->opts.QMbasis,nr);
 +  for(i=0;i<nr;i++){
 +    /* input consists of strings: RHF CASSCF PM3 .. These need to be
 +     * converted to the corresponding enum in names.c
 +     */
 +    ir->opts.QMmethod[i] = search_QMstring(ptr2[i],eQMmethodNR,
 +                                           eQMmethod_names);
 +    ir->opts.QMbasis[i]  = search_QMstring(ptr3[i],eQMbasisNR,
 +                                           eQMbasis_names);
 +
 +  }
 +  nQMmult   = str_nelem(QMmult,MAXPTR,ptr1);
 +  nQMcharge = str_nelem(QMcharge,MAXPTR,ptr2);
 +  nbSH      = str_nelem(bSH,MAXPTR,ptr3);
 +  snew(ir->opts.QMmult,nr);
 +  snew(ir->opts.QMcharge,nr);
 +  snew(ir->opts.bSH,nr);
 +
 +  for(i=0;i<nr;i++){
 +    ir->opts.QMmult[i]   = strtol(ptr1[i],NULL,10);
 +    ir->opts.QMcharge[i] = strtol(ptr2[i],NULL,10);
 +    ir->opts.bSH[i]      = (gmx_strncasecmp(ptr3[i],"Y",1)==0);
 +  }
 +
 +  nCASelec  = str_nelem(CASelectrons,MAXPTR,ptr1);
 +  nCASorb   = str_nelem(CASorbitals,MAXPTR,ptr2);
 +  snew(ir->opts.CASelectrons,nr);
 +  snew(ir->opts.CASorbitals,nr);
 +  for(i=0;i<nr;i++){
 +    ir->opts.CASelectrons[i]= strtol(ptr1[i],NULL,10);
 +    ir->opts.CASorbitals[i] = strtol(ptr2[i],NULL,10);
 +  }
 +  /* special optimization options */
 +
 +  nbOPT = str_nelem(bOPT,MAXPTR,ptr1);
 +  nbTS = str_nelem(bTS,MAXPTR,ptr2);
 +  snew(ir->opts.bOPT,nr);
 +  snew(ir->opts.bTS,nr);
 +  for(i=0;i<nr;i++){
 +    ir->opts.bOPT[i] = (gmx_strncasecmp(ptr1[i],"Y",1)==0);
 +    ir->opts.bTS[i]  = (gmx_strncasecmp(ptr2[i],"Y",1)==0);
 +  }
 +  nSAon     = str_nelem(SAon,MAXPTR,ptr1);
 +  nSAoff    = str_nelem(SAoff,MAXPTR,ptr2);
 +  nSAsteps  = str_nelem(SAsteps,MAXPTR,ptr3);
 +  snew(ir->opts.SAon,nr);
 +  snew(ir->opts.SAoff,nr);
 +  snew(ir->opts.SAsteps,nr);
 +
 +  for(i=0;i<nr;i++){
 +    ir->opts.SAon[i]    = strtod(ptr1[i],NULL);
 +    ir->opts.SAoff[i]   = strtod(ptr2[i],NULL);
 +    ir->opts.SAsteps[i] = strtol(ptr3[i],NULL,10);
 +  }
 +  /* end of QMMM input */
 +
 +  if (bVerbose)
 +    for(i=0; (i<egcNR); i++) {
 +      fprintf(stderr,"%-16s has %d element(s):",gtypes[i],groups->grps[i].nr); 
 +      for(j=0; (j<groups->grps[i].nr); j++)
 +      fprintf(stderr," %s",*(groups->grpname[groups->grps[i].nm_ind[j]]));
 +      fprintf(stderr,"\n");
 +    }
 +
 +  nr = groups->grps[egcENER].nr;
 +  snew(ir->opts.egp_flags,nr*nr);
 +
 +  bExcl = do_egp_flag(ir,groups,"energygrp-excl",egpexcl,EGP_EXCL);
 +    if (bExcl && ir->cutoff_scheme == ecutsVERLET) 
 +    {
 +        warning_error(wi,"Energy group exclusions are not (yet) implemented for the Verlet scheme");
 +    } 
 +  if (bExcl && EEL_FULL(ir->coulombtype))
 +    warning(wi,"Can not exclude the lattice Coulomb energy between energy groups");
 +
 +  bTable = do_egp_flag(ir,groups,"energygrp-table",egptable,EGP_TABLE);
 +  if (bTable && !(ir->vdwtype == evdwUSER) && 
 +      !(ir->coulombtype == eelUSER) && !(ir->coulombtype == eelPMEUSER) &&
 +      !(ir->coulombtype == eelPMEUSERSWITCH))
 +    gmx_fatal(FARGS,"Can only have energy group pair tables in combination with user tables for VdW and/or Coulomb");
 +
 +  decode_cos(efield_x,&(ir->ex[XX]),FALSE);
 +  decode_cos(efield_xt,&(ir->et[XX]),TRUE);
 +  decode_cos(efield_y,&(ir->ex[YY]),FALSE);
 +  decode_cos(efield_yt,&(ir->et[YY]),TRUE);
 +  decode_cos(efield_z,&(ir->ex[ZZ]),FALSE);
 +  decode_cos(efield_zt,&(ir->et[ZZ]),TRUE);
 +
 +  if (ir->bAdress)
 +    do_adress_index(ir->adress,groups,gnames,&(ir->opts),wi);
 +
 +  for(i=0; (i<grps->nr); i++)
 +    sfree(gnames[i]);
 +  sfree(gnames);
 +  done_blocka(grps);
 +  sfree(grps);
 +
 +}
 +
 +
 +
 +static void check_disre(gmx_mtop_t *mtop)
 +{
 +  gmx_ffparams_t *ffparams;
 +  t_functype *functype;
 +  t_iparams  *ip;
 +  int i,ndouble,ftype;
 +  int label,old_label;
 +  
 +  if (gmx_mtop_ftype_count(mtop,F_DISRES) > 0) {
 +    ffparams  = &mtop->ffparams;
 +    functype  = ffparams->functype;
 +    ip        = ffparams->iparams;
 +    ndouble   = 0;
 +    old_label = -1;
 +    for(i=0; i<ffparams->ntypes; i++) {
 +      ftype = functype[i];
 +      if (ftype == F_DISRES) {
 +      label = ip[i].disres.label;
 +      if (label == old_label) {
 +        fprintf(stderr,"Distance restraint index %d occurs twice\n",label);
 +        ndouble++;
 +      }
 +      old_label = label;
 +      }
 +    }
 +    if (ndouble>0)
 +      gmx_fatal(FARGS,"Found %d double distance restraint indices,\n"
 +              "probably the parameters for multiple pairs in one restraint "
 +              "are not identical\n",ndouble);
 +  }
 +}
 +
 +static gmx_bool absolute_reference(t_inputrec *ir,gmx_mtop_t *sys,
 +                                   gmx_bool posres_only,
 +                                   ivec AbsRef)
 +{
 +    int d,g,i;
 +    gmx_mtop_ilistloop_t iloop;
 +    t_ilist *ilist;
 +    int nmol;
 +    t_iparams *pr;
 +
 +    clear_ivec(AbsRef);
 +
 +    if (!posres_only)
 +    {
 +        /* Check the COM */
 +        for(d=0; d<DIM; d++)
 +        {
 +            AbsRef[d] = (d < ndof_com(ir) ? 0 : 1);
 +        }
 +        /* Check for freeze groups */
 +        for(g=0; g<ir->opts.ngfrz; g++)
 +        {
 +            for(d=0; d<DIM; d++)
 +            {
 +                if (ir->opts.nFreeze[g][d] != 0)
 +                {
 +                    AbsRef[d] = 1;
 +                }
 +            }
 +        }
 +    }
 +
 +    /* Check for position restraints */
 +    iloop = gmx_mtop_ilistloop_init(sys);
 +    while (gmx_mtop_ilistloop_next(iloop,&ilist,&nmol))
 +    {
 +        if (nmol > 0 &&
 +            (AbsRef[XX] == 0 || AbsRef[YY] == 0 || AbsRef[ZZ] == 0))
 +        {
 +            for(i=0; i<ilist[F_POSRES].nr; i+=2)
 +            {
 +                pr = &sys->ffparams.iparams[ilist[F_POSRES].iatoms[i]];
 +                for(d=0; d<DIM; d++)
 +                {
 +                    if (pr->posres.fcA[d] != 0)
 +                    {
 +                        AbsRef[d] = 1;
 +                    }
 +                }
 +            }
 +            for(i=0; i<ilist[F_FBPOSRES].nr; i+=2)
 +            {
 +                /* Check for flat-bottom posres */
 +                pr = &sys->ffparams.iparams[ilist[F_FBPOSRES].iatoms[i]];
 +                if (pr->fbposres.k != 0)
 +                {
 +                    switch(pr->fbposres.geom)
 +                    {
 +                    case efbposresSPHERE:
 +                        AbsRef[XX] = AbsRef[YY] = AbsRef[ZZ] = 1;
 +                        break;
 +                    case efbposresCYLINDER:
 +                        AbsRef[XX] = AbsRef[YY] = 1;
 +                        break;
 +                    case efbposresX: /* d=XX */
 +                    case efbposresY: /* d=YY */
 +                    case efbposresZ: /* d=ZZ */
 +                        d = pr->fbposres.geom - efbposresX;
 +                        AbsRef[d] = 1;
 +                        break;
 +                    default:
 +                        gmx_fatal(FARGS," Invalid geometry for flat-bottom position restraint.\n"
 +                                  "Expected nr between 1 and %d. Found %d\n", efbposresNR-1,
 +                                  pr->fbposres.geom);
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    return (AbsRef[XX] != 0 && AbsRef[YY] != 0 && AbsRef[ZZ] != 0);
 +}
 +
 +void triple_check(const char *mdparin,t_inputrec *ir,gmx_mtop_t *sys,
 +                  warninp_t wi)
 +{
 +  char err_buf[256];
 +  int  i,m,g,nmol,npct;
 +  gmx_bool bCharge,bAcc;
 +  real gdt_max,*mgrp,mt;
 +  rvec acc;
 +  gmx_mtop_atomloop_block_t aloopb;
 +  gmx_mtop_atomloop_all_t aloop;
 +  t_atom *atom;
 +  ivec AbsRef;
 +  char warn_buf[STRLEN];
 +
 +  set_warning_line(wi,mdparin,-1);
 +
 +  if (EI_DYNAMICS(ir->eI) && !EI_SD(ir->eI) && ir->eI != eiBD &&
 +      ir->comm_mode == ecmNO &&
 +      !(absolute_reference(ir,sys,FALSE,AbsRef) || ir->nsteps <= 10)) {
 +    warning(wi,"You are not using center of mass motion removal (mdp option comm-mode), numerical rounding errors can lead to build up of kinetic energy of the center of mass");
 +  }
 +
 +    /* Check for pressure coupling with absolute position restraints */
 +    if (ir->epc != epcNO && ir->refcoord_scaling == erscNO)
 +    {
 +        absolute_reference(ir,sys,TRUE,AbsRef);
 +        {
 +            for(m=0; m<DIM; m++)
 +            {
 +                if (AbsRef[m] && norm2(ir->compress[m]) > 0)
 +                {
 +                    warning(wi,"You are using pressure coupling with absolute position restraints, this will give artifacts. Use the refcoord_scaling option.");
 +                    break;
 +                }
 +            }
 +        }
 +    }
 +
 +  bCharge = FALSE;
 +  aloopb = gmx_mtop_atomloop_block_init(sys);
 +  while (gmx_mtop_atomloop_block_next(aloopb,&atom,&nmol)) {
 +    if (atom->q != 0 || atom->qB != 0) {
 +      bCharge = TRUE;
 +    }
 +  }
 +  
 +  if (!bCharge) {
 +    if (EEL_FULL(ir->coulombtype)) {
 +      sprintf(err_buf,
 +            "You are using full electrostatics treatment %s for a system without charges.\n"
 +            "This costs a lot of performance for just processing zeros, consider using %s instead.\n",
 +            EELTYPE(ir->coulombtype),EELTYPE(eelCUT));
 +      warning(wi,err_buf);
 +    }
 +  } else {
 +    if (ir->coulombtype == eelCUT && ir->rcoulomb > 0 && !ir->implicit_solvent) {
 +      sprintf(err_buf,
 +            "You are using a plain Coulomb cut-off, which might produce artifacts.\n"
 +            "You might want to consider using %s electrostatics.\n",
 +            EELTYPE(eelPME));
 +      warning_note(wi,err_buf);
 +    }
 +  }
 +
 +  /* Generalized reaction field */  
 +  if (ir->opts.ngtc == 0) {
 +    sprintf(err_buf,"No temperature coupling while using coulombtype %s",
 +          eel_names[eelGRF]);
 +    CHECK(ir->coulombtype == eelGRF);
 +  }
 +  else {
 +    sprintf(err_buf,"When using coulombtype = %s"
 +          " ref-t for temperature coupling should be > 0",
 +          eel_names[eelGRF]);
 +    CHECK((ir->coulombtype == eelGRF) && (ir->opts.ref_t[0] <= 0));
 +  }
 +
 +    if (ir->eI == eiSD1 &&
 +        (gmx_mtop_ftype_count(sys,F_CONSTR) > 0 ||
 +         gmx_mtop_ftype_count(sys,F_SETTLE) > 0))
 +    {
 +        sprintf(warn_buf,"With constraints integrator %s is less accurate, consider using %s instead",ei_names[ir->eI],ei_names[eiSD2]);
 +        warning_note(wi,warn_buf);
 +    }
 +    
 +  bAcc = FALSE;
 +  for(i=0; (i<sys->groups.grps[egcACC].nr); i++) {
 +    for(m=0; (m<DIM); m++) {
 +      if (fabs(ir->opts.acc[i][m]) > 1e-6) {
 +      bAcc = TRUE;
 +      }
 +    }
 +  }
 +  if (bAcc) {
 +    clear_rvec(acc);
 +    snew(mgrp,sys->groups.grps[egcACC].nr);
 +    aloop = gmx_mtop_atomloop_all_init(sys);
 +    while (gmx_mtop_atomloop_all_next(aloop,&i,&atom)) {
 +      mgrp[ggrpnr(&sys->groups,egcACC,i)] += atom->m;
 +    }
 +    mt = 0.0;
 +    for(i=0; (i<sys->groups.grps[egcACC].nr); i++) {
 +      for(m=0; (m<DIM); m++)
 +      acc[m] += ir->opts.acc[i][m]*mgrp[i];
 +      mt += mgrp[i];
 +    }
 +    for(m=0; (m<DIM); m++) {
 +      if (fabs(acc[m]) > 1e-6) {
 +      const char *dim[DIM] = { "X", "Y", "Z" };
 +      fprintf(stderr,
 +              "Net Acceleration in %s direction, will %s be corrected\n",
 +              dim[m],ir->nstcomm != 0 ? "" : "not");
 +      if (ir->nstcomm != 0 && m < ndof_com(ir)) {
 +        acc[m] /= mt;
 +        for (i=0; (i<sys->groups.grps[egcACC].nr); i++)
 +          ir->opts.acc[i][m] -= acc[m];
 +      }
 +      }
 +    }
 +    sfree(mgrp);
 +  }
 +
 +  if (ir->efep != efepNO && ir->fepvals->sc_alpha != 0 &&
 +      !gmx_within_tol(sys->ffparams.reppow,12.0,10*GMX_DOUBLE_EPS)) {
 +    gmx_fatal(FARGS,"Soft-core interactions are only supported with VdW repulsion power 12");
 +  }
 +
 +  if (ir->ePull != epullNO) {
 +    if (ir->pull->grp[0].nat == 0) {
 +        absolute_reference(ir,sys,FALSE,AbsRef);
 +      for(m=0; m<DIM; m++) {
 +      if (ir->pull->dim[m] && !AbsRef[m]) {
 +        warning(wi,"You are using an absolute reference for pulling, but the rest of the system does not have an absolute reference. This will lead to artifacts.");
 +        break;
 +      }
 +      }
 +    }
 +
 +    if (ir->pull->eGeom == epullgDIRPBC) {
 +      for(i=0; i<3; i++) {
 +      for(m=0; m<=i; m++) {
 +        if ((ir->epc != epcNO && ir->compress[i][m] != 0) ||
 +            ir->deform[i][m] != 0) {
 +          for(g=1; g<ir->pull->ngrp; g++) {
 +            if (ir->pull->grp[g].vec[m] != 0) {
 +              gmx_fatal(FARGS,"Can not have dynamic box while using pull geometry '%s' (dim %c)",EPULLGEOM(ir->pull->eGeom),'x'+m);
 +            }
 +          }
 +        }
 +      }
 +      }
 +    }
 +  }
 +
 +  check_disre(sys);
 +}
 +
 +void double_check(t_inputrec *ir,matrix box,gmx_bool bConstr,warninp_t wi)
 +{
 +  real min_size;
 +  gmx_bool bTWIN;
 +  char warn_buf[STRLEN];
 +  const char *ptr;
 +  
 +  ptr = check_box(ir->ePBC,box);
 +  if (ptr) {
 +      warning_error(wi,ptr);
 +  }  
 +
 +  if (bConstr && ir->eConstrAlg == econtSHAKE) {
 +    if (ir->shake_tol <= 0.0) {
 +      sprintf(warn_buf,"ERROR: shake-tol must be > 0 instead of %g\n",
 +              ir->shake_tol);
 +      warning_error(wi,warn_buf);
 +    }
 +
 +    if (IR_TWINRANGE(*ir) && ir->nstlist > 1) {
 +      sprintf(warn_buf,"With twin-range cut-off's and SHAKE the virial and the pressure are incorrect.");
 +      if (ir->epc == epcNO) {
 +      warning(wi,warn_buf);
 +      } else {
 +          warning_error(wi,warn_buf);
 +      }
 +    }
 +  }
 +
 +  if( (ir->eConstrAlg == econtLINCS) && bConstr) {
 +    /* If we have Lincs constraints: */
 +    if(ir->eI==eiMD && ir->etc==etcNO &&
 +       ir->eConstrAlg==econtLINCS && ir->nLincsIter==1) {
 +      sprintf(warn_buf,"For energy conservation with LINCS, lincs_iter should be 2 or larger.\n");
 +      warning_note(wi,warn_buf);
 +    }
 +    
 +    if ((ir->eI == eiCG || ir->eI == eiLBFGS) && (ir->nProjOrder<8)) {
 +      sprintf(warn_buf,"For accurate %s with LINCS constraints, lincs-order should be 8 or more.",ei_names[ir->eI]);
 +      warning_note(wi,warn_buf);
 +    }
 +    if (ir->epc==epcMTTK) {
 +        warning_error(wi,"MTTK not compatible with lincs -- use shake instead.");
 +    }
 +  }
 +
 +  if (ir->LincsWarnAngle > 90.0) {
 +    sprintf(warn_buf,"lincs-warnangle can not be larger than 90 degrees, setting it to 90.\n");
 +    warning(wi,warn_buf);
 +    ir->LincsWarnAngle = 90.0;
 +  }
 +
 +  if (ir->ePBC != epbcNONE) {
 +    if (ir->nstlist == 0) {
 +      warning(wi,"With nstlist=0 atoms are only put into the box at step 0, therefore drifting atoms might cause the simulation to crash.");
 +    }
 +    bTWIN = (ir->rlistlong > ir->rlist);
 +    if (ir->ns_type == ensGRID) {
 +      if (sqr(ir->rlistlong) >= max_cutoff2(ir->ePBC,box)) {
 +          sprintf(warn_buf,"ERROR: The cut-off length is longer than half the shortest box vector or longer than the smallest box diagonal element. Increase the box size or decrease %s.\n",
 +              bTWIN ? (ir->rcoulomb==ir->rlistlong ? "rcoulomb" : "rvdw"):"rlist");
 +          warning_error(wi,warn_buf);
 +      }
 +    } else {
 +      min_size = min(box[XX][XX],min(box[YY][YY],box[ZZ][ZZ]));
 +      if (2*ir->rlistlong >= min_size) {
 +          sprintf(warn_buf,"ERROR: One of the box lengths is smaller than twice the cut-off length. Increase the box size or decrease rlist.");
 +          warning_error(wi,warn_buf);
 +      if (TRICLINIC(box))
 +        fprintf(stderr,"Grid search might allow larger cut-off's than simple search with triclinic boxes.");
 +      }
 +    }
 +  }
 +}
 +
 +void check_chargegroup_radii(const gmx_mtop_t *mtop,const t_inputrec *ir,
 +                             rvec *x,
 +                             warninp_t wi)
 +{
 +    real rvdw1,rvdw2,rcoul1,rcoul2;
 +    char warn_buf[STRLEN];
 +
 +    calc_chargegroup_radii(mtop,x,&rvdw1,&rvdw2,&rcoul1,&rcoul2);
 +
 +    if (rvdw1 > 0)
 +    {
 +        printf("Largest charge group radii for Van der Waals: %5.3f, %5.3f nm\n",
 +               rvdw1,rvdw2);
 +    }
 +    if (rcoul1 > 0)
 +    {
 +        printf("Largest charge group radii for Coulomb:       %5.3f, %5.3f nm\n",
 +               rcoul1,rcoul2);
 +    }
 +
 +    if (ir->rlist > 0)
 +    {
 +        if (rvdw1  + rvdw2  > ir->rlist ||
 +            rcoul1 + rcoul2 > ir->rlist)
 +        {
 +            sprintf(warn_buf,"The sum of the two largest charge group radii (%f) is larger than rlist (%f)\n",max(rvdw1+rvdw2,rcoul1+rcoul2),ir->rlist);
 +            warning(wi,warn_buf);
 +        }
 +        else
 +        {
 +            /* Here we do not use the zero at cut-off macro,
 +             * since user defined interactions might purposely
 +             * not be zero at the cut-off.
 +             */
 +            if (EVDW_IS_ZERO_AT_CUTOFF(ir->vdwtype) &&
 +                rvdw1 + rvdw2 > ir->rlist - ir->rvdw)
 +            {
 +                sprintf(warn_buf,"The sum of the two largest charge group radii (%f) is larger than rlist (%f) - rvdw (%f)\n",
 +                        rvdw1+rvdw2,
 +                        ir->rlist,ir->rvdw);
 +                if (ir_NVE(ir))
 +                {
 +                    warning(wi,warn_buf);
 +                }
 +                else
 +                {
 +                    warning_note(wi,warn_buf);
 +                }
 +            }
 +            if (EEL_IS_ZERO_AT_CUTOFF(ir->coulombtype) &&
 +                rcoul1 + rcoul2 > ir->rlistlong - ir->rcoulomb)
 +            {
 +                sprintf(warn_buf,"The sum of the two largest charge group radii (%f) is larger than %s (%f) - rcoulomb (%f)\n",
 +                        rcoul1+rcoul2,
 +                        ir->rlistlong > ir->rlist ? "rlistlong" : "rlist",
 +                        ir->rlistlong,ir->rcoulomb);
 +                if (ir_NVE(ir))
 +                {
 +                    warning(wi,warn_buf);
 +                }
 +                else
 +                {
 +                    warning_note(wi,warn_buf);
 +                }
 +            }
 +        }
 +    }
 +}
index f7d6e4aff5670ef0bc4acf3163cacd9a2e73007b,0000000000000000000000000000000000000000..ecdba0f815f45593a923169d25990342793154db
mode 100644,000000..100644
--- /dev/null
@@@ -1,149 -1,0 +1,149 @@@
- void gmx_init_intra_counters(t_commrec *cr);
- /* Initializes intra-node process counts and ID. */
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gromacs Runs On Most of All Computer Systems
 + */
 +
 +#ifndef _network_h
 +#define _network_h
 +
 +
 +/*
 + * This module defines the interface of the actual communication routines.
 + */
 +
 +#include <stdio.h>
 +
 +#include "types/simple.h"
 +#include "types/commrec.h"
 +#include "typedefs.h"
 +#include "main.h"
 +#include "gmx_fatal.h"
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
 +int gmx_setup(int *argc,char **argv,int *nnodes);
 +/* Initializes the parallel communication, return the ID of the node */
 +
 +int gmx_node_num(void);
 +/* return the number of nodes in the ring */
 +
 +int gmx_node_rank(void);
 +/* return the rank of the node */
 +
 +int gmx_hostname_num(void);
 +/* If the first part of the hostname (up to the first dot) ends with a number, returns this number.
 +   If the first part of the hostname does not ends in a number (0-9 characters), returns 0.
 +*/
 +
 +void gmx_setup_nodecomm(FILE *fplog,t_commrec *cr);
 +/* Sets up fast global communication for clusters with multi-core nodes */
 +
++void gmx_init_intranode_counters(t_commrec *cr);
++/* Initializes intra-physical-node MPI process/thread counts and ID. */
 +
 +gmx_bool gmx_mpi_initialized(void);
 +/* return TRUE when MPI_Init has been called.
 + * return FALSE when MPI_Init has not been called OR
 + * when GROMACS was compiled without MPI support.
 + */
 +
 +void gmx_barrier(const t_commrec *cr);
 +/* Wait till all processes in cr->mpi_comm_mygroup have reached the barrier */
 +
 +void gmx_bcast(int nbytes,void *b,const t_commrec *cr);
 +/* Broadcast nbytes bytes from the master to cr->mpi_comm_mygroup */
 +
 +void gmx_bcast_sim(int nbytes,void *b,const t_commrec *cr);
 +/* Broadcast nbytes bytes from the sim master to cr->mpi_comm_mysim */
 +
 +void gmx_sumi(int nr,int r[],const t_commrec *cr);
 +/* Calculate the global sum of an array of ints */
 +
 +void gmx_sumli(int nr,gmx_large_int_t r[],const t_commrec *cr);
 +/* Calculate the global sum of an array of large ints */
 +
 +void gmx_sumf(int nr,float r[],const t_commrec *cr);
 +/* Calculate the global sum of an array of floats */
 +
 +void gmx_sumd(int nr,double r[],const t_commrec *cr);
 +/* Calculate the global sum of an array of doubles */
 +
 +void gmx_sumf_comm(int nr,float r[],MPI_Comm mpi_comm);
 +/* Calculate the global sum of an array of floats */
 +
 +void gmx_sumd_comm(int nr,double r[],MPI_Comm mpi_comm);
 +/* Calculate the global sum of an array of doubles */
 +
 +void gmx_sumi_sim(int nr,int r[],const gmx_multisim_t *ms);
 +/* Calculate the sum over the simulations of an array of ints */
 +
 +void gmx_sumli_sim(int nr,gmx_large_int_t r[],const gmx_multisim_t *ms);
 +/* Calculate the sum over the simulations of an array of large ints */
 +
 +void gmx_sumf_sim(int nr,float r[],const gmx_multisim_t *ms);
 +/* Calculate the sum over the simulations of an array of floats */
 +
 +void gmx_sumd_sim(int nr,double r[],const gmx_multisim_t *ms);
 +/* Calculate the sum over the simulations of an array of doubles */
 +
 +void gmx_abort(int nodeid,int nnodes,int errorno);
 +/* Abort the parallel run */
 +
 +void gmx_finalize_par(void);
 +/* Finish the parallel run in an ordered manner */
 +
 +#ifdef GMX_DOUBLE
 +#define gmx_sum_comm  gmx_sumd_comm
 +#define gmx_sum       gmx_sumd
 +#define gmx_sum_sim   gmx_sumd_sim
 +#else
 +#define gmx_sum_comm  gmx_sumf_comm
 +#define gmx_sum       gmx_sumf
 +#define gmx_sum_sim   gmx_sumf_sim
 +#endif
 +
 +#ifdef DEBUG_GMX
 +#define debug_gmx() do { FILE *fp=debug ? debug : stderr;\
 +if (bDebugMode()) fprintf(fp,"NODEID=%d, %s  %d\n",gmx_mpi_initialized() ? gmx_node_rank() : -1,__FILE__,__LINE__); fflush(fp); } while (0)
 +#else
 +#define debug_gmx()
 +#endif
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +
 +#endif        /* _network_h */
index dddbc4c998f01bff84d3a4401d0abae5f00dbfe6,0000000000000000000000000000000000000000..3fea4ae4080275a7f1c30a8334c157fac9f67b67
mode 100644,000000..100644
--- /dev/null
@@@ -1,326 -1,0 +1,326 @@@
-   /* intra-node stuff */
-   int nodeid_intra;         /* ID over all intra nodes */ 
-   int nodeid_group_intra;   /* ID within my group (separate 0-n IDs for PP/PME-only nodes) */
-   int nnodes_intra;         /* total number of intra nodes */
-   int nnodes_pp_intra;      /* total number of PP intra nodes */
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GRoups of Organic Molecules in ACtion for Science
 + */
 +#ifndef _commrec_h
 +#define _commrec_h
 +
 +#ifdef GMX_LIB_MPI
 +#include <mpi.h>
 +#else
 +#ifdef GMX_THREAD_MPI
 +#include "../thread_mpi/tmpi.h"
 +#include "../thread_mpi/mpi_bindings.h"
 +#else
 +typedef void* MPI_Comm;
 +typedef void* MPI_Request;
 +typedef void* MPI_Group;
 +#define MPI_COMM_NULL NULL
 +#endif
 +#endif
 +
 +#include "idef.h"
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
 +
 +#define DD_MAXZONE  8
 +#define DD_MAXIZONE 4
 +
 +typedef struct gmx_domdec_master *gmx_domdec_master_p_t;
 +
 +typedef struct {
 +  int  j0;       /* j-zone start               */
 +  int  j1;       /* j-zone end                 */
 +  int  cg1;      /* i-charge-group end         */
 +  int  jcg0;     /* j-charge-group start       */
 +  int  jcg1;     /* j-charge-group end         */
 +  ivec shift0;   /* Minimum shifts to consider */
 +  ivec shift1;   /* Maximum shifts to consider */
 +} gmx_domdec_ns_ranges_t;
 +
 +typedef struct {
 +  rvec x0;       /* Zone lower corner in triclinic coordinates         */
 +  rvec x1;       /* Zone upper corner in triclinic coordinates         */
 +  rvec bb_x0;    /* Zone bounding box lower corner in Cartesian coords */
 +  rvec bb_x1;    /* Zone bounding box upper corner in Cartesian coords */
 +} gmx_domdec_zone_size_t;
 +
 +typedef struct {
 +  /* The number of zones including the home zone */
 +  int  n;
 +  /* The shift of the zones with respect to the home zone */
 +  ivec shift[DD_MAXZONE];
 +  /* The charge group boundaries for the zones */
 +  int  cg_range[DD_MAXZONE+1];
 +  /* The number of neighbor search zones with i-particles */
 +  int  nizone;
 +  /* The neighbor search charge group ranges for each i-zone */
 +  gmx_domdec_ns_ranges_t izone[DD_MAXIZONE];
 +  /* Boundaries of the zones */
 +  gmx_domdec_zone_size_t size[DD_MAXZONE];
 +  /* The cg density of the home zone */
 +  real dens_zone0;
 +} gmx_domdec_zones_t;
 +
 +typedef struct gmx_ga2la *gmx_ga2la_t;
 +
 +typedef struct gmx_hash *gmx_hash_t;
 +
 +typedef struct gmx_reverse_top *gmx_reverse_top_p_t;
 +
 +typedef struct gmx_domdec_constraints *gmx_domdec_constraints_p_t;
 +
 +typedef struct gmx_domdec_specat_comm *gmx_domdec_specat_comm_p_t;
 +
 +typedef struct gmx_domdec_comm *gmx_domdec_comm_p_t;
 +
 +typedef struct gmx_pme_comm_n_box *gmx_pme_comm_n_box_p_t;
 +
 +typedef struct {
 +  int  npbcdim;
 +  int  nboundeddim;
 +  rvec box0;
 +  rvec box_size;
 +  /* Tells if the box is skewed for each of the three cartesian directions */
 +  ivec tric_dir;
 +  rvec skew_fac;
 +  /* Orthogonal vectors for triclinic cells, Cartesian index */
 +  rvec v[DIM][DIM];
 +  /* Normal vectors for the cells walls */
 +  rvec normal[DIM];
 +} gmx_ddbox_t;
 +
 +
 +typedef struct {
 +  /* these buffers are used as destination buffers if MPI_IN_PLACE isn't
 +     supported.*/
 +  int *ibuf; /* for ints */
 +  int ibuf_alloc;
 +
 +  gmx_large_int_t *libuf;
 +  int libuf_alloc;
 +
 +  float *fbuf; /* for floats */
 +  int fbuf_alloc;
 +
 +  double *dbuf; /* for doubles */
 +  int dbuf_alloc;
 +} mpi_in_place_buf_t;
 +
 +
 +typedef struct {
 +  /* The DD particle-particle nodes only */
 +  /* The communication setup within the communicator all
 +   * defined in dd->comm in domdec.c
 +   */
 +  int  nnodes;
 +  MPI_Comm mpi_comm_all;
 +  /* Use MPI_Sendrecv communication instead of non-blocking calls */
 +  gmx_bool bSendRecv2;
 +  /* The local DD cell index and rank */
 +  ivec ci;
 +  int  rank;
 +  ivec master_ci;
 +  int  masterrank;
 +  /* Communication with the PME only nodes */
 +  int  pme_nodeid;
 +  gmx_bool pme_receive_vir_ener;
 +  gmx_pme_comm_n_box_p_t cnb;
 +  int  nreq_pme;
 +  MPI_Request req_pme[4];
 +  
 +
 +  /* The communication setup, identical for each cell, cartesian index */
 +  ivec nc;
 +  int  ndim;
 +  ivec dim;  /* indexed by 0 to ndim */
 +  gmx_bool bGridJump;
 +
 +  /* PBC from dim 0 to npbcdim */
 +  int npbcdim;
 +
 +  /* Screw PBC? */
 +  gmx_bool bScrewPBC;
 +
 +  /* Forward and backward neighboring cells, indexed by 0 to ndim */
 +  int  neighbor[DIM][2];
 +
 +  /* Only available on the master node */
 +  gmx_domdec_master_p_t ma;
 +
 +  /* Are there inter charge group constraints */
 +  gmx_bool bInterCGcons;
 +  gmx_bool bInterCGsettles;
 +
 +  /* Global atom number to interaction list */
 +  gmx_reverse_top_p_t reverse_top;
 +  int  nbonded_global;
 +  int  nbonded_local;
 +
 +  /* The number of inter charge-group exclusions */
 +  int  n_intercg_excl;
 +
 +  /* Vsite stuff */
 +  gmx_hash_t  ga2la_vsite;
 +  gmx_domdec_specat_comm_p_t vsite_comm;
 +
 +  /* Constraint stuff */
 +  gmx_domdec_constraints_p_t constraints;
 +  gmx_domdec_specat_comm_p_t constraint_comm;
 +
 +  /* The local to gobal charge group index and local cg to local atom index */
 +  int  ncg_home;
 +  int  ncg_tot;
 +  int  *index_gl;
 +  int  *cgindex;
 +  int  cg_nalloc;
 +  /* Local atom to local cg index, only for special cases */
 +  int  *la2lc;
 +  int  la2lc_nalloc;
 +
 +  /* The number of home atoms */
 +  int  nat_home;
 +  /* The total number of atoms: home and received zones */
 +  int  nat_tot;
 +  /* Index from the local atoms to the global atoms */
 +  int  *gatindex;
 +  int  gatindex_nalloc;
 +
 +  /* Global atom number to local atom number list */
 +  gmx_ga2la_t ga2la;
 +
 +  /* Communication stuff */
 +  gmx_domdec_comm_p_t comm;
 +
 +  /* The partioning count, to keep track of the state */
 +  gmx_large_int_t ddp_count;
 +
 +
 +  /* gmx_pme_recv_f buffer */
 +  int pme_recv_f_alloc;
 +  rvec *pme_recv_f_buf;
 +
 +} gmx_domdec_t;
 +
 +typedef struct gmx_partdec *gmx_partdec_p_t;
 +
 +typedef struct {
 +  int nsim;
 +  int sim;
 +  MPI_Group mpi_group_masters;
 +  MPI_Comm mpi_comm_masters;
 +  /* these buffers are used as destination buffers if MPI_IN_PLACE isn't
 +     supported.*/
 +  mpi_in_place_buf_t *mpb;
 +} gmx_multisim_t;
 +
 +#define DUTY_PP  (1<<0)
 +#define DUTY_PME (1<<1)
 +
 +typedef struct {
 +  int      bUse;
 +  MPI_Comm comm_intra;
 +  int      rank_intra;
 +  MPI_Comm comm_inter;
 +  
 +} gmx_nodecomm_t;
 +
 +typedef struct {
 +  /* The nodeids in one sim are numbered sequentially from 0.
 +   * All communication within some simulation should happen
 +   * in mpi_comm_mysim, or its subset mpi_comm_mygroup.
 +   */
 +  int sim_nodeid,nnodes,npmenodes;
 +
 +  /* thread numbers: */
 +  /* Not used yet: int threadid, nthreads; */
 +  /* The nodeid in the PP/PME, PP or PME group */
 +  int nodeid;
 +  MPI_Comm mpi_comm_mysim;
 +  MPI_Comm mpi_comm_mygroup;
 +
++  /* MPI ranks within a physical node for hardware access */
++  int nrank_intranode;    /* nr of ranks on this physical node */
++  int rank_intranode;     /* our rank on this physical node */
++  int nrank_pp_intranode; /* as nrank_intranode, for particle-particle only */
++  int rank_pp_intranode;  /* as rank_intranode, for particle-particle only */
 +
 +  gmx_nodecomm_t nc;
 +  
 +  /* For domain decomposition */
 +  gmx_domdec_t *dd;
 +
 +  /* For particle decomposition */
 +  gmx_partdec_p_t pd;
 +
 +  /* The duties of this node, see the defines above */
 +  int duty;
 +
 +  gmx_multisim_t *ms;
 +
 +  /* these buffers are used as destination buffers if MPI_IN_PLACE isn't
 +     supported.*/
 +  mpi_in_place_buf_t *mpb;
 +} t_commrec;
 +
 +#define MASTERNODE(cr)     (((cr)->nodeid == 0) || !PAR(cr))
 +  /* #define MASTERTHREAD(cr)   ((cr)->threadid == 0) */
 +  /* #define MASTER(cr)         (MASTERNODE(cr) && MASTERTHREAD(cr)) */
 +#define MASTER(cr)         MASTERNODE(cr)
 +#define SIMMASTER(cr)      ((MASTER(cr) && ((cr)->duty & DUTY_PP)) || !PAR(cr))
 +#define NODEPAR(cr)        ((cr)->nnodes > 1)
 +  /* #define THREADPAR(cr)      ((cr)->nthreads > 1) */
 +  /* #define PAR(cr)            (NODEPAR(cr) || THREADPAR(cr)) */
 +#define PAR(cr)            NODEPAR(cr)
 +#define RANK(cr,nodeid)    (nodeid)
 +#define MASTERRANK(cr)     (0)
 +
 +#define DOMAINDECOMP(cr)   (((cr)->dd != NULL) && PAR(cr))
 +#define DDMASTER(dd)       ((dd)->rank == (dd)->masterrank)
 +
 +#define PARTDECOMP(cr)     ((cr)->pd != NULL)
 +
 +#define MULTISIM(cr)       ((cr)->ms)
 +#define MSRANK(ms,nodeid)  (nodeid)
 +#define MASTERSIM(ms)      ((ms)->sim == 0)
 +
 +/* The master of all (the node that prints the remaining run time etc.) */
 +#define MULTIMASTER(cr)    (SIMMASTER(cr) && (!MULTISIM(cr) || MASTERSIM((cr)->ms)))
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +#endif
index 986dce8872bb89c4c38ca2a1165528556448d6f2,0000000000000000000000000000000000000000..a7dbc89fab86ad7e8ed42f014dd132d814a83c13
mode 100644,000000..100644
--- /dev/null
@@@ -1,442 -1,0 +1,449 @@@
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GRoups of Organic Molecules in ACtion for Science
 + */
 +
 +#ifndef ENUMS_H_
 +#define ENUMS_H_
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +#if 0
 +} /* fixes auto-indentation problems */
 +#endif
 +
 +/* note: these enums should correspond to the names in gmxlib/names.c */
 +
 +enum {
 +  epbcXYZ, epbcNONE, epbcXY, epbcSCREW, epbcNR
 +};
 +
 +enum {
 +  etcNO, etcBERENDSEN, etcNOSEHOOVER, etcYES, etcANDERSEN, etcANDERSENMASSIVE, etcVRESCALE, etcNR
 +}; /* yes is an alias for berendsen */
 +
 +#define ETC_ANDERSEN(e) (((e) == etcANDERSENMASSIVE) || ((e) == etcANDERSEN))
 +
 +enum {
 +  epcNO, epcBERENDSEN, epcPARRINELLORAHMAN, epcISOTROPIC, epcMTTK, epcNR
 +}; /* isotropic is an alias for berendsen */
 +
 +/* trotter decomposition extended variable parts */
 +enum {
 +  etrtNONE, etrtNHC, etrtBAROV, etrtBARONHC, etrtNHC2, etrtBAROV2, etrtBARONHC2, 
 +  etrtVELOCITY1, etrtVELOCITY2, etrtPOSITION, etrtSKIPALL, etrtNR
 +};
 +
 +/* sequenced parts of the trotter decomposition */
 +enum {
 +  ettTSEQ0,  ettTSEQ1,  ettTSEQ2,  ettTSEQ3,  ettTSEQ4, ettTSEQMAX
 +};
 +
 +enum {
 +  epctISOTROPIC, epctSEMIISOTROPIC, epctANISOTROPIC,
 +  epctSURFACETENSION, epctNR
 +};
 +
 +enum {
 +  erscNO, erscALL, erscCOM, erscNR
 +};
 +
 +enum {
 +  ecutsGROUP, ecutsVERLET, ecutsNR
 +};
 +
 +/* Coulomb / VdW interaction modifiers.
 + * grompp replaces eintmodPOTSHIFT_VERLET by eintmodPOTSHIFT or eintmodNONE.
 + * Exactcutoff is only used by Reaction-field-zero, and is not user-selectable.
 + */
 +enum eintmod {
 +    eintmodPOTSHIFT_VERLET, eintmodPOTSHIFT, eintmodNONE, eintmodPOTSWITCH, eintmodEXACTCUTOFF, eintmodNR
 +};
 +
 +/*
 + * eelNOTUSED1 used to be GB, but to enable generalized born with different
 + * forms of electrostatics (RF, switch, etc.) in the future it is now selected
 + * separately (through the implicit_solvent option).
 + */
 +enum {
 +  eelCUT,     eelRF,     eelGRF,   eelPME,  eelEWALD,  eelP3M_AD, 
 +  eelPOISSON, eelSWITCH, eelSHIFT, eelUSER, eelGB_NOTUSED, eelRF_NEC, eelENCADSHIFT, 
 +  eelPMEUSER, eelPMESWITCH, eelPMEUSERSWITCH, eelRF_ZERO, eelNR
 +};
 +
 +/* Ewald geometry */
 +enum { 
 +  eewg3D, eewg3DC, eewgNR
 +};
 +
 +#define EEL_RF(e) ((e) == eelRF || (e) == eelGRF || (e) == eelRF_NEC || (e) == eelRF_ZERO )
 +
 +#define EEL_PME(e)  ((e) == eelPME || (e) == eelPMESWITCH || (e) == eelPMEUSER || (e) == eelPMEUSERSWITCH || (e) == eelP3M_AD)
 +#define EEL_FULL(e) (EEL_PME(e) || (e) == eelPOISSON || (e) == eelEWALD)
 +
 +#define EEL_SWITCHED(e) ((e) == eelSWITCH || (e) == eelSHIFT || (e) == eelENCADSHIFT || (e) == eelPMESWITCH || (e) == eelPMEUSERSWITCH)
 +
 +#define EEL_USER(e) ((e) == eelUSER || (e) == eelPMEUSER || (e) == (eelPMESWITCH))
 +
 +#define EEL_IS_ZERO_AT_CUTOFF(e) (EEL_SWITCHED(e) || (e) == eelRF_ZERO)
 +
 +#define EEL_MIGHT_BE_ZERO_AT_CUTOFF(e) (EEL_IS_ZERO_AT_CUTOFF(e) || (e) == eelUSER || (e) == eelPMEUSER)
 +
 +enum {
 +  evdwCUT, evdwSWITCH, evdwSHIFT, evdwUSER, evdwENCADSHIFT, evdwNR
 +};
 +
 +#define EVDW_SWITCHED(e) ((e) == evdwSWITCH || (e) == evdwSHIFT || (e) == evdwENCADSHIFT)
 +
 +#define EVDW_IS_ZERO_AT_CUTOFF(e) EVDW_SWITCHED(e)
 +
 +#define EVDW_MIGHT_BE_ZERO_AT_CUTOFF(e) (EVDW_IS_ZERO_AT_CUTOFF(e) || (e) == evdwUSER)
 +
 +enum { 
 +  ensGRID, ensSIMPLE, ensNR
 +};
 +
 +/* eiVV is normal velocity verlet -- eiVVAK uses 1/2*(KE(t-dt/2)+KE(t+dt/2)) as the kinetic energy, and the half step kinetic
 +   energy for temperature control */
 +
 +enum {
 +  eiMD, eiSteep, eiCG, eiBD, eiSD2, eiNM, eiLBFGS, eiTPI, eiTPIC, eiSD1, eiVV, eiVVAK, eiNR
 +};
 +#define EI_VV(e) ((e) == eiVV || (e) == eiVVAK)
 +#define EI_MD(e) ((e) == eiMD || EI_VV(e))
 +#define EI_SD(e) ((e) == eiSD1 || (e) == eiSD2)
 +#define EI_RANDOM(e) (EI_SD(e) || (e) == eiBD)
 +/*above integrators may not conserve momenta*/
 +#define EI_DYNAMICS(e) (EI_MD(e) || EI_SD(e) || (e) == eiBD)
 +#define EI_ENERGY_MINIMIZATION(e) ((e) == eiSteep || (e) == eiCG || (e) == eiLBFGS)
 +#define EI_TPI(e) ((e) == eiTPI || (e) == eiTPIC)
 +
 +#define EI_STATE_VELOCITY(e) (EI_MD(e) || EI_SD(e))
 +
 +enum {
 +  econtLINCS, econtSHAKE, econtNR
 +};
 +
 +enum {
 +  edrNone, edrSimple, edrEnsemble, edrNR
 +};
 +
 +enum {
 +  edrwConservative, edrwEqual, edrwNR
 +};
 +
 +/* Combination rule things */
 +enum { 
 +  eCOMB_NONE, eCOMB_GEOMETRIC, eCOMB_ARITHMETIC, eCOMB_GEOM_SIG_EPS, eCOMB_NR 
 +};
 +
 +/* NBF selection */
 +enum { 
 +  eNBF_NONE, eNBF_LJ, eNBF_BHAM, eNBF_NR 
 +};
 +
 +/* simulated tempering methods */
 +enum {
 +  esimtempGEOMETRIC, esimtempEXPONENTIAL, esimtempLINEAR, esimtempNR
 +};
 +/* FEP selection */
 +enum {
 +  efepNO, efepYES, efepSTATIC, efepSLOWGROWTH, efepEXPANDED, efepNR
 +};
 +  /* if efepNO, there are no evaluations at other states.
 +     if efepYES, treated equivalently to efepSTATIC.
 +     if efepSTATIC, then lambdas do not change during the simulation.
 +     if efepSLOWGROWTH, then the states change monotonically throughout the simulation.
 +     if efepEXPANDED, then expanded ensemble simulations are occuring.
 +  */
 +
 +/* FEP coupling types */
 +enum {
 +  efptFEP,efptMASS,efptCOUL,efptVDW,efptBONDED,efptRESTRAINT,efptTEMPERATURE,efptNR
 +};
 +
 +/* How the lambda weights are calculated:
 +   elamstatsMETROPOLIS = using the metropolis criteria
 +   elamstatsBARKER = using the Barker critera for transition weights - also called unoptimized Bennett
 +   elamstatsMINVAR = using Barker + minimum variance for weights
 +   elamstatsWL = Wang-Landu (using visitation counts)
 +   elamstatsWWL = Weighted Wang-Landau (using optimized gibbs weighted visitation counts)
 +*/
 +enum {
 +  elamstatsNO, elamstatsMETROPOLIS, elamstatsBARKER, elamstatsMINVAR, elamstatsWL, elamstatsWWL, elamstatsNR
 +};
 +
 +#define ELAMSTATS_EXPANDED(e) ((e) > elamstatsNO)
 +
 +#define EWL(e) ((e) == elamstatsWL || (e) == elamstatsWWL)
 +
 +/* How moves in lambda are calculated:
 +   elmovemcMETROPOLIS - using the Metropolis criteria, and 50% up and down
 +   elmovemcBARKER - using the Barker criteria, and 50% up and down
 +   elmovemcGIBBS - computing the transition using the marginalized probabilities of the lambdas
 +   elmovemcMETGIBBS - computing the transition using the metropolized version of Gibbs (Monte Carlo Strategies in Scientific computing, Liu, p. 134)
 +*/
 +enum {
 +  elmcmoveNO,elmcmoveMETROPOLIS, elmcmoveBARKER, elmcmoveGIBBS, elmcmoveMETGIBBS, elmcmoveNR
 +};
 +
 +/* how we decide whether weights have reached equilibrium
 +   elmceqNO - never stop, weights keep going
 +   elmceqYES - fix the weights from the beginning; no movement
 +   elmceqWLDELTA - stop when the WL-delta falls below a certain level
 +   elmceqNUMATLAM - stop when we have a certain number of samples at every step
 +   elmceqSTEPS - stop when we've run a certain total number of steps
 +   elmceqSAMPLES - stop when we've run a certain total number of samples
 +   elmceqRATIO - stop when the ratio of samples (lowest to highest) is sufficiently large
 +*/
 +enum {
 +  elmceqNO,elmceqYES,elmceqWLDELTA,elmceqNUMATLAM,elmceqSTEPS,elmceqSAMPLES,elmceqRATIO,elmceqNR
 +};
 +
 +/* separate_dhdl_file selection */
 +enum
 +{
 +  /* NOTE: YES is the first one. Do NOT interpret this one as a gmx_bool */
 +  esepdhdlfileYES, esepdhdlfileNO, esepdhdlfileNR
 +};
 +
 +/* dhdl_derivatives selection */
 +enum
 +{
 +  /* NOTE: YES is the first one. Do NOT interpret this one as a gmx_bool */
 +  edhdlderivativesYES, edhdlderivativesNO, edhdlderivativesNR
 +};
 +
 +/* Solvent model */
 +enum {
 +  esolNO, esolSPC, esolTIP4P, esolNR
 +};
 +
 +/* Dispersion correction */
 +enum {
 +  edispcNO, edispcEnerPres, edispcEner, edispcAllEnerPres, edispcAllEner, edispcNR
 +}; 
 +
 +/* Shell types, for completion stuff */
 +enum {
 +  eshellCSH, eshellBASH, eshellZSH, eshellNR
 +}; 
 +
 +/* Center of mass motion selection */
 +enum { 
 +  ecmLINEAR, ecmANGULAR, ecmNO, ecmNR 
 +};
 +
 +/* New version of simulated annealing */
 +enum { 
 +  eannNO, eannSINGLE, eannPERIODIC, eannNR 
 +};
 +
 +/* Implicit solvent algorithms */
 +enum { 
 +  eisNO, eisGBSA, eisNR
 +};
 +
 +/* Algorithms for calculating GB radii */
 +enum { 
 +  egbSTILL, egbHCT, egbOBC, egbNR 
 +};
 +
 +enum {
 +  esaAPPROX, esaNO, esaSTILL, esaNR
 +};
 +
 +/* Wall types */
 +enum {
 +  ewt93, ewt104, ewtTABLE, ewt126, ewtNR
 +};
 +
 +/* Pull stuff */
 +enum {
 +  epullNO, epullUMBRELLA, epullCONSTRAINT, epullCONST_F, epullNR
 +};
 +
 +enum {
 +  epullgDIST, epullgDIR, epullgCYL, epullgPOS, epullgDIRPBC, epullgNR
 +};
 +
 +#define PULL_CYL(pull) ((pull)->eGeom == epullgCYL)
 +
 +/* Enforced rotation groups */
 +enum {
 +  erotgISO  , erotgISOPF ,
 +  erotgPM   , erotgPMPF  ,
 +  erotgRM   , erotgRMPF  ,
 +  erotgRM2  , erotgRM2PF ,
 +  erotgFLEX , erotgFLEXT ,
 +  erotgFLEX2, erotgFLEX2T,
 +  erotgNR
 +};
 +
 +enum {
 +    erotgFitRMSD, erotgFitNORM, erotgFitPOT, erotgFitNR
 +};
 +
 +/* QMMM */
 +enum {
 +  eQMmethodAM1, eQMmethodPM3, eQMmethodRHF, 
 +  eQMmethodUHF, eQMmethodDFT, eQMmethodB3LYP, eQMmethodMP2, eQMmethodCASSCF, eQMmethodB3LYPLAN,
 +  eQMmethodDIRECT, eQMmethodNR
 +};
 +
 +enum {
 +  eQMbasisSTO3G, eQMbasisSTO3G2, eQMbasis321G, 
 +  eQMbasis321Gp, eQMbasis321dGp, eQMbasis621G,
 +  eQMbasis631G, eQMbasis631Gp, eQMbasis631dGp, 
 +  eQMbasis6311G, eQMbasisNR
 +};
 +
 +enum {
 +  eQMMMschemenormal,eQMMMschemeoniom,eQMMMschemeNR
 +};
 +
 +enum {
 +  eMultentOptName, eMultentOptNo, eMultentOptLast, eMultentOptNR
 +};
 +
 +/* flat-bottom posres geometries */
 +enum {
 +  efbposresZERO, efbposresSPHERE, efbposresCYLINDER, efbposresX, efbposresY, efbposresZ,
 +  efbposresNR
 +};
 +
 +enum {
 +  eAdressOff,eAdressConst, eAdressXSplit, eAdressSphere, eAdressNR
 +};
 +
 +enum {
 +  eAdressICOff, eAdressICThermoForce, eAdressICNR
 +};
 +
 +enum {
 +  eAdressSITEcom,eAdressSITEcog, eAdressSITEatom, eAdressSITEatomatom, eAdressSITENR
 +};
 +
 +
 +/* The interactions contained in a (possibly merged) table
 + * for computing electrostatic, VDW repulsion and/or VDW dispersion 
 + * contributions.
 + */
 +enum gmx_table_interaction
 +{
 +    GMX_TABLE_INTERACTION_ELEC,
 +    GMX_TABLE_INTERACTION_VDWREP_VDWDISP,
 +    GMX_TABLE_INTERACTION_VDWEXPREP_VDWDISP,
 +    GMX_TABLE_INTERACTION_VDWDISP,
 +    GMX_TABLE_INTERACTION_ELEC_VDWREP_VDWDISP,
 +    GMX_TABLE_INTERACTION_ELEC_VDWEXPREP_VDWDISP,
 +    GMX_TABLE_INTERACTION_ELEC_VDWDISP,
 +    GMX_TABLE_INTERACTION_NR
 +};
 +
 +/* Different formats for table data. Cubic spline tables are typically stored
 + * with the four Y,F,G,H intermediate values (check tables.c for format), which
 + * makes it easy to load with a single 4-way SIMD instruction too.
 + * Linear tables only need one value per table point, or two if both V and F
 + * are calculated. However, with SIMD instructions this makes the loads unaligned,
 + * and in that case we store the data as F, D=F(i+1)-F(i), V, and then a blank value,
 + * which again makes it possible to load as a single instruction.
 + */
 +enum gmx_table_format
 +{
 +    GMX_TABLE_FORMAT_CUBICSPLINE_YFGH,
 +    GMX_TABLE_FORMAT_LINEAR_VF,
 +    GMX_TABLE_FORMAT_LINEAR_V,
 +    GMX_TABLE_FORMAT_LINEAR_F,
 +    GMX_TABLE_FORMAT_LINEAR_FDV0,
 +    GMX_TABLE_FORMAT_NR
 +};
 +
 +/* Neighborlist geometry type.
 + * Kernels will compute interactions between two particles, 
 + * 3-center water, 4-center water or coarse-grained beads.
 + */
 +enum gmx_nblist_kernel_geometry
 +{
 +    GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE,
 +    GMX_NBLIST_GEOMETRY_WATER3_PARTICLE,
 +    GMX_NBLIST_GEOMETRY_WATER3_WATER3,
 +    GMX_NBLIST_GEOMETRY_WATER4_PARTICLE,
 +    GMX_NBLIST_GEOMETRY_WATER4_WATER4,
 +    GMX_NBLIST_GEOMETRY_CG_CG,
 +    GMX_NBLIST_GEOMETRY_NR
 +};
 +
 +/* Types of electrostatics calculations available inside nonbonded kernels.
 + * Note that these do NOT necessarily correspond to the user selections in the MDP file;
 + * many interactions for instance map to tabulated kernels.
 + */
 +enum gmx_nbkernel_elec
 +{
 +    GMX_NBKERNEL_ELEC_NONE,
 +    GMX_NBKERNEL_ELEC_COULOMB,
 +    GMX_NBKERNEL_ELEC_REACTIONFIELD,
 +    GMX_NBKERNEL_ELEC_CUBICSPLINETABLE,
 +    GMX_NBKERNEL_ELEC_GENERALIZEDBORN,
 +    GMX_NBKERNEL_ELEC_EWALD,
 +    GMX_NBKERNEL_ELEC_NR
 +};
 +
 +/* Types of vdw calculations available inside nonbonded kernels.
 + * Note that these do NOT necessarily correspond to the user selections in the MDP file;
 + * many interactions for instance map to tabulated kernels.
 + */
 +enum gmx_nbkernel_vdw
 +{
 +    GMX_NBKERNEL_VDW_NONE,
 +    GMX_NBKERNEL_VDW_LENNARDJONES,
 +    GMX_NBKERNEL_VDW_BUCKINGHAM,
 +    GMX_NBKERNEL_VDW_CUBICSPLINETABLE,
 +    GMX_NBKERNEL_VDW_NR
 +};
++/* Types of interactions inside the neighborlist
++ */
++enum gmx_nblist_interaction_type
++{
++  GMX_NBLIST_INTERACTION_STANDARD,
++  GMX_NBLIST_INTERACTION_FREE_ENERGY,
++  GMX_NBLIST_INTERACTION_ADRESS,
++  GMX_NBLIST_INTERACTION_NR
++};
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +#endif /* ENUMS_H_ */
index 3433adca4173daeae75fe9f0f9cfe08f68d6b40c,0000000000000000000000000000000000000000..e791df979fd4fe0288ee80a6fb8f6105d5eccb68
mode 100644,000000..100644
--- /dev/null
@@@ -1,331 -1,0 +1,331 @@@
-   F_VTEMP,
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GRoups of Organic Molecules in ACtion for Science
 + */
 +
 +
 +#ifndef _idef_h
 +#define _idef_h
 +
 +#include "simple.h"
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
 +
 +/* check kernel/toppush.c when you change these numbers */
 +#define MAXATOMLIST   6
 +#define MAXFORCEPARAM 12
 +#define NR_RBDIHS     6
 +#define NR_FOURDIHS     4
 +
 +typedef atom_id t_iatom;
 +
 +/* this MUST correspond to the 
 +   t_interaction_function[F_NRE] in gmxlib/ifunc.c */
 +enum {
 +  F_BONDS,
 +  F_G96BONDS,
 +  F_MORSE,
 +  F_CUBICBONDS,
 +  F_CONNBONDS,
 +  F_HARMONIC,
 +  F_FENEBONDS,
 +  F_TABBONDS,
 +  F_TABBONDSNC,
 +  F_RESTRBONDS,
 +  F_ANGLES, 
 +  F_G96ANGLES,
 +  F_LINEAR_ANGLES,
 +  F_CROSS_BOND_BONDS,
 +  F_CROSS_BOND_ANGLES,
 +  F_UREY_BRADLEY,
 +  F_QUARTIC_ANGLES,
 +  F_TABANGLES,
 +  F_PDIHS,
 +  F_RBDIHS, 
 +  F_FOURDIHS,
 +  F_IDIHS, 
 +  F_PIDIHS, 
 +  F_TABDIHS,
 +  F_CMAP,
 +  F_GB12,
 +  F_GB13,
 +  F_GB14,
 +  F_GBPOL,
 +  F_NPSOLVATION,
 +  F_LJ14,
 +  F_COUL14,
 +  F_LJC14_Q,
 +  F_LJC_PAIRS_NB,
 +  F_LJ,
 +  F_BHAM,
 +  F_LJ_LR,
 +  F_BHAM_LR,
 +  F_DISPCORR,
 +  F_COUL_SR,
 +  F_COUL_LR,
 +  F_RF_EXCL,
 +  F_COUL_RECIP,
 +  F_DPD,
 +  F_POLARIZATION,
 +  F_WATER_POL,
 +  F_THOLE_POL,
 +  F_ANHARM_POL,
 +  F_POSRES,
 +  F_FBPOSRES,
 +  F_DISRES,
 +  F_DISRESVIOL,
 +  F_ORIRES,
 +  F_ORIRESDEV,
 +  F_ANGRES,
 +  F_ANGRESZ,
 +  F_DIHRES,
 +  F_DIHRESVIOL,
 +  F_CONSTR,
 +  F_CONSTRNC,
 +  F_SETTLE,
 +  F_VSITE2,
 +  F_VSITE3,
 +  F_VSITE3FD,
 +  F_VSITE3FAD,
 +  F_VSITE3OUT,
 +  F_VSITE4FD,
 +  F_VSITE4FDN,
 +  F_VSITEN,
 +  F_COM_PULL,
 +  F_EQM,
 +  F_EPOT,
 +  F_EKIN,
 +  F_ETOT,
 +  F_ECONSERVED,
 +  F_TEMP,
++  F_VTEMP_NOLONGERUSED,
 +  F_PDISPCORR,
 +  F_PRES,
 +  F_DHDL_CON,
 +  F_DVDL,
 +  F_DKDL,
 +  F_DVDL_COUL,
 +  F_DVDL_VDW,
 +  F_DVDL_BONDED,
 +  F_DVDL_RESTRAINT,
 +  F_DVDL_TEMPERATURE, /* not calculated for now, but should just be the energy (NVT) or enthalpy (NPT), or 0 (NVE) */
 +  F_NRE               /* This number is for the total number of energies      */
 +};
 +
 +#define IS_RESTRAINT_TYPE(ifunc) (((ifunc==F_POSRES) || (ifunc==F_DISRES) || (ifunc==F_RESTRBONDS) || (ifunc==F_DISRESVIOL) || (ifunc==F_ORIRES) || (ifunc==F_ORIRESDEV) || (ifunc==F_ANGRES) || (ifunc == F_ANGRESZ) || (ifunc==F_DIHRES)))
 +
 +/* A macro for checking if ftype is an explicit pair-listed LJ or COULOMB
 + * interaction type:
 + * bonded LJ (usually 1-4), or special listed non-bonded for FEP.
 + */
 +#define IS_LISTED_LJ_C(ftype) ((ftype) >= F_LJ14 && (ftype) <= F_LJC_PAIRS_NB)
 +
 +typedef union
 +{
 +  /* Some parameters have A and B values for free energy calculations.
 +   * The B values are not used for regular simulations of course.
 +   * Free Energy for nonbondeds can be computed by changing the atom type.
 +   * The harmonic type is used for all harmonic potentials:
 +   * bonds, angles and improper dihedrals
 +   */
 +  struct {real a,b,c;                                    } bham;
 +  struct {real rA,krA,rB,krB;                            } harmonic;
 +  struct {real klinA,aA,klinB,aB;                          } linangle;
 +  struct {real lowA,up1A,up2A,kA,lowB,up1B,up2B,kB;        } restraint;
 +  /* No free energy supported for cubic bonds, FENE, WPOL or cross terms */ 
 +  struct {real b0,kb,kcub;                                 } cubic;
 +  struct {real bm,kb;                                      } fene;
 +  struct {real r1e,r2e,krr;                                } cross_bb;
 +  struct {real r1e,r2e,r3e,krt;                            } cross_ba;
 +  struct {real thetaA,kthetaA,r13A,kUBA,thetaB,kthetaB,r13B,kUBB;} u_b;
 +  struct {real theta,c[5];                                 } qangle; 
 +  struct {real alpha;                                      } polarize;
 +  struct {real alpha,drcut,khyp;                           } anharm_polarize;
 +  struct {real al_x,al_y,al_z,rOH,rHH,rOD;                 } wpol;
 +  struct {real a,alpha1,alpha2,rfac;                       } thole;
 +  struct {real c6,c12;                                           } lj;
 +  struct {real c6A,c12A,c6B,c12B;                        } lj14;
 +  struct {real fqq,qi,qj,c6,c12;                         } ljc14;
 +  struct {real qi,qj,c6,c12;                             } ljcnb;
 +  /* Proper dihedrals can not have different multiplicity when
 +   * doing free energy calculations, because the potential would not
 +   * be periodic anymore.
 +   */ 
 +  struct {real phiA,cpA;int mult;real phiB,cpB;            } pdihs;
 +  struct {real dA,dB;                                    } constr;
 +  /* Settle can not be used for Free energy calculations of water bond geometry.
 +   * Use shake (or lincs) instead if you have to change the water bonds.
 +   */
 +  struct {real doh,dhh;                                   } settle;
 +  struct {real b0A,cbA,betaA,b0B,cbB,betaB;               } morse;
 +  struct {real pos0A[DIM],fcA[DIM],pos0B[DIM],fcB[DIM];   } posres;
 +  struct {real pos0[DIM],r,k; int geom;                   } fbposres;
 +  struct {real rbcA[NR_RBDIHS], rbcB[NR_RBDIHS];          } rbdihs;
 +  struct {real a,b,c,d,e,f;                               } vsite;   
 +  struct {int  n; real a;                                 } vsiten;   
 +  /* NOTE: npair is only set after reading the tpx file */
 +  struct {real low,up1,up2,kfac;int type,label,npair;     } disres; 
 +  struct {real phiA,dphiA,kfacA,phiB,dphiB,kfacB;         } dihres;
 +  struct {int  ex,power,label; real c,obs,kfac;           } orires;
 +  struct {int  table;real kA;real kB;                     } tab;
 +  struct {real sar,st,pi,gbr,bmlt;                        } gb;
 +  struct {int cmapA,cmapB;                                } cmap;
 +  struct {real buf[MAXFORCEPARAM];                      } generic; /* Conversion */
 +} t_iparams;
 +
 +typedef int t_functype;
 +
 +/*
 + * The nonperturbed/perturbed interactions are now separated (sorted) in the
 + * ilist, such that the first 0..(nr_nonperturbed-1) ones are exactly that, and 
 + * the remaining ones from nr_nonperturbed..(nr-1) are perturbed bonded 
 + * interactions.
 + */
 +typedef struct
 +{
 +  int nr;
 +  int nr_nonperturbed;
 +  t_iatom *iatoms;
 +  int nalloc;
 +} t_ilist;
 +
 +/*
 + * The struct t_ilist defines a list of atoms with their interactions. 
 + * General field description:
 + *   int nr
 + *    the size (nr elements) of the interactions array (iatoms[]).
 + *   t_iatom *iatoms
 + *    specifies which atoms are involved in an interaction of a certain 
 + *       type. The layout of this array is as follows:
 + *
 + *      +-----+---+---+---+-----+---+---+-----+---+---+---+-----+---+---+...
 + *      |type1|at1|at2|at3|type2|at1|at2|type1|at1|at2|at3|type3|at1|at2|
 + *      +-----+---+---+---+-----+---+---+-----+---+---+---+-----+---+---+...
 + *
 + *    So for interaction type type1 3 atoms are needed, and for type2 and 
 + *      type3 only 2. The type identifier is used to select the function to 
 + *    calculate the interaction and its actual parameters. This type 
 + *    identifier is an index in a params[] and functype[] array.
 + */
 +
 +typedef struct
 +{
 +      real *cmap; /* Has length 4*grid_spacing*grid_spacing, */
 +      /* there are 4 entries for each cmap type (V,dVdx,dVdy,d2dVdxdy) */
 +} cmapdata_t;
 +
 +typedef struct
 +{
 +      int ngrid;            /* Number of allocated cmap (cmapdata_t ) grids */
 +      int grid_spacing;     /* Grid spacing */
 +      cmapdata_t *cmapdata; /* Pointer to grid with actual, pre-interpolated data */
 +} gmx_cmap_t;
 +
 +
 +typedef struct
 +{
 +  int        ntypes;
 +  int        atnr;
 +  t_functype *functype;
 +  t_iparams  *iparams;
 +  double     reppow;     /* The repulsion power for VdW: C12*r^-reppow   */
 +  real       fudgeQQ;    /* The scaling factor for Coulomb 1-4: f*q1*q2  */
 +  gmx_cmap_t cmap_grid;  /* The dihedral correction maps                 */
 +} gmx_ffparams_t;
 +
 +enum {
 +  ilsortUNKNOWN, ilsortNO_FE, ilsortFE_UNSORTED, ilsortFE_SORTED
 +};
 +
 +typedef struct
 +{
 +  int ntypes;
 +  int atnr;
 +  t_functype *functype;
 +  t_iparams  *iparams;
 +  real fudgeQQ;
 +  gmx_cmap_t cmap_grid;
 +  t_iparams  *iparams_posres,*iparams_fbposres;
 +  int iparams_posres_nalloc,iparams_fbposres_nalloc;
 +
 +  t_ilist il[F_NRE];
 +  int ilsort;
 +} t_idef;
 +
 +/*
 + * The struct t_idef defines all the interactions for the complete
 + * simulation. The structure is setup in such a way that the multinode
 + * version of the program  can use it as easy as the single node version.
 + * General field description:
 + *   int ntypes
 + *    defines the number of elements in functype[] and param[].
 + *   int nodeid
 + *      the node id (if parallel machines)
 + *   int atnr
 + *      the number of atomtypes
 + *   t_functype *functype
 + *    array of length ntypes, defines for every force type what type of 
 + *      function to use. Every "bond" with the same function but different 
 + *    force parameters is a different force type. The type identifier in the 
 + *    forceatoms[] array is an index in this array.
 + *   t_iparams *iparams
 + *    array of length ntypes, defines the parameters for every interaction
 + *      type. The type identifier in the actual interaction list
 + *      (ilist[ftype].iatoms[]) is an index in this array.
 + *   gmx_cmap_t cmap_grid
 + *      the grid for the dihedral pair correction maps.
 + *   t_iparams *iparams_posres, *iparams_fbposres
 + *    defines the parameters for position restraints only.
 + *      Position restraints are the only interactions that have different
 + *      parameters (reference positions) for different molecules
 + *      of the same type. ilist[F_POSRES].iatoms[] is an index in this array.
 + *   t_ilist il[F_NRE]
 + *      The list of interactions for each type. Note that some,
 + *      such as LJ and COUL will have 0 entries.
 + */
 +
 +typedef struct {
 +  int  n;         /* n+1 is the number of points */
 +  real scale;     /* distance between two points */
 +  real *data;     /* the actual table data, per point there are 4 numbers */
 +} bondedtable_t;
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +
 +#endif
index c39d0ecb7c4b534037494c94ec221cd105cb0287,0000000000000000000000000000000000000000..1fd776590b486d325d3b07bdf2a117fb7a1feb15
mode 100644,000000..100644
--- /dev/null
@@@ -1,85 -1,0 +1,83 @@@
-   gmx_bool          pureex;
-   gmx_bool          purecg;
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GRoups of Organic Molecules in ACtion for Science
 + */
 +
 +#ifndef _mdatom_h
 +#define _mdatom_h
 +
 +#include "simple.h"
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
 +
 +#define  NO_TF_TABLE 255
 +#define  DEFAULT_TF_TABLE 0
 +
 +typedef struct {
 +  real          tmassA,tmassB,tmass;
 +  int           nr;
 +  int           nalloc;
 +  int           nenergrp;
 +  gmx_bool          bVCMgrps;
 +  int           nPerturbed;
 +  int           nMassPerturbed;
 +  int           nChargePerturbed;
 +  gmx_bool          bOrires;
 +  real          *massA,*massB,*massT,*invmass;
 +  real          *chargeA,*chargeB;
 +  gmx_bool          *bPerturbed;
 +  int           *typeA,*typeB;
 +  unsigned short        *ptype;
 +  unsigned short        *cTC,*cENER,*cACC,*cFREEZE,*cVCM;
 +  unsigned short        *cU1,*cU2,*cORF;
 +  /* for QMMM, atomnumber contains atomic number of the atoms */
 +  gmx_bool          *bQM;
 +  /* The range of home atoms */
 +  int           start;
 +  int           homenr;
 +  /* The lambda value used to create the contents of the struct */
 +  real          lambda;
 +  /* The AdResS weighting function */
 +  real          *wf;
 +  unsigned short  *tf_table_index; /* The tf table that will be applied, if thermodyn, force enabled*/
 +} t_mdatoms;
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +
 +#endif
index c4786511fee8ca70f767e4c11e5b0f208999cece,0000000000000000000000000000000000000000..b83cfef838376b021fd64249aa9f7f96271edc81
mode 100644,000000..100644
--- /dev/null
@@@ -1,113 -1,0 +1,114 @@@
-     int             free_energy;  /* Free energy setting for this list     */
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GRoups of Organic Molecules in ACtion for Science
 + */
 +#ifndef _nblist_h
 +#define _nblist_h
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
 +
 +typedef unsigned long t_excl;
 +
 +/* The maximum charge group size because of minimum size of t_excl
 + * could be 32 bits.
 + */
 +#define MAX_CHARGEGROUP_SIZE 32
 +
 +/* The maximum charge group size for CG-CG nblists.
 + * The excl entry in t_nblist uses blocks of this size.
 + */
 +#define MAX_CGCGSIZE 32
 +
 +typedef struct 
 +{
 +    int             igeometry;    /* The type of list (atom, water, etc.)  */
 +    int             ielec;        /* Coulomb loop type index for kernels   */
 +    int             ielecmod;     /* Coulomb modifier (e.g. switch/shift)  */
 +    int             ivdw;         /* VdW loop type index for kernels       */
 +    int             ivdwmod;      /* VdW modifier (e.g. switch/shift)      */
++    int             type;         /* Type of interaction, listed in
++                                     gmx_nblist_interaction_type           */
 +
 +    int             nri,maxnri;   /* Current/max number of i particles           */
 +    int             nrj,maxnrj;   /* Current/max number of j particles           */
 +    int             maxlen;       /* maxnr of j atoms for a single i atom  */
 +    int *           iinr;         /* The i-elements                        */
 +    int *           iinr_end;     /* The end atom, only with enlistCG      */
 +    int *           gid;          /* Index in energy arrays                */
 +    int *           shift;        /* Shift vector index                    */
 +    int *           jindex;       /* Index in jjnr                         */
 +    int *           jjnr;         /* The j-atom list                       */
 +    int *           jjnr_end;     /* The end atom, only with enltypeCG     */
 +    t_excl *        excl;         /* Exclusions, only with enltypeCG       */
 +
 +    /* We use separate pointers for kernels that compute both potential
 +     * and force (vf suffix), only potential (v) or only force (f)
 +     */
 +    void *          kernelptr_vf;
 +    void *          kernelptr_v;
 +    void *          kernelptr_f;
 +
 +    /* Pad the list of neighbors for each i atom with "-1" entries up to the
 +     * simd_padding_width, if it is larger than 0. This is necessary for many
 +     * accelerated kernels using single-instruction multiple-data operations
 +     * internally.
 +     */
 +    int             simd_padding_width;
 +
 +} t_nblist;
 +
 +
 +/* For atom I =  nblist->iinr[N] (0 <= N < nblist->nri) there can be
 + * several neighborlists (N's), for different energy groups (gid) and
 + * different shifts (shift).
 + * For corresponding J atoms for each list start at:
 + * nblist->jjnr[JI]
 + * with nblist->jindex[N] <= JI < nblist->jindex[N+1]
 + *
 + * enlist is of the form enlistUNIT1_UNIT2:
 + * UNIT ATOM:  there is one atom: iinr[N] or jjnr[JI]
 + * UNIT SPC:   there are 3 atoms: iinr[N],iinr[N]+1,iinr[N]+2, jjnr analog.
 + * UNIT TIP4P: there are 4 atoms: iinr[N],...,iinr[N]+3, jjnr analog.
 + * UNIT CG:    there are N atoms: iinr[N],...,iinr_end[N]-1, jjnr analog.
 + *
 + * Clear?
 + */
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +#endif
index 03cc48c62b14fd36777c2b24272ce3ecb08d7269,0000000000000000000000000000000000000000..ed9df8553a2e9e333245663cd5d21b50bb1c504f
mode 100644,000000..100644
--- /dev/null
@@@ -1,204 -1,0 +1,235 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +
 +#ifndef _nbnxn_pairlist_h
 +#define _nbnxn_pairlist_h
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
 +/* A buffer data structure of 64 bytes
 + * to be placed at the beginning and end of structs
 + * to avoid cache invalidation of the real contents
 + * of the struct by writes to neighboring memory.
 + */
 +typedef struct {
 +    int dummy[16];
 +} gmx_cache_protect_t;
 +
 +/* Abstract type for pair searching data */
 +typedef struct nbnxn_search * nbnxn_search_t;
 +
 +/* Function that should return a pointer *ptr to memory
 + * of size nbytes.
 + * Error handling should be done within this function.
 + */
 +typedef void nbnxn_alloc_t(void **ptr,size_t nbytes);
 +
 +/* Function that should free the memory pointed to by *ptr.
 + * NULL should not be passed to this function.
 + */
 +typedef void nbnxn_free_t(void *ptr);
 +
 +typedef struct {
 +    int      cj;    /* The j-cluster                    */
 +    unsigned excl;  /* The exclusion (interaction) bits */
 +} nbnxn_cj_t;
 +
 +#define NBNXN_CI_SHIFT          127
 +#define NBNXN_CI_DO_LJ(subc)    (1<<(7+3*(subc)))
 +#define NBNXN_CI_HALF_LJ(subc)  (1<<(8+3*(subc)))
 +#define NBNXN_CI_DO_COUL(subc)  (1<<(9+3*(subc)))
 +
 +/* Simple pair-list i-unit */
 +typedef struct {
 +    int ci;             /* i-cluster             */
 +    int shift;          /* Shift vector index plus possible flags */
 +    int cj_ind_start;   /* Start index into cj   */
 +    int cj_ind_end;     /* End index into cj     */
 +} nbnxn_ci_t;
 +
 +/* Grouped pair-list i-unit */
 +typedef struct {
 +    int sci;            /* i-super-cluster       */
 +    int shift;          /* Shift vector index plus possible flags */
 +    int cj4_ind_start;  /* Start index into cj4  */
 +    int cj4_ind_end;    /* End index into cj4    */
 +} nbnxn_sci_t;
 +
 +typedef struct {
 +    unsigned imask;        /* The i-cluster interactions mask for 1 warp  */
 +    int excl_ind;          /* Index into the exclusion array for 1 warp   */
 +} nbnxn_im_ei_t;
 +
 +typedef struct {
 +    int cj[4];             /* The 4 j-clusters                            */
 +    nbnxn_im_ei_t imei[2]; /* The i-cluster mask data       for 2 warps   */
 +} nbnxn_cj4_t;
 +
 +typedef struct {
 +    unsigned pair[32];     /* Exclusion bits for one warp,                *
 +                            * each unsigned has bit for 4*8 i clusters    */
 +} nbnxn_excl_t;
 +
 +typedef struct {
 +    gmx_cache_protect_t cp0;
 +
 +    nbnxn_alloc_t *alloc;
 +    nbnxn_free_t  *free;
 +
 +    gmx_bool bSimple;      /* Simple list has na_sc=na_s and uses cj   *
 +                            * Complex list uses cj4                    */
 +
 +    int      na_ci;        /* The number of atoms per i-cluster        */
 +    int      na_cj;        /* The number of atoms per j-cluster        */
 +    int      na_sc;        /* The number of atoms per super cluster    */
 +    real     rlist;        /* The radius for constructing the list     */
 +    int      nci;          /* The number of i-clusters in the list     */
 +    nbnxn_ci_t *ci;        /* The i-cluster list, size nci             */
 +    int      ci_nalloc;    /* The allocation size of ci                */
 +    int      nsci;         /* The number of i-super-clusters in the list */
 +    nbnxn_sci_t *sci;      /* The i-super-cluster list                 */
 +    int      sci_nalloc;   /* The allocation size of sci               */
 +
 +    int      ncj;          /* The number of j-clusters in the list     */
 +    nbnxn_cj_t *cj;        /* The j-cluster list, size ncj             */
 +    int      cj_nalloc;    /* The allocation size of cj                */
 +
 +    int      ncj4;         /* The total number of 4*j clusters         */
 +    nbnxn_cj4_t *cj4;      /* The 4*j cluster list, size ncj4          */
 +    int      cj4_nalloc;   /* The allocation size of cj4               */
 +    int      nexcl;        /* The count for excl                       */
 +    nbnxn_excl_t *excl;    /* Atom interaction bits (non-exclusions)   */
 +    int      excl_nalloc;  /* The allocation size for excl             */
 +    int      nci_tot;      /* The total number of i clusters           */
 +
 +    struct nbnxn_list_work *work;
 +
 +    gmx_cache_protect_t cp1;
 +} nbnxn_pairlist_t;
 +
 +typedef struct {
 +    int          nnbl;      /* number of lists */
 +    nbnxn_pairlist_t **nbl; /* lists */
 +    gmx_bool     bCombined; /* TRUE if lists get combined into one (the 1st) */
 +    gmx_bool     bSimple;   /* TRUE if the list of of type "simple"
 +                               (na_sc=na_s, no super-clusters used) */
 +    int          natpair_ljq; /* Total number of atom pairs for LJ+Q kernel */
 +    int          natpair_lj;  /* Total number of atom pairs for LJ kernel   */
 +    int          natpair_q;   /* Total number of atom pairs for Q kernel    */
 +} nbnxn_pairlist_set_t;
 +
 +enum { nbatXYZ, nbatXYZQ, nbatX4, nbatX8 };
 +
 +typedef struct {
 +    real *f;      /* f, size natoms*fstride                             */
 +    real *fshift; /* Shift force array, size SHIFTS*DIM                 */
 +    int  nV;      /* The size of *Vvdw and *Vc                          */
 +    real *Vvdw;   /* Temporary Van der Waals group energy storage       */
 +    real *Vc;     /* Temporary Coulomb group energy storage             */
 +    int  nVS;     /* The size of *VSvdw and *VSc                        */
 +    real *VSvdw;  /* Temporary SIMD Van der Waals group energy storage  */
 +    real *VSc;    /* Temporary SIMD Coulomb group energy storage        */
 +} nbnxn_atomdata_output_t;
 +
++/* Block size in atoms for the non-bonded thread force-buffer reduction,
++ * should be a multiple of all cell and x86 SIMD sizes (i.e. 2, 4 and 8).
++ * Should be small to reduce the reduction and zeroing cost,
++ * but too small will result in overhead.
++ * Currently the block size is NBNXN_BUFFERFLAG_SIZE*3*sizeof(real)=192 bytes.
++ */
++#ifdef GMX_DOUBLE
++#define NBNXN_BUFFERFLAG_SIZE   8
++#else
++#define NBNXN_BUFFERFLAG_SIZE  16
++#endif
++
++/* We currently store the reduction flags as bits in an unsigned int.
++ * In most cases this limits the number of flags to 32.
++ * The reduction will automatically disable the flagging and do a full
++ * reduction when the flags won't fit, but this will lead to very slow
++ * reduction. As we anyhow don't expect reasonable performance with
++ * more than 32 threads, we put in this hard limit.
++ * You can increase this number, but the reduction will be very slow.
++ */
++#define NBNXN_BUFFERFLAG_MAX_THREADS  32
++
++/* Flags for telling if threads write to force output buffers */
++typedef struct {
++    int nflag;       /* The number of flag blocks                         */
++    unsigned *flag;  /* Bit i is set when thread i writes to a cell-block */
++    int flag_nalloc; /* Allocation size of cxy_flag                       */
++} nbnxn_buffer_flags_t;
++
 +/* LJ combination rules: geometric, Lorentz-Berthelot, none */
 +enum { ljcrGEOM, ljcrLB, ljcrNONE, ljcrNR };
 +
 +typedef struct {
 +    nbnxn_alloc_t *alloc;
 +    nbnxn_free_t  *free;
 +    int  ntype;      /* The number of different atom types                 */
 +    real *nbfp;      /* Lennard-Jones 6*C6 and 12*C12 params, size ntype^2*2 */
 +    int  comb_rule;  /* Combination rule, see enum above                   */
 +    real *nbfp_comb; /* LJ parameter per atom type, size ntype*2           */
 +    real *nbfp_s4;   /* As nbfp, but with stride 4, size ntype^2*4         */
 +    int  natoms;     /* Number of atoms                                    */
 +    int  natoms_local;  /* Number of local atoms                           */
 +    int  *type;      /* Atom types                                         */
 +    real *lj_comb;   /* LJ parameters per atom for combining for pairs     */
 +    int  XFormat;    /* The format of x (and q), enum                      */
 +    int  FFormat;    /* The format of f, enum                              */
 +    real *q;         /* Charges, can be NULL if incorporated in x          */
 +    int  na_c;       /* The number of atoms per cluster                    */
 +    int  nenergrp;   /* The number of energy groups                        */
 +    int  neg_2log;   /* Log2 of nenergrp                                   */
 +    int  *energrp;   /* The energy groups per cluster, can be NULL         */
 +    gmx_bool bDynamicBox; /* Do we need to update shift_vec every step?    */
 +    rvec *shift_vec; /* Shift vectors, copied from t_forcerec              */
 +    int  xstride;    /* stride for a coordinate in x (usually 3 or 4)      */
 +    int  fstride;    /* stride for a coordinate in f (usually 3 or 4)      */
 +    real *x;         /* x and possibly q, size natoms*xstride              */
 +    int  nout;       /* The number of force arrays                         */
 +    nbnxn_atomdata_output_t *out;  /* Output data structures               */
 +    int  nalloc;     /* Allocation size of all arrays (for x/f *x/fstride) */
++    gmx_bool bUseBufferFlags; /* Use the flags or operate on all atoms     */
++    nbnxn_buffer_flags_t buffer_flags; /* Flags for buffer zeroing+reduc.  */
 +} nbnxn_atomdata_t;
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +#endif
index d00aabe4b10e822c662c7f4b342008274ed1210a,0000000000000000000000000000000000000000..41e9c71fd5b9e8ce9c5b4f4819335ec37a31f38f
mode 100644,000000..100644
--- /dev/null
@@@ -1,624 -1,0 +1,581 @@@
-     adress_set_kernel_flags(n_ex, n_hyb, n_cg, mdatoms);
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 4.0.5
 + * Written by Christoph Junghans, Brad Lambeth, and possibly others.
 + * Copyright (c) 2009 Christoph Junghans, Brad Lambeth.
 + * All rights reserved.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * GROningen Mixture of Alchemy and Childrens' Stories
 + */
 +
 +#include "adress.h"
 +#include "maths.h"
 +#include "pbc.h"
 +#include "types/simple.h"
 +#include "typedefs.h"
 +#include "vec.h"
 +
 +real
 +adress_weight(rvec            x,
 +              int             adresstype,
 +              real            adressr,
 +              real            adressw,
 +              rvec *          ref,
 +              t_pbc *         pbc,
 +              t_forcerec *         fr )
 +{
 +    int  i;
 +    real l2 = adressr+adressw;
 +    real sqr_dl,dl;
 +    real tmp;
 +    rvec dx;
 +
 +    sqr_dl = 0.0;
 +
 +    if (pbc)
 +    {
 +        pbc_dx(pbc,(*ref),x,dx);
 +    }
 +    else
 +    {
 +        rvec_sub((*ref),x,dx);
 +    }
 +
 +    switch(adresstype)
 +    {
 +    case eAdressOff:
 +        /* default to explicit simulation */
 +        return 1;
 +    case eAdressConst:
 +        /* constant value for weighting function = adressw */
 +        return fr->adress_const_wf;
 +    case eAdressXSplit:
 +        /* plane through center of ref, varies in x direction */
 +        sqr_dl         = dx[0]*dx[0];
 +        break;
 +    case eAdressSphere:
 +        /* point at center of ref, assuming cubic geometry */
 +        for(i=0;i<3;i++){
 +            sqr_dl    += dx[i]*dx[i];
 +        }
 +        break;
 +    default:
 +        /* default to explicit simulation */
 +        return 1;
 +    }
 +
 +    dl=sqrt(sqr_dl);
 +
 +    /* molecule is coarse grained */
 +    if (dl > l2)
 +    {
 +        return 0;
 +    }
 +    /* molecule is explicit */
 +    else if (dl < adressr)
 +    {
 +        return 1;
 +    }
 +    /* hybrid region */
 +    else
 +    {
 +        tmp=cos((dl-adressr)*M_PI/2/adressw);
 +        return tmp*tmp;
 +    }
 +}
 +
 +void
 +update_adress_weights_com(FILE *               fplog,
 +                          int                  cg0,
 +                          int                  cg1,
 +                          t_block *            cgs,
 +                          rvec                 x[],
 +                          t_forcerec *         fr,
 +                          t_mdatoms *          mdatoms,
 +                          t_pbc *              pbc)
 +{
 +    int            icg,k,k0,k1,d;
 +    real           nrcg,inv_ncg,mtot,inv_mtot;
 +    atom_id *      cgindex;
 +    rvec           ix;
 +    int            adresstype;
 +    real           adressr,adressw;
 +    rvec *         ref;
 +    real *         massT;
 +    real *         wf;
 +
 +
 +    int n_hyb, n_ex, n_cg;
 +
 +    n_hyb=0;
 +    n_cg=0;
 +    n_ex=0;
 +
 +    adresstype         = fr->adress_type;
 +    adressr            = fr->adress_ex_width;
 +    adressw            = fr->adress_hy_width;
 +    massT              = mdatoms->massT;
 +    wf                 = mdatoms->wf;
 +    ref                = &(fr->adress_refs);
 +
 +
 +    /* Since this is center of mass AdResS, the vsite is not guaranteed
 +     * to be on the same node as the constructing atoms.  Therefore we
 +     * loop over the charge groups, calculate their center of mass,
 +     * then use this to calculate wf for each atom.  This wastes vsite
 +     * construction, but it's the only way to assure that the explicit
 +     * atoms have the same wf as their vsite. */
 +
 +#ifdef DEBUG
 +    fprintf(fplog,"Calculating center of mass for charge groups %d to %d\n",
 +            cg0,cg1);
 +#endif
 +    cgindex = cgs->index;
 +
 +    /* Compute the center of mass for all charge groups */
 +    for(icg=cg0; (icg<cg1); icg++)
 +    {
 +        k0      = cgindex[icg];
 +        k1      = cgindex[icg+1];
 +        nrcg    = k1-k0;
 +        if (nrcg == 1)
 +        {
 +            wf[k0] = adress_weight(x[k0],adresstype,adressr,adressw,ref,pbc,fr);
 +            if (wf[k0]==0){ n_cg++;}
 +            else if (wf[k0]==1){ n_ex++;}
 +            else {n_hyb++;}
 +        }
 +        else
 +        {
 +            mtot = 0.0;
 +            for(k=k0; (k<k1); k++)
 +            {
 +                mtot += massT[k];
 +            }
 +            if (mtot > 0.0)
 +            {
 +                inv_mtot = 1.0/mtot;
 +
 +                clear_rvec(ix);
 +                for(k=k0; (k<k1); k++)
 +                {
 +                    for(d=0; (d<DIM); d++)
 +                    {
 +                        ix[d] += x[k][d]*massT[k];
 +                    }
 +                }
 +                for(d=0; (d<DIM); d++)
 +                {
 +                    ix[d] *= inv_mtot;
 +                }
 +            }
 +            /* Calculate the center of gravity if the charge group mtot=0 (only vsites) */
 +            else
 +            {
 +                inv_ncg = 1.0/nrcg;
 +
 +                clear_rvec(ix);
 +                for(k=k0; (k<k1); k++)
 +                {
 +                    for(d=0; (d<DIM); d++)
 +                    {
 +                        ix[d] += x[k][d];
 +                    }
 +                }
 +                for(d=0; (d<DIM); d++)
 +                {
 +                    ix[d] *= inv_ncg;
 +                }
 +            }
 +
 +            /* Set wf of all atoms in charge group equal to wf of com */
 +            wf[k0] = adress_weight(ix,adresstype,adressr,adressw,ref,pbc, fr);
 +
 +            if (wf[k0]==0){ n_cg++;}
 +            else if (wf[k0]==1){ n_ex++;}
 +            else {n_hyb++;}
 +
 +            for(k=(k0+1); (k<k1); k++)
 +            {
 +                wf[k] = wf[k0];
 +            }
 +        }
 +    }
-     adress_set_kernel_flags(n_ex, n_hyb, n_cg, mdatoms);
 +}
++
 +void update_adress_weights_atom_per_atom(
 +                            int                  cg0,
 +                          int                  cg1,
 +                          t_block *            cgs,
 +                          rvec                 x[],
 +                          t_forcerec *         fr,
 +                          t_mdatoms *          mdatoms,
 +                          t_pbc *              pbc)
 +{
 +    int            icg,k,k0,k1,d;
 +    real           nrcg,inv_ncg,mtot,inv_mtot;
 +    atom_id *      cgindex;
 +    rvec           ix;
 +    int            adresstype;
 +    real           adressr,adressw;
 +    rvec *         ref;
 +    real *         massT;
 +    real *         wf;
 +
 +
 +    int n_hyb, n_ex, n_cg;
 +
 +    n_hyb=0;
 +    n_cg=0;
 +    n_ex=0;
 +
 +    adresstype         = fr->adress_type;
 +    adressr            = fr->adress_ex_width;
 +    adressw            = fr->adress_hy_width;
 +    massT              = mdatoms->massT;
 +    wf                 = mdatoms->wf;
 +    ref                = &(fr->adress_refs);
 +
 +    cgindex = cgs->index;
 +
 +    /* Weighting function is determined for each atom individually.
 +     * This is an approximation
 +     * as in the theory requires an interpolation based on the center of masses.
 +     * Should be used with caution */
 +
 +    for (icg = cg0; (icg < cg1); icg++) {
 +        k0 = cgindex[icg];
 +        k1 = cgindex[icg + 1];
 +        nrcg = k1 - k0;
 +
 +        for (k = (k0); (k < k1); k++) {
 +            wf[k] = adress_weight(x[k], adresstype, adressr, adressw, ref, pbc, fr);
 +            if (wf[k] == 0) {
 +                n_cg++;
 +            } else if (wf[k] == 1) {
 +                n_ex++;
 +            } else {
 +                n_hyb++;
 +            }
 +        }
 +
 +    }
-     adress_set_kernel_flags(n_ex, n_hyb, n_cg, mdatoms);
 +}
 +
 +void
 +update_adress_weights_cog(t_iparams            ip[],
 +                          t_ilist              ilist[],
 +                          rvec                 x[],
 +                          t_forcerec *         fr,
 +                          t_mdatoms *          mdatoms,
 +                          t_pbc *              pbc)
 +{
 +    int            i,j,k,nr,nra,inc;
 +    int            ftype,adresstype;
 +    t_iatom        avsite,ai,aj,ak,al;
 +    t_iatom *      ia;
 +    real           adressr,adressw;
 +    rvec *         ref;
 +    real *         wf;
 +    int            n_hyb, n_ex, n_cg;
 +
 +    adresstype         = fr->adress_type;
 +    adressr            = fr->adress_ex_width;
 +    adressw            = fr->adress_hy_width;
 +    wf                 = mdatoms->wf;
 +    ref                = &(fr->adress_refs);
 +
 +
 +    n_hyb=0;
 +    n_cg=0;
 +    n_ex=0;
 +
 +
 +    /* Since this is center of geometry AdResS, we know the vsite
 +     * is in the same charge group node as the constructing atoms.
 +     * Loop over vsite types, calculate the weight of the vsite,
 +     * then assign that weight to the constructing atoms. */
 +
 +    for(ftype=0; (ftype<F_NRE); ftype++)
 +    {
 +        if (interaction_function[ftype].flags & IF_VSITE)
 +        {
 +            nra    = interaction_function[ftype].nratoms;
 +            nr     = ilist[ftype].nr;
 +            ia     = ilist[ftype].iatoms;
 +
 +            for(i=0; (i<nr); )
 +            {
 +                /* The vsite and first constructing atom */
 +                avsite     = ia[1];
 +                ai         = ia[2];
 +                wf[avsite] = adress_weight(x[avsite],adresstype,adressr,adressw,ref,pbc,fr);
 +                wf[ai]     = wf[avsite];
 +
 +                if (wf[ai]  == 0) {
 +                    n_cg++;
 +                } else if (wf[ai]  == 1) {
 +                    n_ex++;
 +                } else {
 +                    n_hyb++;
 +                }
 +
 +                /* Assign the vsite wf to rest of constructing atoms depending on type */
 +                inc = nra+1;
 +                switch (ftype) {
 +                case F_VSITE2:
 +                    aj     = ia[3];
 +                    wf[aj] = wf[avsite];
 +                    break;
 +                case F_VSITE3:
 +                    aj     = ia[3];
 +                    wf[aj] = wf[avsite];
 +                    ak     = ia[4];
 +                    wf[ak] = wf[avsite];
 +                    break;
 +                case F_VSITE3FD:
 +                    aj     = ia[3];
 +                    wf[aj] = wf[avsite];
 +                    ak     = ia[4];
 +                    wf[ak] = wf[avsite];
 +                    break;
 +                case F_VSITE3FAD:
 +                    aj     = ia[3];
 +                    wf[aj] = wf[avsite];
 +                    ak     = ia[4];
 +                    wf[ak] = wf[avsite];
 +                    break;
 +                case F_VSITE3OUT:
 +                    aj     = ia[3];
 +                    wf[aj] = wf[avsite];
 +                    ak     = ia[4];
 +                    wf[ak] = wf[avsite];
 +                    break;
 +                case F_VSITE4FD:
 +                    aj     = ia[3];
 +                    wf[aj] = wf[avsite];
 +                    ak     = ia[4];
 +                    wf[ak] = wf[avsite];
 +                    al     = ia[5];
 +                    wf[al] = wf[avsite];
 +                    break;
 +                case F_VSITE4FDN:
 +                    aj     = ia[3];
 +                    wf[aj] = wf[avsite];
 +                    ak     = ia[4];
 +                    wf[ak] = wf[avsite];
 +                    al     = ia[5];
 +                    wf[al] = wf[avsite];
 +                    break;
 +                case F_VSITEN:
 +                    inc    = 3*ip[ia[0]].vsiten.n;
 +                    for(j=3; j<inc; j+=3)
 +                    {
 +                        ai = ia[j+2];
 +                        wf[ai] = wf[avsite];
 +                    }
 +                    break;
 +                default:
 +                    gmx_fatal(FARGS,"No such vsite type %d in %s, line %d",
 +                              ftype,__FILE__,__LINE__);
 +                }
 +
 +                /* Increment loop variables */
 +                i  += inc;
 +                ia += inc;
 +            }
 +        }
 +    }
- void adress_set_kernel_flags(int n_ex, int n_hyb, int n_cg, t_mdatoms * mdatoms){
-     /* With domain decomposition we can check weather a cpu calculates only
-      * coarse-grained or explicit interactions. If so we use standard gromacs kernels
-      * on this proc. See also nonbonded.c */
-     if (n_hyb ==0 && n_ex == 0){
-      /* all particles on this proc are coarse-grained, use standard gromacs kernels */
-         if (!mdatoms->purecg){
-             mdatoms->purecg = TRUE;
-            if (debug) fprintf (debug, "adress.c: pure cg kernels on this proc\n");
-         }
-     }
-     else
-     {
-         if (mdatoms->purecg){
-          /* now this processor has hybrid particles again, call the hybrid kernels */
-             mdatoms->purecg = FALSE;
-         }
-     }
-     if (n_hyb ==0 && n_cg == 0){
-     /* all particles on this proc are atomistic, use standard gromacs kernels */
-         if (!mdatoms->pureex){
-              mdatoms->pureex = TRUE;
-              if (debug) fprintf (debug, "adress.c: pure ex kernels on this proc\n");
-         }
-     }
-     else
-     {
-         if (mdatoms->pureex){
-             mdatoms->pureex = FALSE;
-         }
-     }
- }
 +}
 +
 +void
 +update_adress_weights_atom(int                  cg0,
 +                           int                  cg1,
 +                           t_block *            cgs,
 +                           rvec                 x[],
 +                           t_forcerec *         fr,
 +                           t_mdatoms *          mdatoms,
 +                           t_pbc *              pbc)
 +{
 +    int            icg,k,k0,k1;
 +    atom_id *      cgindex;
 +    int            adresstype;
 +    real           adressr,adressw;
 +    rvec *         ref;
 +    real *         massT;
 +    real *         wf;
 +
 +    adresstype         = fr->adress_type;
 +    adressr            = fr->adress_ex_width;
 +    adressw            = fr->adress_hy_width;
 +    massT              = mdatoms->massT;
 +    wf                 = mdatoms->wf;
 +    ref                = &(fr->adress_refs);
 +    cgindex            = cgs->index;
 +
 +    /* Only use first atom in charge group.
 +     * We still can't be sure that the vsite and constructing
 +     * atoms are on the same processor, so we must calculate
 +     * in the same way as com adress. */
 +
 +    for(icg=cg0; (icg<cg1); icg++)
 +    {
 +        k0      = cgindex[icg];
 +        k1      = cgindex[icg+1];
 +        wf[k0] = adress_weight(x[k0],adresstype,adressr,adressw,ref,pbc,fr);
 +
 +        /* Set wf of all atoms in charge group equal to wf of first atom in charge group*/
 +        for(k=(k0+1); (k<k1); k++)
 +        {
 +            wf[k] = wf[k0];
 +        }
 +    }
 +}
 +
 +void
 +adress_thermo_force(int                  start,
 +                    int                  homenr,
 +                    t_block *            cgs,
 +                    rvec                 x[],
 +                    rvec                 f[],
 +                    t_forcerec *         fr,
 +                    t_mdatoms *          mdatoms,
 +                    t_pbc *              pbc)
 +{
 +    int              iatom,n0,nnn,nrcg, i;
 +    int              adresstype;
 +    real             adressw, adressr;
 +    atom_id *        cgindex;
 +    unsigned short * ptype;
 +    rvec *           ref;
 +    real *           wf;
 +    real             tabscale;
 +    real *           ATFtab;
 +    rvec             dr;
 +    real             w,wsq,wmin1,wmin1sq,wp,wt,rinv, sqr_dl, dl;
 +    real             eps,eps2,F,Geps,Heps2,Fp,dmu_dwp,dwp_dr,fscal;
 +
 +    adresstype       = fr->adress_type;
 +    adressw          = fr->adress_hy_width;
 +    adressr           = fr->adress_ex_width;
 +    cgindex          = cgs->index;
 +    ptype            = mdatoms->ptype;
 +    ref              = &(fr->adress_refs);
 +    wf               = mdatoms->wf;
 +
 +    for(iatom=start; (iatom<start+homenr); iatom++)
 +    {
 +        if (egp_coarsegrained(fr, mdatoms->cENER[iatom]))
 +        {
 +            if (ptype[iatom] == eptVSite)
 +            {
 +                w    = wf[iatom];
 +                /* is it hybrid or apply the thermodynamics force everywhere?*/
 +                if ( mdatoms->tf_table_index[iatom] != NO_TF_TABLE)
 +                {
 +                    if (fr->n_adress_tf_grps > 0 ){
 +                        /* multi component tf is on, select the right table */
 +                        ATFtab = fr->atf_tabs[mdatoms->tf_table_index[iatom]].data;
 +                        tabscale = fr->atf_tabs[mdatoms->tf_table_index[iatom]].scale;
 +                    }
 +                    else {
 +                    /* just on component*/
 +                        ATFtab = fr->atf_tabs[DEFAULT_TF_TABLE].data;
 +                        tabscale = fr->atf_tabs[DEFAULT_TF_TABLE].scale;
 +                    }
 +
 +                    fscal            = 0;
 +                    if (pbc)
 +                    {
 +                        pbc_dx(pbc,(*ref),x[iatom],dr);
 +                    }
 +                    else
 +                    {
 +                        rvec_sub((*ref),x[iatom],dr);
 +                    }
 +
 +
 +
 +
 +                    /* calculate distace to adress center again */
 +                    sqr_dl =0.0;
 +                    switch(adresstype)
 +                    {
 +                    case eAdressXSplit:
 +                        /* plane through center of ref, varies in x direction */
 +                        sqr_dl         = dr[0]*dr[0];
 +                        rinv             = gmx_invsqrt(dr[0]*dr[0]);
 +                        break;
 +                    case eAdressSphere:
 +                        /* point at center of ref, assuming cubic geometry */
 +                        for(i=0;i<3;i++){
 +                            sqr_dl    += dr[i]*dr[i];
 +                        }
 +                        rinv             = gmx_invsqrt(sqr_dl);
 +                        break;
 +                    default:
 +                        /* This case should not happen */
 +                        rinv = 0.0;
 +                    }
 +
 +                    dl=sqrt(sqr_dl);
 +                    /* table origin is adress center */
 +                    wt               = dl*tabscale;
 +                    n0               = wt;
 +                    eps              = wt-n0;
 +                    eps2             = eps*eps;
 +                    nnn              = 4*n0;
 +                    F                = ATFtab[nnn+1];
 +                    Geps             = eps*ATFtab[nnn+2];
 +                    Heps2            = eps2*ATFtab[nnn+3];
 +                    Fp               = F+Geps+Heps2;
 +                    F                = (Fp+Geps+2.0*Heps2)*tabscale;
 +
 +                    fscal            = F*rinv;
 +
 +                    f[iatom][0]        += fscal*dr[0];
 +                    if (adresstype != eAdressXSplit)
 +                    {
 +                        f[iatom][1]    += fscal*dr[1];
 +                        f[iatom][2]    += fscal*dr[2];
 +                    }
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +gmx_bool egp_explicit(t_forcerec *   fr, int egp_nr)
 +{
 +    return fr->adress_group_explicit[egp_nr];
 +}
 +
 +gmx_bool egp_coarsegrained(t_forcerec *   fr, int egp_nr)
 +{
 +   return !fr->adress_group_explicit[egp_nr];
 +}
index 438d8dfb2062ca0fea526fc0f2ddb786a07f3bfc,0000000000000000000000000000000000000000..e6095ebf549d90cdc485eb92f2870dc9926e43ed
mode 100644,000000..100644
--- /dev/null
@@@ -1,9554 -1,0 +1,9589 @@@
-         for(i=0; i<DIM; i++)
-         {
-             zones->size[z].bb_x0[i] = zones->size[z].x0[i];
-             zones->size[z].bb_x1[i] = zones->size[z].x1[i];
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + * This file is part of Gromacs        Copyright (c) 1991-2008
 + * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gnomes, ROck Monsters And Chili Sauce
 + */
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <stdio.h>
 +#include <time.h>
 +#include <math.h>
 +#include <string.h>
 +#include <stdlib.h>
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "gmx_fatal.h"
 +#include "gmx_fatal_collective.h"
 +#include "vec.h"
 +#include "domdec.h"
 +#include "domdec_network.h"
 +#include "nrnb.h"
 +#include "pbc.h"
 +#include "chargegroup.h"
 +#include "constr.h"
 +#include "mdatoms.h"
 +#include "names.h"
 +#include "pdbio.h"
 +#include "futil.h"
 +#include "force.h"
 +#include "pme.h"
 +#include "pull.h"
 +#include "pull_rotation.h"
 +#include "gmx_wallcycle.h"
 +#include "mdrun.h"
 +#include "nsgrid.h"
 +#include "shellfc.h"
 +#include "mtop_util.h"
 +#include "gmxfio.h"
 +#include "gmx_ga2la.h"
 +#include "gmx_sort.h"
 +#include "macros.h"
 +#include "nbnxn_search.h"
 +#include "bondf.h"
 +#include "gmx_omp_nthreads.h"
 +
 +#ifdef GMX_LIB_MPI
 +#include <mpi.h>
 +#endif
 +#ifdef GMX_THREAD_MPI
 +#include "tmpi.h"
 +#endif
 +
 +#define DDRANK(dd,rank)    (rank)
 +#define DDMASTERRANK(dd)   (dd->masterrank)
 +
 +typedef struct gmx_domdec_master
 +{
 +    /* The cell boundaries */
 +    real **cell_x;
 +    /* The global charge group division */
 +    int  *ncg;     /* Number of home charge groups for each node */
 +    int  *index;   /* Index of nnodes+1 into cg */
 +    int  *cg;      /* Global charge group index */
 +    int  *nat;     /* Number of home atoms for each node. */
 +    int  *ibuf;    /* Buffer for communication */
 +    rvec *vbuf;    /* Buffer for state scattering and gathering */
 +} gmx_domdec_master_t;
 +
 +typedef struct
 +{
 +    /* The numbers of charge groups to send and receive for each cell
 +     * that requires communication, the last entry contains the total
 +     * number of atoms that needs to be communicated.
 +     */
 +    int nsend[DD_MAXIZONE+2];
 +    int nrecv[DD_MAXIZONE+2];
 +    /* The charge groups to send */
 +    int *index;
 +    int nalloc;
 +    /* The atom range for non-in-place communication */
 +    int cell2at0[DD_MAXIZONE];
 +    int cell2at1[DD_MAXIZONE];
 +} gmx_domdec_ind_t;
 +
 +typedef struct
 +{
 +    int  np;                   /* Number of grid pulses in this dimension */
 +    int  np_dlb;               /* For dlb, for use with edlbAUTO          */
 +    gmx_domdec_ind_t *ind;     /* The indices to communicate, size np     */
 +    int  np_nalloc;
 +    gmx_bool bInPlace;             /* Can we communicate in place?            */
 +} gmx_domdec_comm_dim_t;
 +
 +typedef struct
 +{
 +    gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
 +    real *cell_f;      /* State var.: cell boundaries, box relative      */
 +    real *old_cell_f;  /* Temp. var.: old cell size                      */
 +    real *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
 +    real *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
 +    real *bound_min;   /* Temp. var.: lower limit for cell boundary      */
 +    real *bound_max;   /* Temp. var.: upper limit for cell boundary      */
 +    gmx_bool bLimited;     /* State var.: is DLB limited in this dim and row */
 +    real *buf_ncd;     /* Temp. var.                                     */
 +} gmx_domdec_root_t;
 +
 +#define DD_NLOAD_MAX 9
 +
 +/* Here floats are accurate enough, since these variables
 + * only influence the load balancing, not the actual MD results.
 + */
 +typedef struct
 +{
 +    int  nload;
 +    float *load;
 +    float sum;
 +    float max;
 +    float sum_m;
 +    float cvol_min;
 +    float mdf;
 +    float pme;
 +    int   flags;
 +} gmx_domdec_load_t;
 +
 +typedef struct
 +{
 +    int  nsc;
 +    int  ind_gl;
 +    int  ind;
 +} gmx_cgsort_t;
 +
 +typedef struct
 +{
 +    gmx_cgsort_t *sort;
 +    gmx_cgsort_t *sort2;
 +    int  sort_nalloc;
 +    gmx_cgsort_t *sort_new;
 +    int  sort_new_nalloc;
 +    int  *ibuf;
 +    int  ibuf_nalloc;
 +} gmx_domdec_sort_t;
 +
 +typedef struct
 +{
 +    rvec *v;
 +    int  nalloc;
 +} vec_rvec_t;
 +
 +/* This enum determines the order of the coordinates.
 + * ddnatHOME and ddnatZONE should be first and second,
 + * the others can be ordered as wanted.
 + */
 +enum { ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR };
 +
 +enum { edlbAUTO, edlbNO, edlbYES, edlbNR };
 +const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
 +
 +typedef struct
 +{
 +    int  dim;      /* The dimension                                          */
 +    gmx_bool dim_match;/* Tells if DD and PME dims match                         */
 +    int  nslab;    /* The number of PME slabs in this dimension              */
 +    real *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
 +    int  *pp_min;  /* The minimum pp node location, size nslab               */
 +    int  *pp_max;  /* The maximum pp node location,size nslab                */
 +    int  maxshift; /* The maximum shift for coordinate redistribution in PME */
 +} gmx_ddpme_t;
 +
 +typedef struct
 +{
 +    real min0;    /* The minimum bottom of this zone                        */
 +    real max1;    /* The maximum top of this zone                           */
 +    real min1;    /* The minimum top of this zone                           */
 +    real mch0;    /* The maximum bottom communicaton height for this zone   */
 +    real mch1;    /* The maximum top communicaton height for this zone      */
 +    real p1_0;    /* The bottom value of the first cell in this zone        */
 +    real p1_1;    /* The top value of the first cell in this zone           */
 +} gmx_ddzone_t;
 +
 +typedef struct
 +{
 +    gmx_domdec_ind_t ind;
 +    int *ibuf;
 +    int ibuf_nalloc;
 +    vec_rvec_t vbuf;
 +    int nsend;
 +    int nat;
 +    int nsend_zone;
 +} dd_comm_setup_work_t;
 +
 +typedef struct gmx_domdec_comm
 +{
 +    /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
 +     * unless stated otherwise.
 +     */
 +
 +    /* The number of decomposition dimensions for PME, 0: no PME */
 +    int  npmedecompdim;
 +    /* The number of nodes doing PME (PP/PME or only PME) */
 +    int  npmenodes;
 +    int  npmenodes_x;
 +    int  npmenodes_y;
 +    /* The communication setup including the PME only nodes */
 +    gmx_bool bCartesianPP_PME;
 +    ivec ntot;
 +    int  cartpmedim;
 +    int  *pmenodes;          /* size npmenodes                         */
 +    int  *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
 +                              * but with bCartesianPP_PME              */
 +    gmx_ddpme_t ddpme[2];
 +    
 +    /* The DD particle-particle nodes only */
 +    gmx_bool bCartesianPP;
 +    int  *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
 +    
 +    /* The global charge groups */
 +    t_block cgs_gl;
 +
 +    /* Should we sort the cgs */
 +    int  nstSortCG;
 +    gmx_domdec_sort_t *sort;
 +    
 +    /* Are there charge groups? */
 +    gmx_bool bCGs;
 +
 +    /* Are there bonded and multi-body interactions between charge groups? */
 +    gmx_bool bInterCGBondeds;
 +    gmx_bool bInterCGMultiBody;
 +
 +    /* Data for the optional bonded interaction atom communication range */
 +    gmx_bool bBondComm;
 +    t_blocka *cglink;
 +    char *bLocalCG;
 +
 +    /* The DLB option */
 +    int  eDLB;
 +    /* Are we actually using DLB? */
 +    gmx_bool bDynLoadBal;
 +
 +    /* Cell sizes for static load balancing, first index cartesian */
 +    real **slb_frac;
 +
 +    /* The width of the communicated boundaries */
 +    real cutoff_mbody;
 +    real cutoff;
 +    /* The minimum cell size (including triclinic correction) */
 +    rvec cellsize_min;
 +    /* For dlb, for use with edlbAUTO */
 +    rvec cellsize_min_dlb;
 +    /* The lower limit for the DD cell size with DLB */
 +    real cellsize_limit;
 +    /* Effectively no NB cut-off limit with DLB for systems without PBC? */
 +    gmx_bool bVacDLBNoLimit;
 +
 +    /* tric_dir is only stored here because dd_get_ns_ranges needs it */
 +    ivec tric_dir;
 +    /* box0 and box_size are required with dim's without pbc and -gcom */
 +    rvec box0;
 +    rvec box_size;
 +    
 +    /* The cell boundaries */
 +    rvec cell_x0;
 +    rvec cell_x1;
 +
 +    /* The old location of the cell boundaries, to check cg displacements */
 +    rvec old_cell_x0;
 +    rvec old_cell_x1;
 +
 +    /* The communication setup and charge group boundaries for the zones */
 +    gmx_domdec_zones_t zones;
 +    
 +    /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
 +     * cell boundaries of neighboring cells for dynamic load balancing.
 +     */
 +    gmx_ddzone_t zone_d1[2];
 +    gmx_ddzone_t zone_d2[2][2];
 +    
 +    /* The coordinate/force communication setup and indices */
 +    gmx_domdec_comm_dim_t cd[DIM];
 +    /* The maximum number of cells to communicate with in one dimension */
 +    int  maxpulse;
 +    
 +    /* Which cg distribution is stored on the master node */
 +    int master_cg_ddp_count;
 +    
 +    /* The number of cg's received from the direct neighbors */
 +    int  zone_ncg1[DD_MAXZONE];
 +    
 +    /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
 +    int  nat[ddnatNR];
 +
 +    /* Array for signalling if atoms have moved to another domain */
 +    int  *moved;
 +    int  moved_nalloc;
 +    
 +    /* Communication buffer for general use */
 +    int  *buf_int;
 +    int  nalloc_int;
 +
 +    /* Communication buffer for general use */
 +    vec_rvec_t vbuf;
 +
 +    /* Temporary storage for thread parallel communication setup */
 +    int nth;
 +    dd_comm_setup_work_t *dth;
 +
 +    /* Communication buffers only used with multiple grid pulses */
 +    int  *buf_int2;
 +    int  nalloc_int2;
 +    vec_rvec_t vbuf2;
 +    
 +    /* Communication buffers for local redistribution */
 +    int  **cggl_flag;
 +    int  cggl_flag_nalloc[DIM*2];
 +    rvec **cgcm_state;
 +    int  cgcm_state_nalloc[DIM*2];
 +    
 +    /* Cell sizes for dynamic load balancing */
 +    gmx_domdec_root_t **root;
 +    real *cell_f_row;
 +    real cell_f0[DIM];
 +    real cell_f1[DIM];
 +    real cell_f_max0[DIM];
 +    real cell_f_min1[DIM];
 +    
 +    /* Stuff for load communication */
 +    gmx_bool bRecordLoad;
 +    gmx_domdec_load_t *load;
 +#ifdef GMX_MPI
 +    MPI_Comm *mpi_comm_load;
 +#endif
 +
 +    /* Maximum DLB scaling per load balancing step in percent */
 +    int dlb_scale_lim;
 +
 +    /* Cycle counters */
 +    float cycl[ddCyclNr];
 +    int   cycl_n[ddCyclNr];
 +    float cycl_max[ddCyclNr];
 +    /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
 +    int eFlop;
 +    double flop;
 +    int    flop_n;
 +    /* Have often have did we have load measurements */
 +    int    n_load_have;
 +    /* Have often have we collected the load measurements */
 +    int    n_load_collect;
 +    
 +    /* Statistics */
 +    double sum_nat[ddnatNR-ddnatZONE];
 +    int    ndecomp;
 +    int    nload;
 +    double load_step;
 +    double load_sum;
 +    double load_max;
 +    ivec   load_lim;
 +    double load_mdf;
 +    double load_pme;
 +
 +    /* The last partition step */
 +    gmx_large_int_t partition_step;
 +
 +    /* Debugging */
 +    int  nstDDDump;
 +    int  nstDDDumpGrid;
 +    int  DD_debug;
 +} gmx_domdec_comm_t;
 +
 +/* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 +#define DD_CGIBS 2
 +
 +/* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 +#define DD_FLAG_NRCG  65535
 +#define DD_FLAG_FW(d) (1<<(16+(d)*2))
 +#define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 +
 +/* Zone permutation required to obtain consecutive charge groups
 + * for neighbor searching.
 + */
 +static const int zone_perm[3][4] = { {0,0,0,0},{1,0,0,0},{3,0,1,2} };
 +
 +/* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
 + * components see only j zones with that component 0.
 + */
 +
 +/* The DD zone order */
 +static const ivec dd_zo[DD_MAXZONE] =
 +  {{0,0,0},{1,0,0},{1,1,0},{0,1,0},{0,1,1},{0,0,1},{1,0,1},{1,1,1}};
 +
 +/* The 3D setup */
 +#define dd_z3n  8
 +#define dd_zp3n 4
 +static const ivec dd_zp3[dd_zp3n] = {{0,0,8},{1,3,6},{2,5,6},{3,5,7}};
 +
 +/* The 2D setup */
 +#define dd_z2n  4
 +#define dd_zp2n 2
 +static const ivec dd_zp2[dd_zp2n] = {{0,0,4},{1,3,4}};
 +
 +/* The 1D setup */
 +#define dd_z1n  2
 +#define dd_zp1n 1
 +static const ivec dd_zp1[dd_zp1n] = {{0,0,2}};
 +
 +/* Factors used to avoid problems due to rounding issues */
 +#define DD_CELL_MARGIN       1.0001
 +#define DD_CELL_MARGIN2      1.00005
 +/* Factor to account for pressure scaling during nstlist steps */
 +#define DD_PRES_SCALE_MARGIN 1.02
 +
 +/* Allowed performance loss before we DLB or warn */
 +#define DD_PERF_LOSS 0.05
 +
 +#define DD_CELL_F_SIZE(dd,di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 +
 +/* Use separate MPI send and receive commands
 + * when nnodes <= GMX_DD_NNODES_SENDRECV.
 + * This saves memory (and some copying for small nnodes).
 + * For high parallelization scatter and gather calls are used.
 + */
 +#define GMX_DD_NNODES_SENDRECV 4
 +
 +
 +/*
 +#define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 +
 +static void index2xyz(ivec nc,int ind,ivec xyz)
 +{
 +  xyz[XX] = ind % nc[XX];
 +  xyz[YY] = (ind / nc[XX]) % nc[YY];
 +  xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 +}
 +*/
 +
 +/* This order is required to minimize the coordinate communication in PME
 + * which uses decomposition in the x direction.
 + */
 +#define dd_index(n,i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 +
 +static void ddindex2xyz(ivec nc,int ind,ivec xyz)
 +{
 +    xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 +    xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 +    xyz[ZZ] = ind % nc[ZZ];
 +}
 +
 +static int ddcoord2ddnodeid(gmx_domdec_t *dd,ivec c)
 +{
 +    int ddindex;
 +    int ddnodeid=-1;
 +    
 +    ddindex = dd_index(dd->nc,c);
 +    if (dd->comm->bCartesianPP_PME)
 +    {
 +        ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 +    }
 +    else if (dd->comm->bCartesianPP)
 +    {
 +#ifdef GMX_MPI
 +        MPI_Cart_rank(dd->mpi_comm_all,c,&ddnodeid);
 +#endif
 +    }
 +    else
 +    {
 +        ddnodeid = ddindex;
 +    }
 +    
 +    return ddnodeid;
 +}
 +
 +static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox,t_inputrec *ir)
 +{
 +    return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 +}
 +
 +int ddglatnr(gmx_domdec_t *dd,int i)
 +{
 +    int atnr;
 +    
 +    if (dd == NULL)
 +    {
 +        atnr = i + 1;
 +    }
 +    else
 +    {
 +        if (i >= dd->comm->nat[ddnatNR-1])
 +        {
 +            gmx_fatal(FARGS,"glatnr called with %d, which is larger than the local number of atoms (%d)",i,dd->comm->nat[ddnatNR-1]);
 +        }
 +        atnr = dd->gatindex[i] + 1;
 +    }
 +    
 +    return atnr;
 +}
 +
 +t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 +{
 +    return &dd->comm->cgs_gl;
 +}
 +
 +static void vec_rvec_init(vec_rvec_t *v)
 +{
 +    v->nalloc = 0;
 +    v->v      = NULL;
 +}
 +
 +static void vec_rvec_check_alloc(vec_rvec_t *v,int n)
 +{
 +    if (n > v->nalloc)
 +    {
 +        v->nalloc = over_alloc_dd(n);
 +        srenew(v->v,v->nalloc);
 +    }
 +}
 +
 +void dd_store_state(gmx_domdec_t *dd,t_state *state)
 +{
 +    int i;
 +    
 +    if (state->ddp_count != dd->ddp_count)
 +    {
 +        gmx_incons("The state does not the domain decomposition state");
 +    }
 +    
 +    state->ncg_gl = dd->ncg_home;
 +    if (state->ncg_gl > state->cg_gl_nalloc)
 +    {
 +        state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 +        srenew(state->cg_gl,state->cg_gl_nalloc);
 +    }
 +    for(i=0; i<state->ncg_gl; i++)
 +    {
 +        state->cg_gl[i] = dd->index_gl[i];
 +    }
 +    
 +    state->ddp_count_cg_gl = dd->ddp_count;
 +}
 +
 +gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 +{
 +    return &dd->comm->zones;
 +}
 +
 +void dd_get_ns_ranges(gmx_domdec_t *dd,int icg,
 +                      int *jcg0,int *jcg1,ivec shift0,ivec shift1)
 +{
 +    gmx_domdec_zones_t *zones;
 +    int izone,d,dim;
 +
 +    zones = &dd->comm->zones;
 +
 +    izone = 0;
 +    while (icg >= zones->izone[izone].cg1)
 +    {
 +        izone++;
 +    }
 +    
 +    if (izone == 0)
 +    {
 +        *jcg0 = icg;
 +    }
 +    else if (izone < zones->nizone)
 +    {
 +        *jcg0 = zones->izone[izone].jcg0;
 +    }
 +    else
 +    {
 +        gmx_fatal(FARGS,"DD icg %d out of range: izone (%d) >= nizone (%d)",
 +                  icg,izone,zones->nizone);
 +    }
 +        
 +    *jcg1 = zones->izone[izone].jcg1;
 +    
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        shift0[dim] = zones->izone[izone].shift0[dim];
 +        shift1[dim] = zones->izone[izone].shift1[dim];
 +        if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
 +        {
 +            /* A conservative approach, this can be optimized */
 +            shift0[dim] -= 1;
 +            shift1[dim] += 1;
 +        }
 +    }
 +}
 +
 +int dd_natoms_vsite(gmx_domdec_t *dd)
 +{
 +    return dd->comm->nat[ddnatVSITE];
 +}
 +
 +void dd_get_constraint_range(gmx_domdec_t *dd,int *at_start,int *at_end)
 +{
 +    *at_start = dd->comm->nat[ddnatCON-1];
 +    *at_end   = dd->comm->nat[ddnatCON];
 +}
 +
 +void dd_move_x(gmx_domdec_t *dd,matrix box,rvec x[])
 +{
 +    int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 +    int  *index,*cgindex;
 +    gmx_domdec_comm_t *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t *ind;
 +    rvec shift={0,0,0},*buf,*rbuf;
 +    gmx_bool bPBC,bScrew;
 +    
 +    comm = dd->comm;
 +    
 +    cgindex = dd->cgindex;
 +    
 +    buf = comm->vbuf.v;
 +
 +    nzone = 1;
 +    nat_tot = dd->nat_home;
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        bPBC   = (dd->ci[dd->dim[d]] == 0);
 +        bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 +        if (bPBC)
 +        {
 +            copy_rvec(box[dd->dim[d]],shift);
 +        }
 +        cd = &comm->cd[d];
 +        for(p=0; p<cd->np; p++)
 +        {
 +            ind = &cd->ind[p];
 +            index = ind->index;
 +            n = 0;
 +            if (!bPBC)
 +            {
 +                for(i=0; i<ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for(j=at0; j<at1; j++)
 +                    {
 +                        copy_rvec(x[j],buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else if (!bScrew)
 +            {
 +                for(i=0; i<ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for(j=at0; j<at1; j++)
 +                    {
 +                        /* We need to shift the coordinates */
 +                        rvec_add(x[j],shift,buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else
 +            {
 +                for(i=0; i<ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for(j=at0; j<at1; j++)
 +                    {
 +                        /* Shift x */
 +                        buf[n][XX] = x[j][XX] + shift[XX];
 +                        /* Rotate y and z.
 +                         * This operation requires a special shift force
 +                         * treatment, which is performed in calc_vir.
 +                         */
 +                        buf[n][YY] = box[YY][YY] - x[j][YY];
 +                        buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 +                        n++;
 +                    }
 +                }
 +            }
 +            
 +            if (cd->bInPlace)
 +            {
 +                rbuf = x + nat_tot;
 +            }
 +            else
 +            {
 +                rbuf = comm->vbuf2.v;
 +            }
 +            /* Send and receive the coordinates */
 +            dd_sendrecv_rvec(dd, d, dddirBackward,
 +                             buf,  ind->nsend[nzone+1],
 +                             rbuf, ind->nrecv[nzone+1]);
 +            if (!cd->bInPlace)
 +            {
 +                j = 0;
 +                for(zone=0; zone<nzone; zone++)
 +                {
 +                    for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 +                    {
 +                        copy_rvec(rbuf[j],x[i]);
 +                        j++;
 +                    }
 +                }
 +            }
 +            nat_tot += ind->nrecv[nzone+1];
 +        }
 +        nzone += nzone;
 +    }
 +}
 +
 +void dd_move_f(gmx_domdec_t *dd,rvec f[],rvec *fshift)
 +{
 +    int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 +    int  *index,*cgindex;
 +    gmx_domdec_comm_t *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t *ind;
 +    rvec *buf,*sbuf;
 +    ivec vis;
 +    int  is;
 +    gmx_bool bPBC,bScrew;
 +    
 +    comm = dd->comm;
 +    
 +    cgindex = dd->cgindex;
 +
 +    buf = comm->vbuf.v;
 +
 +    n = 0;
 +    nzone = comm->zones.n/2;
 +    nat_tot = dd->nat_tot;
 +    for(d=dd->ndim-1; d>=0; d--)
 +    {
 +        bPBC   = (dd->ci[dd->dim[d]] == 0);
 +        bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 +        if (fshift == NULL && !bScrew)
 +        {
 +            bPBC = FALSE;
 +        }
 +        /* Determine which shift vector we need */
 +        clear_ivec(vis);
 +        vis[dd->dim[d]] = 1;
 +        is = IVEC2IS(vis);
 +        
 +        cd = &comm->cd[d];
 +        for(p=cd->np-1; p>=0; p--) {
 +            ind = &cd->ind[p];
 +            nat_tot -= ind->nrecv[nzone+1];
 +            if (cd->bInPlace)
 +            {
 +                sbuf = f + nat_tot;
 +            }
 +            else
 +            {
 +                sbuf = comm->vbuf2.v;
 +                j = 0;
 +                for(zone=0; zone<nzone; zone++)
 +                {
 +                    for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 +                    {
 +                        copy_rvec(f[i],sbuf[j]);
 +                        j++;
 +                    }
 +                }
 +            }
 +            /* Communicate the forces */
 +            dd_sendrecv_rvec(dd, d, dddirForward,
 +                             sbuf, ind->nrecv[nzone+1],
 +                             buf,  ind->nsend[nzone+1]);
 +            index = ind->index;
 +            /* Add the received forces */
 +            n = 0;
 +            if (!bPBC)
 +            {
 +                for(i=0; i<ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for(j=at0; j<at1; j++)
 +                    {
 +                        rvec_inc(f[j],buf[n]);
 +                        n++;
 +                    }
 +                } 
 +            }
 +            else if (!bScrew)
 +            {
 +                for(i=0; i<ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for(j=at0; j<at1; j++)
 +                    {
 +                        rvec_inc(f[j],buf[n]);
 +                        /* Add this force to the shift force */
 +                        rvec_inc(fshift[is],buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else
 +            {
 +                for(i=0; i<ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for(j=at0; j<at1; j++)
 +                    {
 +                        /* Rotate the force */
 +                        f[j][XX] += buf[n][XX];
 +                        f[j][YY] -= buf[n][YY];
 +                        f[j][ZZ] -= buf[n][ZZ];
 +                        if (fshift)
 +                        {
 +                            /* Add this force to the shift force */
 +                            rvec_inc(fshift[is],buf[n]);
 +                        }
 +                        n++;
 +                    }
 +                }
 +            }
 +        }
 +        nzone /= 2;
 +    }
 +}
 +
 +void dd_atom_spread_real(gmx_domdec_t *dd,real v[])
 +{
 +    int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 +    int  *index,*cgindex;
 +    gmx_domdec_comm_t *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t *ind;
 +    real *buf,*rbuf;
 +    
 +    comm = dd->comm;
 +    
 +    cgindex = dd->cgindex;
 +    
 +    buf = &comm->vbuf.v[0][0];
 +
 +    nzone = 1;
 +    nat_tot = dd->nat_home;
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        cd = &comm->cd[d];
 +        for(p=0; p<cd->np; p++)
 +        {
 +            ind = &cd->ind[p];
 +            index = ind->index;
 +            n = 0;
 +            for(i=0; i<ind->nsend[nzone]; i++)
 +            {
 +                at0 = cgindex[index[i]];
 +                at1 = cgindex[index[i]+1];
 +                for(j=at0; j<at1; j++)
 +                {
 +                    buf[n] = v[j];
 +                    n++;
 +                }
 +            }
 +            
 +            if (cd->bInPlace)
 +            {
 +                rbuf = v + nat_tot;
 +            }
 +            else
 +            {
 +                rbuf = &comm->vbuf2.v[0][0];
 +            }
 +            /* Send and receive the coordinates */
 +            dd_sendrecv_real(dd, d, dddirBackward,
 +                             buf,  ind->nsend[nzone+1],
 +                             rbuf, ind->nrecv[nzone+1]);
 +            if (!cd->bInPlace)
 +            {
 +                j = 0;
 +                for(zone=0; zone<nzone; zone++)
 +                {
 +                    for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 +                    {
 +                        v[i] = rbuf[j];
 +                        j++;
 +                    }
 +                }
 +            }
 +            nat_tot += ind->nrecv[nzone+1];
 +        }
 +        nzone += nzone;
 +    }
 +}
 +
 +void dd_atom_sum_real(gmx_domdec_t *dd,real v[])
 +{
 +    int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 +    int  *index,*cgindex;
 +    gmx_domdec_comm_t *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t *ind;
 +    real *buf,*sbuf;
 +    
 +    comm = dd->comm;
 +    
 +    cgindex = dd->cgindex;
 +
 +    buf = &comm->vbuf.v[0][0];
 +
 +    n = 0;
 +    nzone = comm->zones.n/2;
 +    nat_tot = dd->nat_tot;
 +    for(d=dd->ndim-1; d>=0; d--)
 +    {
 +        cd = &comm->cd[d];
 +        for(p=cd->np-1; p>=0; p--) {
 +            ind = &cd->ind[p];
 +            nat_tot -= ind->nrecv[nzone+1];
 +            if (cd->bInPlace)
 +            {
 +                sbuf = v + nat_tot;
 +            }
 +            else
 +            {
 +                sbuf = &comm->vbuf2.v[0][0];
 +                j = 0;
 +                for(zone=0; zone<nzone; zone++)
 +                {
 +                    for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 +                    {
 +                        sbuf[j] = v[i];
 +                        j++;
 +                    }
 +                }
 +            }
 +            /* Communicate the forces */
 +            dd_sendrecv_real(dd, d, dddirForward,
 +                             sbuf, ind->nrecv[nzone+1],
 +                             buf,  ind->nsend[nzone+1]);
 +            index = ind->index;
 +            /* Add the received forces */
 +            n = 0;
 +            for(i=0; i<ind->nsend[nzone]; i++)
 +            {
 +                at0 = cgindex[index[i]];
 +                at1 = cgindex[index[i]+1];
 +                for(j=at0; j<at1; j++)
 +                {
 +                    v[j] += buf[n];
 +                    n++;
 +                }
 +            } 
 +        }
 +        nzone /= 2;
 +    }
 +}
 +
 +static void print_ddzone(FILE *fp,int d,int i,int j,gmx_ddzone_t *zone)
 +{
 +    fprintf(fp,"zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 +            d,i,j,
 +            zone->min0,zone->max1,
 +            zone->mch0,zone->mch0,
 +            zone->p1_0,zone->p1_1);
 +}
 +
 +
 +#define DDZONECOMM_MAXZONE  5
 +#define DDZONECOMM_BUFSIZE  3
 +
 +static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
 +                               int ddimind,int direction,
 +                               gmx_ddzone_t *buf_s,int n_s,
 +                               gmx_ddzone_t *buf_r,int n_r)
 +{
 +#define ZBS  DDZONECOMM_BUFSIZE
 +    rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
 +    rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
 +    int i;
 +
 +    for(i=0; i<n_s; i++)
 +    {
 +        vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
 +        vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
 +        vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
 +        vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
 +        vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
 +        vbuf_s[i*ZBS+1][2] = 0;
 +        vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
 +        vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
 +        vbuf_s[i*ZBS+2][2] = 0;
 +    }
 +
 +    dd_sendrecv_rvec(dd, ddimind, direction,
 +                     vbuf_s, n_s*ZBS,
 +                     vbuf_r, n_r*ZBS);
 +
 +    for(i=0; i<n_r; i++)
 +    {
 +        buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
 +        buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
 +        buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
 +        buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
 +        buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
 +        buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
 +        buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
 +    }
 +
 +#undef ZBS
 +}
 +
 +static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
 +                          rvec cell_ns_x0,rvec cell_ns_x1)
 +{
 +    int  d,d1,dim,dim1,pos,buf_size,i,j,k,p,npulse,npulse_min;
 +    gmx_ddzone_t *zp;
 +    gmx_ddzone_t buf_s[DDZONECOMM_MAXZONE];
 +    gmx_ddzone_t buf_r[DDZONECOMM_MAXZONE];
 +    gmx_ddzone_t buf_e[DDZONECOMM_MAXZONE];
 +    rvec extr_s[2],extr_r[2];
 +    rvec dh;
 +    real dist_d,c=0,det;
 +    gmx_domdec_comm_t *comm;
 +    gmx_bool bPBC,bUse;
 +
 +    comm = dd->comm;
 +
 +    for(d=1; d<dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        zp = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
 +        zp->min0 = cell_ns_x0[dim];
 +        zp->max1 = cell_ns_x1[dim];
 +        zp->min1 = cell_ns_x1[dim];
 +        zp->mch0 = cell_ns_x0[dim];
 +        zp->mch1 = cell_ns_x1[dim];
 +        zp->p1_0 = cell_ns_x0[dim];
 +        zp->p1_1 = cell_ns_x1[dim];
 +    }
 +    
 +    for(d=dd->ndim-2; d>=0; d--)
 +    {
 +        dim  = dd->dim[d];
 +        bPBC = (dim < ddbox->npbcdim);
 +
 +        /* Use an rvec to store two reals */
 +        extr_s[d][0] = comm->cell_f0[d+1];
 +        extr_s[d][1] = comm->cell_f1[d+1];
 +        extr_s[d][2] = comm->cell_f1[d+1];
 +
 +        pos = 0;
 +        /* Store the extremes in the backward sending buffer,
 +         * so the get updated separately from the forward communication.
 +         */
 +        for(d1=d; d1<dd->ndim-1; d1++)
 +        {
 +            /* We invert the order to be able to use the same loop for buf_e */
 +            buf_s[pos].min0 = extr_s[d1][1];
 +            buf_s[pos].max1 = extr_s[d1][0];
 +            buf_s[pos].min1 = extr_s[d1][2];
 +            buf_s[pos].mch0 = 0;
 +            buf_s[pos].mch1 = 0;
 +            /* Store the cell corner of the dimension we communicate along */
 +            buf_s[pos].p1_0 = comm->cell_x0[dim];
 +            buf_s[pos].p1_1 = 0;
 +            pos++;
 +        }
 +
 +        buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 +        pos++;
 +
 +        if (dd->ndim == 3 && d == 0)
 +        {
 +            buf_s[pos] = comm->zone_d2[0][1];
 +            pos++;
 +            buf_s[pos] = comm->zone_d1[0];
 +            pos++;
 +        }
 +
 +        /* We only need to communicate the extremes
 +         * in the forward direction
 +         */
 +        npulse = comm->cd[d].np;
 +        if (bPBC)
 +        {
 +            /* Take the minimum to avoid double communication */
 +            npulse_min = min(npulse,dd->nc[dim]-1-npulse);
 +        }
 +        else
 +        {
 +            /* Without PBC we should really not communicate over
 +             * the boundaries, but implementing that complicates
 +             * the communication setup and therefore we simply
 +             * do all communication, but ignore some data.
 +             */
 +            npulse_min = npulse;
 +        }
 +        for(p=0; p<npulse_min; p++)
 +        {
 +            /* Communicate the extremes forward */
 +            bUse = (bPBC || dd->ci[dim] > 0);
 +
 +            dd_sendrecv_rvec(dd, d, dddirForward,
 +                             extr_s+d, dd->ndim-d-1,
 +                             extr_r+d, dd->ndim-d-1);
 +
 +            if (bUse)
 +            {
 +                for(d1=d; d1<dd->ndim-1; d1++)
 +                {
 +                    extr_s[d1][0] = max(extr_s[d1][0],extr_r[d1][0]);
 +                    extr_s[d1][1] = min(extr_s[d1][1],extr_r[d1][1]);
 +                    extr_s[d1][2] = min(extr_s[d1][2],extr_r[d1][2]);
 +                }
 +            }
 +        }
 +
 +        buf_size = pos;
 +        for(p=0; p<npulse; p++)
 +        {
 +            /* Communicate all the zone information backward */
 +            bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
 +
 +            dd_sendrecv_ddzone(dd, d, dddirBackward,
 +                               buf_s, buf_size,
 +                               buf_r, buf_size);
 +
 +            clear_rvec(dh);
 +            if (p > 0)
 +            {
 +                for(d1=d+1; d1<dd->ndim; d1++)
 +                {
 +                    /* Determine the decrease of maximum required
 +                     * communication height along d1 due to the distance along d,
 +                     * this avoids a lot of useless atom communication.
 +                     */
 +                    dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
 +
 +                    if (ddbox->tric_dir[dim])
 +                    {
 +                        /* c is the off-diagonal coupling between the cell planes
 +                         * along directions d and d1.
 +                         */
 +                        c = ddbox->v[dim][dd->dim[d1]][dim];
 +                    }
 +                    else
 +                    {
 +                        c = 0;
 +                    }
 +                    det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
 +                    if (det > 0)
 +                    {
 +                        dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
 +                    }
 +                    else
 +                    {
 +                        /* A negative value signals out of range */
 +                        dh[d1] = -1;
 +                    }
 +                }
 +            }
 +
 +            /* Accumulate the extremes over all pulses */
 +            for(i=0; i<buf_size; i++)
 +            {
 +                if (p == 0)
 +                {
 +                    buf_e[i] = buf_r[i];
 +                }
 +                else
 +                {
 +                    if (bUse)
 +                    {
 +                        buf_e[i].min0 = min(buf_e[i].min0,buf_r[i].min0);
 +                        buf_e[i].max1 = max(buf_e[i].max1,buf_r[i].max1);
 +                        buf_e[i].min1 = min(buf_e[i].min1,buf_r[i].min1);
 +                    }
 +
 +                    if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
 +                    {
 +                        d1 = 1;
 +                    }
 +                    else
 +                    {
 +                        d1 = d + 1;
 +                    }
 +                    if (bUse && dh[d1] >= 0)
 +                    {
 +                        buf_e[i].mch0 = max(buf_e[i].mch0,buf_r[i].mch0-dh[d1]);
 +                        buf_e[i].mch1 = max(buf_e[i].mch1,buf_r[i].mch1-dh[d1]);
 +                    }
 +                }
 +                /* Copy the received buffer to the send buffer,
 +                 * to pass the data through with the next pulse.
 +                 */
 +                buf_s[i] = buf_r[i];
 +            }
 +            if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
 +                (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
 +            {
 +                /* Store the extremes */ 
 +                pos = 0;
 +
 +                for(d1=d; d1<dd->ndim-1; d1++)
 +                {
 +                    extr_s[d1][1] = min(extr_s[d1][1],buf_e[pos].min0);
 +                    extr_s[d1][0] = max(extr_s[d1][0],buf_e[pos].max1);
 +                    extr_s[d1][2] = min(extr_s[d1][2],buf_e[pos].min1);
 +                    pos++;
 +                }
 +
 +                if (d == 1 || (d == 0 && dd->ndim == 3))
 +                {
 +                    for(i=d; i<2; i++)
 +                    {
 +                        comm->zone_d2[1-d][i] = buf_e[pos];
 +                        pos++;
 +                    }
 +                }
 +                if (d == 0)
 +                {
 +                    comm->zone_d1[1] = buf_e[pos];
 +                    pos++;
 +                }
 +            }
 +        }
 +    }
 +    
 +    if (dd->ndim >= 2)
 +    {
 +        dim = dd->dim[1];
 +        for(i=0; i<2; i++)
 +        {
 +            if (debug)
 +            {
 +                print_ddzone(debug,1,i,0,&comm->zone_d1[i]);
 +            }
 +            cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d1[i].min0);
 +            cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d1[i].max1);
 +        }
 +    }
 +    if (dd->ndim >= 3)
 +    {
 +        dim = dd->dim[2];
 +        for(i=0; i<2; i++)
 +        {
 +            for(j=0; j<2; j++)
 +            {
 +                if (debug)
 +                {
 +                    print_ddzone(debug,2,i,j,&comm->zone_d2[i][j]);
 +                }
 +                cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d2[i][j].min0);
 +                cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d2[i][j].max1);
 +            }
 +        }
 +    }
 +    for(d=1; d<dd->ndim; d++)
 +    {
 +        comm->cell_f_max0[d] = extr_s[d-1][0];
 +        comm->cell_f_min1[d] = extr_s[d-1][1];
 +        if (debug)
 +        {
 +            fprintf(debug,"Cell fraction d %d, max0 %f, min1 %f\n",
 +                    d,comm->cell_f_max0[d],comm->cell_f_min1[d]);
 +        }
 +    }
 +}
 +
 +static void dd_collect_cg(gmx_domdec_t *dd,
 +                          t_state *state_local)
 +{
 +    gmx_domdec_master_t *ma=NULL;
 +    int buf2[2],*ibuf,i,ncg_home=0,*cg=NULL,nat_home=0;
 +    t_block *cgs_gl;
 +
 +    if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
 +    {
 +        /* The master has the correct distribution */
 +        return;
 +    }
 +    
 +    if (state_local->ddp_count == dd->ddp_count)
 +    {
 +        ncg_home = dd->ncg_home;
 +        cg       = dd->index_gl;
 +        nat_home = dd->nat_home;
 +    } 
 +    else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
 +    {
 +        cgs_gl = &dd->comm->cgs_gl;
 +
 +        ncg_home = state_local->ncg_gl;
 +        cg       = state_local->cg_gl;
 +        nat_home = 0;
 +        for(i=0; i<ncg_home; i++)
 +        {
 +            nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
 +        }
 +    }
 +    else
 +    {
 +        gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
 +    }
 +    
 +    buf2[0] = dd->ncg_home;
 +    buf2[1] = dd->nat_home;
 +    if (DDMASTER(dd))
 +    {
 +        ma = dd->ma;
 +        ibuf = ma->ibuf;
 +    }
 +    else
 +    {
 +        ibuf = NULL;
 +    }
 +    /* Collect the charge group and atom counts on the master */
 +    dd_gather(dd,2*sizeof(int),buf2,ibuf);
 +    
 +    if (DDMASTER(dd))
 +    {
 +        ma->index[0] = 0;
 +        for(i=0; i<dd->nnodes; i++)
 +        {
 +            ma->ncg[i] = ma->ibuf[2*i];
 +            ma->nat[i] = ma->ibuf[2*i+1];
 +            ma->index[i+1] = ma->index[i] + ma->ncg[i];
 +            
 +        }
 +        /* Make byte counts and indices */
 +        for(i=0; i<dd->nnodes; i++)
 +        {
 +            ma->ibuf[i] = ma->ncg[i]*sizeof(int);
 +            ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug,"Initial charge group distribution: ");
 +            for(i=0; i<dd->nnodes; i++)
 +                fprintf(debug," %d",ma->ncg[i]);
 +            fprintf(debug,"\n");
 +        }
 +    }
 +    
 +    /* Collect the charge group indices on the master */
 +    dd_gatherv(dd,
 +               dd->ncg_home*sizeof(int),dd->index_gl,
 +               DDMASTER(dd) ? ma->ibuf : NULL,
 +               DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
 +               DDMASTER(dd) ? ma->cg : NULL);
 +    
 +    dd->comm->master_cg_ddp_count = state_local->ddp_count;
 +}
 +
 +static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
 +                                    rvec *lv,rvec *v)
 +{
 +    gmx_domdec_master_t *ma;
 +    int  n,i,c,a,nalloc=0;
 +    rvec *buf=NULL;
 +    t_block *cgs_gl;
 +
 +    ma = dd->ma;
 +    
 +    if (!DDMASTER(dd))
 +    {
 +#ifdef GMX_MPI
 +        MPI_Send(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
 +                 dd->rank,dd->mpi_comm_all);
 +#endif
 +    } else {
 +        /* Copy the master coordinates to the global array */
 +        cgs_gl = &dd->comm->cgs_gl;
 +
 +        n = DDMASTERRANK(dd);
 +        a = 0;
 +        for(i=ma->index[n]; i<ma->index[n+1]; i++)
 +        {
 +            for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
 +            {
 +                copy_rvec(lv[a++],v[c]);
 +            }
 +        }
 +        
 +        for(n=0; n<dd->nnodes; n++)
 +        {
 +            if (n != dd->rank)
 +            {
 +                if (ma->nat[n] > nalloc)
 +                {
 +                    nalloc = over_alloc_dd(ma->nat[n]);
 +                    srenew(buf,nalloc);
 +                }
 +#ifdef GMX_MPI
 +                MPI_Recv(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,DDRANK(dd,n),
 +                         n,dd->mpi_comm_all,MPI_STATUS_IGNORE);
 +#endif
 +                a = 0;
 +                for(i=ma->index[n]; i<ma->index[n+1]; i++)
 +                {
 +                    for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
 +                    {
 +                        copy_rvec(buf[a++],v[c]);
 +                    }
 +                }
 +            }
 +        }
 +        sfree(buf);
 +    }
 +}
 +
 +static void get_commbuffer_counts(gmx_domdec_t *dd,
 +                                  int **counts,int **disps)
 +{
 +    gmx_domdec_master_t *ma;
 +    int n;
 +
 +    ma = dd->ma;
 +    
 +    /* Make the rvec count and displacment arrays */
 +    *counts  = ma->ibuf;
 +    *disps   = ma->ibuf + dd->nnodes;
 +    for(n=0; n<dd->nnodes; n++)
 +    {
 +        (*counts)[n] = ma->nat[n]*sizeof(rvec);
 +        (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
 +    }
 +}
 +
 +static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
 +                                   rvec *lv,rvec *v)
 +{
 +    gmx_domdec_master_t *ma;
 +    int  *rcounts=NULL,*disps=NULL;
 +    int  n,i,c,a;
 +    rvec *buf=NULL;
 +    t_block *cgs_gl;
 +    
 +    ma = dd->ma;
 +    
 +    if (DDMASTER(dd))
 +    {
 +        get_commbuffer_counts(dd,&rcounts,&disps);
 +
 +        buf = ma->vbuf;
 +    }
 +    
 +    dd_gatherv(dd,dd->nat_home*sizeof(rvec),lv,rcounts,disps,buf);
 +
 +    if (DDMASTER(dd))
 +    {
 +        cgs_gl = &dd->comm->cgs_gl;
 +
 +        a = 0;
 +        for(n=0; n<dd->nnodes; n++)
 +        {
 +            for(i=ma->index[n]; i<ma->index[n+1]; i++)
 +            {
 +                for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
 +                {
 +                    copy_rvec(buf[a++],v[c]);
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +void dd_collect_vec(gmx_domdec_t *dd,
 +                    t_state *state_local,rvec *lv,rvec *v)
 +{
 +    gmx_domdec_master_t *ma;
 +    int  n,i,c,a,nalloc=0;
 +    rvec *buf=NULL;
 +    
 +    dd_collect_cg(dd,state_local);
 +
 +    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
 +    {
 +        dd_collect_vec_sendrecv(dd,lv,v);
 +    }
 +    else
 +    {
 +        dd_collect_vec_gatherv(dd,lv,v);
 +    }
 +}
 +
 +
 +void dd_collect_state(gmx_domdec_t *dd,
 +                      t_state *state_local,t_state *state)
 +{
 +    int est,i,j,nh;
 +
 +    nh = state->nhchainlength;
 +
 +    if (DDMASTER(dd))
 +    {
 +        for (i=0;i<efptNR;i++) {
 +            state->lambda[i] = state_local->lambda[i];
 +        }
 +        state->fep_state = state_local->fep_state;
 +        state->veta = state_local->veta;
 +        state->vol0 = state_local->vol0;
 +        copy_mat(state_local->box,state->box);
 +        copy_mat(state_local->boxv,state->boxv);
 +        copy_mat(state_local->svir_prev,state->svir_prev);
 +        copy_mat(state_local->fvir_prev,state->fvir_prev);
 +        copy_mat(state_local->pres_prev,state->pres_prev);
 +
 +
 +        for(i=0; i<state_local->ngtc; i++)
 +        {
 +            for(j=0; j<nh; j++) {
 +                state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
 +                state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
 +            }
 +            state->therm_integral[i] = state_local->therm_integral[i];            
 +        }
 +        for(i=0; i<state_local->nnhpres; i++) 
 +        {
 +            for(j=0; j<nh; j++) {
 +                state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
 +                state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
 +            }
 +        }
 +    }
 +    for(est=0; est<estNR; est++)
 +    {
 +        if (EST_DISTR(est) && (state_local->flags & (1<<est)))
 +        {
 +            switch (est) {
 +            case estX:
 +                dd_collect_vec(dd,state_local,state_local->x,state->x);
 +                break;
 +            case estV:
 +                dd_collect_vec(dd,state_local,state_local->v,state->v);
 +                break;
 +            case estSDX:
 +                dd_collect_vec(dd,state_local,state_local->sd_X,state->sd_X);
 +                break;
 +            case estCGP:
 +                dd_collect_vec(dd,state_local,state_local->cg_p,state->cg_p);
 +                break;
 +            case estLD_RNG:
 +                if (state->nrngi == 1)
 +                {
 +                    if (DDMASTER(dd))
 +                    {
 +                        for(i=0; i<state_local->nrng; i++)
 +                        {
 +                            state->ld_rng[i] = state_local->ld_rng[i];
 +                        }
 +                    }
 +                }
 +                else
 +                {
 +                    dd_gather(dd,state_local->nrng*sizeof(state->ld_rng[0]),
 +                              state_local->ld_rng,state->ld_rng);
 +                }
 +                break;
 +            case estLD_RNGI:
 +                if (state->nrngi == 1)
 +                {
 +                   if (DDMASTER(dd))
 +                    {
 +                        state->ld_rngi[0] = state_local->ld_rngi[0];
 +                    } 
 +                }
 +                else
 +                {
 +                    dd_gather(dd,sizeof(state->ld_rngi[0]),
 +                              state_local->ld_rngi,state->ld_rngi);
 +                }
 +                break;
 +            case estDISRE_INITF:
 +            case estDISRE_RM3TAV:
 +            case estORIRE_INITF:
 +            case estORIRE_DTAV:
 +                break;
 +            default:
 +                gmx_incons("Unknown state entry encountered in dd_collect_state");
 +            }
 +        }
 +    }
 +}
 +
 +static void dd_realloc_state(t_state *state,rvec **f,int nalloc)
 +{
 +    int est;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"Reallocating state: currently %d, required %d, allocating %d\n",state->nalloc,nalloc,over_alloc_dd(nalloc));
 +    }
 +
 +    state->nalloc = over_alloc_dd(nalloc);
 +    
 +    for(est=0; est<estNR; est++)
 +    {
 +        if (EST_DISTR(est) && (state->flags & (1<<est)))
 +        {
 +            switch(est) {
 +            case estX:
 +                srenew(state->x,state->nalloc);
 +                break;
 +            case estV:
 +                srenew(state->v,state->nalloc);
 +                break;
 +            case estSDX:
 +                srenew(state->sd_X,state->nalloc);
 +                break;
 +            case estCGP:
 +                srenew(state->cg_p,state->nalloc);
 +                break;
 +            case estLD_RNG:
 +            case estLD_RNGI:
 +            case estDISRE_INITF:
 +            case estDISRE_RM3TAV:
 +            case estORIRE_INITF:
 +            case estORIRE_DTAV:
 +                /* No reallocation required */
 +                break;
 +            default:
 +                gmx_incons("Unknown state entry encountered in dd_realloc_state");            
 +            }
 +        }
 +    }
 +    
 +    if (f != NULL)
 +    {
 +        srenew(*f,state->nalloc);
 +    }
 +}
 +
 +static void dd_check_alloc_ncg(t_forcerec *fr,t_state *state,rvec **f,
 +                               int nalloc)
 +{
 +    if (nalloc > fr->cg_nalloc)
 +    {
 +        if (debug)
 +        {
 +            fprintf(debug,"Reallocating forcerec: currently %d, required %d, allocating %d\n",fr->cg_nalloc,nalloc,over_alloc_dd(nalloc));
 +        }
 +        fr->cg_nalloc = over_alloc_dd(nalloc);
 +        srenew(fr->cginfo,fr->cg_nalloc);
 +        if (fr->cutoff_scheme == ecutsGROUP)
 +        {
 +            srenew(fr->cg_cm,fr->cg_nalloc);
 +        }
 +    }
 +    if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
 +    {
 +        /* We don't use charge groups, we use x in state to set up
 +         * the atom communication.
 +         */
 +        dd_realloc_state(state,f,nalloc);
 +    }
 +}
 +
 +static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd,t_block *cgs,
 +                                       rvec *v,rvec *lv)
 +{
 +    gmx_domdec_master_t *ma;
 +    int  n,i,c,a,nalloc=0;
 +    rvec *buf=NULL;
 +    
 +    if (DDMASTER(dd))
 +    {
 +        ma  = dd->ma;
 +        
 +        for(n=0; n<dd->nnodes; n++)
 +        {
 +            if (n != dd->rank)
 +            {
 +                if (ma->nat[n] > nalloc)
 +                {
 +                    nalloc = over_alloc_dd(ma->nat[n]);
 +                    srenew(buf,nalloc);
 +                }
 +                /* Use lv as a temporary buffer */
 +                a = 0;
 +                for(i=ma->index[n]; i<ma->index[n+1]; i++)
 +                {
 +                    for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
 +                    {
 +                        copy_rvec(v[c],buf[a++]);
 +                    }
 +                }
 +                if (a != ma->nat[n])
 +                {
 +                    gmx_fatal(FARGS,"Internal error a (%d) != nat (%d)",
 +                              a,ma->nat[n]);
 +                }
 +                
 +#ifdef GMX_MPI
 +                MPI_Send(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,
 +                         DDRANK(dd,n),n,dd->mpi_comm_all);
 +#endif
 +            }
 +        }
 +        sfree(buf);
 +        n = DDMASTERRANK(dd);
 +        a = 0;
 +        for(i=ma->index[n]; i<ma->index[n+1]; i++)
 +        {
 +            for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
 +            {
 +                copy_rvec(v[c],lv[a++]);
 +            }
 +        }
 +    }
 +    else
 +    {
 +#ifdef GMX_MPI
 +        MPI_Recv(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
 +                 MPI_ANY_TAG,dd->mpi_comm_all,MPI_STATUS_IGNORE);
 +#endif
 +    }
 +}
 +
 +static void dd_distribute_vec_scatterv(gmx_domdec_t *dd,t_block *cgs,
 +                                       rvec *v,rvec *lv)
 +{
 +    gmx_domdec_master_t *ma;
 +    int  *scounts=NULL,*disps=NULL;
 +    int  n,i,c,a,nalloc=0;
 +    rvec *buf=NULL;
 +    
 +    if (DDMASTER(dd))
 +    {
 +        ma  = dd->ma;
 +     
 +        get_commbuffer_counts(dd,&scounts,&disps);
 +
 +        buf = ma->vbuf;
 +        a = 0;
 +        for(n=0; n<dd->nnodes; n++)
 +        {
 +            for(i=ma->index[n]; i<ma->index[n+1]; i++)
 +            {
 +                for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
 +                {
 +                    copy_rvec(v[c],buf[a++]);
 +                }
 +            }
 +        }
 +    }
 +
 +    dd_scatterv(dd,scounts,disps,buf,dd->nat_home*sizeof(rvec),lv);
 +}
 +
 +static void dd_distribute_vec(gmx_domdec_t *dd,t_block *cgs,rvec *v,rvec *lv)
 +{
 +    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
 +    {
 +        dd_distribute_vec_sendrecv(dd,cgs,v,lv);
 +    }
 +    else
 +    {
 +        dd_distribute_vec_scatterv(dd,cgs,v,lv);
 +    }
 +}
 +
 +static void dd_distribute_state(gmx_domdec_t *dd,t_block *cgs,
 +                                t_state *state,t_state *state_local,
 +                                rvec **f)
 +{
 +    int  i,j,nh;
 +
 +    nh = state->nhchainlength;
 +
 +    if (DDMASTER(dd))
 +    {
 +        for(i=0;i<efptNR;i++)
 +        {
 +            state_local->lambda[i] = state->lambda[i];
 +        }
 +        state_local->fep_state = state->fep_state;
 +        state_local->veta   = state->veta;
 +        state_local->vol0   = state->vol0;
 +        copy_mat(state->box,state_local->box);
 +        copy_mat(state->box_rel,state_local->box_rel);
 +        copy_mat(state->boxv,state_local->boxv);
 +        copy_mat(state->svir_prev,state_local->svir_prev);
 +        copy_mat(state->fvir_prev,state_local->fvir_prev);
 +        for(i=0; i<state_local->ngtc; i++)
 +        {
 +            for(j=0; j<nh; j++) {
 +                state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
 +                state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
 +            }
 +            state_local->therm_integral[i] = state->therm_integral[i];
 +        }
 +        for(i=0; i<state_local->nnhpres; i++)
 +        {
 +            for(j=0; j<nh; j++) {
 +                state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
 +                state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
 +            }
 +        }
 +    }
 +    dd_bcast(dd,((efptNR)*sizeof(real)),state_local->lambda);
 +    dd_bcast(dd,sizeof(int),&state_local->fep_state);
 +    dd_bcast(dd,sizeof(real),&state_local->veta);
 +    dd_bcast(dd,sizeof(real),&state_local->vol0);
 +    dd_bcast(dd,sizeof(state_local->box),state_local->box);
 +    dd_bcast(dd,sizeof(state_local->box_rel),state_local->box_rel);
 +    dd_bcast(dd,sizeof(state_local->boxv),state_local->boxv);
 +    dd_bcast(dd,sizeof(state_local->svir_prev),state_local->svir_prev);
 +    dd_bcast(dd,sizeof(state_local->fvir_prev),state_local->fvir_prev);
 +    dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_xi);
 +    dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_vxi);
 +    dd_bcast(dd,state_local->ngtc*sizeof(double),state_local->therm_integral);
 +    dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_xi);
 +    dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_vxi);
 +
 +    if (dd->nat_home > state_local->nalloc)
 +    {
 +        dd_realloc_state(state_local,f,dd->nat_home);
 +    }
 +    for(i=0; i<estNR; i++)
 +    {
 +        if (EST_DISTR(i) && (state_local->flags & (1<<i)))
 +        {
 +            switch (i) {
 +            case estX:
 +                dd_distribute_vec(dd,cgs,state->x,state_local->x);
 +                break;
 +            case estV:
 +                dd_distribute_vec(dd,cgs,state->v,state_local->v);
 +                break;
 +            case estSDX:
 +                dd_distribute_vec(dd,cgs,state->sd_X,state_local->sd_X);
 +                break;
 +            case estCGP:
 +                dd_distribute_vec(dd,cgs,state->cg_p,state_local->cg_p);
 +                break;
 +            case estLD_RNG:
 +                if (state->nrngi == 1)
 +                {
 +                    dd_bcastc(dd,
 +                              state_local->nrng*sizeof(state_local->ld_rng[0]),
 +                              state->ld_rng,state_local->ld_rng);
 +                }
 +                else
 +                {
 +                    dd_scatter(dd,
 +                               state_local->nrng*sizeof(state_local->ld_rng[0]),
 +                               state->ld_rng,state_local->ld_rng);
 +                }
 +                break;
 +            case estLD_RNGI:
 +                if (state->nrngi == 1)
 +                {
 +                    dd_bcastc(dd,sizeof(state_local->ld_rngi[0]),
 +                              state->ld_rngi,state_local->ld_rngi);
 +                }
 +                else
 +                {
 +                     dd_scatter(dd,sizeof(state_local->ld_rngi[0]),
 +                               state->ld_rngi,state_local->ld_rngi);
 +                }   
 +                break;
 +            case estDISRE_INITF:
 +            case estDISRE_RM3TAV:
 +            case estORIRE_INITF:
 +            case estORIRE_DTAV:
 +                /* Not implemented yet */
 +                break;
 +            default:
 +                gmx_incons("Unknown state entry encountered in dd_distribute_state");
 +            }
 +        }
 +    }
 +}
 +
 +static char dim2char(int dim)
 +{
 +    char c='?';
 +    
 +    switch (dim)
 +    {
 +    case XX: c = 'X'; break;
 +    case YY: c = 'Y'; break;
 +    case ZZ: c = 'Z'; break;
 +    default: gmx_fatal(FARGS,"Unknown dim %d",dim);
 +    }
 +    
 +    return c;
 +}
 +
 +static void write_dd_grid_pdb(const char *fn,gmx_large_int_t step,
 +                              gmx_domdec_t *dd,matrix box,gmx_ddbox_t *ddbox)
 +{
 +    rvec grid_s[2],*grid_r=NULL,cx,r;
 +    char fname[STRLEN],format[STRLEN],buf[22];
 +    FILE *out;
 +    int  a,i,d,z,y,x;
 +    matrix tric;
 +    real vol;
 +
 +    copy_rvec(dd->comm->cell_x0,grid_s[0]);
 +    copy_rvec(dd->comm->cell_x1,grid_s[1]);
 +    
 +    if (DDMASTER(dd))
 +    {
 +        snew(grid_r,2*dd->nnodes);
 +    }
 +    
 +    dd_gather(dd,2*sizeof(rvec),grid_s[0],DDMASTER(dd) ? grid_r[0] : NULL);
 +    
 +    if (DDMASTER(dd))
 +    {
 +        for(d=0; d<DIM; d++)
 +        {
 +            for(i=0; i<DIM; i++)
 +            {
 +                if (d == i)
 +                {
 +                    tric[d][i] = 1;
 +                }
 +                else
 +                {
 +                    if (d < ddbox->npbcdim && dd->nc[d] > 1)
 +                    {
 +                        tric[d][i] = box[i][d]/box[i][i];
 +                    }
 +                    else
 +                    {
 +                        tric[d][i] = 0;
 +                    }
 +                }
 +            }
 +        }
 +        sprintf(fname,"%s_%s.pdb",fn,gmx_step_str(step,buf));
 +        sprintf(format,"%s%s\n",get_pdbformat(),"%6.2f%6.2f");
 +        out = gmx_fio_fopen(fname,"w");
 +        gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
 +        a = 1;
 +        for(i=0; i<dd->nnodes; i++)
 +        {
 +            vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
 +            for(d=0; d<DIM; d++)
 +            {
 +                vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
 +            }
 +            for(z=0; z<2; z++)
 +            {
 +                for(y=0; y<2; y++)
 +                {
 +                    for(x=0; x<2; x++)
 +                    {
 +                        cx[XX] = grid_r[i*2+x][XX];
 +                        cx[YY] = grid_r[i*2+y][YY];
 +                        cx[ZZ] = grid_r[i*2+z][ZZ];
 +                        mvmul(tric,cx,r);
 +                        fprintf(out,format,"ATOM",a++,"CA","GLY",' ',1+i,
 +                                10*r[XX],10*r[YY],10*r[ZZ],1.0,vol);
 +                    }
 +                }
 +            }
 +            for(d=0; d<DIM; d++)
 +            {
 +                for(x=0; x<4; x++)
 +                {
 +                    switch(d)
 +                    {
 +                    case 0: y = 1 + i*8 + 2*x; break;
 +                    case 1: y = 1 + i*8 + 2*x - (x % 2); break;
 +                    case 2: y = 1 + i*8 + x; break;
 +                    }
 +                    fprintf(out,"%6s%5d%5d\n","CONECT",y,y+(1<<d));
 +                }
 +            }
 +        }
 +        gmx_fio_fclose(out);
 +        sfree(grid_r);
 +    }
 +}
 +
 +void write_dd_pdb(const char *fn,gmx_large_int_t step,const char *title,
 +                  gmx_mtop_t *mtop,t_commrec *cr,
 +                  int natoms,rvec x[],matrix box)
 +{
 +    char fname[STRLEN],format[STRLEN],format4[STRLEN],buf[22];
 +    FILE *out;
 +    int  i,ii,resnr,c;
 +    char *atomname,*resname;
 +    real b;
 +    gmx_domdec_t *dd;
 +    
 +    dd = cr->dd;
 +    if (natoms == -1)
 +    {
 +        natoms = dd->comm->nat[ddnatVSITE];
 +    }
 +    
 +    sprintf(fname,"%s_%s_n%d.pdb",fn,gmx_step_str(step,buf),cr->sim_nodeid);
 +    
 +    sprintf(format,"%s%s\n",get_pdbformat(),"%6.2f%6.2f");
 +    sprintf(format4,"%s%s\n",get_pdbformat4(),"%6.2f%6.2f");
 +    
 +    out = gmx_fio_fopen(fname,"w");
 +    
 +    fprintf(out,"TITLE     %s\n",title);
 +    gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
 +    for(i=0; i<natoms; i++)
 +    {
 +        ii = dd->gatindex[i];
 +        gmx_mtop_atominfo_global(mtop,ii,&atomname,&resnr,&resname);
 +        if (i < dd->comm->nat[ddnatZONE])
 +        {
 +            c = 0;
 +            while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
 +            {
 +                c++;
 +            }
 +            b = c;
 +        }
 +        else if (i < dd->comm->nat[ddnatVSITE])
 +        {
 +            b = dd->comm->zones.n;
 +        }
 +        else
 +        {
 +            b = dd->comm->zones.n + 1;
 +        }
 +        fprintf(out,strlen(atomname)<4 ? format : format4,
 +                "ATOM",(ii+1)%100000,
 +                atomname,resname,' ',resnr%10000,' ',
 +                10*x[i][XX],10*x[i][YY],10*x[i][ZZ],1.0,b);
 +    }
 +    fprintf(out,"TER\n");
 +    
 +    gmx_fio_fclose(out);
 +}
 +
 +real dd_cutoff_mbody(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  di;
 +    real r;
 +
 +    comm = dd->comm;
 +
 +    r = -1;
 +    if (comm->bInterCGBondeds)
 +    {
 +        if (comm->cutoff_mbody > 0)
 +        {
 +            r = comm->cutoff_mbody;
 +        }
 +        else
 +        {
 +            /* cutoff_mbody=0 means we do not have DLB */
 +            r = comm->cellsize_min[dd->dim[0]];
 +            for(di=1; di<dd->ndim; di++)
 +            {
 +                r = min(r,comm->cellsize_min[dd->dim[di]]);
 +            }
 +            if (comm->bBondComm)
 +            {
 +                r = max(r,comm->cutoff_mbody);
 +            }
 +            else
 +            {
 +                r = min(r,comm->cutoff);
 +            }
 +        }
 +    }
 +
 +    return r;
 +}
 +
 +real dd_cutoff_twobody(gmx_domdec_t *dd)
 +{
 +    real r_mb;
 +
 +    r_mb = dd_cutoff_mbody(dd);
 +
 +    return max(dd->comm->cutoff,r_mb);
 +}
 +
 +
 +static void dd_cart_coord2pmecoord(gmx_domdec_t *dd,ivec coord,ivec coord_pme)
 +{
 +    int nc,ntot;
 +    
 +    nc   = dd->nc[dd->comm->cartpmedim];
 +    ntot = dd->comm->ntot[dd->comm->cartpmedim];
 +    copy_ivec(coord,coord_pme);
 +    coord_pme[dd->comm->cartpmedim] =
 +        nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
 +}
 +
 +static int low_ddindex2pmeindex(int ndd,int npme,int ddindex)
 +{
 +    /* Here we assign a PME node to communicate with this DD node
 +     * by assuming that the major index of both is x.
 +     * We add cr->npmenodes/2 to obtain an even distribution.
 +     */
 +    return (ddindex*npme + npme/2)/ndd;
 +}
 +
 +static int ddindex2pmeindex(const gmx_domdec_t *dd,int ddindex)
 +{
 +    return low_ddindex2pmeindex(dd->nnodes,dd->comm->npmenodes,ddindex);
 +}
 +
 +static int cr_ddindex2pmeindex(const t_commrec *cr,int ddindex)
 +{
 +    return low_ddindex2pmeindex(cr->dd->nnodes,cr->npmenodes,ddindex);
 +}
 +
 +static int *dd_pmenodes(t_commrec *cr)
 +{
 +    int *pmenodes;
 +    int n,i,p0,p1;
 +    
 +    snew(pmenodes,cr->npmenodes);
 +    n = 0;
 +    for(i=0; i<cr->dd->nnodes; i++) {
 +        p0 = cr_ddindex2pmeindex(cr,i);
 +        p1 = cr_ddindex2pmeindex(cr,i+1);
 +        if (i+1 == cr->dd->nnodes || p1 > p0) {
 +            if (debug)
 +                fprintf(debug,"pmenode[%d] = %d\n",n,i+1+n);
 +            pmenodes[n] = i + 1 + n;
 +            n++;
 +        }
 +    }
 +
 +    return pmenodes;
 +}
 +
 +static int gmx_ddcoord2pmeindex(t_commrec *cr,int x,int y,int z)
 +{
 +    gmx_domdec_t *dd;
 +    ivec coords,coords_pme,nc;
 +    int  slab;
 +    
 +    dd = cr->dd;
 +    /*
 +      if (dd->comm->bCartesian) {
 +      gmx_ddindex2xyz(dd->nc,ddindex,coords);
 +      dd_coords2pmecoords(dd,coords,coords_pme);
 +      copy_ivec(dd->ntot,nc);
 +      nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
 +      coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
 +      
 +      slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
 +      } else {
 +      slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
 +      }
 +    */
 +    coords[XX] = x;
 +    coords[YY] = y;
 +    coords[ZZ] = z;
 +    slab = ddindex2pmeindex(dd,dd_index(dd->nc,coords));
 +    
 +    return slab;
 +}
 +
 +static int ddcoord2simnodeid(t_commrec *cr,int x,int y,int z)
 +{
 +    gmx_domdec_comm_t *comm;
 +    ivec coords;
 +    int  ddindex,nodeid=-1;
 +    
 +    comm = cr->dd->comm;
 +    
 +    coords[XX] = x;
 +    coords[YY] = y;
 +    coords[ZZ] = z;
 +    if (comm->bCartesianPP_PME)
 +    {
 +#ifdef GMX_MPI
 +        MPI_Cart_rank(cr->mpi_comm_mysim,coords,&nodeid);
 +#endif
 +    }
 +    else
 +    {
 +        ddindex = dd_index(cr->dd->nc,coords);
 +        if (comm->bCartesianPP)
 +        {
 +            nodeid = comm->ddindex2simnodeid[ddindex];
 +        }
 +        else
 +        {
 +            if (comm->pmenodes)
 +            {
 +                nodeid = ddindex + gmx_ddcoord2pmeindex(cr,x,y,z);
 +            }
 +            else
 +            {
 +                nodeid = ddindex;
 +            }
 +        }
 +    }
 +  
 +    return nodeid;
 +}
 +
 +static int dd_simnode2pmenode(t_commrec *cr,int sim_nodeid)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_domdec_comm_t *comm;
 +    ivec coord,coord_pme;
 +    int  i;
 +    int  pmenode=-1;
 +    
 +    dd = cr->dd;
 +    comm = dd->comm;
 +    
 +    /* This assumes a uniform x domain decomposition grid cell size */
 +    if (comm->bCartesianPP_PME)
 +    {
 +#ifdef GMX_MPI
 +        MPI_Cart_coords(cr->mpi_comm_mysim,sim_nodeid,DIM,coord);
 +        if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 +        {
 +            /* This is a PP node */
 +            dd_cart_coord2pmecoord(dd,coord,coord_pme);
 +            MPI_Cart_rank(cr->mpi_comm_mysim,coord_pme,&pmenode);
 +        }
 +#endif
 +    }
 +    else if (comm->bCartesianPP)
 +    {
 +        if (sim_nodeid < dd->nnodes)
 +        {
 +            pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
 +        }
 +    }
 +    else
 +    {
 +        /* This assumes DD cells with identical x coordinates
 +         * are numbered sequentially.
 +         */
 +        if (dd->comm->pmenodes == NULL)
 +        {
 +            if (sim_nodeid < dd->nnodes)
 +            {
 +                /* The DD index equals the nodeid */
 +                pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
 +            }
 +        }
 +        else
 +        {
 +            i = 0;
 +            while (sim_nodeid > dd->comm->pmenodes[i])
 +            {
 +                i++;
 +            }
 +            if (sim_nodeid < dd->comm->pmenodes[i])
 +            {
 +                pmenode = dd->comm->pmenodes[i];
 +            }
 +        }
 +    }
 +    
 +    return pmenode;
 +}
 +
 +gmx_bool gmx_pmeonlynode(t_commrec *cr,int sim_nodeid)
 +{
 +    gmx_bool bPMEOnlyNode;
 +    
 +    if (DOMAINDECOMP(cr))
 +    {
 +        bPMEOnlyNode = (dd_simnode2pmenode(cr,sim_nodeid) == -1);
 +    }
 +    else
 +    {
 +        bPMEOnlyNode = FALSE;
 +    }
 +    
 +    return bPMEOnlyNode;
 +}
 +
 +void get_pme_ddnodes(t_commrec *cr,int pmenodeid,
 +                     int *nmy_ddnodes,int **my_ddnodes,int *node_peer)
 +{
 +    gmx_domdec_t *dd;
 +    int x,y,z;
 +    ivec coord,coord_pme;
 +    
 +    dd = cr->dd;
 +    
 +    snew(*my_ddnodes,(dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
 +    
 +    *nmy_ddnodes = 0;
 +    for(x=0; x<dd->nc[XX]; x++)
 +    {
 +        for(y=0; y<dd->nc[YY]; y++)
 +        {
 +            for(z=0; z<dd->nc[ZZ]; z++)
 +            {
 +                if (dd->comm->bCartesianPP_PME)
 +                {
 +                    coord[XX] = x;
 +                    coord[YY] = y;
 +                    coord[ZZ] = z;
 +                    dd_cart_coord2pmecoord(dd,coord,coord_pme);
 +                    if (dd->ci[XX] == coord_pme[XX] &&
 +                        dd->ci[YY] == coord_pme[YY] &&
 +                        dd->ci[ZZ] == coord_pme[ZZ])
 +                        (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
 +                }
 +                else
 +                {
 +                    /* The slab corresponds to the nodeid in the PME group */
 +                    if (gmx_ddcoord2pmeindex(cr,x,y,z) == pmenodeid)
 +                    {
 +                        (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
 +                    }
 +                }
 +            }
 +        }
 +    }
 +    
 +    /* The last PP-only node is the peer node */
 +    *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
 +    
 +    if (debug)
 +    {
 +        fprintf(debug,"Receive coordinates from PP nodes:");
 +        for(x=0; x<*nmy_ddnodes; x++)
 +        {
 +            fprintf(debug," %d",(*my_ddnodes)[x]);
 +        }
 +        fprintf(debug,"\n");
 +    }
 +}
 +
 +static gmx_bool receive_vir_ener(t_commrec *cr)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  pmenode,coords[DIM],rank;
 +    gmx_bool bReceive;
 +    
 +    bReceive = TRUE;
 +    if (cr->npmenodes < cr->dd->nnodes)
 +    {
 +        comm = cr->dd->comm;
 +        if (comm->bCartesianPP_PME)
 +        {
 +            pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
 +#ifdef GMX_MPI
 +            MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,coords);
 +            coords[comm->cartpmedim]++;
 +            if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
 +            {
 +                MPI_Cart_rank(cr->mpi_comm_mysim,coords,&rank);
 +                if (dd_simnode2pmenode(cr,rank) == pmenode)
 +                {
 +                    /* This is not the last PP node for pmenode */
 +                    bReceive = FALSE;
 +                }
 +            }
 +#endif  
 +        }
 +        else
 +        {
 +            pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
 +            if (cr->sim_nodeid+1 < cr->nnodes &&
 +                dd_simnode2pmenode(cr,cr->sim_nodeid+1) == pmenode)
 +            {
 +                /* This is not the last PP node for pmenode */
 +                bReceive = FALSE;
 +            }
 +        }
 +    }
 +    
 +    return bReceive;
 +}
 +
 +static void set_zones_ncg_home(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_zones_t *zones;
 +    int i;
 +
 +    zones = &dd->comm->zones;
 +
 +    zones->cg_range[0] = 0;
 +    for(i=1; i<zones->n+1; i++)
 +    {
 +        zones->cg_range[i] = dd->ncg_home;
 +    }
 +}
 +
 +static void rebuild_cgindex(gmx_domdec_t *dd,
 +                            const int *gcgs_index,t_state *state)
 +{
 +    int nat,i,*ind,*dd_cg_gl,*cgindex,cg_gl;
 +    
 +    ind = state->cg_gl;
 +    dd_cg_gl = dd->index_gl;
 +    cgindex  = dd->cgindex;
 +    nat = 0;
 +    cgindex[0] = nat;
 +    for(i=0; i<state->ncg_gl; i++)
 +    {
 +        cgindex[i] = nat;
 +        cg_gl = ind[i];
 +        dd_cg_gl[i] = cg_gl;
 +        nat += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
 +    }
 +    cgindex[i] = nat;
 +    
 +    dd->ncg_home = state->ncg_gl;
 +    dd->nat_home = nat;
 +
 +    set_zones_ncg_home(dd);
 +}
 +
 +static int ddcginfo(const cginfo_mb_t *cginfo_mb,int cg)
 +{
 +    while (cg >= cginfo_mb->cg_end)
 +    {
 +        cginfo_mb++;
 +    }
 +
 +    return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
 +}
 +
 +static void dd_set_cginfo(int *index_gl,int cg0,int cg1,
 +                          t_forcerec *fr,char *bLocalCG)
 +{
 +    cginfo_mb_t *cginfo_mb;
 +    int *cginfo;
 +    int cg;
 +
 +    if (fr != NULL)
 +    {
 +        cginfo_mb = fr->cginfo_mb;
 +        cginfo    = fr->cginfo;
 +
 +        for(cg=cg0; cg<cg1; cg++)
 +        {
 +            cginfo[cg] = ddcginfo(cginfo_mb,index_gl[cg]);
 +        }
 +    }
 +
 +    if (bLocalCG != NULL)
 +    {
 +        for(cg=cg0; cg<cg1; cg++)
 +        {
 +            bLocalCG[index_gl[cg]] = TRUE;
 +        }
 +    }
 +}
 +
 +static void make_dd_indices(gmx_domdec_t *dd,
 +                            const int *gcgs_index,int cg_start)
 +{
 +    int nzone,zone,zone1,cg0,cg1,cg1_p1,cg,cg_gl,a,a_gl;
 +    int *zone2cg,*zone_ncg1,*index_gl,*gatindex;
 +    gmx_ga2la_t *ga2la;
 +    char *bLocalCG;
 +    gmx_bool bCGs;
 +
 +    bLocalCG = dd->comm->bLocalCG;
 +
 +    if (dd->nat_tot > dd->gatindex_nalloc)
 +    {
 +        dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
 +        srenew(dd->gatindex,dd->gatindex_nalloc);
 +    }
 +
 +    nzone      = dd->comm->zones.n;
 +    zone2cg    = dd->comm->zones.cg_range;
 +    zone_ncg1  = dd->comm->zone_ncg1;
 +    index_gl   = dd->index_gl;
 +    gatindex   = dd->gatindex;
 +    bCGs       = dd->comm->bCGs;
 +
 +    if (zone2cg[1] != dd->ncg_home)
 +    {
 +        gmx_incons("dd->ncg_zone is not up to date");
 +    }
 +    
 +    /* Make the local to global and global to local atom index */
 +    a = dd->cgindex[cg_start];
 +    for(zone=0; zone<nzone; zone++)
 +    {
 +        if (zone == 0)
 +        {
 +            cg0 = cg_start;
 +        }
 +        else
 +        {
 +            cg0 = zone2cg[zone];
 +        }
 +        cg1    = zone2cg[zone+1];
 +        cg1_p1 = cg0 + zone_ncg1[zone];
 +
 +        for(cg=cg0; cg<cg1; cg++)
 +        {
 +            zone1 = zone;
 +            if (cg >= cg1_p1)
 +            {
 +                /* Signal that this cg is from more than one pulse away */
 +                zone1 += nzone;
 +            }
 +            cg_gl = index_gl[cg];
 +            if (bCGs)
 +            {
 +                for(a_gl=gcgs_index[cg_gl]; a_gl<gcgs_index[cg_gl+1]; a_gl++)
 +                {
 +                    gatindex[a] = a_gl;
 +                    ga2la_set(dd->ga2la,a_gl,a,zone1);
 +                    a++;
 +                }
 +            }
 +            else
 +            {
 +                gatindex[a] = cg_gl;
 +                ga2la_set(dd->ga2la,cg_gl,a,zone1);
 +                a++;
 +            }
 +        }
 +    }
 +}
 +
 +static int check_bLocalCG(gmx_domdec_t *dd,int ncg_sys,const char *bLocalCG,
 +                          const char *where)
 +{
 +    int ncg,i,ngl,nerr;
 +
 +    nerr = 0;
 +    if (bLocalCG == NULL)
 +    {
 +        return nerr;
 +    }
 +    for(i=0; i<dd->ncg_tot; i++)
 +    {
 +        if (!bLocalCG[dd->index_gl[i]])
 +        {
 +            fprintf(stderr,
 +                    "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n",dd->rank,where,i+1,dd->index_gl[i]+1,dd->ncg_home);
 +            nerr++;
 +        }
 +    }
 +    ngl = 0;
 +    for(i=0; i<ncg_sys; i++)
 +    {
 +        if (bLocalCG[i])
 +        {
 +            ngl++;
 +        }
 +    }
 +    if (ngl != dd->ncg_tot)
 +    {
 +        fprintf(stderr,"DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n",dd->rank,where,ngl,dd->ncg_tot);
 +        nerr++;
 +    }
 +
 +    return nerr;
 +}
 +
 +static void check_index_consistency(gmx_domdec_t *dd,
 +                                    int natoms_sys,int ncg_sys,
 +                                    const char *where)
 +{
 +    int  nerr,ngl,i,a,cell;
 +    int  *have;
 +
 +    nerr = 0;
 +
 +    if (dd->comm->DD_debug > 1)
 +    {
 +        snew(have,natoms_sys);
 +        for(a=0; a<dd->nat_tot; a++)
 +        {
 +            if (have[dd->gatindex[a]] > 0)
 +            {
 +                fprintf(stderr,"DD node %d: global atom %d occurs twice: index %d and %d\n",dd->rank,dd->gatindex[a]+1,have[dd->gatindex[a]],a+1);
 +            }
 +            else
 +            {
 +                have[dd->gatindex[a]] = a + 1;
 +            }
 +        }
 +        sfree(have);
 +    }
 +
 +    snew(have,dd->nat_tot);
 +
 +    ngl  = 0;
 +    for(i=0; i<natoms_sys; i++)
 +    {
 +        if (ga2la_get(dd->ga2la,i,&a,&cell))
 +        {
 +            if (a >= dd->nat_tot)
 +            {
 +                fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n",dd->rank,i+1,a+1,dd->nat_tot);
 +                nerr++;
 +            }
 +            else
 +            {
 +                have[a] = 1;
 +                if (dd->gatindex[a] != i)
 +                {
 +                    fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n",dd->rank,i+1,a+1,dd->gatindex[a]+1);
 +                    nerr++;
 +                }
 +            }
 +            ngl++;
 +        }
 +    }
 +    if (ngl != dd->nat_tot)
 +    {
 +        fprintf(stderr,
 +                "DD node %d, %s: %d global atom indices, %d local atoms\n",
 +                dd->rank,where,ngl,dd->nat_tot);
 +    }
 +    for(a=0; a<dd->nat_tot; a++)
 +    {
 +        if (have[a] == 0)
 +        {
 +            fprintf(stderr,
 +                    "DD node %d, %s: local atom %d, global %d has no global index\n",
 +                    dd->rank,where,a+1,dd->gatindex[a]+1);
 +        }
 +    }
 +    sfree(have);
 +
 +    nerr += check_bLocalCG(dd,ncg_sys,dd->comm->bLocalCG,where);
 +
 +    if (nerr > 0) {
 +        gmx_fatal(FARGS,"DD node %d, %s: %d atom/cg index inconsistencies",
 +                  dd->rank,where,nerr);
 +    }
 +}
 +
 +static void clear_dd_indices(gmx_domdec_t *dd,int cg_start,int a_start)
 +{
 +    int  i;
 +    char *bLocalCG;
 +
 +    if (a_start == 0)
 +    {
 +        /* Clear the whole list without searching */
 +        ga2la_clear(dd->ga2la);
 +    }
 +    else
 +    {
 +        for(i=a_start; i<dd->nat_tot; i++)
 +        {
 +            ga2la_del(dd->ga2la,dd->gatindex[i]);
 +        }
 +    }
 +
 +    bLocalCG = dd->comm->bLocalCG;
 +    if (bLocalCG)
 +    {
 +        for(i=cg_start; i<dd->ncg_tot; i++)
 +        {
 +            bLocalCG[dd->index_gl[i]] = FALSE;
 +        }
 +    }
 +
 +    dd_clear_local_vsite_indices(dd);
 +    
 +    if (dd->constraints)
 +    {
 +        dd_clear_local_constraint_indices(dd);
 +    }
 +}
 +
 +static real grid_jump_limit(gmx_domdec_comm_t *comm,real cutoff,
 +                            int dim_ind)
 +{
 +    real grid_jump_limit;
 +
 +    /* The distance between the boundaries of cells at distance
 +     * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
 +     * and by the fact that cells should not be shifted by more than
 +     * half their size, such that cg's only shift by one cell
 +     * at redecomposition.
 +     */
 +    grid_jump_limit = comm->cellsize_limit;
 +    if (!comm->bVacDLBNoLimit)
 +    {
 +        grid_jump_limit = max(grid_jump_limit,
 +                              cutoff/comm->cd[dim_ind].np);
 +    }
 +
 +    return grid_jump_limit;
 +}
 +
 +static gmx_bool check_grid_jump(gmx_large_int_t step,
 +                                gmx_domdec_t *dd,
 +                                real cutoff,
 +                                gmx_ddbox_t *ddbox,
 +                                gmx_bool bFatal)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  d,dim;
 +    real limit,bfac;
 +    gmx_bool bInvalid;
 +
 +    bInvalid = FALSE;
 +
 +    comm = dd->comm;
 +    
 +    for(d=1; d<dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        limit = grid_jump_limit(comm,cutoff,d);
 +        bfac = ddbox->box_size[dim];
 +        if (ddbox->tric_dir[dim])
 +        {
 +            bfac *= ddbox->skew_fac[dim];
 +        }
 +        if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
 +            (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
 +        {
 +            bInvalid = TRUE;
 +
 +            if (bFatal)
 +            {
 +                char buf[22];
 +
 +                /* This error should never be triggered under normal
 +                 * circumstances, but you never know ...
 +                 */
 +                gmx_fatal(FARGS,"Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with less nodes might avoid this issue.",
 +                          gmx_step_str(step,buf),
 +                          dim2char(dim),dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
 +            }
 +        }
 +    }
 +
 +    return bInvalid;
 +}
 +
 +static int dd_load_count(gmx_domdec_comm_t *comm)
 +{
 +    return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
 +}
 +
 +static float dd_force_load(gmx_domdec_comm_t *comm)
 +{
 +    float load;
 +    
 +    if (comm->eFlop)
 +    {
 +        load = comm->flop;
 +        if (comm->eFlop > 1)
 +        {
 +            load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
 +        }
 +    } 
 +    else
 +    {
 +        load = comm->cycl[ddCyclF];
 +        if (comm->cycl_n[ddCyclF] > 1)
 +        {
 +            /* Subtract the maximum of the last n cycle counts
 +             * to get rid of possible high counts due to other soures,
 +             * for instance system activity, that would otherwise
 +             * affect the dynamic load balancing.
 +             */
 +            load -= comm->cycl_max[ddCyclF];
 +        }
 +    }
 +    
 +    return load;
 +}
 +
 +static void set_slb_pme_dim_f(gmx_domdec_t *dd,int dim,real **dim_f)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int i;
 +    
 +    comm = dd->comm;
 +    
 +    snew(*dim_f,dd->nc[dim]+1);
 +    (*dim_f)[0] = 0;
 +    for(i=1; i<dd->nc[dim]; i++)
 +    {
 +        if (comm->slb_frac[dim])
 +        {
 +            (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
 +        }
 +        else
 +        {
 +            (*dim_f)[i] = (real)i/(real)dd->nc[dim];
 +        }
 +    }
 +    (*dim_f)[dd->nc[dim]] = 1;
 +}
 +
 +static void init_ddpme(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,int dimind)
 +{
 +    int        pmeindex,slab,nso,i;
 +    ivec xyz;
 +    
 +    if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
 +    {
 +        ddpme->dim = YY;
 +    }
 +    else
 +    {
 +        ddpme->dim = dimind;
 +    }
 +    ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
 +    
 +    ddpme->nslab = (ddpme->dim == 0 ?
 +                    dd->comm->npmenodes_x :
 +                    dd->comm->npmenodes_y);
 +
 +    if (ddpme->nslab <= 1)
 +    {
 +        return;
 +    }
 +
 +    nso = dd->comm->npmenodes/ddpme->nslab;
 +    /* Determine for each PME slab the PP location range for dimension dim */
 +    snew(ddpme->pp_min,ddpme->nslab);
 +    snew(ddpme->pp_max,ddpme->nslab);
 +    for(slab=0; slab<ddpme->nslab; slab++) {
 +        ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
 +        ddpme->pp_max[slab] = 0;
 +    }
 +    for(i=0; i<dd->nnodes; i++) {
 +        ddindex2xyz(dd->nc,i,xyz);
 +        /* For y only use our y/z slab.
 +         * This assumes that the PME x grid size matches the DD grid size.
 +         */
 +        if (dimind == 0 || xyz[XX] == dd->ci[XX]) {
 +            pmeindex = ddindex2pmeindex(dd,i);
 +            if (dimind == 0) {
 +                slab = pmeindex/nso;
 +            } else {
 +                slab = pmeindex % ddpme->nslab;
 +            }
 +            ddpme->pp_min[slab] = min(ddpme->pp_min[slab],xyz[dimind]);
 +            ddpme->pp_max[slab] = max(ddpme->pp_max[slab],xyz[dimind]);
 +        }
 +    }
 +
 +    set_slb_pme_dim_f(dd,ddpme->dim,&ddpme->slb_dim_f);
 +}
 +
 +int dd_pme_maxshift_x(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->ddpme[0].dim == XX)
 +    {
 +        return dd->comm->ddpme[0].maxshift;
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +int dd_pme_maxshift_y(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->ddpme[0].dim == YY)
 +    {
 +        return dd->comm->ddpme[0].maxshift;
 +    }
 +    else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
 +    {
 +        return dd->comm->ddpme[1].maxshift;
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +static void set_pme_maxshift(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,
 +                             gmx_bool bUniform,gmx_ddbox_t *ddbox,real *cell_f)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  nc,ns,s;
 +    int  *xmin,*xmax;
 +    real range,pme_boundary;
 +    int  sh;
 +    
 +    comm = dd->comm;
 +    nc  = dd->nc[ddpme->dim];
 +    ns  = ddpme->nslab;
 +    
 +    if (!ddpme->dim_match)
 +    {
 +        /* PP decomposition is not along dim: the worst situation */
 +        sh = ns/2;
 +    }
 +    else if (ns <= 3 || (bUniform && ns == nc))
 +    {
 +        /* The optimal situation */
 +        sh = 1;
 +    }
 +    else
 +    {
 +        /* We need to check for all pme nodes which nodes they
 +         * could possibly need to communicate with.
 +         */
 +        xmin = ddpme->pp_min;
 +        xmax = ddpme->pp_max;
 +        /* Allow for atoms to be maximally 2/3 times the cut-off
 +         * out of their DD cell. This is a reasonable balance between
 +         * between performance and support for most charge-group/cut-off
 +         * combinations.
 +         */
 +        range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
 +        /* Avoid extra communication when we are exactly at a boundary */
 +        range *= 0.999;
 +        
 +        sh = 1;
 +        for(s=0; s<ns; s++)
 +        {
 +            /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
 +            pme_boundary = (real)s/ns;
 +            while (sh+1 < ns &&
 +                   ((s-(sh+1) >= 0 &&
 +                     cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
 +                    (s-(sh+1) <  0 &&
 +                     cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
 +            {
 +                sh++;
 +            }
 +            pme_boundary = (real)(s+1)/ns;
 +            while (sh+1 < ns &&
 +                   ((s+(sh+1) <  ns &&
 +                     cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
 +                    (s+(sh+1) >= ns &&
 +                     cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
 +            {
 +                sh++;
 +            }
 +        }
 +    }
 +    
 +    ddpme->maxshift = sh;
 +    
 +    if (debug)
 +    {
 +        fprintf(debug,"PME slab communication range for dim %d is %d\n",
 +                ddpme->dim,ddpme->maxshift);
 +    }
 +}
 +
 +static void check_box_size(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
 +{
 +    int d,dim;
 +    
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        if (dim < ddbox->nboundeddim &&
 +            ddbox->box_size[dim]*ddbox->skew_fac[dim] <
 +            dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
 +        {
 +            gmx_fatal(FARGS,"The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
 +                      dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
 +                      dd->nc[dim],dd->comm->cellsize_limit);
 +        }
 +    }
 +}
 +
 +static void set_dd_cell_sizes_slb(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
 +                                  gmx_bool bMaster,ivec npulse)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  d,j;
 +    rvec cellsize_min;
 +    real *cell_x,cell_dx,cellsize;
 +    
 +    comm = dd->comm;
 +    
 +    for(d=0; d<DIM; d++)
 +    {
 +        cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
 +        npulse[d] = 1;
 +        if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
 +        {
 +            /* Uniform grid */
 +            cell_dx = ddbox->box_size[d]/dd->nc[d];
 +            if (bMaster)
 +            {
 +                for(j=0; j<dd->nc[d]+1; j++)
 +                {
 +                    dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
 +                }
 +            }
 +            else
 +            {
 +                comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
 +                comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
 +            }
 +            cellsize = cell_dx*ddbox->skew_fac[d];
 +            while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
 +            {
 +                npulse[d]++;
 +            }
 +            cellsize_min[d] = cellsize;
 +        }
 +        else
 +        {
 +            /* Statically load balanced grid */
 +            /* Also when we are not doing a master distribution we determine
 +             * all cell borders in a loop to obtain identical values
 +             * to the master distribution case and to determine npulse.
 +             */
 +            if (bMaster)
 +            {
 +                cell_x = dd->ma->cell_x[d];
 +            }
 +            else
 +            {
 +                snew(cell_x,dd->nc[d]+1);
 +            }
 +            cell_x[0] = ddbox->box0[d];
 +            for(j=0; j<dd->nc[d]; j++)
 +            {
 +                cell_dx = ddbox->box_size[d]*comm->slb_frac[d][j];
 +                cell_x[j+1] = cell_x[j] + cell_dx;
 +                cellsize = cell_dx*ddbox->skew_fac[d];
 +                while (cellsize*npulse[d] < comm->cutoff &&
 +                       npulse[d] < dd->nc[d]-1)
 +                {
 +                    npulse[d]++;
 +                }
 +                cellsize_min[d] = min(cellsize_min[d],cellsize);
 +            }
 +            if (!bMaster)
 +            {
 +                comm->cell_x0[d] = cell_x[dd->ci[d]];
 +                comm->cell_x1[d] = cell_x[dd->ci[d]+1];
 +                sfree(cell_x);
 +            }
 +        }
 +        /* The following limitation is to avoid that a cell would receive
 +         * some of its own home charge groups back over the periodic boundary.
 +         * Double charge groups cause trouble with the global indices.
 +         */
 +        if (d < ddbox->npbcdim &&
 +            dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
 +        {
 +            gmx_fatal_collective(FARGS,NULL,dd,
 +                                 "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
 +                                 dim2char(d),ddbox->box_size[d],ddbox->skew_fac[d],
 +                                 comm->cutoff,
 +                                 dd->nc[d],dd->nc[d],
 +                                 dd->nnodes > dd->nc[d] ? "cells" : "processors");
 +        }
 +    }
 +    
 +    if (!comm->bDynLoadBal)
 +    {
 +        copy_rvec(cellsize_min,comm->cellsize_min);
 +    }
 +   
 +    for(d=0; d<comm->npmedecompdim; d++)
 +    {
 +        set_pme_maxshift(dd,&comm->ddpme[d],
 +                         comm->slb_frac[dd->dim[d]]==NULL,ddbox,
 +                         comm->ddpme[d].slb_dim_f);
 +    }
 +}
 +
 +
 +static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
 +                                       int d,int dim,gmx_domdec_root_t *root,
 +                                       gmx_ddbox_t *ddbox,
 +                                       gmx_bool bUniform,gmx_large_int_t step, real cellsize_limit_f, int range[])
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  ncd,i,j,nmin,nmin_old;
 +    gmx_bool bLimLo,bLimHi;
 +    real *cell_size;
 +    real fac,halfway,cellsize_limit_f_i,region_size;
 +    gmx_bool bPBC,bLastHi=FALSE;
 +    int nrange[]={range[0],range[1]};
 +
 +    region_size= root->cell_f[range[1]]-root->cell_f[range[0]];  
 +
 +    comm = dd->comm;
 +
 +    ncd = dd->nc[dim];
 +
 +    bPBC = (dim < ddbox->npbcdim);
 +
 +    cell_size = root->buf_ncd;
 +
 +    if (debug) 
 +    {
 +        fprintf(debug,"enforce_limits: %d %d\n",range[0],range[1]);
 +    }
 +
 +    /* First we need to check if the scaling does not make cells
 +     * smaller than the smallest allowed size.
 +     * We need to do this iteratively, since if a cell is too small,
 +     * it needs to be enlarged, which makes all the other cells smaller,
 +     * which could in turn make another cell smaller than allowed.
 +     */
 +    for(i=range[0]; i<range[1]; i++)
 +    {
 +        root->bCellMin[i] = FALSE;
 +    }
 +    nmin = 0;
 +    do
 +    {
 +        nmin_old = nmin;
 +        /* We need the total for normalization */
 +        fac = 0;
 +        for(i=range[0]; i<range[1]; i++)
 +        {
 +            if (root->bCellMin[i] == FALSE)
 +            {
 +                fac += cell_size[i];
 +            }
 +        }
 +        fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
 +        /* Determine the cell boundaries */
 +        for(i=range[0]; i<range[1]; i++)
 +        {
 +            if (root->bCellMin[i] == FALSE)
 +            {
 +                cell_size[i] *= fac;
 +                if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
 +                {
 +                    cellsize_limit_f_i = 0;
 +                }
 +                else
 +                {
 +                    cellsize_limit_f_i = cellsize_limit_f;
 +                }
 +                if (cell_size[i] < cellsize_limit_f_i)
 +                {
 +                    root->bCellMin[i] = TRUE;
 +                    cell_size[i] = cellsize_limit_f_i;
 +                    nmin++;
 +                }
 +            }
 +            root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
 +        }
 +    }
 +    while (nmin > nmin_old);
 +    
 +    i=range[1]-1;
 +    cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
 +    /* For this check we should not use DD_CELL_MARGIN,
 +     * but a slightly smaller factor,
 +     * since rounding could get use below the limit.
 +     */
 +    if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
 +    {
 +        char buf[22];
 +        gmx_fatal(FARGS,"Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
 +                  gmx_step_str(step,buf),
 +                  dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
 +                  ncd,comm->cellsize_min[dim]);
 +    }
 +    
 +    root->bLimited = (nmin > 0) || (range[0]>0) || (range[1]<ncd);
 +    
 +    if (!bUniform)
 +    {
 +        /* Check if the boundary did not displace more than halfway
 +         * each of the cells it bounds, as this could cause problems,
 +         * especially when the differences between cell sizes are large.
 +         * If changes are applied, they will not make cells smaller
 +         * than the cut-off, as we check all the boundaries which
 +         * might be affected by a change and if the old state was ok,
 +         * the cells will at most be shrunk back to their old size.
 +         */
 +        for(i=range[0]+1; i<range[1]; i++)
 +        {
 +            halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
 +            if (root->cell_f[i] < halfway)
 +            {
 +                root->cell_f[i] = halfway;
 +                /* Check if the change also causes shifts of the next boundaries */
 +                for(j=i+1; j<range[1]; j++)
 +                {
 +                    if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
 +                        root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
 +                }
 +            }
 +            halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
 +            if (root->cell_f[i] > halfway)
 +            {
 +                root->cell_f[i] = halfway;
 +                /* Check if the change also causes shifts of the next boundaries */
 +                for(j=i-1; j>=range[0]+1; j--)
 +                {
 +                    if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
 +                        root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
 +                }
 +            }
 +        }
 +    }
 +    
 +    /* nrange is defined as [lower, upper) range for new call to enforce_limits */
 +    /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
 +     * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
 +     * for a and b nrange is used */
 +    if (d > 0)
 +    {
 +        /* Take care of the staggering of the cell boundaries */
 +        if (bUniform)
 +        {
 +            for(i=range[0]; i<range[1]; i++)
 +            {
 +                root->cell_f_max0[i] = root->cell_f[i];
 +                root->cell_f_min1[i] = root->cell_f[i+1];
 +            }
 +        }
 +        else
 +        {
 +            for(i=range[0]+1; i<range[1]; i++)
 +            {
 +                bLimLo = (root->cell_f[i] < root->bound_min[i]);
 +                bLimHi = (root->cell_f[i] > root->bound_max[i]);
 +                if (bLimLo && bLimHi)
 +                {
 +                    /* Both limits violated, try the best we can */
 +                    /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
 +                    root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
 +                    nrange[0]=range[0];
 +                    nrange[1]=i;
 +                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +
 +                    nrange[0]=i;
 +                    nrange[1]=range[1];
 +                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +
 +                    return;
 +                }
 +                else if (bLimLo)
 +                {
 +                    /* root->cell_f[i] = root->bound_min[i]; */
 +                    nrange[1]=i;  /* only store violation location. There could be a LimLo violation following with an higher index */
 +                    bLastHi=FALSE;
 +                }
 +                else if (bLimHi && !bLastHi)
 +                {
 +                    bLastHi=TRUE;
 +                    if (nrange[1] < range[1])   /* found a LimLo before */
 +                    {
 +                        root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
 +                        dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +                        nrange[0]=nrange[1];
 +                    }
 +                    root->cell_f[i] = root->bound_max[i];
 +                    nrange[1]=i; 
 +                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +                    nrange[0]=i;
 +                    nrange[1]=range[1];
 +                }
 +            }
 +            if (nrange[1] < range[1])   /* found last a LimLo */
 +            {
 +                root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
 +                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +                nrange[0]=nrange[1];
 +                nrange[1]=range[1];
 +                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +            } 
 +            else if (nrange[0] > range[0]) /* found at least one LimHi */
 +            {
 +                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
 +                                       int d,int dim,gmx_domdec_root_t *root,
 +                                       gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
 +                                       gmx_bool bUniform,gmx_large_int_t step)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  ncd,d1,i,j,pos;
 +    real *cell_size;
 +    real load_aver,load_i,imbalance,change,change_max,sc;
 +    real cellsize_limit_f,dist_min_f,dist_min_f_hard,space;
 +    real change_limit;
 +    real relax = 0.5;
 +    gmx_bool bPBC;
 +    int range[] = { 0, 0 };
 +
 +    comm = dd->comm;
 +
 +    /* Convert the maximum change from the input percentage to a fraction */
 +    change_limit = comm->dlb_scale_lim*0.01;
 +
 +    ncd = dd->nc[dim];
 +
 +    bPBC = (dim < ddbox->npbcdim);
 +
 +    cell_size = root->buf_ncd;
 +
 +    /* Store the original boundaries */
 +    for(i=0; i<ncd+1; i++)
 +    {
 +        root->old_cell_f[i] = root->cell_f[i];
 +    }
 +    if (bUniform) {
 +        for(i=0; i<ncd; i++)
 +        {
 +            cell_size[i] = 1.0/ncd;
 +        }
 +    }
 +    else if (dd_load_count(comm))
 +    {
 +        load_aver = comm->load[d].sum_m/ncd;
 +        change_max = 0;
 +        for(i=0; i<ncd; i++)
 +        {
 +            /* Determine the relative imbalance of cell i */
 +            load_i = comm->load[d].load[i*comm->load[d].nload+2];
 +            imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
 +            /* Determine the change of the cell size using underrelaxation */
 +            change = -relax*imbalance;
 +            change_max = max(change_max,max(change,-change));
 +        }
 +        /* Limit the amount of scaling.
 +         * We need to use the same rescaling for all cells in one row,
 +         * otherwise the load balancing might not converge.
 +         */
 +        sc = relax;
 +        if (change_max > change_limit)
 +        {
 +            sc *= change_limit/change_max;
 +        }
 +        for(i=0; i<ncd; i++)
 +        {
 +            /* Determine the relative imbalance of cell i */
 +            load_i = comm->load[d].load[i*comm->load[d].nload+2];
 +            imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
 +            /* Determine the change of the cell size using underrelaxation */
 +            change = -sc*imbalance;
 +            cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
 +        }
 +    }
 +    
 +    cellsize_limit_f  = comm->cellsize_min[dim]/ddbox->box_size[dim];
 +    cellsize_limit_f *= DD_CELL_MARGIN;
 +    dist_min_f_hard   = grid_jump_limit(comm,comm->cutoff,d)/ddbox->box_size[dim];
 +    dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
 +    if (ddbox->tric_dir[dim])
 +    {
 +        cellsize_limit_f /= ddbox->skew_fac[dim];
 +        dist_min_f       /= ddbox->skew_fac[dim];
 +    }
 +    if (bDynamicBox && d > 0)
 +    {
 +        dist_min_f *= DD_PRES_SCALE_MARGIN;
 +    }
 +    if (d > 0 && !bUniform)
 +    {
 +        /* Make sure that the grid is not shifted too much */
 +        for(i=1; i<ncd; i++) {
 +            if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard) 
 +            {
 +                gmx_incons("Inconsistent DD boundary staggering limits!");
 +            }
 +            root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
 +            space = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
 +            if (space > 0) {
 +                root->bound_min[i] += 0.5*space;
 +            }
 +            root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
 +            space = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
 +            if (space < 0) {
 +                root->bound_max[i] += 0.5*space;
 +            }
 +            if (debug)
 +            {
 +                fprintf(debug,
 +                        "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
 +                        d,i,
 +                        root->cell_f_max0[i-1] + dist_min_f,
 +                        root->bound_min[i],root->cell_f[i],root->bound_max[i],
 +                        root->cell_f_min1[i] - dist_min_f);
 +            }
 +        }
 +    }
 +    range[1]=ncd;
 +    root->cell_f[0] = 0;
 +    root->cell_f[ncd] = 1;
 +    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
 +
 +
 +    /* After the checks above, the cells should obey the cut-off
 +     * restrictions, but it does not hurt to check.
 +     */
 +    for(i=0; i<ncd; i++)
 +    {
 +        if (debug)
 +        {
 +            fprintf(debug,"Relative bounds dim %d  cell %d: %f %f\n",
 +                    dim,i,root->cell_f[i],root->cell_f[i+1]);
 +        }
 +
 +        if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
 +            root->cell_f[i+1] - root->cell_f[i] <
 +            cellsize_limit_f/DD_CELL_MARGIN)
 +        {
 +            char buf[22];
 +            fprintf(stderr,
 +                    "\nWARNING step %s: direction %c, cell %d too small: %f\n",
 +                    gmx_step_str(step,buf),dim2char(dim),i,
 +                    (root->cell_f[i+1] - root->cell_f[i])
 +                    *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
 +        }
 +    }
 +    
 +    pos = ncd + 1;
 +    /* Store the cell boundaries of the lower dimensions at the end */
 +    for(d1=0; d1<d; d1++)
 +    {
 +        root->cell_f[pos++] = comm->cell_f0[d1];
 +        root->cell_f[pos++] = comm->cell_f1[d1];
 +    }
 +    
 +    if (d < comm->npmedecompdim)
 +    {
 +        /* The master determines the maximum shift for
 +         * the coordinate communication between separate PME nodes.
 +         */
 +        set_pme_maxshift(dd,&comm->ddpme[d],bUniform,ddbox,root->cell_f);
 +    }
 +    root->cell_f[pos++] = comm->ddpme[0].maxshift;
 +    if (d >= 1)
 +    {
 +        root->cell_f[pos++] = comm->ddpme[1].maxshift;
 +    }
 +}    
 +
 +static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
 +                                             gmx_ddbox_t *ddbox,int dimind)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int dim;
 +
 +    comm = dd->comm;
 +
 +    /* Set the cell dimensions */
 +    dim = dd->dim[dimind];
 +    comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
 +    comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
 +    if (dim >= ddbox->nboundeddim)
 +    {
 +        comm->cell_x0[dim] += ddbox->box0[dim];
 +        comm->cell_x1[dim] += ddbox->box0[dim];
 +    }
 +}
 +
 +static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
 +                                         int d,int dim,real *cell_f_row,
 +                                         gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int d1,dim1,pos;
 +
 +    comm = dd->comm;
 +
 +#ifdef GMX_MPI
 +    /* Each node would only need to know two fractions,
 +     * but it is probably cheaper to broadcast the whole array.
 +     */
 +    MPI_Bcast(cell_f_row,DD_CELL_F_SIZE(dd,d)*sizeof(real),MPI_BYTE,
 +              0,comm->mpi_comm_load[d]);
 +#endif
 +    /* Copy the fractions for this dimension from the buffer */
 +    comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
 +    comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
 +    /* The whole array was communicated, so set the buffer position */
 +    pos = dd->nc[dim] + 1;
 +    for(d1=0; d1<=d; d1++)
 +    {
 +        if (d1 < d)
 +        {
 +            /* Copy the cell fractions of the lower dimensions */
 +            comm->cell_f0[d1] = cell_f_row[pos++];
 +            comm->cell_f1[d1] = cell_f_row[pos++];
 +        }
 +        relative_to_absolute_cell_bounds(dd,ddbox,d1);
 +    }
 +    /* Convert the communicated shift from float to int */
 +    comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
 +    if (d >= 1)
 +    {
 +        comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
 +    }
 +}
 +
 +static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
 +                                         gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
 +                                         gmx_bool bUniform,gmx_large_int_t step)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int d,dim,d1;
 +    gmx_bool bRowMember,bRowRoot;
 +    real *cell_f_row;
 +    
 +    comm = dd->comm;
 +
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        bRowMember = TRUE;
 +        bRowRoot = TRUE;
 +        for(d1=d; d1<dd->ndim; d1++)
 +        {
 +            if (dd->ci[dd->dim[d1]] > 0)
 +            {
 +                if (d1 > d)
 +                {
 +                    bRowMember = FALSE;
 +                }
 +                bRowRoot = FALSE;
 +            }
 +        }
 +        if (bRowMember)
 +        {
 +            if (bRowRoot)
 +            {
 +                set_dd_cell_sizes_dlb_root(dd,d,dim,comm->root[d],
 +                                           ddbox,bDynamicBox,bUniform,step);
 +                cell_f_row = comm->root[d]->cell_f;
 +            }
 +            else
 +            {
 +                cell_f_row = comm->cell_f_row;
 +            }
 +            distribute_dd_cell_sizes_dlb(dd,d,dim,cell_f_row,ddbox);
 +        }
 +    }
 +}    
 +
 +static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
 +{
 +    int d;
 +
 +    /* This function assumes the box is static and should therefore
 +     * not be called when the box has changed since the last
 +     * call to dd_partition_system.
 +     */
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        relative_to_absolute_cell_bounds(dd,ddbox,d); 
 +    }
 +}
 +
 +
 +
 +static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
 +                                  gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
 +                                  gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
 +                                  gmx_wallcycle_t wcycle)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int dim;
 +
 +    comm = dd->comm;
 +    
 +    if (bDoDLB)
 +    {
 +        wallcycle_start(wcycle,ewcDDCOMMBOUND);
 +        set_dd_cell_sizes_dlb_change(dd,ddbox,bDynamicBox,bUniform,step);
 +        wallcycle_stop(wcycle,ewcDDCOMMBOUND);
 +    }
 +    else if (bDynamicBox)
 +    {
 +        set_dd_cell_sizes_dlb_nochange(dd,ddbox);
 +    }
 +    
 +    /* Set the dimensions for which no DD is used */
 +    for(dim=0; dim<DIM; dim++) {
 +        if (dd->nc[dim] == 1) {
 +            comm->cell_x0[dim] = 0;
 +            comm->cell_x1[dim] = ddbox->box_size[dim];
 +            if (dim >= ddbox->nboundeddim)
 +            {
 +                comm->cell_x0[dim] += ddbox->box0[dim];
 +                comm->cell_x1[dim] += ddbox->box0[dim];
 +            }
 +        }
 +    }
 +}
 +
 +static void realloc_comm_ind(gmx_domdec_t *dd,ivec npulse)
 +{
 +    int d,np,i;
 +    gmx_domdec_comm_dim_t *cd;
 +    
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        cd = &dd->comm->cd[d];
 +        np = npulse[dd->dim[d]];
 +        if (np > cd->np_nalloc)
 +        {
 +            if (debug)
 +            {
 +                fprintf(debug,"(Re)allocing cd for %c to %d pulses\n",
 +                        dim2char(dd->dim[d]),np);
 +            }
 +            if (DDMASTER(dd) && cd->np_nalloc > 0)
 +            {
 +                fprintf(stderr,"\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n",dim2char(dd->dim[d]),np);
 +            }
 +            srenew(cd->ind,np);
 +            for(i=cd->np_nalloc; i<np; i++)
 +            {
 +                cd->ind[i].index  = NULL;
 +                cd->ind[i].nalloc = 0;
 +            }
 +            cd->np_nalloc = np;
 +        }
 +        cd->np = np;
 +    }
 +}
 +
 +
 +static void set_dd_cell_sizes(gmx_domdec_t *dd,
 +                              gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
 +                              gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
 +                              gmx_wallcycle_t wcycle)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  d;
 +    ivec npulse;
 +    
 +    comm = dd->comm;
 +
 +    /* Copy the old cell boundaries for the cg displacement check */
 +    copy_rvec(comm->cell_x0,comm->old_cell_x0);
 +    copy_rvec(comm->cell_x1,comm->old_cell_x1);
 +    
 +    if (comm->bDynLoadBal)
 +    {
 +        if (DDMASTER(dd))
 +        {
 +            check_box_size(dd,ddbox);
 +        }
 +        set_dd_cell_sizes_dlb(dd,ddbox,bDynamicBox,bUniform,bDoDLB,step,wcycle);
 +    }
 +    else
 +    {
 +        set_dd_cell_sizes_slb(dd,ddbox,FALSE,npulse);
 +        realloc_comm_ind(dd,npulse);
 +    }
 +    
 +    if (debug)
 +    {
 +        for(d=0; d<DIM; d++)
 +        {
 +            fprintf(debug,"cell_x[%d] %f - %f skew_fac %f\n",
 +                    d,comm->cell_x0[d],comm->cell_x1[d],ddbox->skew_fac[d]);
 +        }
 +    }
 +}
 +
 +static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
 +                                  gmx_ddbox_t *ddbox,
 +                                  rvec cell_ns_x0,rvec cell_ns_x1,
 +                                  gmx_large_int_t step)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int dim_ind,dim;
 +    
 +    comm = dd->comm;
 +
 +    for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
 +    {
 +        dim = dd->dim[dim_ind];
 +        
 +        /* Without PBC we don't have restrictions on the outer cells */
 +        if (!(dim >= ddbox->npbcdim && 
 +              (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
 +            comm->bDynLoadBal &&
 +            (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
 +            comm->cellsize_min[dim])
 +        {
 +            char buf[22];
 +            gmx_fatal(FARGS,"Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
 +                      gmx_step_str(step,buf),dim2char(dim),
 +                      comm->cell_x1[dim] - comm->cell_x0[dim],
 +                      ddbox->skew_fac[dim],
 +                      dd->comm->cellsize_min[dim],
 +                      dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
 +        }
 +    }
 +    
 +    if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
 +    {
 +        /* Communicate the boundaries and update cell_ns_x0/1 */
 +        dd_move_cellx(dd,ddbox,cell_ns_x0,cell_ns_x1);
 +        if (dd->bGridJump && dd->ndim > 1)
 +        {
 +            check_grid_jump(step,dd,dd->comm->cutoff,ddbox,TRUE);
 +        }
 +    }
 +}
 +
 +static void make_tric_corr_matrix(int npbcdim,matrix box,matrix tcm)
 +{
 +    if (YY < npbcdim)
 +    {
 +        tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
 +    }
 +    else
 +    {
 +        tcm[YY][XX] = 0;
 +    }
 +    if (ZZ < npbcdim)
 +    {
 +        tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
 +        tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
 +    }
 +    else
 +    {
 +        tcm[ZZ][XX] = 0;
 +        tcm[ZZ][YY] = 0;
 +    }
 +}
 +
 +static void check_screw_box(matrix box)
 +{
 +    /* Mathematical limitation */
 +    if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
 +    {
 +        gmx_fatal(FARGS,"With screw pbc the unit cell can not have non-zero off-diagonal x-components");
 +    }
 +    
 +    /* Limitation due to the asymmetry of the eighth shell method */
 +    if (box[ZZ][YY] != 0)
 +    {
 +        gmx_fatal(FARGS,"pbc=screw with non-zero box_zy is not supported");
 +    }
 +}
 +
 +static void distribute_cg(FILE *fplog,gmx_large_int_t step,
 +                          matrix box,ivec tric_dir,t_block *cgs,rvec pos[],
 +                          gmx_domdec_t *dd)
 +{
 +    gmx_domdec_master_t *ma;
 +    int **tmp_ind=NULL,*tmp_nalloc=NULL;
 +    int  i,icg,j,k,k0,k1,d,npbcdim;
 +    matrix tcm;
 +    rvec box_size,cg_cm;
 +    ivec ind;
 +    real nrcg,inv_ncg,pos_d;
 +    atom_id *cgindex;
 +    gmx_bool bUnbounded,bScrew;
 +
 +    ma = dd->ma;
 +    
 +    if (tmp_ind == NULL)
 +    {
 +        snew(tmp_nalloc,dd->nnodes);
 +        snew(tmp_ind,dd->nnodes);
 +        for(i=0; i<dd->nnodes; i++)
 +        {
 +            tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
 +            snew(tmp_ind[i],tmp_nalloc[i]);
 +        }
 +    }
 +    
 +    /* Clear the count */
 +    for(i=0; i<dd->nnodes; i++)
 +    {
 +        ma->ncg[i] = 0;
 +        ma->nat[i] = 0;
 +    }
 +    
 +    make_tric_corr_matrix(dd->npbcdim,box,tcm);
 +    
 +    cgindex = cgs->index;
 +    
 +    /* Compute the center of geometry for all charge groups */
 +    for(icg=0; icg<cgs->nr; icg++)
 +    {
 +        k0      = cgindex[icg];
 +        k1      = cgindex[icg+1];
 +        nrcg    = k1 - k0;
 +        if (nrcg == 1)
 +        {
 +            copy_rvec(pos[k0],cg_cm);
 +        }
 +        else
 +        {
 +            inv_ncg = 1.0/nrcg;
 +            
 +            clear_rvec(cg_cm);
 +            for(k=k0; (k<k1); k++)
 +            {
 +                rvec_inc(cg_cm,pos[k]);
 +            }
 +            for(d=0; (d<DIM); d++)
 +            {
 +                cg_cm[d] *= inv_ncg;
 +            }
 +        }
 +        /* Put the charge group in the box and determine the cell index */
 +        for(d=DIM-1; d>=0; d--) {
 +            pos_d = cg_cm[d];
 +            if (d < dd->npbcdim)
 +            {
 +                bScrew = (dd->bScrewPBC && d == XX);
 +                if (tric_dir[d] && dd->nc[d] > 1)
 +                {
 +                    /* Use triclinic coordintates for this dimension */
 +                    for(j=d+1; j<DIM; j++)
 +                    {
 +                        pos_d += cg_cm[j]*tcm[j][d];
 +                    }
 +                }
 +                while(pos_d >= box[d][d])
 +                {
 +                    pos_d -= box[d][d];
 +                    rvec_dec(cg_cm,box[d]);
 +                    if (bScrew)
 +                    {
 +                        cg_cm[YY] = box[YY][YY] - cg_cm[YY];
 +                        cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
 +                    }
 +                    for(k=k0; (k<k1); k++)
 +                    {
 +                        rvec_dec(pos[k],box[d]);
 +                        if (bScrew)
 +                        {
 +                            pos[k][YY] = box[YY][YY] - pos[k][YY];
 +                            pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
 +                        }
 +                    }
 +                }
 +                while(pos_d < 0)
 +                {
 +                    pos_d += box[d][d];
 +                    rvec_inc(cg_cm,box[d]);
 +                    if (bScrew)
 +                    {
 +                        cg_cm[YY] = box[YY][YY] - cg_cm[YY];
 +                        cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
 +                    }
 +                    for(k=k0; (k<k1); k++)
 +                    {
 +                        rvec_inc(pos[k],box[d]);
 +                        if (bScrew) {
 +                            pos[k][YY] = box[YY][YY] - pos[k][YY];
 +                            pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
 +                        }
 +                    }
 +                }
 +            }
 +            /* This could be done more efficiently */
 +            ind[d] = 0;
 +            while(ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
 +            {
 +                ind[d]++;
 +            }
 +        }
 +        i = dd_index(dd->nc,ind);
 +        if (ma->ncg[i] == tmp_nalloc[i])
 +        {
 +            tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
 +            srenew(tmp_ind[i],tmp_nalloc[i]);
 +        }
 +        tmp_ind[i][ma->ncg[i]] = icg;
 +        ma->ncg[i]++;
 +        ma->nat[i] += cgindex[icg+1] - cgindex[icg];
 +    }
 +    
 +    k1 = 0;
 +    for(i=0; i<dd->nnodes; i++)
 +    {
 +        ma->index[i] = k1;
 +        for(k=0; k<ma->ncg[i]; k++)
 +        {
 +            ma->cg[k1++] = tmp_ind[i][k];
 +        }
 +    }
 +    ma->index[dd->nnodes] = k1;
 +    
 +    for(i=0; i<dd->nnodes; i++)
 +    {
 +        sfree(tmp_ind[i]);
 +    }
 +    sfree(tmp_ind);
 +    sfree(tmp_nalloc);
 +    
 +    if (fplog)
 +    {
 +        char buf[22];
 +        fprintf(fplog,"Charge group distribution at step %s:",
 +                gmx_step_str(step,buf));
 +        for(i=0; i<dd->nnodes; i++)
 +        {
 +            fprintf(fplog," %d",ma->ncg[i]);
 +        }
 +        fprintf(fplog,"\n");
 +    }
 +}
 +
 +static void get_cg_distribution(FILE *fplog,gmx_large_int_t step,gmx_domdec_t *dd,
 +                                t_block *cgs,matrix box,gmx_ddbox_t *ddbox,
 +                                rvec pos[])
 +{
 +    gmx_domdec_master_t *ma=NULL;
 +    ivec npulse;
 +    int  i,cg_gl;
 +    int  *ibuf,buf2[2] = { 0, 0 };
 +    gmx_bool bMaster = DDMASTER(dd);
 +    if (bMaster)
 +    {
 +        ma = dd->ma;
 +        
 +        if (dd->bScrewPBC)
 +        {
 +            check_screw_box(box);
 +        }
 +    
 +        set_dd_cell_sizes_slb(dd,ddbox,TRUE,npulse);
 +    
 +        distribute_cg(fplog,step,box,ddbox->tric_dir,cgs,pos,dd);
 +        for(i=0; i<dd->nnodes; i++)
 +        {
 +            ma->ibuf[2*i]   = ma->ncg[i];
 +            ma->ibuf[2*i+1] = ma->nat[i];
 +        }
 +        ibuf = ma->ibuf;
 +    }
 +    else
 +    {
 +        ibuf = NULL;
 +    }
 +    dd_scatter(dd,2*sizeof(int),ibuf,buf2);
 +    
 +    dd->ncg_home = buf2[0];
 +    dd->nat_home = buf2[1];
 +    dd->ncg_tot  = dd->ncg_home;
 +    dd->nat_tot  = dd->nat_home;
 +    if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
 +    {
 +        dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
 +        srenew(dd->index_gl,dd->cg_nalloc);
 +        srenew(dd->cgindex,dd->cg_nalloc+1);
 +    }
 +    if (bMaster)
 +    {
 +        for(i=0; i<dd->nnodes; i++)
 +        {
 +            ma->ibuf[i] = ma->ncg[i]*sizeof(int);
 +            ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
 +        }
 +    }
 +    
 +    dd_scatterv(dd,
 +                DDMASTER(dd) ? ma->ibuf : NULL,
 +                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
 +                DDMASTER(dd) ? ma->cg : NULL,
 +                dd->ncg_home*sizeof(int),dd->index_gl);
 +    
 +    /* Determine the home charge group sizes */
 +    dd->cgindex[0] = 0;
 +    for(i=0; i<dd->ncg_home; i++)
 +    {
 +        cg_gl = dd->index_gl[i];
 +        dd->cgindex[i+1] =
 +            dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
 +    }
 +    
 +    if (debug)
 +    {
 +        fprintf(debug,"Home charge groups:\n");
 +        for(i=0; i<dd->ncg_home; i++)
 +        {
 +            fprintf(debug," %d",dd->index_gl[i]);
 +            if (i % 10 == 9) 
 +                fprintf(debug,"\n");
 +        }
 +        fprintf(debug,"\n");
 +    }
 +}
 +
 +static int compact_and_copy_vec_at(int ncg,int *move,
 +                                   int *cgindex,
 +                                   int nvec,int vec,
 +                                   rvec *src,gmx_domdec_comm_t *comm,
 +                                   gmx_bool bCompact)
 +{
 +    int m,icg,i,i0,i1,nrcg;
 +    int home_pos;
 +    int pos_vec[DIM*2];
 +    
 +    home_pos = 0;
 +
 +    for(m=0; m<DIM*2; m++)
 +    {
 +        pos_vec[m] = 0;
 +    }
 +    
 +    i0 = 0;
 +    for(icg=0; icg<ncg; icg++)
 +    {
 +        i1 = cgindex[icg+1];
 +        m = move[icg];
 +        if (m == -1)
 +        {
 +            if (bCompact)
 +            {
 +                /* Compact the home array in place */
 +                for(i=i0; i<i1; i++)
 +                {
 +                    copy_rvec(src[i],src[home_pos++]);
 +                }
 +            }
 +        }
 +        else
 +        {
 +            /* Copy to the communication buffer */
 +            nrcg = i1 - i0;
 +            pos_vec[m] += 1 + vec*nrcg;
 +            for(i=i0; i<i1; i++)
 +            {
 +                copy_rvec(src[i],comm->cgcm_state[m][pos_vec[m]++]);
 +            }
 +            pos_vec[m] += (nvec - vec - 1)*nrcg;
 +        }
 +        if (!bCompact)
 +        {
 +            home_pos += i1 - i0;
 +        }
 +        i0 = i1;
 +    }
 +    
 +    return home_pos;
 +}
 +
 +static int compact_and_copy_vec_cg(int ncg,int *move,
 +                                   int *cgindex,
 +                                   int nvec,rvec *src,gmx_domdec_comm_t *comm,
 +                                   gmx_bool bCompact)
 +{
 +    int m,icg,i0,i1,nrcg;
 +    int home_pos;
 +    int pos_vec[DIM*2];
 +    
 +    home_pos = 0;
 +    
 +    for(m=0; m<DIM*2; m++)
 +    {
 +        pos_vec[m] = 0;
 +    }
 +    
 +    i0 = 0;
 +    for(icg=0; icg<ncg; icg++)
 +    {
 +        i1 = cgindex[icg+1];
 +        m = move[icg];
 +        if (m == -1)
 +        {
 +            if (bCompact)
 +            {
 +                /* Compact the home array in place */
 +                copy_rvec(src[icg],src[home_pos++]);
 +            }
 +        }
 +        else
 +        {
 +            nrcg = i1 - i0;
 +            /* Copy to the communication buffer */
 +            copy_rvec(src[icg],comm->cgcm_state[m][pos_vec[m]]);
 +            pos_vec[m] += 1 + nrcg*nvec;
 +        }
 +        i0 = i1;
 +    }
 +    if (!bCompact)
 +    {
 +        home_pos = ncg;
 +    }
 +    
 +    return home_pos;
 +}
 +
 +static int compact_ind(int ncg,int *move,
 +                       int *index_gl,int *cgindex,
 +                       int *gatindex,
 +                       gmx_ga2la_t ga2la,char *bLocalCG,
 +                       int *cginfo)
 +{
 +    int cg,nat,a0,a1,a,a_gl;
 +    int home_pos;
 +
 +    home_pos = 0;
 +    nat = 0;
 +    for(cg=0; cg<ncg; cg++)
 +    {
 +        a0 = cgindex[cg];
 +        a1 = cgindex[cg+1];
 +        if (move[cg] == -1)
 +        {
 +            /* Compact the home arrays in place.
 +             * Anything that can be done here avoids access to global arrays.
 +             */
 +            cgindex[home_pos] = nat;
 +            for(a=a0; a<a1; a++)
 +            {
 +                a_gl = gatindex[a];
 +                gatindex[nat] = a_gl;
 +                /* The cell number stays 0, so we don't need to set it */
 +                ga2la_change_la(ga2la,a_gl,nat);
 +                nat++;
 +            }
 +            index_gl[home_pos] = index_gl[cg];
 +            cginfo[home_pos]   = cginfo[cg];
 +            /* The charge group remains local, so bLocalCG does not change */
 +            home_pos++;
 +        }
 +        else
 +        {
 +            /* Clear the global indices */
 +            for(a=a0; a<a1; a++)
 +            {
 +                ga2la_del(ga2la,gatindex[a]);
 +            }
 +            if (bLocalCG)
 +            {
 +                bLocalCG[index_gl[cg]] = FALSE;
 +            }
 +        }
 +    }
 +    cgindex[home_pos] = nat;
 +    
 +    return home_pos;
 +}
 +
 +static void clear_and_mark_ind(int ncg,int *move,
 +                               int *index_gl,int *cgindex,int *gatindex,
 +                               gmx_ga2la_t ga2la,char *bLocalCG,
 +                               int *cell_index)
 +{
 +    int cg,a0,a1,a;
 +    
 +    for(cg=0; cg<ncg; cg++)
 +    {
 +        if (move[cg] >= 0)
 +        {
 +            a0 = cgindex[cg];
 +            a1 = cgindex[cg+1];
 +            /* Clear the global indices */
 +            for(a=a0; a<a1; a++)
 +            {
 +                ga2la_del(ga2la,gatindex[a]);
 +            }
 +            if (bLocalCG)
 +            {
 +                bLocalCG[index_gl[cg]] = FALSE;
 +            }
 +            /* Signal that this cg has moved using the ns cell index.
 +             * Here we set it to -1. fill_grid will change it
 +             * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
 +             */
 +            cell_index[cg] = -1;
 +        }
 +    }
 +}
 +
 +static void print_cg_move(FILE *fplog,
 +                          gmx_domdec_t *dd,
 +                          gmx_large_int_t step,int cg,int dim,int dir,
 +                          gmx_bool bHaveLimitdAndCMOld,real limitd,
 +                          rvec cm_old,rvec cm_new,real pos_d)
 +{
 +    gmx_domdec_comm_t *comm;
 +    char buf[22];
 +
 +    comm = dd->comm;
 +
 +    fprintf(fplog,"\nStep %s:\n",gmx_step_str(step,buf));
 +    if (bHaveLimitdAndCMOld)
 +    {
 +        fprintf(fplog,"The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
 +                ddglatnr(dd,dd->cgindex[cg]),limitd,dim2char(dim));
 +    }
 +    else
 +    {
 +        fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
 +                ddglatnr(dd,dd->cgindex[cg]),dim2char(dim));
 +    }
 +    fprintf(fplog,"distance out of cell %f\n",
 +            dir==1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
 +    if (bHaveLimitdAndCMOld)
 +    {
 +        fprintf(fplog,"Old coordinates: %8.3f %8.3f %8.3f\n",
 +                cm_old[XX],cm_old[YY],cm_old[ZZ]);
 +    }
 +    fprintf(fplog,"New coordinates: %8.3f %8.3f %8.3f\n",
 +            cm_new[XX],cm_new[YY],cm_new[ZZ]);
 +    fprintf(fplog,"Old cell boundaries in direction %c: %8.3f %8.3f\n",
 +            dim2char(dim),
 +            comm->old_cell_x0[dim],comm->old_cell_x1[dim]);
 +    fprintf(fplog,"New cell boundaries in direction %c: %8.3f %8.3f\n",
 +            dim2char(dim),
 +            comm->cell_x0[dim],comm->cell_x1[dim]);
 +}
 +
 +static void cg_move_error(FILE *fplog,
 +                          gmx_domdec_t *dd,
 +                          gmx_large_int_t step,int cg,int dim,int dir,
 +                          gmx_bool bHaveLimitdAndCMOld,real limitd,
 +                          rvec cm_old,rvec cm_new,real pos_d)
 +{
 +    if (fplog)
 +    {
 +        print_cg_move(fplog, dd,step,cg,dim,dir,
 +                      bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
 +    }
 +    print_cg_move(stderr,dd,step,cg,dim,dir,
 +                  bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
 +    gmx_fatal(FARGS,
 +              "A charge group moved too far between two domain decomposition steps\n"
 +              "This usually means that your system is not well equilibrated");
 +}
 +
 +static void rotate_state_atom(t_state *state,int a)
 +{
 +    int est;
 +
 +    for(est=0; est<estNR; est++)
 +    {
 +        if (EST_DISTR(est) && (state->flags & (1<<est))) {
 +            switch (est) {
 +            case estX:
 +                /* Rotate the complete state; for a rectangular box only */
 +                state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
 +                state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
 +                break;
 +            case estV:
 +                state->v[a][YY] = -state->v[a][YY];
 +                state->v[a][ZZ] = -state->v[a][ZZ];
 +                break;
 +            case estSDX:
 +                state->sd_X[a][YY] = -state->sd_X[a][YY];
 +                state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
 +                break;
 +            case estCGP:
 +                state->cg_p[a][YY] = -state->cg_p[a][YY];
 +                state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
 +                break;
 +            case estDISRE_INITF:
 +            case estDISRE_RM3TAV:
 +            case estORIRE_INITF:
 +            case estORIRE_DTAV:
 +                /* These are distances, so not affected by rotation */
 +                break;
 +            default:
 +                gmx_incons("Unknown state entry encountered in rotate_state_atom");            
 +            }
 +        }
 +    }
 +}
 +
 +static int *get_moved(gmx_domdec_comm_t *comm,int natoms)
 +{
 +    if (natoms > comm->moved_nalloc)
 +    {
 +        /* Contents should be preserved here */
 +        comm->moved_nalloc = over_alloc_dd(natoms);
 +        srenew(comm->moved,comm->moved_nalloc);
 +    }
 +
 +    return comm->moved;
 +}
 +
 +static void calc_cg_move(FILE *fplog,gmx_large_int_t step,
 +                         gmx_domdec_t *dd,
 +                         t_state *state,
 +                         ivec tric_dir,matrix tcm,
 +                         rvec cell_x0,rvec cell_x1,
 +                         rvec limitd,rvec limit0,rvec limit1,
 +                         const int *cgindex,
 +                         int cg_start,int cg_end,
 +                         rvec *cg_cm,
 +                         int *move)
 +{
 +    int  npbcdim;
 +    int  c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
 +    int  mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
 +    int  flag;
 +    gmx_bool bScrew;
 +    ivec dev;
 +    real inv_ncg,pos_d;
 +    rvec cm_new;
 +
 +    npbcdim = dd->npbcdim;
 +
 +    for(cg=cg_start; cg<cg_end; cg++)
 +    {
 +        k0   = cgindex[cg];
 +        k1   = cgindex[cg+1];
 +        nrcg = k1 - k0;
 +        if (nrcg == 1)
 +        {
 +            copy_rvec(state->x[k0],cm_new);
 +        }
 +        else
 +        {
 +            inv_ncg = 1.0/nrcg;
 +            
 +            clear_rvec(cm_new);
 +            for(k=k0; (k<k1); k++)
 +            {
 +                rvec_inc(cm_new,state->x[k]);
 +            }
 +            for(d=0; (d<DIM); d++)
 +            {
 +                cm_new[d] = inv_ncg*cm_new[d];
 +            }
 +        }
 +        
 +        clear_ivec(dev);
 +        /* Do pbc and check DD cell boundary crossings */
 +        for(d=DIM-1; d>=0; d--)
 +        {
 +            if (dd->nc[d] > 1)
 +            {
 +                bScrew = (dd->bScrewPBC && d == XX);
 +                /* Determine the location of this cg in lattice coordinates */
 +                pos_d = cm_new[d];
 +                if (tric_dir[d])
 +                {
 +                    for(d2=d+1; d2<DIM; d2++)
 +                    {
 +                        pos_d += cm_new[d2]*tcm[d2][d];
 +                    }
 +                }
 +                /* Put the charge group in the triclinic unit-cell */
 +                if (pos_d >= cell_x1[d])
 +                {
 +                    if (pos_d >= limit1[d])
 +                    {
 +                        cg_move_error(fplog,dd,step,cg,d,1,TRUE,limitd[d],
 +                                      cg_cm[cg],cm_new,pos_d);
 +                    }
 +                    dev[d] = 1;
 +                    if (dd->ci[d] == dd->nc[d] - 1)
 +                    {
 +                        rvec_dec(cm_new,state->box[d]);
 +                        if (bScrew)
 +                        {
 +                            cm_new[YY] = state->box[YY][YY] - cm_new[YY];
 +                            cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
 +                        }
 +                        for(k=k0; (k<k1); k++)
 +                        {
 +                            rvec_dec(state->x[k],state->box[d]);
 +                            if (bScrew)
 +                            {
 +                                rotate_state_atom(state,k);
 +                            }
 +                        }
 +                    }
 +                }
 +                else if (pos_d < cell_x0[d])
 +                {
 +                    if (pos_d < limit0[d])
 +                    {
 +                        cg_move_error(fplog,dd,step,cg,d,-1,TRUE,limitd[d],
 +                                      cg_cm[cg],cm_new,pos_d);
 +                    }
 +                    dev[d] = -1;
 +                    if (dd->ci[d] == 0)
 +                    {
 +                        rvec_inc(cm_new,state->box[d]);
 +                        if (bScrew)
 +                        {
 +                            cm_new[YY] = state->box[YY][YY] - cm_new[YY];
 +                            cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
 +                        }
 +                        for(k=k0; (k<k1); k++)
 +                        {
 +                            rvec_inc(state->x[k],state->box[d]);
 +                            if (bScrew)
 +                            {
 +                                rotate_state_atom(state,k);
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +            else if (d < npbcdim)
 +            {
 +                /* Put the charge group in the rectangular unit-cell */
 +                while (cm_new[d] >= state->box[d][d])
 +                {
 +                    rvec_dec(cm_new,state->box[d]);
 +                    for(k=k0; (k<k1); k++)
 +                    {
 +                        rvec_dec(state->x[k],state->box[d]);
 +                    }
 +                }
 +                while (cm_new[d] < 0)
 +                {
 +                    rvec_inc(cm_new,state->box[d]);
 +                    for(k=k0; (k<k1); k++)
 +                    {
 +                        rvec_inc(state->x[k],state->box[d]);
 +                    }
 +                }
 +            }
 +        }
 +    
 +        copy_rvec(cm_new,cg_cm[cg]);
 +        
 +        /* Determine where this cg should go */
 +        flag = 0;
 +        mc = -1;
 +        for(d=0; d<dd->ndim; d++)
 +        {
 +            dim = dd->dim[d];
 +            if (dev[dim] == 1)
 +            {
 +                flag |= DD_FLAG_FW(d);
 +                if (mc == -1)
 +                {
 +                    mc = d*2;
 +                }
 +            }
 +            else if (dev[dim] == -1)
 +            {
 +                flag |= DD_FLAG_BW(d);
 +                if (mc == -1) {
 +                    if (dd->nc[dim] > 2)
 +                    {
 +                        mc = d*2 + 1;
 +                    }
 +                    else
 +                    {
 +                        mc = d*2;
 +                    }
 +                }
 +            }
 +        }
 +        /* Temporarily store the flag in move */
 +        move[cg] = mc + flag;
 +    }
 +}
 +
 +static void dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
 +                               gmx_domdec_t *dd,ivec tric_dir,
 +                               t_state *state,rvec **f,
 +                               t_forcerec *fr,t_mdatoms *md,
 +                               gmx_bool bCompact,
 +                               t_nrnb *nrnb,
 +                               int *ncg_stay_home,
 +                               int *ncg_moved)
 +{
 +    int  *move;
 +    int  npbcdim;
 +    int  ncg[DIM*2],nat[DIM*2];
 +    int  c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
 +    int  mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
 +    int  sbuf[2],rbuf[2];
 +    int  home_pos_cg,home_pos_at,buf_pos;
 +    int  flag;
 +    gmx_bool bV=FALSE,bSDX=FALSE,bCGP=FALSE;
 +    gmx_bool bScrew;
 +    ivec dev;
 +    real inv_ncg,pos_d;
 +    matrix tcm;
 +    rvec *cg_cm=NULL,cell_x0,cell_x1,limitd,limit0,limit1,cm_new;
 +    atom_id *cgindex;
 +    cginfo_mb_t *cginfo_mb;
 +    gmx_domdec_comm_t *comm;
 +    int  *moved;
 +    int  nthread,thread;
 +    
 +    if (dd->bScrewPBC)
 +    {
 +        check_screw_box(state->box);
 +    }
 +    
 +    comm  = dd->comm;
 +    if (fr->cutoff_scheme == ecutsGROUP)
 +    {
 +        cg_cm = fr->cg_cm;
 +    }
 +    
 +    for(i=0; i<estNR; i++)
 +    {
 +        if (EST_DISTR(i))
 +        {
 +            switch (i)
 +            {
 +            case estX:   /* Always present */            break;
 +            case estV:   bV   = (state->flags & (1<<i)); break;
 +            case estSDX: bSDX = (state->flags & (1<<i)); break;
 +            case estCGP: bCGP = (state->flags & (1<<i)); break;
 +            case estLD_RNG:
 +            case estLD_RNGI:
 +            case estDISRE_INITF:
 +            case estDISRE_RM3TAV:
 +            case estORIRE_INITF:
 +            case estORIRE_DTAV:
 +                /* No processing required */
 +                break;
 +            default:
 +            gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
 +            }
 +        }
 +    }
 +    
 +    if (dd->ncg_tot > comm->nalloc_int)
 +    {
 +        comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
 +        srenew(comm->buf_int,comm->nalloc_int);
 +    }
 +    move = comm->buf_int;
 +    
 +    /* Clear the count */
 +    for(c=0; c<dd->ndim*2; c++)
 +    {
 +        ncg[c] = 0;
 +        nat[c] = 0;
 +    }
 +
 +    npbcdim = dd->npbcdim;
 +
 +    for(d=0; (d<DIM); d++)
 +    {
 +        limitd[d] = dd->comm->cellsize_min[d];
 +        if (d >= npbcdim && dd->ci[d] == 0)
 +        {
 +            cell_x0[d] = -GMX_FLOAT_MAX;
 +        }
 +        else
 +        {
 +            cell_x0[d] = comm->cell_x0[d];
 +        }
 +        if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
 +        {
 +            cell_x1[d] = GMX_FLOAT_MAX;
 +        }
 +        else
 +        {
 +            cell_x1[d] = comm->cell_x1[d];
 +        }
 +        if (d < npbcdim)
 +        {
 +            limit0[d] = comm->old_cell_x0[d] - limitd[d];
 +            limit1[d] = comm->old_cell_x1[d] + limitd[d];
 +        }
 +        else
 +        {
 +            /* We check after communication if a charge group moved
 +             * more than one cell. Set the pre-comm check limit to float_max.
 +             */
 +            limit0[d] = -GMX_FLOAT_MAX;
 +            limit1[d] =  GMX_FLOAT_MAX;
 +        }
 +    }
 +    
 +    make_tric_corr_matrix(npbcdim,state->box,tcm);
 +    
 +    cgindex = dd->cgindex;
 +
 +    nthread = gmx_omp_nthreads_get(emntDomdec);
 +
 +    /* Compute the center of geometry for all home charge groups
 +     * and put them in the box and determine where they should go.
 +     */
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +    for(thread=0; thread<nthread; thread++)
 +    {
 +        calc_cg_move(fplog,step,dd,state,tric_dir,tcm,
 +                     cell_x0,cell_x1,limitd,limit0,limit1,
 +                     cgindex,
 +                     ( thread   *dd->ncg_home)/nthread,
 +                     ((thread+1)*dd->ncg_home)/nthread,
 +                     fr->cutoff_scheme==ecutsGROUP ? cg_cm : state->x,
 +                     move);
 +    }
 +
 +    for(cg=0; cg<dd->ncg_home; cg++)
 +    {
 +        if (move[cg] >= 0)
 +        {
 +            mc = move[cg];
 +            flag     = mc & ~DD_FLAG_NRCG;
 +            mc       = mc & DD_FLAG_NRCG;
 +            move[cg] = mc;
 +
 +            if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
 +            {
 +                comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
 +                srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
 +            }
 +            comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
 +            /* We store the cg size in the lower 16 bits
 +             * and the place where the charge group should go
 +             * in the next 6 bits. This saves some communication volume.
 +             */
 +            nrcg = cgindex[cg+1] - cgindex[cg];
 +            comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
 +            ncg[mc] += 1;
 +            nat[mc] += nrcg;
 +        }
 +    }
 +    
 +    inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
 +    inc_nrnb(nrnb,eNR_RESETX,dd->ncg_home);
 +
 +    *ncg_moved = 0;
 +    for(i=0; i<dd->ndim*2; i++)
 +    {
 +        *ncg_moved += ncg[i];
 +    }
 +    
 +    nvec = 1;
 +    if (bV)
 +    {
 +        nvec++;
 +    }
 +    if (bSDX)
 +    {
 +        nvec++;
 +    }
 +    if (bCGP)
 +    {
 +        nvec++;
 +    }
 +    
 +    /* Make sure the communication buffers are large enough */
 +    for(mc=0; mc<dd->ndim*2; mc++)
 +    {
 +        nvr = ncg[mc] + nat[mc]*nvec;
 +        if (nvr > comm->cgcm_state_nalloc[mc])
 +        {
 +            comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
 +            srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
 +        }
 +    }
 +    
 +    switch (fr->cutoff_scheme)
 +    {
 +    case ecutsGROUP:
 +        /* Recalculating cg_cm might be cheaper than communicating,
 +         * but that could give rise to rounding issues.
 +         */
 +        home_pos_cg =
 +            compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
 +                                    nvec,cg_cm,comm,bCompact);
 +    break;
 +    case ecutsVERLET:
 +        /* Without charge groups we send the moved atom coordinates
 +         * over twice. This is so the code below can be used without
 +         * many conditionals for both for with and without charge groups.
 +         */
 +        home_pos_cg =
 +            compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
 +                                    nvec,state->x,comm,FALSE);
 +        if (bCompact)
 +        {
 +            home_pos_cg -= *ncg_moved;
 +        }
 +        break;
 +    default:
 +        gmx_incons("unimplemented");
 +        home_pos_cg = 0;
 +    }
 +    
 +    vec = 0;
 +    home_pos_at =
 +        compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
 +                                nvec,vec++,state->x,comm,bCompact);
 +    if (bV)
 +    {
 +        compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
 +                                nvec,vec++,state->v,comm,bCompact);
 +    }
 +    if (bSDX)
 +    {
 +        compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
 +                                nvec,vec++,state->sd_X,comm,bCompact);
 +    }
 +    if (bCGP)
 +    {
 +        compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
 +                                nvec,vec++,state->cg_p,comm,bCompact);
 +    }
 +    
 +    if (bCompact)
 +    {
 +        compact_ind(dd->ncg_home,move,
 +                    dd->index_gl,dd->cgindex,dd->gatindex,
 +                    dd->ga2la,comm->bLocalCG,
 +                    fr->cginfo);
 +    }
 +    else
 +    {
 +        if (fr->cutoff_scheme == ecutsVERLET)
 +        {
 +            moved = get_moved(comm,dd->ncg_home);
 +
 +            for(k=0; k<dd->ncg_home; k++)
 +            {
 +                moved[k] = 0;
 +            }
 +        }
 +        else
 +        {
 +            moved = fr->ns.grid->cell_index;
 +        }
 +
 +        clear_and_mark_ind(dd->ncg_home,move,
 +                           dd->index_gl,dd->cgindex,dd->gatindex,
 +                           dd->ga2la,comm->bLocalCG,
 +                           moved);
 +    }
 +    
 +    cginfo_mb = fr->cginfo_mb;
 +
 +    *ncg_stay_home = home_pos_cg;
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        ncg_recv = 0;
 +        nat_recv = 0;
 +        nvr      = 0;
 +        for(dir=0; dir<(dd->nc[dim]==2 ? 1 : 2); dir++)
 +        {
 +            cdd = d*2 + dir;
 +            /* Communicate the cg and atom counts */
 +            sbuf[0] = ncg[cdd];
 +            sbuf[1] = nat[cdd];
 +            if (debug)
 +            {
 +                fprintf(debug,"Sending ddim %d dir %d: ncg %d nat %d\n",
 +                        d,dir,sbuf[0],sbuf[1]);
 +            }
 +            dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
 +            
 +            if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
 +            {
 +                comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
 +                srenew(comm->buf_int,comm->nalloc_int);
 +            }
 +            
 +            /* Communicate the charge group indices, sizes and flags */
 +            dd_sendrecv_int(dd, d, dir,
 +                            comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
 +                            comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
 +            
 +            nvs = ncg[cdd] + nat[cdd]*nvec;
 +            i   = rbuf[0]  + rbuf[1] *nvec;
 +            vec_rvec_check_alloc(&comm->vbuf,nvr+i);
 +            
 +            /* Communicate cgcm and state */
 +            dd_sendrecv_rvec(dd, d, dir,
 +                             comm->cgcm_state[cdd], nvs,
 +                             comm->vbuf.v+nvr, i);
 +            ncg_recv += rbuf[0];
 +            nat_recv += rbuf[1];
 +            nvr      += i;
 +        }
 +        
 +        /* Process the received charge groups */
 +        buf_pos = 0;
 +        for(cg=0; cg<ncg_recv; cg++)
 +        {
 +            flag = comm->buf_int[cg*DD_CGIBS+1];
 +
 +            if (dim >= npbcdim && dd->nc[dim] > 2)
 +            {
 +                /* No pbc in this dim and more than one domain boundary.
 +                 * We do a separate check if a charge group didn't move too far.
 +                 */
 +                if (((flag & DD_FLAG_FW(d)) &&
 +                     comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
 +                    ((flag & DD_FLAG_BW(d)) &&
 +                     comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
 +                {
 +                    cg_move_error(fplog,dd,step,cg,dim,
 +                                  (flag & DD_FLAG_FW(d)) ? 1 : 0,
 +                                   FALSE,0,
 +                                   comm->vbuf.v[buf_pos],
 +                                   comm->vbuf.v[buf_pos],
 +                                   comm->vbuf.v[buf_pos][dim]);
 +                }
 +            }
 +
 +            mc = -1;
 +            if (d < dd->ndim-1)
 +            {
 +                /* Check which direction this cg should go */
 +                for(d2=d+1; (d2<dd->ndim && mc==-1); d2++)
 +                {
 +                    if (dd->bGridJump)
 +                    {
 +                        /* The cell boundaries for dimension d2 are not equal
 +                         * for each cell row of the lower dimension(s),
 +                         * therefore we might need to redetermine where
 +                         * this cg should go.
 +                         */
 +                        dim2 = dd->dim[d2];
 +                        /* If this cg crosses the box boundary in dimension d2
 +                         * we can use the communicated flag, so we do not
 +                         * have to worry about pbc.
 +                         */
 +                        if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
 +                               (flag & DD_FLAG_FW(d2))) ||
 +                              (dd->ci[dim2] == 0 &&
 +                               (flag & DD_FLAG_BW(d2)))))
 +                        {
 +                            /* Clear the two flags for this dimension */
 +                            flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
 +                            /* Determine the location of this cg
 +                             * in lattice coordinates
 +                             */
 +                            pos_d = comm->vbuf.v[buf_pos][dim2];
 +                            if (tric_dir[dim2])
 +                            {
 +                                for(d3=dim2+1; d3<DIM; d3++)
 +                                {
 +                                    pos_d +=
 +                                        comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
 +                                }
 +                            }
 +                            /* Check of we are not at the box edge.
 +                             * pbc is only handled in the first step above,
 +                             * but this check could move over pbc while
 +                             * the first step did not due to different rounding.
 +                             */
 +                            if (pos_d >= cell_x1[dim2] &&
 +                                dd->ci[dim2] != dd->nc[dim2]-1)
 +                            {
 +                                flag |= DD_FLAG_FW(d2);
 +                            }
 +                            else if (pos_d < cell_x0[dim2] &&
 +                                     dd->ci[dim2] != 0)
 +                            {
 +                                flag |= DD_FLAG_BW(d2);
 +                            }
 +                            comm->buf_int[cg*DD_CGIBS+1] = flag;
 +                        }
 +                    }
 +                    /* Set to which neighboring cell this cg should go */
 +                    if (flag & DD_FLAG_FW(d2))
 +                    {
 +                        mc = d2*2;
 +                    }
 +                    else if (flag & DD_FLAG_BW(d2))
 +                    {
 +                        if (dd->nc[dd->dim[d2]] > 2)
 +                        {
 +                            mc = d2*2+1;
 +                        }
 +                        else
 +                        {
 +                            mc = d2*2;
 +                        }
 +                    }
 +                }
 +            }
 +            
 +            nrcg = flag & DD_FLAG_NRCG;
 +            if (mc == -1)
 +            {
 +                if (home_pos_cg+1 > dd->cg_nalloc)
 +                {
 +                    dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
 +                    srenew(dd->index_gl,dd->cg_nalloc);
 +                    srenew(dd->cgindex,dd->cg_nalloc+1);
 +                }
 +                /* Set the global charge group index and size */
 +                dd->index_gl[home_pos_cg] = comm->buf_int[cg*DD_CGIBS];
 +                dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
 +                /* Copy the state from the buffer */
 +                dd_check_alloc_ncg(fr,state,f,home_pos_cg+1);
 +                if (fr->cutoff_scheme == ecutsGROUP)
 +                {
 +                    cg_cm = fr->cg_cm;
 +                    copy_rvec(comm->vbuf.v[buf_pos],cg_cm[home_pos_cg]);
 +                }
 +                buf_pos++;
 +
 +                /* Set the cginfo */
 +                fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
 +                                                   dd->index_gl[home_pos_cg]);
 +                if (comm->bLocalCG)
 +                {
 +                    comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
 +                }
 +
 +                if (home_pos_at+nrcg > state->nalloc)
 +                {
 +                    dd_realloc_state(state,f,home_pos_at+nrcg);
 +                }
 +                for(i=0; i<nrcg; i++)
 +                {
 +                    copy_rvec(comm->vbuf.v[buf_pos++],
 +                              state->x[home_pos_at+i]);
 +                }
 +                if (bV)
 +                {
 +                    for(i=0; i<nrcg; i++)
 +                    {
 +                        copy_rvec(comm->vbuf.v[buf_pos++],
 +                                  state->v[home_pos_at+i]);
 +                    }
 +                }
 +                if (bSDX)
 +                {
 +                    for(i=0; i<nrcg; i++)
 +                    {
 +                        copy_rvec(comm->vbuf.v[buf_pos++],
 +                                  state->sd_X[home_pos_at+i]);
 +                    }
 +                }
 +                if (bCGP)
 +                {
 +                    for(i=0; i<nrcg; i++)
 +                    {
 +                        copy_rvec(comm->vbuf.v[buf_pos++],
 +                                  state->cg_p[home_pos_at+i]);
 +                    }
 +                }
 +                home_pos_cg += 1;
 +                home_pos_at += nrcg;
 +            }
 +            else
 +            {
 +                /* Reallocate the buffers if necessary  */
 +                if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
 +                {
 +                    comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
 +                    srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
 +                }
 +                nvr = ncg[mc] + nat[mc]*nvec;
 +                if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
 +                {
 +                    comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
 +                    srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
 +                }
 +                /* Copy from the receive to the send buffers */
 +                memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
 +                       comm->buf_int + cg*DD_CGIBS,
 +                       DD_CGIBS*sizeof(int));
 +                memcpy(comm->cgcm_state[mc][nvr],
 +                       comm->vbuf.v[buf_pos],
 +                       (1+nrcg*nvec)*sizeof(rvec));
 +                buf_pos += 1 + nrcg*nvec;
 +                ncg[mc] += 1;
 +                nat[mc] += nrcg;
 +            }
 +        }
 +    }
 +    
 +    /* With sorting (!bCompact) the indices are now only partially up to date
 +     * and ncg_home and nat_home are not the real count, since there are
 +     * "holes" in the arrays for the charge groups that moved to neighbors.
 +     */
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        moved = get_moved(comm,home_pos_cg);
 +
 +        for(i=dd->ncg_home; i<home_pos_cg; i++)
 +        {
 +            moved[i] = 0;
 +        }
 +    }
 +    dd->ncg_home = home_pos_cg;
 +    dd->nat_home = home_pos_at;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,
 +                "Finished repartitioning: cgs moved out %d, new home %d\n",
 +                *ncg_moved,dd->ncg_home-*ncg_moved);
 +                
 +    }
 +}
 +
 +void dd_cycles_add(gmx_domdec_t *dd,float cycles,int ddCycl)
 +{
 +    dd->comm->cycl[ddCycl] += cycles;
 +    dd->comm->cycl_n[ddCycl]++;
 +    if (cycles > dd->comm->cycl_max[ddCycl])
 +    {
 +        dd->comm->cycl_max[ddCycl] = cycles;
 +    }
 +}
 +
 +static double force_flop_count(t_nrnb *nrnb)
 +{
 +    int i;
 +    double sum;
 +    const char *name;
 +
 +    sum = 0;
 +    for(i=0; i<eNR_NBKERNEL_FREE_ENERGY; i++)
 +    {
 +        /* To get closer to the real timings, we half the count
 +         * for the normal loops and again half it for water loops.
 +         */
 +        name = nrnb_str(i);
 +        if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
 +        {
 +            sum += nrnb->n[i]*0.25*cost_nrnb(i);
 +        }
 +        else
 +        {
 +            sum += nrnb->n[i]*0.50*cost_nrnb(i);
 +        }
 +    }
 +    for(i=eNR_NBKERNEL_FREE_ENERGY; i<=eNR_NB14; i++)
 +    {
 +        name = nrnb_str(i);
 +        if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
 +        sum += nrnb->n[i]*cost_nrnb(i);
 +    }
 +    for(i=eNR_BONDS; i<=eNR_WALLS; i++)
 +    {
 +        sum += nrnb->n[i]*cost_nrnb(i);
 +    }
 +
 +    return sum;
 +}
 +
 +void dd_force_flop_start(gmx_domdec_t *dd,t_nrnb *nrnb)
 +{
 +    if (dd->comm->eFlop)
 +    {
 +        dd->comm->flop -= force_flop_count(nrnb);
 +    }
 +}
 +void dd_force_flop_stop(gmx_domdec_t *dd,t_nrnb *nrnb)
 +{
 +    if (dd->comm->eFlop)
 +    {
 +        dd->comm->flop += force_flop_count(nrnb);
 +        dd->comm->flop_n++;
 +    }
 +}  
 +
 +static void clear_dd_cycle_counts(gmx_domdec_t *dd)
 +{
 +    int i;
 +    
 +    for(i=0; i<ddCyclNr; i++)
 +    {
 +        dd->comm->cycl[i] = 0;
 +        dd->comm->cycl_n[i] = 0;
 +        dd->comm->cycl_max[i] = 0;
 +    }
 +    dd->comm->flop = 0;
 +    dd->comm->flop_n = 0;
 +}
 +
 +static void get_load_distribution(gmx_domdec_t *dd,gmx_wallcycle_t wcycle)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_domdec_load_t *load;
 +    gmx_domdec_root_t *root=NULL;
 +    int  d,dim,cid,i,pos;
 +    float cell_frac=0,sbuf[DD_NLOAD_MAX];
 +    gmx_bool bSepPME;
 +    
 +    if (debug)
 +    {
 +        fprintf(debug,"get_load_distribution start\n");
 +    }
 +
 +    wallcycle_start(wcycle,ewcDDCOMMLOAD);
 +    
 +    comm = dd->comm;
 +    
 +    bSepPME = (dd->pme_nodeid >= 0);
 +    
 +    for(d=dd->ndim-1; d>=0; d--)
 +    {
 +        dim = dd->dim[d];
 +        /* Check if we participate in the communication in this dimension */
 +        if (d == dd->ndim-1 || 
 +            (dd->ci[dd->dim[d+1]]==0 && dd->ci[dd->dim[dd->ndim-1]]==0))
 +        {
 +            load = &comm->load[d];
 +            if (dd->bGridJump)
 +            {
 +                cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
 +            }
 +            pos = 0;
 +            if (d == dd->ndim-1)
 +            {
 +                sbuf[pos++] = dd_force_load(comm);
 +                sbuf[pos++] = sbuf[0];
 +                if (dd->bGridJump)
 +                {
 +                    sbuf[pos++] = sbuf[0];
 +                    sbuf[pos++] = cell_frac;
 +                    if (d > 0)
 +                    {
 +                        sbuf[pos++] = comm->cell_f_max0[d];
 +                        sbuf[pos++] = comm->cell_f_min1[d];
 +                    }
 +                }
 +                if (bSepPME)
 +                {
 +                    sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
 +                    sbuf[pos++] = comm->cycl[ddCyclPME];
 +                }
 +            }
 +            else
 +            {
 +                sbuf[pos++] = comm->load[d+1].sum;
 +                sbuf[pos++] = comm->load[d+1].max;
 +                if (dd->bGridJump)
 +                {
 +                    sbuf[pos++] = comm->load[d+1].sum_m;
 +                    sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
 +                    sbuf[pos++] = comm->load[d+1].flags;
 +                    if (d > 0)
 +                    {
 +                        sbuf[pos++] = comm->cell_f_max0[d];
 +                        sbuf[pos++] = comm->cell_f_min1[d];
 +                    }
 +                }
 +                if (bSepPME)
 +                {
 +                    sbuf[pos++] = comm->load[d+1].mdf;
 +                    sbuf[pos++] = comm->load[d+1].pme;
 +                }
 +            }
 +            load->nload = pos;
 +            /* Communicate a row in DD direction d.
 +             * The communicators are setup such that the root always has rank 0.
 +             */
 +#ifdef GMX_MPI
 +            MPI_Gather(sbuf      ,load->nload*sizeof(float),MPI_BYTE,
 +                       load->load,load->nload*sizeof(float),MPI_BYTE,
 +                       0,comm->mpi_comm_load[d]);
 +#endif
 +            if (dd->ci[dim] == dd->master_ci[dim])
 +            {
 +                /* We are the root, process this row */
 +                if (comm->bDynLoadBal)
 +                {
 +                    root = comm->root[d];
 +                }
 +                load->sum = 0;
 +                load->max = 0;
 +                load->sum_m = 0;
 +                load->cvol_min = 1;
 +                load->flags = 0;
 +                load->mdf = 0;
 +                load->pme = 0;
 +                pos = 0;
 +                for(i=0; i<dd->nc[dim]; i++)
 +                {
 +                    load->sum += load->load[pos++];
 +                    load->max = max(load->max,load->load[pos]);
 +                    pos++;
 +                    if (dd->bGridJump)
 +                    {
 +                        if (root->bLimited)
 +                        {
 +                            /* This direction could not be load balanced properly,
 +                             * therefore we need to use the maximum iso the average load.
 +                             */
 +                            load->sum_m = max(load->sum_m,load->load[pos]);
 +                        }
 +                        else
 +                        {
 +                            load->sum_m += load->load[pos];
 +                        }
 +                        pos++;
 +                        load->cvol_min = min(load->cvol_min,load->load[pos]);
 +                        pos++;
 +                        if (d < dd->ndim-1)
 +                        {
 +                            load->flags = (int)(load->load[pos++] + 0.5);
 +                        }
 +                        if (d > 0)
 +                        {
 +                            root->cell_f_max0[i] = load->load[pos++];
 +                            root->cell_f_min1[i] = load->load[pos++];
 +                        }
 +                    }
 +                    if (bSepPME)
 +                    {
 +                        load->mdf = max(load->mdf,load->load[pos]);
 +                        pos++;
 +                        load->pme = max(load->pme,load->load[pos]);
 +                        pos++;
 +                    }
 +                }
 +                if (comm->bDynLoadBal && root->bLimited)
 +                {
 +                    load->sum_m *= dd->nc[dim];
 +                    load->flags |= (1<<d);
 +                }
 +            }
 +        }
 +    }
 +
 +    if (DDMASTER(dd))
 +    {
 +        comm->nload      += dd_load_count(comm);
 +        comm->load_step  += comm->cycl[ddCyclStep];
 +        comm->load_sum   += comm->load[0].sum;
 +        comm->load_max   += comm->load[0].max;
 +        if (comm->bDynLoadBal)
 +        {
 +            for(d=0; d<dd->ndim; d++)
 +            {
 +                if (comm->load[0].flags & (1<<d))
 +                {
 +                    comm->load_lim[d]++;
 +                }
 +            }
 +        }
 +        if (bSepPME)
 +        {
 +            comm->load_mdf += comm->load[0].mdf;
 +            comm->load_pme += comm->load[0].pme;
 +        }
 +    }
 +
 +    wallcycle_stop(wcycle,ewcDDCOMMLOAD);
 +    
 +    if (debug)
 +    {
 +        fprintf(debug,"get_load_distribution finished\n");
 +    }
 +}
 +
 +static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
 +{
 +    /* Return the relative performance loss on the total run time
 +     * due to the force calculation load imbalance.
 +     */
 +    if (dd->comm->nload > 0)
 +    {
 +        return
 +            (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
 +            (dd->comm->load_step*dd->nnodes);
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +static void print_dd_load_av(FILE *fplog,gmx_domdec_t *dd)
 +{
 +    char  buf[STRLEN];
 +    int   npp,npme,nnodes,d,limp;
 +    float imbal,pme_f_ratio,lossf,lossp=0;
 +    gmx_bool  bLim;
 +    gmx_domdec_comm_t *comm;
 +
 +    comm = dd->comm;
 +    if (DDMASTER(dd) && comm->nload > 0)
 +    {
 +        npp    = dd->nnodes;
 +        npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
 +        nnodes = npp + npme;
 +        imbal = comm->load_max*npp/comm->load_sum - 1;
 +        lossf = dd_force_imb_perf_loss(dd);
 +        sprintf(buf," Average load imbalance: %.1f %%\n",imbal*100);
 +        fprintf(fplog,"%s",buf);
 +        fprintf(stderr,"\n");
 +        fprintf(stderr,"%s",buf);
 +        sprintf(buf," Part of the total run time spent waiting due to load imbalance: %.1f %%\n",lossf*100);
 +        fprintf(fplog,"%s",buf);
 +        fprintf(stderr,"%s",buf);
 +        bLim = FALSE;
 +        if (comm->bDynLoadBal)
 +        {
 +            sprintf(buf," Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
 +            for(d=0; d<dd->ndim; d++)
 +            {
 +                limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
 +                sprintf(buf+strlen(buf)," %c %d %%",dim2char(dd->dim[d]),limp);
 +                if (limp >= 50)
 +                {
 +                    bLim = TRUE;
 +                }
 +            }
 +            sprintf(buf+strlen(buf),"\n");
 +            fprintf(fplog,"%s",buf);
 +            fprintf(stderr,"%s",buf);
 +        }
 +        if (npme > 0)
 +        {
 +            pme_f_ratio = comm->load_pme/comm->load_mdf;
 +            lossp = (comm->load_pme -comm->load_mdf)/comm->load_step;
 +            if (lossp <= 0)
 +            {
 +                lossp *= (float)npme/(float)nnodes;
 +            }
 +            else
 +            {
 +                lossp *= (float)npp/(float)nnodes;
 +            }
 +            sprintf(buf," Average PME mesh/force load: %5.3f\n",pme_f_ratio);
 +            fprintf(fplog,"%s",buf);
 +            fprintf(stderr,"%s",buf);
 +            sprintf(buf," Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",fabs(lossp)*100);
 +            fprintf(fplog,"%s",buf);
 +            fprintf(stderr,"%s",buf);
 +        }
 +        fprintf(fplog,"\n");
 +        fprintf(stderr,"\n");
 +        
 +        if (lossf >= DD_PERF_LOSS)
 +        {
 +            sprintf(buf,
 +                    "NOTE: %.1f %% performance was lost due to load imbalance\n"
 +                    "      in the domain decomposition.\n",lossf*100);
 +            if (!comm->bDynLoadBal)
 +            {
 +                sprintf(buf+strlen(buf),"      You might want to use dynamic load balancing (option -dlb.)\n");
 +            }
 +            else if (bLim)
 +            {
 +                sprintf(buf+strlen(buf),"      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
 +            }
 +            fprintf(fplog,"%s\n",buf);
 +            fprintf(stderr,"%s\n",buf);
 +        }
 +        if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
 +        {
 +            sprintf(buf,
 +                    "NOTE: %.1f %% performance was lost because the PME nodes\n"
 +                    "      had %s work to do than the PP nodes.\n"
 +                    "      You might want to %s the number of PME nodes\n"
 +                    "      or %s the cut-off and the grid spacing.\n",
 +                    fabs(lossp*100),
 +                    (lossp < 0) ? "less"     : "more",
 +                    (lossp < 0) ? "decrease" : "increase",
 +                    (lossp < 0) ? "decrease" : "increase");
 +            fprintf(fplog,"%s\n",buf);
 +            fprintf(stderr,"%s\n",buf);
 +        }
 +    }
 +}
 +
 +static float dd_vol_min(gmx_domdec_t *dd)
 +{
 +    return dd->comm->load[0].cvol_min*dd->nnodes;
 +}
 +
 +static gmx_bool dd_load_flags(gmx_domdec_t *dd)
 +{
 +    return dd->comm->load[0].flags;
 +}
 +
 +static float dd_f_imbal(gmx_domdec_t *dd)
 +{
 +    return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
 +}
 +
 +float dd_pme_f_ratio(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->cycl_n[ddCyclPME] > 0)
 +    {
 +        return dd->comm->load[0].pme/dd->comm->load[0].mdf;
 +    }
 +    else
 +    {
 +        return -1.0;
 +    }
 +}
 +
 +static void dd_print_load(FILE *fplog,gmx_domdec_t *dd,gmx_large_int_t step)
 +{
 +    int flags,d;
 +    char buf[22];
 +    
 +    flags = dd_load_flags(dd);
 +    if (flags)
 +    {
 +        fprintf(fplog,
 +                "DD  load balancing is limited by minimum cell size in dimension");
 +        for(d=0; d<dd->ndim; d++)
 +        {
 +            if (flags & (1<<d))
 +            {
 +                fprintf(fplog," %c",dim2char(dd->dim[d]));
 +            }
 +        }
 +        fprintf(fplog,"\n");
 +    }
 +    fprintf(fplog,"DD  step %s",gmx_step_str(step,buf));
 +    if (dd->comm->bDynLoadBal)
 +    {
 +        fprintf(fplog,"  vol min/aver %5.3f%c",
 +                dd_vol_min(dd),flags ? '!' : ' ');
 +    }
 +    fprintf(fplog," load imb.: force %4.1f%%",dd_f_imbal(dd)*100);
 +    if (dd->comm->cycl_n[ddCyclPME])
 +    {
 +        fprintf(fplog,"  pme mesh/force %5.3f",dd_pme_f_ratio(dd));
 +    }
 +    fprintf(fplog,"\n\n");
 +}
 +
 +static void dd_print_load_verbose(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->bDynLoadBal)
 +    {
 +        fprintf(stderr,"vol %4.2f%c ",
 +                dd_vol_min(dd),dd_load_flags(dd) ? '!' : ' ');
 +    }
 +    fprintf(stderr,"imb F %2d%% ",(int)(dd_f_imbal(dd)*100+0.5));
 +    if (dd->comm->cycl_n[ddCyclPME])
 +    {
 +        fprintf(stderr,"pme/F %4.2f ",dd_pme_f_ratio(dd));
 +    }
 +}
 +
 +#ifdef GMX_MPI
 +static void make_load_communicator(gmx_domdec_t *dd, int dim_ind,ivec loc)
 +{
 +    MPI_Comm  c_row;
 +    int  dim, i, rank;
 +    ivec loc_c;
 +    gmx_domdec_root_t *root;
 +    gmx_bool bPartOfGroup = FALSE;
 +    
 +    dim = dd->dim[dim_ind];
 +    copy_ivec(loc,loc_c);
 +    for(i=0; i<dd->nc[dim]; i++)
 +    {
 +        loc_c[dim] = i;
 +        rank = dd_index(dd->nc,loc_c);
 +        if (rank == dd->rank)
 +        {
 +            /* This process is part of the group */
 +            bPartOfGroup = TRUE;
 +        }
 +    }
 +    MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup?0:MPI_UNDEFINED, dd->rank,
 +                   &c_row);
 +    if (bPartOfGroup)
 +    {
 +        dd->comm->mpi_comm_load[dim_ind] = c_row;
 +        if (dd->comm->eDLB != edlbNO)
 +        {
 +            if (dd->ci[dim] == dd->master_ci[dim])
 +            {
 +                /* This is the root process of this row */
 +                snew(dd->comm->root[dim_ind],1);
 +                root = dd->comm->root[dim_ind];
 +                snew(root->cell_f,DD_CELL_F_SIZE(dd,dim_ind));
 +                snew(root->old_cell_f,dd->nc[dim]+1);
 +                snew(root->bCellMin,dd->nc[dim]);
 +                if (dim_ind > 0)
 +                {
 +                    snew(root->cell_f_max0,dd->nc[dim]);
 +                    snew(root->cell_f_min1,dd->nc[dim]);
 +                    snew(root->bound_min,dd->nc[dim]);
 +                    snew(root->bound_max,dd->nc[dim]);
 +                }
 +                snew(root->buf_ncd,dd->nc[dim]);
 +            }
 +            else
 +            {
 +                /* This is not a root process, we only need to receive cell_f */
 +                snew(dd->comm->cell_f_row,DD_CELL_F_SIZE(dd,dim_ind));
 +            }
 +        }
 +        if (dd->ci[dim] == dd->master_ci[dim])
 +        {
 +            snew(dd->comm->load[dim_ind].load,dd->nc[dim]*DD_NLOAD_MAX);
 +        }
 +    }
 +}
 +#endif
 +
 +static void make_load_communicators(gmx_domdec_t *dd)
 +{
 +#ifdef GMX_MPI
 +  int  dim0,dim1,i,j;
 +  ivec loc;
 +
 +  if (debug)
 +    fprintf(debug,"Making load communicators\n");
 +
 +  snew(dd->comm->load,dd->ndim);
 +  snew(dd->comm->mpi_comm_load,dd->ndim);
 +  
 +  clear_ivec(loc);
 +  make_load_communicator(dd,0,loc);
 +  if (dd->ndim > 1) {
 +    dim0 = dd->dim[0];
 +    for(i=0; i<dd->nc[dim0]; i++) {
 +      loc[dim0] = i;
 +      make_load_communicator(dd,1,loc);
 +    }
 +  }
 +  if (dd->ndim > 2) {
 +    dim0 = dd->dim[0];
 +    for(i=0; i<dd->nc[dim0]; i++) {
 +      loc[dim0] = i;
 +      dim1 = dd->dim[1];
 +      for(j=0; j<dd->nc[dim1]; j++) {
 +        loc[dim1] = j;
 +        make_load_communicator(dd,2,loc);
 +      }
 +    }
 +  }
 +
 +  if (debug)
 +    fprintf(debug,"Finished making load communicators\n");
 +#endif
 +}
 +
 +void setup_dd_grid(FILE *fplog,gmx_domdec_t *dd)
 +{
 +    gmx_bool bZYX;
 +    int  d,dim,i,j,m;
 +    ivec tmp,s;
 +    int  nzone,nzonep;
 +    ivec dd_zp[DD_MAXIZONE];
 +    gmx_domdec_zones_t *zones;
 +    gmx_domdec_ns_ranges_t *izone;
 +    
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        copy_ivec(dd->ci,tmp);
 +        tmp[dim] = (tmp[dim] + 1) % dd->nc[dim];
 +        dd->neighbor[d][0] = ddcoord2ddnodeid(dd,tmp);
 +        copy_ivec(dd->ci,tmp);
 +        tmp[dim] = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
 +        dd->neighbor[d][1] = ddcoord2ddnodeid(dd,tmp);
 +        if (debug)
 +        {
 +            fprintf(debug,"DD rank %d neighbor ranks in dir %d are + %d - %d\n",
 +                    dd->rank,dim,
 +                    dd->neighbor[d][0],
 +                    dd->neighbor[d][1]);
 +        }
 +    }
 +    
 +    if (DDMASTER(dd))
 +    {
 +        fprintf(stderr,"Making %dD domain decomposition %d x %d x %d\n",
 +          dd->ndim,dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
 +    }
 +    if (fplog)
 +    {
 +        fprintf(fplog,"\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
 +                dd->ndim,
 +                dd->nc[XX],dd->nc[YY],dd->nc[ZZ],
 +                dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
 +    }
 +    switch (dd->ndim)
 +    {
 +    case 3:
 +        nzone  = dd_z3n;
 +        nzonep = dd_zp3n;
 +        for(i=0; i<nzonep; i++)
 +        {
 +            copy_ivec(dd_zp3[i],dd_zp[i]);
 +        }
 +        break;
 +    case 2:
 +        nzone  = dd_z2n;
 +        nzonep = dd_zp2n;
 +        for(i=0; i<nzonep; i++)
 +        {
 +            copy_ivec(dd_zp2[i],dd_zp[i]);
 +        }
 +        break;
 +    case 1:
 +        nzone  = dd_z1n;
 +        nzonep = dd_zp1n;
 +        for(i=0; i<nzonep; i++)
 +        {
 +            copy_ivec(dd_zp1[i],dd_zp[i]);
 +        }
 +        break;
 +    default:
 +        gmx_fatal(FARGS,"Can only do 1, 2 or 3D domain decomposition");
 +        nzone = 0;
 +        nzonep = 0;
 +    }
 +
 +    zones = &dd->comm->zones;
 +
 +    for(i=0; i<nzone; i++)
 +    {
 +        m = 0;
 +        clear_ivec(zones->shift[i]);
 +        for(d=0; d<dd->ndim; d++)
 +        {
 +            zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
 +        }
 +    }
 +    
 +    zones->n = nzone;
 +    for(i=0; i<nzone; i++)
 +    {
 +        for(d=0; d<DIM; d++)
 +        {
 +            s[d] = dd->ci[d] - zones->shift[i][d];
 +            if (s[d] < 0)
 +            {
 +                s[d] += dd->nc[d];
 +            }
 +            else if (s[d] >= dd->nc[d])
 +            {
 +                s[d] -= dd->nc[d];
 +            }
 +        }
 +    }
 +    zones->nizone = nzonep;
 +    for(i=0; i<zones->nizone; i++)
 +    {
 +        if (dd_zp[i][0] != i)
 +        {
 +            gmx_fatal(FARGS,"Internal inconsistency in the dd grid setup");
 +        }
 +        izone = &zones->izone[i];
 +        izone->j0 = dd_zp[i][1];
 +        izone->j1 = dd_zp[i][2];
 +        for(dim=0; dim<DIM; dim++)
 +        {
 +            if (dd->nc[dim] == 1)
 +            {
 +                /* All shifts should be allowed */
 +                izone->shift0[dim] = -1;
 +                izone->shift1[dim] = 1;
 +            }
 +            else
 +            {
 +                /*
 +                  izone->shift0[d] = 0;
 +                  izone->shift1[d] = 0;
 +                  for(j=izone->j0; j<izone->j1; j++) {
 +                  if (dd->shift[j][d] > dd->shift[i][d])
 +                  izone->shift0[d] = -1;
 +                  if (dd->shift[j][d] < dd->shift[i][d])
 +                  izone->shift1[d] = 1;
 +                  }
 +                */
 +                
 +                int shift_diff;
 +                
 +                /* Assume the shift are not more than 1 cell */
 +                izone->shift0[dim] = 1;
 +                izone->shift1[dim] = -1;
 +                for(j=izone->j0; j<izone->j1; j++)
 +                {
 +                    shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
 +                    if (shift_diff < izone->shift0[dim])
 +                    {
 +                        izone->shift0[dim] = shift_diff;
 +                    }
 +                    if (shift_diff > izone->shift1[dim])
 +                    {
 +                        izone->shift1[dim] = shift_diff;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +    
 +    if (dd->comm->eDLB != edlbNO)
 +    {
 +        snew(dd->comm->root,dd->ndim);
 +    }
 +    
 +    if (dd->comm->bRecordLoad)
 +    {
 +        make_load_communicators(dd);
 +    }
 +}
 +
 +static void make_pp_communicator(FILE *fplog,t_commrec *cr,int reorder)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_domdec_comm_t *comm;
 +    int  i,rank,*buf;
 +    ivec periods;
 +#ifdef GMX_MPI
 +    MPI_Comm comm_cart;
 +#endif
 +    
 +    dd = cr->dd;
 +    comm = dd->comm;
 +    
 +#ifdef GMX_MPI
 +    if (comm->bCartesianPP)
 +    {
 +        /* Set up cartesian communication for the particle-particle part */
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Will use a Cartesian communicator: %d x %d x %d\n",
 +                    dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
 +        }
 +        
 +        for(i=0; i<DIM; i++)
 +        {
 +            periods[i] = TRUE;
 +        }
 +        MPI_Cart_create(cr->mpi_comm_mygroup,DIM,dd->nc,periods,reorder,
 +                        &comm_cart);
 +        /* We overwrite the old communicator with the new cartesian one */
 +        cr->mpi_comm_mygroup = comm_cart;
 +    }
 +    
 +    dd->mpi_comm_all = cr->mpi_comm_mygroup;
 +    MPI_Comm_rank(dd->mpi_comm_all,&dd->rank);
 +    
 +    if (comm->bCartesianPP_PME)
 +    {
 +        /* Since we want to use the original cartesian setup for sim,
 +         * and not the one after split, we need to make an index.
 +         */
 +        snew(comm->ddindex2ddnodeid,dd->nnodes);
 +        comm->ddindex2ddnodeid[dd_index(dd->nc,dd->ci)] = dd->rank;
 +        gmx_sumi(dd->nnodes,comm->ddindex2ddnodeid,cr);
 +        /* Get the rank of the DD master,
 +         * above we made sure that the master node is a PP node.
 +         */
 +        if (MASTER(cr))
 +        {
 +            rank = dd->rank;
 +        }
 +        else
 +        {
 +            rank = 0;
 +        }
 +        MPI_Allreduce(&rank,&dd->masterrank,1,MPI_INT,MPI_SUM,dd->mpi_comm_all);
 +    }
 +    else if (comm->bCartesianPP)
 +    {
 +        if (cr->npmenodes == 0)
 +        {
 +            /* The PP communicator is also
 +             * the communicator for this simulation
 +             */
 +            cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
 +        }
 +        cr->nodeid = dd->rank;
 +        
 +        MPI_Cart_coords(dd->mpi_comm_all,dd->rank,DIM,dd->ci);
 +        
 +        /* We need to make an index to go from the coordinates
 +         * to the nodeid of this simulation.
 +         */
 +        snew(comm->ddindex2simnodeid,dd->nnodes);
 +        snew(buf,dd->nnodes);
 +        if (cr->duty & DUTY_PP)
 +        {
 +            buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
 +        }
 +        /* Communicate the ddindex to simulation nodeid index */
 +        MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
 +                      cr->mpi_comm_mysim);
 +        sfree(buf);
 +        
 +        /* Determine the master coordinates and rank.
 +         * The DD master should be the same node as the master of this sim.
 +         */
 +        for(i=0; i<dd->nnodes; i++)
 +        {
 +            if (comm->ddindex2simnodeid[i] == 0)
 +            {
 +                ddindex2xyz(dd->nc,i,dd->master_ci);
 +                MPI_Cart_rank(dd->mpi_comm_all,dd->master_ci,&dd->masterrank);
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug,"The master rank is %d\n",dd->masterrank);
 +        }
 +    }
 +    else
 +    {
 +        /* No Cartesian communicators */
 +        /* We use the rank in dd->comm->all as DD index */
 +        ddindex2xyz(dd->nc,dd->rank,dd->ci);
 +        /* The simulation master nodeid is 0, so the DD master rank is also 0 */
 +        dd->masterrank = 0;
 +        clear_ivec(dd->master_ci);
 +    }
 +#endif
 +  
 +    if (fplog)
 +    {
 +        fprintf(fplog,
 +                "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
 +                dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug,
 +                "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
 +                dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
 +    }
 +}
 +
 +static void receive_ddindex2simnodeid(t_commrec *cr)
 +{
 +    gmx_domdec_t *dd;
 +    
 +    gmx_domdec_comm_t *comm;
 +    int  *buf;
 +    
 +    dd = cr->dd;
 +    comm = dd->comm;
 +    
 +#ifdef GMX_MPI
 +    if (!comm->bCartesianPP_PME && comm->bCartesianPP)
 +    {
 +        snew(comm->ddindex2simnodeid,dd->nnodes);
 +        snew(buf,dd->nnodes);
 +        if (cr->duty & DUTY_PP)
 +        {
 +            buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
 +        }
 +#ifdef GMX_MPI
 +        /* Communicate the ddindex to simulation nodeid index */
 +        MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
 +                      cr->mpi_comm_mysim);
 +#endif
 +        sfree(buf);
 +    }
 +#endif
 +}
 +
 +static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
 +                                                     int ncg,int natoms)
 +{
 +    gmx_domdec_master_t *ma;
 +    int i;
 +
 +    snew(ma,1);
 +    
 +    snew(ma->ncg,dd->nnodes);
 +    snew(ma->index,dd->nnodes+1);
 +    snew(ma->cg,ncg);
 +    snew(ma->nat,dd->nnodes);
 +    snew(ma->ibuf,dd->nnodes*2);
 +    snew(ma->cell_x,DIM);
 +    for(i=0; i<DIM; i++)
 +    {
 +        snew(ma->cell_x[i],dd->nc[i]+1);
 +    }
 +
 +    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
 +    {
 +        ma->vbuf = NULL;
 +    }
 +    else
 +    {
 +        snew(ma->vbuf,natoms);
 +    }
 +
 +    return ma;
 +}
 +
 +static void split_communicator(FILE *fplog,t_commrec *cr,int dd_node_order,
 +                               int reorder)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_domdec_comm_t *comm;
 +    int  i,rank;
 +    gmx_bool bDiv[DIM];
 +    ivec periods;
 +#ifdef GMX_MPI
 +    MPI_Comm comm_cart;
 +#endif
 +    
 +    dd = cr->dd;
 +    comm = dd->comm;
 +    
 +    if (comm->bCartesianPP)
 +    {
 +        for(i=1; i<DIM; i++)
 +        {
 +            bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
 +        }
 +        if (bDiv[YY] || bDiv[ZZ])
 +        {
 +            comm->bCartesianPP_PME = TRUE;
 +            /* If we have 2D PME decomposition, which is always in x+y,
 +             * we stack the PME only nodes in z.
 +             * Otherwise we choose the direction that provides the thinnest slab
 +             * of PME only nodes as this will have the least effect
 +             * on the PP communication.
 +             * But for the PME communication the opposite might be better.
 +             */
 +            if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
 +                             !bDiv[YY] ||
 +                             dd->nc[YY] > dd->nc[ZZ]))
 +            {
 +                comm->cartpmedim = ZZ;
 +            }
 +            else
 +            {
 +                comm->cartpmedim = YY;
 +            }
 +            comm->ntot[comm->cartpmedim]
 +                += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
 +        }
 +        else if (fplog)
 +        {
 +            fprintf(fplog,"#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n",cr->npmenodes,dd->nc[XX],dd->nc[YY],dd->nc[XX],dd->nc[ZZ]);
 +            fprintf(fplog,
 +                    "Will not use a Cartesian communicator for PP <-> PME\n\n");
 +        }
 +    }
 +    
 +#ifdef GMX_MPI
 +    if (comm->bCartesianPP_PME)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n",comm->ntot[XX],comm->ntot[YY],comm->ntot[ZZ]);
 +        }
 +        
 +        for(i=0; i<DIM; i++)
 +        {
 +            periods[i] = TRUE;
 +        }
 +        MPI_Cart_create(cr->mpi_comm_mysim,DIM,comm->ntot,periods,reorder,
 +                        &comm_cart);
 +        
 +        MPI_Comm_rank(comm_cart,&rank);
 +        if (MASTERNODE(cr) && rank != 0)
 +        {
 +            gmx_fatal(FARGS,"MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
 +        }
 +        
 +        /* With this assigment we loose the link to the original communicator
 +         * which will usually be MPI_COMM_WORLD, unless have multisim.
 +         */
 +        cr->mpi_comm_mysim = comm_cart;
 +        cr->sim_nodeid = rank;
 +        
 +        MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,dd->ci);
 +        
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Cartesian nodeid %d, coordinates %d %d %d\n\n",
 +                    cr->sim_nodeid,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
 +        }
 +        
 +        if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 +        {
 +            cr->duty = DUTY_PP;
 +        }
 +        if (cr->npmenodes == 0 ||
 +            dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
 +        {
 +            cr->duty = DUTY_PME;
 +        }
 +        
 +        /* Split the sim communicator into PP and PME only nodes */
 +        MPI_Comm_split(cr->mpi_comm_mysim,
 +                       cr->duty,
 +                       dd_index(comm->ntot,dd->ci),
 +                       &cr->mpi_comm_mygroup);
 +    }
 +    else
 +    {
 +        switch (dd_node_order)
 +        {
 +        case ddnoPP_PME:
 +            if (fplog)
 +            {
 +                fprintf(fplog,"Order of the nodes: PP first, PME last\n");
 +            }
 +            break;
 +        case ddnoINTERLEAVE:
 +            /* Interleave the PP-only and PME-only nodes,
 +             * as on clusters with dual-core machines this will double
 +             * the communication bandwidth of the PME processes
 +             * and thus speed up the PP <-> PME and inter PME communication.
 +             */
 +            if (fplog)
 +            {
 +                fprintf(fplog,"Interleaving PP and PME nodes\n");
 +            }
 +            comm->pmenodes = dd_pmenodes(cr);
 +            break;
 +        case ddnoCARTESIAN:
 +            break;
 +        default:
 +            gmx_fatal(FARGS,"Unknown dd_node_order=%d",dd_node_order);
 +        }
 +    
 +        if (dd_simnode2pmenode(cr,cr->sim_nodeid) == -1)
 +        {
 +            cr->duty = DUTY_PME;
 +        }
 +        else
 +        {
 +            cr->duty = DUTY_PP;
 +        }
 +        
 +        /* Split the sim communicator into PP and PME only nodes */
 +        MPI_Comm_split(cr->mpi_comm_mysim,
 +                       cr->duty,
 +                       cr->nodeid,
 +                       &cr->mpi_comm_mygroup);
 +        MPI_Comm_rank(cr->mpi_comm_mygroup,&cr->nodeid);
 +    }
 +#endif
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog,"This is a %s only node\n\n",
 +                (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
 +    }
 +}
 +
 +void make_dd_communicators(FILE *fplog,t_commrec *cr,int dd_node_order)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_domdec_comm_t *comm;
 +    int CartReorder;
 +    
 +    dd = cr->dd;
 +    comm = dd->comm;
 +    
 +    copy_ivec(dd->nc,comm->ntot);
 +    
 +    comm->bCartesianPP = (dd_node_order == ddnoCARTESIAN);
 +    comm->bCartesianPP_PME = FALSE;
 +    
 +    /* Reorder the nodes by default. This might change the MPI ranks.
 +     * Real reordering is only supported on very few architectures,
 +     * Blue Gene is one of them.
 +     */
 +    CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
 +    
 +    if (cr->npmenodes > 0)
 +    {
 +        /* Split the communicator into a PP and PME part */
 +        split_communicator(fplog,cr,dd_node_order,CartReorder);
 +        if (comm->bCartesianPP_PME)
 +        {
 +            /* We (possibly) reordered the nodes in split_communicator,
 +             * so it is no longer required in make_pp_communicator.
 +             */
 +            CartReorder = FALSE;
 +        }
 +    }
 +    else
 +    {
 +        /* All nodes do PP and PME */
 +#ifdef GMX_MPI    
 +        /* We do not require separate communicators */
 +        cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
 +#endif
 +    }
 +    
 +    if (cr->duty & DUTY_PP)
 +    {
 +        /* Copy or make a new PP communicator */
 +        make_pp_communicator(fplog,cr,CartReorder);
 +    }
 +    else
 +    {
 +        receive_ddindex2simnodeid(cr);
 +    }
 +    
 +    if (!(cr->duty & DUTY_PME))
 +    {
 +        /* Set up the commnuication to our PME node */
 +        dd->pme_nodeid = dd_simnode2pmenode(cr,cr->sim_nodeid);
 +        dd->pme_receive_vir_ener = receive_vir_ener(cr);
 +        if (debug)
 +        {
 +            fprintf(debug,"My pme_nodeid %d receive ener %d\n",
 +                    dd->pme_nodeid,dd->pme_receive_vir_ener);
 +        }
 +    }
 +    else
 +    {
 +        dd->pme_nodeid = -1;
 +    }
 +
 +    if (DDMASTER(dd))
 +    {
 +        dd->ma = init_gmx_domdec_master_t(dd,
 +                                          comm->cgs_gl.nr,
 +                                          comm->cgs_gl.index[comm->cgs_gl.nr]);
 +    }
 +}
 +
 +static real *get_slb_frac(FILE *fplog,const char *dir,int nc,const char *size_string)
 +{
 +    real *slb_frac,tot;
 +    int  i,n;
 +    double dbl;
 +    
 +    slb_frac = NULL;
 +    if (nc > 1 && size_string != NULL)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Using static load balancing for the %s direction\n",
 +                    dir);
 +        }
 +        snew(slb_frac,nc);
 +        tot = 0;
 +        for (i=0; i<nc; i++)
 +        {
 +            dbl = 0;
 +            sscanf(size_string,"%lf%n",&dbl,&n);
 +            if (dbl == 0)
 +            {
 +                gmx_fatal(FARGS,"Incorrect or not enough DD cell size entries for direction %s: '%s'",dir,size_string);
 +            }
 +            slb_frac[i] = dbl;
 +            size_string += n;
 +            tot += slb_frac[i];
 +        }
 +        /* Normalize */
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Relative cell sizes:");
 +        }
 +        for (i=0; i<nc; i++)
 +        {
 +            slb_frac[i] /= tot;
 +            if (fplog)
 +            {
 +                fprintf(fplog," %5.3f",slb_frac[i]);
 +            }
 +        }
 +        if (fplog)
 +        {
 +            fprintf(fplog,"\n");
 +        }
 +    }
 +    
 +    return slb_frac;
 +}
 +
 +static int multi_body_bondeds_count(gmx_mtop_t *mtop)
 +{
 +    int n,nmol,ftype;
 +    gmx_mtop_ilistloop_t iloop;
 +    t_ilist *il;
 +    
 +    n = 0;
 +    iloop = gmx_mtop_ilistloop_init(mtop);
 +    while (gmx_mtop_ilistloop_next(iloop,&il,&nmol))
 +    {
 +        for(ftype=0; ftype<F_NRE; ftype++)
 +        {
 +            if ((interaction_function[ftype].flags & IF_BOND) &&
 +                NRAL(ftype) >  2)
 +            {
 +                n += nmol*il[ftype].nr/(1 + NRAL(ftype));
 +            }
 +        }
 +  }
 +
 +  return n;
 +}
 +
 +static int dd_nst_env(FILE *fplog,const char *env_var,int def)
 +{
 +    char *val;
 +    int  nst;
 +    
 +    nst = def;
 +    val = getenv(env_var);
 +    if (val)
 +    {
 +        if (sscanf(val,"%d",&nst) <= 0)
 +        {
 +            nst = 1;
 +        }
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Found env.var. %s = %s, using value %d\n",
 +                    env_var,val,nst);
 +        }
 +    }
 +    
 +    return nst;
 +}
 +
 +static void dd_warning(t_commrec *cr,FILE *fplog,const char *warn_string)
 +{
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr,"\n%s\n",warn_string);
 +    }
 +    if (fplog)
 +    {
 +        fprintf(fplog,"\n%s\n",warn_string);
 +    }
 +}
 +
 +static void check_dd_restrictions(t_commrec *cr,gmx_domdec_t *dd,
 +                                  t_inputrec *ir,FILE *fplog)
 +{
 +    if (ir->ePBC == epbcSCREW &&
 +        (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
 +    {
 +        gmx_fatal(FARGS,"With pbc=%s can only do domain decomposition in the x-direction",epbc_names[ir->ePBC]);
 +    }
 +
 +    if (ir->ns_type == ensSIMPLE)
 +    {
 +        gmx_fatal(FARGS,"Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
 +    }
 +
 +    if (ir->nstlist == 0)
 +    {
 +        gmx_fatal(FARGS,"Domain decomposition does not work with nstlist=0");
 +    }
 +
 +    if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
 +    {
 +        dd_warning(cr,fplog,"comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
 +    }
 +}
 +
 +static real average_cellsize_min(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
 +{
 +    int  di,d;
 +    real r;
 +
 +    r = ddbox->box_size[XX];
 +    for(di=0; di<dd->ndim; di++)
 +    {
 +        d = dd->dim[di];
 +        /* Check using the initial average cell size */
 +        r = min(r,ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
 +    }
 +
 +    return r;
 +}
 +
 +static int check_dlb_support(FILE *fplog,t_commrec *cr,
 +                             const char *dlb_opt,gmx_bool bRecordLoad,
 +                             unsigned long Flags,t_inputrec *ir)
 +{
 +    gmx_domdec_t *dd;
 +    int  eDLB=-1;
 +    char buf[STRLEN];
 +
 +    switch (dlb_opt[0])
 +    {
 +    case 'a': eDLB = edlbAUTO; break;
 +    case 'n': eDLB = edlbNO;   break;
 +    case 'y': eDLB = edlbYES;  break;
 +    default: gmx_incons("Unknown dlb_opt");
 +    }
 +
 +    if (Flags & MD_RERUN)
 +    {
 +        return edlbNO;
 +    }
 +
 +    if (!EI_DYNAMICS(ir->eI))
 +    {
 +        if (eDLB == edlbYES)
 +        {
 +            sprintf(buf,"NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n",EI(ir->eI));
 +            dd_warning(cr,fplog,buf);
 +        }
 +            
 +        return edlbNO;
 +    }
 +
 +    if (!bRecordLoad)
 +    {
 +        dd_warning(cr,fplog,"NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
 +
 +        return edlbNO;
 +    }
 +
 +    if (Flags & MD_REPRODUCIBLE)
 +    {
 +        switch (eDLB)
 +        {
 +                      case edlbNO: 
 +                              break;
 +                      case edlbAUTO:
 +                              dd_warning(cr,fplog,"NOTE: reproducibility requested, will not use dynamic load balancing\n");
 +                              eDLB = edlbNO;
 +                              break;
 +                      case edlbYES:
 +                              dd_warning(cr,fplog,"WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
 +                              break;
 +                      default:
 +                              gmx_fatal(FARGS,"Death horror: undefined case (%d) for load balancing choice",eDLB);
 +                              break;
 +        }
 +    }
 +
 +    return eDLB;
 +}
 +
 +static void set_dd_dim(FILE *fplog,gmx_domdec_t *dd)
 +{
 +    int dim;
 +
 +    dd->ndim = 0;
 +    if (getenv("GMX_DD_ORDER_ZYX") != NULL)
 +    {
 +        /* Decomposition order z,y,x */
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Using domain decomposition order z, y, x\n");
 +        }
 +        for(dim=DIM-1; dim>=0; dim--)
 +        {
 +            if (dd->nc[dim] > 1)
 +            {
 +                dd->dim[dd->ndim++] = dim;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        /* Decomposition order x,y,z */
 +        for(dim=0; dim<DIM; dim++)
 +        {
 +            if (dd->nc[dim] > 1)
 +            {
 +                dd->dim[dd->ndim++] = dim;
 +            }
 +        }
 +    }
 +}
 +
 +static gmx_domdec_comm_t *init_dd_comm()
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  i;
 +
 +    snew(comm,1);
 +    snew(comm->cggl_flag,DIM*2);
 +    snew(comm->cgcm_state,DIM*2);
 +    for(i=0; i<DIM*2; i++)
 +    {
 +        comm->cggl_flag_nalloc[i]  = 0;
 +        comm->cgcm_state_nalloc[i] = 0;
 +    }
 +    
 +    comm->nalloc_int = 0;
 +    comm->buf_int    = NULL;
 +
 +    vec_rvec_init(&comm->vbuf);
 +
 +    comm->n_load_have    = 0;
 +    comm->n_load_collect = 0;
 +
 +    for(i=0; i<ddnatNR-ddnatZONE; i++)
 +    {
 +        comm->sum_nat[i] = 0;
 +    }
 +    comm->ndecomp = 0;
 +    comm->nload   = 0;
 +    comm->load_step = 0;
 +    comm->load_sum  = 0;
 +    comm->load_max  = 0;
 +    clear_ivec(comm->load_lim);
 +    comm->load_mdf  = 0;
 +    comm->load_pme  = 0;
 +
 +    return comm;
 +}
 +
 +gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
 +                                        unsigned long Flags,
 +                                        ivec nc,
 +                                        real comm_distance_min,real rconstr,
 +                                        const char *dlb_opt,real dlb_scale,
 +                                        const char *sizex,const char *sizey,const char *sizez,
 +                                        gmx_mtop_t *mtop,t_inputrec *ir,
 +                                        matrix box,rvec *x,
 +                                        gmx_ddbox_t *ddbox,
 +                                        int *npme_x,int *npme_y)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_domdec_comm_t *comm;
 +    int  recload;
 +    int  d,i,j;
 +    real r_2b,r_mb,r_bonded=-1,r_bonded_limit=-1,limit,acs;
 +    gmx_bool bC;
 +    char buf[STRLEN];
 +    
 +    if (fplog)
 +    {
 +        fprintf(fplog,
 +                "\nInitializing Domain Decomposition on %d nodes\n",cr->nnodes);
 +    }
 +    
 +    snew(dd,1);
 +
 +    dd->comm = init_dd_comm();
 +    comm = dd->comm;
 +    snew(comm->cggl_flag,DIM*2);
 +    snew(comm->cgcm_state,DIM*2);
 +
 +    dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
 +    dd->bScrewPBC = (ir->ePBC == epbcSCREW);
 +    
 +    dd->bSendRecv2      = dd_nst_env(fplog,"GMX_DD_SENDRECV2",0);
 +    comm->dlb_scale_lim = dd_nst_env(fplog,"GMX_DLB_MAX",10);
 +    comm->eFlop         = dd_nst_env(fplog,"GMX_DLB_FLOP",0);
 +    recload             = dd_nst_env(fplog,"GMX_DD_LOAD",1);
 +    comm->nstSortCG     = dd_nst_env(fplog,"GMX_DD_SORT",1);
 +    comm->nstDDDump     = dd_nst_env(fplog,"GMX_DD_DUMP",0);
 +    comm->nstDDDumpGrid = dd_nst_env(fplog,"GMX_DD_DUMP_GRID",0);
 +    comm->DD_debug      = dd_nst_env(fplog,"GMX_DD_DEBUG",0);
 +
 +    dd->pme_recv_f_alloc = 0;
 +    dd->pme_recv_f_buf = NULL;
 +
 +    if (dd->bSendRecv2 && fplog)
 +    {
 +        fprintf(fplog,"Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
 +    }
 +    if (comm->eFlop)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Will load balance based on FLOP count\n");
 +        }
 +        if (comm->eFlop > 1)
 +        {
 +            srand(1+cr->nodeid);
 +        }
 +        comm->bRecordLoad = TRUE;
 +    }
 +    else
 +    {
 +        comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
 +                             
 +    }
 +    
 +    comm->eDLB = check_dlb_support(fplog,cr,dlb_opt,comm->bRecordLoad,Flags,ir);
 +    
 +    comm->bDynLoadBal = (comm->eDLB == edlbYES);
 +    if (fplog)
 +    {
 +        fprintf(fplog,"Dynamic load balancing: %s\n",edlb_names[comm->eDLB]);
 +    }
 +    dd->bGridJump = comm->bDynLoadBal;
 +    
 +    if (comm->nstSortCG)
 +    {
 +        if (fplog)
 +        {
 +            if (comm->nstSortCG == 1)
 +            {
 +                fprintf(fplog,"Will sort the charge groups at every domain (re)decomposition\n");
 +            }
 +            else
 +            {
 +                fprintf(fplog,"Will sort the charge groups every %d steps\n",
 +                        comm->nstSortCG);
 +            }
 +        }
 +        snew(comm->sort,1);
 +    }
 +    else
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Will not sort the charge groups\n");
 +        }
 +    }
 +
 +    comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
 +    
 +    comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
 +    if (comm->bInterCGBondeds)
 +    {
 +        comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
 +    }
 +    else
 +    {
 +        comm->bInterCGMultiBody = FALSE;
 +    }
 +    
 +    dd->bInterCGcons    = inter_charge_group_constraints(mtop);
 +    dd->bInterCGsettles = inter_charge_group_settles(mtop);
 +
 +    if (ir->rlistlong == 0)
 +    {
 +        /* Set the cut-off to some very large value,
 +         * so we don't need if statements everywhere in the code.
 +         * We use sqrt, since the cut-off is squared in some places.
 +         */
 +        comm->cutoff   = GMX_CUTOFF_INF;
 +    }
 +    else
 +    {
 +        comm->cutoff   = ir->rlistlong;
 +    }
 +    comm->cutoff_mbody = 0;
 +    
 +    comm->cellsize_limit = 0;
 +    comm->bBondComm = FALSE;
 +
 +    if (comm->bInterCGBondeds)
 +    {
 +        if (comm_distance_min > 0)
 +        {
 +            comm->cutoff_mbody = comm_distance_min;
 +            if (Flags & MD_DDBONDCOMM)
 +            {
 +                comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
 +            }
 +            else
 +            {
 +                comm->cutoff = max(comm->cutoff,comm->cutoff_mbody);
 +            }
 +            r_bonded_limit = comm->cutoff_mbody;
 +        }
 +        else if (ir->bPeriodicMols)
 +        {
 +            /* Can not easily determine the required cut-off */
 +            dd_warning(cr,fplog,"NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
 +            comm->cutoff_mbody = comm->cutoff/2;
 +            r_bonded_limit = comm->cutoff_mbody;
 +        }
 +        else
 +        {
 +            if (MASTER(cr))
 +            {
 +                dd_bonded_cg_distance(fplog,dd,mtop,ir,x,box,
 +                                      Flags & MD_DDBONDCHECK,&r_2b,&r_mb);
 +            }
 +            gmx_bcast(sizeof(r_2b),&r_2b,cr);
 +            gmx_bcast(sizeof(r_mb),&r_mb,cr);
 +
 +            /* We use an initial margin of 10% for the minimum cell size,
 +             * except when we are just below the non-bonded cut-off.
 +             */
 +            if (Flags & MD_DDBONDCOMM)
 +            {
 +                if (max(r_2b,r_mb) > comm->cutoff)
 +                {
 +                    r_bonded       = max(r_2b,r_mb);
 +                    r_bonded_limit = 1.1*r_bonded;
 +                    comm->bBondComm = TRUE;
 +                }
 +                else
 +                {
 +                    r_bonded       = r_mb;
 +                    r_bonded_limit = min(1.1*r_bonded,comm->cutoff);
 +                }
 +                /* We determine cutoff_mbody later */
 +            }
 +            else
 +            {
 +                /* No special bonded communication,
 +                 * simply increase the DD cut-off.
 +                 */
 +                r_bonded_limit     = 1.1*max(r_2b,r_mb);
 +                comm->cutoff_mbody = r_bonded_limit;
 +                comm->cutoff       = max(comm->cutoff,comm->cutoff_mbody);
 +            }
 +        }
 +        comm->cellsize_limit = max(comm->cellsize_limit,r_bonded_limit);
 +        if (fplog)
 +        {
 +            fprintf(fplog,
 +                    "Minimum cell size due to bonded interactions: %.3f nm\n",
 +                    comm->cellsize_limit);
 +        }
 +    }
 +
 +    if (dd->bInterCGcons && rconstr <= 0)
 +    {
 +        /* There is a cell size limit due to the constraints (P-LINCS) */
 +        rconstr = constr_r_max(fplog,mtop,ir);
 +        if (fplog)
 +        {
 +            fprintf(fplog,
 +                    "Estimated maximum distance required for P-LINCS: %.3f nm\n",
 +                    rconstr);
 +            if (rconstr > comm->cellsize_limit)
 +            {
 +                fprintf(fplog,"This distance will limit the DD cell size, you can override this with -rcon\n");
 +            }
 +        }
 +    }
 +    else if (rconstr > 0 && fplog)
 +    {
 +        /* Here we do not check for dd->bInterCGcons,
 +         * because one can also set a cell size limit for virtual sites only
 +         * and at this point we don't know yet if there are intercg v-sites.
 +         */
 +        fprintf(fplog,
 +                "User supplied maximum distance required for P-LINCS: %.3f nm\n",
 +                rconstr);
 +    }
 +    comm->cellsize_limit = max(comm->cellsize_limit,rconstr);
 +
 +    comm->cgs_gl = gmx_mtop_global_cgs(mtop);
 +
 +    if (nc[XX] > 0)
 +    {
 +        copy_ivec(nc,dd->nc);
 +        set_dd_dim(fplog,dd);
 +        set_ddbox_cr(cr,&dd->nc,ir,box,&comm->cgs_gl,x,ddbox);
 +
 +        if (cr->npmenodes == -1)
 +        {
 +            cr->npmenodes = 0;
 +        }
 +        acs = average_cellsize_min(dd,ddbox);
 +        if (acs < comm->cellsize_limit)
 +        {
 +            if (fplog)
 +            {
 +                fprintf(fplog,"ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n",acs,comm->cellsize_limit);
 +            }
 +            gmx_fatal_collective(FARGS,cr,NULL,
 +                                 "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
 +                                 acs,comm->cellsize_limit);
 +        }
 +    }
 +    else
 +    {
 +        set_ddbox_cr(cr,NULL,ir,box,&comm->cgs_gl,x,ddbox);
 +
 +        /* We need to choose the optimal DD grid and possibly PME nodes */
 +        limit = dd_choose_grid(fplog,cr,dd,ir,mtop,box,ddbox,
 +                               comm->eDLB!=edlbNO,dlb_scale,
 +                               comm->cellsize_limit,comm->cutoff,
 +                               comm->bInterCGBondeds,comm->bInterCGMultiBody);
 +        
 +        if (dd->nc[XX] == 0)
 +        {
 +            bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
 +            sprintf(buf,"Change the number of nodes or mdrun option %s%s%s",
 +                    !bC ? "-rdd" : "-rcon",
 +                    comm->eDLB!=edlbNO ? " or -dds" : "",
 +                    bC ? " or your LINCS settings" : "");
 +
 +            gmx_fatal_collective(FARGS,cr,NULL,
 +                                 "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
 +                                 "%s\n"
 +                                 "Look in the log file for details on the domain decomposition",
 +                                 cr->nnodes-cr->npmenodes,limit,buf);
 +        }
 +        set_dd_dim(fplog,dd);
 +    }
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog,
 +                "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
 +                dd->nc[XX],dd->nc[YY],dd->nc[ZZ],cr->npmenodes);
 +    }
 +    
 +    dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
 +    if (cr->nnodes - dd->nnodes != cr->npmenodes)
 +    {
 +        gmx_fatal_collective(FARGS,cr,NULL,
 +                             "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
 +                             dd->nnodes,cr->nnodes - cr->npmenodes,cr->nnodes);
 +    }
 +    if (cr->npmenodes > dd->nnodes)
 +    {
 +        gmx_fatal_collective(FARGS,cr,NULL,
 +                             "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.",cr->npmenodes,dd->nnodes);
 +    }
 +    if (cr->npmenodes > 0)
 +    {
 +        comm->npmenodes = cr->npmenodes;
 +    }
 +    else
 +    {
 +        comm->npmenodes = dd->nnodes;
 +    }
 +
 +    if (EEL_PME(ir->coulombtype))
 +    {
 +        /* The following choices should match those
 +         * in comm_cost_est in domdec_setup.c.
 +         * Note that here the checks have to take into account
 +         * that the decomposition might occur in a different order than xyz
 +         * (for instance through the env.var. GMX_DD_ORDER_ZYX),
 +         * in which case they will not match those in comm_cost_est,
 +         * but since that is mainly for testing purposes that's fine.
 +         */
 +        if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
 +            comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
 +            getenv("GMX_PMEONEDD") == NULL)
 +        {
 +            comm->npmedecompdim = 2;
 +            comm->npmenodes_x   = dd->nc[XX];
 +            comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
 +        }
 +        else
 +        {
 +            /* In case nc is 1 in both x and y we could still choose to
 +             * decompose pme in y instead of x, but we use x for simplicity.
 +             */
 +            comm->npmedecompdim = 1;
 +            if (dd->dim[0] == YY)
 +            {
 +                comm->npmenodes_x = 1;
 +                comm->npmenodes_y = comm->npmenodes;
 +            }
 +            else
 +            {
 +                comm->npmenodes_x = comm->npmenodes;
 +                comm->npmenodes_y = 1;
 +            }
 +        }    
 +        if (fplog)
 +        {
 +            fprintf(fplog,"PME domain decomposition: %d x %d x %d\n",
 +                    comm->npmenodes_x,comm->npmenodes_y,1);
 +        }
 +    }
 +    else
 +    {
 +        comm->npmedecompdim = 0;
 +        comm->npmenodes_x   = 0;
 +        comm->npmenodes_y   = 0;
 +    }
 +    
 +    /* Technically we don't need both of these,
 +     * but it simplifies code not having to recalculate it.
 +     */
 +    *npme_x = comm->npmenodes_x;
 +    *npme_y = comm->npmenodes_y;
 +        
 +    snew(comm->slb_frac,DIM);
 +    if (comm->eDLB == edlbNO)
 +    {
 +        comm->slb_frac[XX] = get_slb_frac(fplog,"x",dd->nc[XX],sizex);
 +        comm->slb_frac[YY] = get_slb_frac(fplog,"y",dd->nc[YY],sizey);
 +        comm->slb_frac[ZZ] = get_slb_frac(fplog,"z",dd->nc[ZZ],sizez);
 +    }
 +
 +    if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
 +    {
 +        if (comm->bBondComm || comm->eDLB != edlbNO)
 +        {
 +            /* Set the bonded communication distance to halfway
 +             * the minimum and the maximum,
 +             * since the extra communication cost is nearly zero.
 +             */
 +            acs = average_cellsize_min(dd,ddbox);
 +            comm->cutoff_mbody = 0.5*(r_bonded + acs);
 +            if (comm->eDLB != edlbNO)
 +            {
 +                /* Check if this does not limit the scaling */
 +                comm->cutoff_mbody = min(comm->cutoff_mbody,dlb_scale*acs);
 +            }
 +            if (!comm->bBondComm)
 +            {
 +                /* Without bBondComm do not go beyond the n.b. cut-off */
 +                comm->cutoff_mbody = min(comm->cutoff_mbody,comm->cutoff);
 +                if (comm->cellsize_limit >= comm->cutoff)
 +                {
 +                    /* We don't loose a lot of efficieny
 +                     * when increasing it to the n.b. cut-off.
 +                     * It can even be slightly faster, because we need
 +                     * less checks for the communication setup.
 +                     */
 +                    comm->cutoff_mbody = comm->cutoff;
 +                }
 +            }
 +            /* Check if we did not end up below our original limit */
 +            comm->cutoff_mbody = max(comm->cutoff_mbody,r_bonded_limit);
 +
 +            if (comm->cutoff_mbody > comm->cellsize_limit)
 +            {
 +                comm->cellsize_limit = comm->cutoff_mbody;
 +            }
 +        }
 +        /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"Bonded atom communication beyond the cut-off: %d\n"
 +                "cellsize limit %f\n",
 +                comm->bBondComm,comm->cellsize_limit);
 +    }
 +    
 +    if (MASTER(cr))
 +    {
 +        check_dd_restrictions(cr,dd,ir,fplog);
 +    }
 +
 +    comm->partition_step = INT_MIN;
 +    dd->ddp_count = 0;
 +
 +    clear_dd_cycle_counts(dd);
 +
 +    return dd;
 +}
 +
 +static void set_dlb_limits(gmx_domdec_t *dd)
 +
 +{
 +    int d;
 +
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        dd->comm->cd[d].np = dd->comm->cd[d].np_dlb;
 +        dd->comm->cellsize_min[dd->dim[d]] =
 +            dd->comm->cellsize_min_dlb[dd->dim[d]];
 +    }
 +}
 +
 +
 +static void turn_on_dlb(FILE *fplog,t_commrec *cr,gmx_large_int_t step)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_domdec_comm_t *comm;
 +    real cellsize_min;
 +    int  d,nc,i;
 +    char buf[STRLEN];
 +    
 +    dd = cr->dd;
 +    comm = dd->comm;
 +    
 +    if (fplog)
 +    {
 +        fprintf(fplog,"At step %s the performance loss due to force load imbalance is %.1f %%\n",gmx_step_str(step,buf),dd_force_imb_perf_loss(dd)*100);
 +    }
 +
 +    cellsize_min = comm->cellsize_min[dd->dim[0]];
 +    for(d=1; d<dd->ndim; d++)
 +    {
 +        cellsize_min = min(cellsize_min,comm->cellsize_min[dd->dim[d]]);
 +    }
 +
 +    if (cellsize_min < comm->cellsize_limit*1.05)
 +    {
 +        dd_warning(cr,fplog,"NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
 +
 +        /* Change DLB from "auto" to "no". */
 +        comm->eDLB = edlbNO;
 +
 +        return;
 +    }
 +
 +    dd_warning(cr,fplog,"NOTE: Turning on dynamic load balancing\n");
 +    comm->bDynLoadBal = TRUE;
 +    dd->bGridJump = TRUE;
 +    
 +    set_dlb_limits(dd);
 +
 +    /* We can set the required cell size info here,
 +     * so we do not need to communicate this.
 +     * The grid is completely uniform.
 +     */
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        if (comm->root[d])
 +        {
 +            comm->load[d].sum_m = comm->load[d].sum;
 +
 +            nc = dd->nc[dd->dim[d]];
 +            for(i=0; i<nc; i++)
 +            {
 +                comm->root[d]->cell_f[i]    = i/(real)nc;
 +                if (d > 0)
 +                {
 +                    comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
 +                    comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
 +                }
 +            }
 +            comm->root[d]->cell_f[nc] = 1.0;
 +        }
 +    }
 +}
 +
 +static char *init_bLocalCG(gmx_mtop_t *mtop)
 +{
 +    int  ncg,cg;
 +    char *bLocalCG;
 +    
 +    ncg = ncg_mtop(mtop);
 +    snew(bLocalCG,ncg);
 +    for(cg=0; cg<ncg; cg++)
 +    {
 +        bLocalCG[cg] = FALSE;
 +    }
 +
 +    return bLocalCG;
 +}
 +
 +void dd_init_bondeds(FILE *fplog,
 +                     gmx_domdec_t *dd,gmx_mtop_t *mtop,
 +                     gmx_vsite_t *vsite,gmx_constr_t constr,
 +                     t_inputrec *ir,gmx_bool bBCheck,cginfo_mb_t *cginfo_mb)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_bool bBondComm;
 +    int  d;
 +
 +    dd_make_reverse_top(fplog,dd,mtop,vsite,constr,ir,bBCheck);
 +
 +    comm = dd->comm;
 +
 +    if (comm->bBondComm)
 +    {
 +        /* Communicate atoms beyond the cut-off for bonded interactions */
 +        comm = dd->comm;
 +
 +        comm->cglink = make_charge_group_links(mtop,dd,cginfo_mb);
 +
 +        comm->bLocalCG = init_bLocalCG(mtop);
 +    }
 +    else
 +    {
 +        /* Only communicate atoms based on cut-off */
 +        comm->cglink   = NULL;
 +        comm->bLocalCG = NULL;
 +    }
 +}
 +
 +static void print_dd_settings(FILE *fplog,gmx_domdec_t *dd,
 +                              t_inputrec *ir,
 +                              gmx_bool bDynLoadBal,real dlb_scale,
 +                              gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  d;
 +    ivec np;
 +    real limit,shrink;
 +    char buf[64];
 +
 +    if (fplog == NULL)
 +    {
 +        return;
 +    }
 +
 +    comm = dd->comm;
 +
 +    if (bDynLoadBal)
 +    {
 +        fprintf(fplog,"The maximum number of communication pulses is:");
 +        for(d=0; d<dd->ndim; d++)
 +        {
 +            fprintf(fplog," %c %d",dim2char(dd->dim[d]),comm->cd[d].np_dlb);
 +        }
 +        fprintf(fplog,"\n");
 +        fprintf(fplog,"The minimum size for domain decomposition cells is %.3f nm\n",comm->cellsize_limit);
 +        fprintf(fplog,"The requested allowed shrink of DD cells (option -dds) is: %.2f\n",dlb_scale);
 +        fprintf(fplog,"The allowed shrink of domain decomposition cells is:");
 +        for(d=0; d<DIM; d++)
 +        {
 +            if (dd->nc[d] > 1)
 +            {
 +                if (d >= ddbox->npbcdim && dd->nc[d] == 2)
 +                {
 +                    shrink = 0;
 +                }
 +                else
 +                {
 +                    shrink =
 +                        comm->cellsize_min_dlb[d]/
 +                        (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
 +                }
 +                fprintf(fplog," %c %.2f",dim2char(d),shrink);
 +            }
 +        }
 +        fprintf(fplog,"\n");
 +    }
 +    else
 +    {
 +        set_dd_cell_sizes_slb(dd,ddbox,FALSE,np);
 +        fprintf(fplog,"The initial number of communication pulses is:");
 +        for(d=0; d<dd->ndim; d++)
 +        {
 +            fprintf(fplog," %c %d",dim2char(dd->dim[d]),np[dd->dim[d]]);
 +        }
 +        fprintf(fplog,"\n");
 +        fprintf(fplog,"The initial domain decomposition cell size is:");
 +        for(d=0; d<DIM; d++) {
 +            if (dd->nc[d] > 1)
 +            {
 +                fprintf(fplog," %c %.2f nm",
 +                        dim2char(d),dd->comm->cellsize_min[d]);
 +            }
 +        }
 +        fprintf(fplog,"\n\n");
 +    }
 +    
 +    if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
 +    {
 +        fprintf(fplog,"The maximum allowed distance for charge groups involved in interactions is:\n");
 +        fprintf(fplog,"%40s  %-7s %6.3f nm\n",
 +                "non-bonded interactions","",comm->cutoff);
 +
 +        if (bDynLoadBal)
 +        {
 +            limit = dd->comm->cellsize_limit;
 +        }
 +        else
 +        {
 +            if (dynamic_dd_box(ddbox,ir))
 +            {
 +                fprintf(fplog,"(the following are initial values, they could change due to box deformation)\n");
 +            }
 +            limit = dd->comm->cellsize_min[XX];
 +            for(d=1; d<DIM; d++)
 +            {
 +                limit = min(limit,dd->comm->cellsize_min[d]);
 +            }
 +        }
 +
 +        if (comm->bInterCGBondeds)
 +        {
 +            fprintf(fplog,"%40s  %-7s %6.3f nm\n",
 +                    "two-body bonded interactions","(-rdd)",
 +                    max(comm->cutoff,comm->cutoff_mbody));
 +            fprintf(fplog,"%40s  %-7s %6.3f nm\n",
 +                    "multi-body bonded interactions","(-rdd)",
 +                    (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff,limit));
 +        }
 +        if (dd->vsite_comm)
 +        {
 +            fprintf(fplog,"%40s  %-7s %6.3f nm\n",
 +                    "virtual site constructions","(-rcon)",limit);
 +        }
 +        if (dd->constraint_comm)
 +        {
 +            sprintf(buf,"atoms separated by up to %d constraints",
 +                    1+ir->nProjOrder);
 +            fprintf(fplog,"%40s  %-7s %6.3f nm\n",
 +                    buf,"(-rcon)",limit);
 +        }
 +        fprintf(fplog,"\n");
 +    }
 +    
 +    fflush(fplog);
 +}
 +
 +static void set_cell_limits_dlb(gmx_domdec_t *dd,
 +                                real dlb_scale,
 +                                const t_inputrec *ir,
 +                                const gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  d,dim,npulse,npulse_d_max,npulse_d;
 +    gmx_bool bNoCutOff;
 +
 +    comm = dd->comm;
 +
 +    bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
 +
 +    /* Determine the maximum number of comm. pulses in one dimension */
 +        
 +    comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
 +        
 +    /* Determine the maximum required number of grid pulses */
 +    if (comm->cellsize_limit >= comm->cutoff)
 +    {
 +        /* Only a single pulse is required */
 +        npulse = 1;
 +    }
 +    else if (!bNoCutOff && comm->cellsize_limit > 0)
 +    {
 +        /* We round down slightly here to avoid overhead due to the latency
 +         * of extra communication calls when the cut-off
 +         * would be only slightly longer than the cell size.
 +         * Later cellsize_limit is redetermined,
 +         * so we can not miss interactions due to this rounding.
 +         */
 +        npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
 +    }
 +    else
 +    {
 +        /* There is no cell size limit */
 +        npulse = max(dd->nc[XX]-1,max(dd->nc[YY]-1,dd->nc[ZZ]-1));
 +    }
 +
 +    if (!bNoCutOff && npulse > 1)
 +    {
 +        /* See if we can do with less pulses, based on dlb_scale */
 +        npulse_d_max = 0;
 +        for(d=0; d<dd->ndim; d++)
 +        {
 +            dim = dd->dim[d];
 +            npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
 +                             /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
 +            npulse_d_max = max(npulse_d_max,npulse_d);
 +        }
 +        npulse = min(npulse,npulse_d_max);
 +    }
 +
 +    /* This env var can override npulse */
 +    d = dd_nst_env(debug,"GMX_DD_NPULSE",0);
 +    if (d > 0)
 +    {
 +        npulse = d;
 +    }
 +
 +    comm->maxpulse = 1;
 +    comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        comm->cd[d].np_dlb = min(npulse,dd->nc[dd->dim[d]]-1);
 +        comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
 +        snew(comm->cd[d].ind,comm->cd[d].np_nalloc);
 +        comm->maxpulse = max(comm->maxpulse,comm->cd[d].np_dlb);
 +        if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
 +        {
 +            comm->bVacDLBNoLimit = FALSE;
 +        }
 +    }
 +
 +    /* cellsize_limit is set for LINCS in init_domain_decomposition */
 +    if (!comm->bVacDLBNoLimit)
 +    {
 +        comm->cellsize_limit = max(comm->cellsize_limit,
 +                                   comm->cutoff/comm->maxpulse);
 +    }
 +    comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
 +    /* Set the minimum cell size for each DD dimension */
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        if (comm->bVacDLBNoLimit ||
 +            comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
 +        {
 +            comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
 +        }
 +        else
 +        {
 +            comm->cellsize_min_dlb[dd->dim[d]] =
 +                comm->cutoff/comm->cd[d].np_dlb;
 +        }
 +    }
 +    if (comm->cutoff_mbody <= 0)
 +    {
 +        comm->cutoff_mbody = min(comm->cutoff,comm->cellsize_limit);
 +    }
 +    if (comm->bDynLoadBal)
 +    {
 +        set_dlb_limits(dd);
 +    }
 +}
 +
 +gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd,int ePBC)
 +{
 +    /* If each molecule is a single charge group
 +     * or we use domain decomposition for each periodic dimension,
 +     * we do not need to take pbc into account for the bonded interactions.
 +     */
 +    return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
 +            !(dd->nc[XX]>1 &&
 +              dd->nc[YY]>1 &&
 +              (dd->nc[ZZ]>1 || ePBC==epbcXY)));
 +}
 +
 +void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
 +                       t_inputrec *ir,t_forcerec *fr,
 +                       gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  natoms_tot;
 +    real vol_frac;
 +
 +    comm = dd->comm;
 +
 +    /* Initialize the thread data.
 +     * This can not be done in init_domain_decomposition,
 +     * as the numbers of threads is determined later.
 +     */
 +    comm->nth = gmx_omp_nthreads_get(emntDomdec);
 +    if (comm->nth > 1)
 +    {
 +        snew(comm->dth,comm->nth);
 +    }
 +
 +    if (EEL_PME(ir->coulombtype))
 +    {
 +        init_ddpme(dd,&comm->ddpme[0],0);
 +        if (comm->npmedecompdim >= 2)
 +        {
 +            init_ddpme(dd,&comm->ddpme[1],1);
 +        }
 +    }
 +    else
 +    {
 +        comm->npmenodes = 0;
 +        if (dd->pme_nodeid >= 0)
 +        {
 +            gmx_fatal_collective(FARGS,NULL,dd,
 +                                 "Can not have separate PME nodes without PME electrostatics");
 +        }
 +    }
 +        
 +    if (debug)
 +    {
 +        fprintf(debug,"The DD cut-off is %f\n",comm->cutoff);
 +    }
 +    if (comm->eDLB != edlbNO)
 +    {
 +        set_cell_limits_dlb(dd,dlb_scale,ir,ddbox);
 +    }
 +    
 +    print_dd_settings(fplog,dd,ir,comm->bDynLoadBal,dlb_scale,ddbox);
 +    if (comm->eDLB == edlbAUTO)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog,"When dynamic load balancing gets turned on, these settings will change to:\n");
 +        }
 +        print_dd_settings(fplog,dd,ir,TRUE,dlb_scale,ddbox);
 +    }
 +
 +    if (ir->ePBC == epbcNONE)
 +    {
 +        vol_frac = 1 - 1/(double)dd->nnodes;
 +    }
 +    else
 +    {
 +        vol_frac =
 +            (1 + comm_box_frac(dd->nc,comm->cutoff,ddbox))/(double)dd->nnodes;
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug,"Volume fraction for all DD zones: %f\n",vol_frac);
 +    }
 +    natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
 +   
 +    dd->ga2la = ga2la_init(natoms_tot,vol_frac*natoms_tot);
 +}
 +
 +gmx_bool change_dd_cutoff(t_commrec *cr,t_state *state,t_inputrec *ir,
 +                          real cutoff_req)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_ddbox_t ddbox;
 +    int d,dim,np;
 +    real inv_cell_size;
 +    int LocallyLimited;
 +
 +    dd = cr->dd;
 +
 +    set_ddbox(dd,FALSE,cr,ir,state->box,
 +              TRUE,&dd->comm->cgs_gl,state->x,&ddbox);
 +
 +    LocallyLimited = 0;
 +
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +
 +        inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
 +        if (dynamic_dd_box(&ddbox,ir))
 +        {
 +            inv_cell_size *= DD_PRES_SCALE_MARGIN;
 +        }
 +
 +        np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
 +
 +        if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
 +            dd->comm->cd[d].np_dlb > 0)
 +        {
 +            if (np > dd->comm->cd[d].np_dlb)
 +            {
 +                return FALSE;
 +            }
 +
 +            /* If a current local cell size is smaller than the requested
 +             * cut-off, we could still fix it, but this gets very complicated.
 +             * Without fixing here, we might actually need more checks.
 +             */
 +            if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
 +            {
 +                LocallyLimited = 1;
 +            }
 +        }
 +    }
 +
 +    if (dd->comm->eDLB != edlbNO)
 +    {
 +        /* If DLB is not active yet, we don't need to check the grid jumps.
 +         * Actually we shouldn't, because then the grid jump data is not set.
 +         */
 +        if (dd->comm->bDynLoadBal &&
 +            check_grid_jump(0,dd,cutoff_req,&ddbox,FALSE))
 +        {
 +            LocallyLimited = 1; 
 +        }
 +
 +        gmx_sumi(1,&LocallyLimited,cr);
 +
 +        if (LocallyLimited > 0)
 +        {
 +            return FALSE;
 +        }
 +    }
 +
 +    dd->comm->cutoff = cutoff_req;
 +
 +    return TRUE;
 +}
 +
 +static void merge_cg_buffers(int ncell,
 +                             gmx_domdec_comm_dim_t *cd, int pulse,
 +                             int  *ncg_cell,
 +                             int  *index_gl, int  *recv_i,
 +                             rvec *cg_cm,    rvec *recv_vr,
 +                             int *cgindex,
 +                             cginfo_mb_t *cginfo_mb,int *cginfo)
 +{
 +    gmx_domdec_ind_t *ind,*ind_p;
 +    int p,cell,c,cg,cg0,cg1,cg_gl,nat;
 +    int shift,shift_at;
 +    
 +    ind = &cd->ind[pulse];
 +    
 +    /* First correct the already stored data */
 +    shift = ind->nrecv[ncell];
 +    for(cell=ncell-1; cell>=0; cell--)
 +    {
 +        shift -= ind->nrecv[cell];
 +        if (shift > 0)
 +        {
 +            /* Move the cg's present from previous grid pulses */
 +            cg0 = ncg_cell[ncell+cell];
 +            cg1 = ncg_cell[ncell+cell+1];
 +            cgindex[cg1+shift] = cgindex[cg1];
 +            for(cg=cg1-1; cg>=cg0; cg--)
 +            {
 +                index_gl[cg+shift] = index_gl[cg];
 +                copy_rvec(cg_cm[cg],cg_cm[cg+shift]);
 +                cgindex[cg+shift] = cgindex[cg];
 +                cginfo[cg+shift] = cginfo[cg];
 +            }
 +            /* Correct the already stored send indices for the shift */
 +            for(p=1; p<=pulse; p++)
 +            {
 +                ind_p = &cd->ind[p];
 +                cg0 = 0;
 +                for(c=0; c<cell; c++)
 +                {
 +                    cg0 += ind_p->nsend[c];
 +                }
 +                cg1 = cg0 + ind_p->nsend[cell];
 +                for(cg=cg0; cg<cg1; cg++)
 +                {
 +                    ind_p->index[cg] += shift;
 +                }
 +            }
 +        }
 +    }
 +
 +    /* Merge in the communicated buffers */
 +    shift = 0;
 +    shift_at = 0;
 +    cg0 = 0;
 +    for(cell=0; cell<ncell; cell++)
 +    {
 +        cg1 = ncg_cell[ncell+cell+1] + shift;
 +        if (shift_at > 0)
 +        {
 +            /* Correct the old cg indices */
 +            for(cg=ncg_cell[ncell+cell]; cg<cg1; cg++)
 +            {
 +                cgindex[cg+1] += shift_at;
 +            }
 +        }
 +        for(cg=0; cg<ind->nrecv[cell]; cg++)
 +        {
 +            /* Copy this charge group from the buffer */
 +            index_gl[cg1] = recv_i[cg0];
 +            copy_rvec(recv_vr[cg0],cg_cm[cg1]);
 +            /* Add it to the cgindex */
 +            cg_gl = index_gl[cg1];
 +            cginfo[cg1] = ddcginfo(cginfo_mb,cg_gl);
 +            nat = GET_CGINFO_NATOMS(cginfo[cg1]);
 +            cgindex[cg1+1] = cgindex[cg1] + nat;
 +            cg0++;
 +            cg1++;
 +            shift_at += nat;
 +        }
 +        shift += ind->nrecv[cell];
 +        ncg_cell[ncell+cell+1] = cg1;
 +    }
 +}
 +
 +static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
 +                               int nzone,int cg0,const int *cgindex)
 +{
 +    int cg,zone,p;
 +    
 +    /* Store the atom block boundaries for easy copying of communication buffers
 +     */
 +    cg = cg0;
 +    for(zone=0; zone<nzone; zone++)
 +    {
 +        for(p=0; p<cd->np; p++) {
 +            cd->ind[p].cell2at0[zone] = cgindex[cg];
 +            cg += cd->ind[p].nrecv[zone];
 +            cd->ind[p].cell2at1[zone] = cgindex[cg];
 +        }
 +    }
 +}
 +
 +static gmx_bool missing_link(t_blocka *link,int cg_gl,char *bLocalCG)
 +{
 +    int  i;
 +    gmx_bool bMiss;
 +
 +    bMiss = FALSE;
 +    for(i=link->index[cg_gl]; i<link->index[cg_gl+1]; i++)
 +    {
 +        if (!bLocalCG[link->a[i]])
 +        {
 +            bMiss = TRUE;
 +        }
 +    }
 +
 +    return bMiss;
 +}
 +
 +/* Domain corners for communication, a maximum of 4 i-zones see a j domain */
 +typedef struct {
 +    real c[DIM][4]; /* the corners for the non-bonded communication */
 +    real cr0;       /* corner for rounding */
 +    real cr1[4];    /* corners for rounding */
 +    real bc[DIM];   /* corners for bounded communication */
 +    real bcr1;      /* corner for rounding for bonded communication */
 +} dd_corners_t;
 +
 +/* Determine the corners of the domain(s) we are communicating with */
 +static void
 +set_dd_corners(const gmx_domdec_t *dd,
 +               int dim0, int dim1, int dim2,
 +               gmx_bool bDistMB,
 +               dd_corners_t *c)
 +{
 +    const gmx_domdec_comm_t *comm;
 +    const gmx_domdec_zones_t *zones;
 +    int i,j;
 +
 +    comm = dd->comm;
 +
 +    zones = &comm->zones;
 +
 +    /* Keep the compiler happy */
 +    c->cr0  = 0;
 +    c->bcr1 = 0;
 +
 +    /* The first dimension is equal for all cells */
 +    c->c[0][0] = comm->cell_x0[dim0];
 +    if (bDistMB)
 +    {
 +        c->bc[0] = c->c[0][0];
 +    }
 +    if (dd->ndim >= 2)
 +    {
 +        dim1 = dd->dim[1];
 +        /* This cell row is only seen from the first row */
 +        c->c[1][0] = comm->cell_x0[dim1];
 +        /* All rows can see this row */
 +        c->c[1][1] = comm->cell_x0[dim1];
 +        if (dd->bGridJump)
 +        {
 +            c->c[1][1] = max(comm->cell_x0[dim1],comm->zone_d1[1].mch0);
 +            if (bDistMB)
 +            {
 +                /* For the multi-body distance we need the maximum */
 +                c->bc[1] = max(comm->cell_x0[dim1],comm->zone_d1[1].p1_0);
 +            }
 +        }
 +        /* Set the upper-right corner for rounding */
 +        c->cr0 = comm->cell_x1[dim0];
 +        
 +        if (dd->ndim >= 3)
 +        {
 +            dim2 = dd->dim[2];
 +            for(j=0; j<4; j++)
 +            {
 +                c->c[2][j] = comm->cell_x0[dim2];
 +            }
 +            if (dd->bGridJump)
 +            {
 +                /* Use the maximum of the i-cells that see a j-cell */
 +                for(i=0; i<zones->nizone; i++)
 +                {
 +                    for(j=zones->izone[i].j0; j<zones->izone[i].j1; j++)
 +                    {
 +                        if (j >= 4)
 +                        {
 +                            c->c[2][j-4] =
 +                                max(c->c[2][j-4],
 +                                    comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
 +                        }
 +                    }
 +                }
 +                if (bDistMB)
 +                {
 +                    /* For the multi-body distance we need the maximum */
 +                    c->bc[2] = comm->cell_x0[dim2];
 +                    for(i=0; i<2; i++)
 +                    {
 +                        for(j=0; j<2; j++)
 +                        {
 +                            c->bc[2] = max(c->bc[2],comm->zone_d2[i][j].p1_0);
 +                        }
 +                    }
 +                }
 +            }
 +            
 +            /* Set the upper-right corner for rounding */
 +            /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
 +             * Only cell (0,0,0) can see cell 7 (1,1,1)
 +             */
 +            c->cr1[0] = comm->cell_x1[dim1];
 +            c->cr1[3] = comm->cell_x1[dim1];
 +            if (dd->bGridJump)
 +            {
 +                c->cr1[0] = max(comm->cell_x1[dim1],comm->zone_d1[1].mch1);
 +                if (bDistMB)
 +                {
 +                    /* For the multi-body distance we need the maximum */
 +                    c->bcr1 = max(comm->cell_x1[dim1],comm->zone_d1[1].p1_1);
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* Determine which cg's we need to send in this pulse from this zone */
 +static void
 +get_zone_pulse_cgs(gmx_domdec_t *dd,
 +                   int zonei, int zone,
 +                   int cg0, int cg1,
 +                   const int *index_gl,
 +                   const int *cgindex,
 +                   int dim, int dim_ind,
 +                   int dim0, int dim1, int dim2,
 +                   real r_comm2, real r_bcomm2,
 +                   matrix box,
 +                   ivec tric_dist,
 +                   rvec *normal,
 +                   real skew_fac2_d, real skew_fac_01,
 +                   rvec *v_d, rvec *v_0, rvec *v_1,
 +                   const dd_corners_t *c,
 +                   rvec sf2_round,
 +                   gmx_bool bDistBonded,
 +                   gmx_bool bBondComm,
 +                   gmx_bool bDist2B,
 +                   gmx_bool bDistMB,
 +                   rvec *cg_cm,
 +                   int *cginfo,
 +                   gmx_domdec_ind_t *ind,
 +                   int **ibuf, int *ibuf_nalloc,
 +                   vec_rvec_t *vbuf,
 +                   int *nsend_ptr,
 +                   int *nat_ptr,
 +                   int *nsend_z_ptr)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_bool bScrew;
 +    gmx_bool bDistMB_pulse;
 +    int  cg,i;
 +    real r2,rb2,r,tric_sh;
 +    rvec rn,rb;
 +    int  dimd;
 +    int  nsend_z,nsend,nat;
 +
 +    comm = dd->comm;
 +
 +    bScrew = (dd->bScrewPBC && dim == XX);
 +
 +    bDistMB_pulse = (bDistMB && bDistBonded);
 +
 +    nsend_z = 0;
 +    nsend   = *nsend_ptr;
 +    nat     = *nat_ptr;
 +
 +    for(cg=cg0; cg<cg1; cg++)
 +    {
 +        r2  = 0;
 +        rb2 = 0;
 +        if (tric_dist[dim_ind] == 0)
 +        {
 +            /* Rectangular direction, easy */
 +            r = cg_cm[cg][dim] - c->c[dim_ind][zone];
 +            if (r > 0)
 +            {
 +                r2 += r*r;
 +            }
 +            if (bDistMB_pulse)
 +            {
 +                r = cg_cm[cg][dim] - c->bc[dim_ind];
 +                if (r > 0)
 +                {
 +                    rb2 += r*r;
 +                }
 +            }
 +            /* Rounding gives at most a 16% reduction
 +             * in communicated atoms
 +             */
 +            if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
 +            {
 +                r = cg_cm[cg][dim0] - c->cr0;
 +                /* This is the first dimension, so always r >= 0 */
 +                r2 += r*r;
 +                if (bDistMB_pulse)
 +                {
 +                    rb2 += r*r;
 +                }
 +            }
 +            if (dim_ind == 2 && (zonei == 2 || zonei == 3))
 +            {
 +                r = cg_cm[cg][dim1] - c->cr1[zone];
 +                if (r > 0)
 +                {
 +                    r2 += r*r;
 +                }
 +                if (bDistMB_pulse)
 +                {
 +                    r = cg_cm[cg][dim1] - c->bcr1;
 +                    if (r > 0)
 +                    {
 +                        rb2 += r*r;
 +                    }
 +                }
 +            }
 +        }
 +        else
 +        {
 +            /* Triclinic direction, more complicated */
 +            clear_rvec(rn);
 +            clear_rvec(rb);
 +            /* Rounding, conservative as the skew_fac multiplication
 +             * will slightly underestimate the distance.
 +             */
 +            if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
 +            {
 +                rn[dim0] = cg_cm[cg][dim0] - c->cr0;
 +                for(i=dim0+1; i<DIM; i++)
 +                {
 +                    rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
 +                }
 +                r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
 +                if (bDistMB_pulse)
 +                {
 +                    rb[dim0] = rn[dim0];
 +                    rb2 = r2;
 +                }
 +                /* Take care that the cell planes along dim0 might not
 +                 * be orthogonal to those along dim1 and dim2.
 +                 */
 +                for(i=1; i<=dim_ind; i++)
 +                {
 +                    dimd = dd->dim[i];
 +                    if (normal[dim0][dimd] > 0)
 +                    {
 +                        rn[dimd] -= rn[dim0]*normal[dim0][dimd];
 +                        if (bDistMB_pulse)
 +                        {
 +                            rb[dimd] -= rb[dim0]*normal[dim0][dimd];
 +                        }
 +                    }
 +                }
 +            }
 +            if (dim_ind == 2 && (zonei == 2 || zonei == 3))
 +            {
 +                rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
 +                tric_sh = 0;
 +                for(i=dim1+1; i<DIM; i++)
 +                {
 +                    tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
 +                }
 +                rn[dim1] += tric_sh;
 +                if (rn[dim1] > 0)
 +                {
 +                    r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
 +                    /* Take care of coupling of the distances
 +                     * to the planes along dim0 and dim1 through dim2.
 +                     */
 +                    r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
 +                    /* Take care that the cell planes along dim1
 +                     * might not be orthogonal to that along dim2.
 +                     */
 +                    if (normal[dim1][dim2] > 0)
 +                    {
 +                        rn[dim2] -= rn[dim1]*normal[dim1][dim2];
 +                    }
 +                }
 +                if (bDistMB_pulse)
 +                {
 +                    rb[dim1] +=
 +                        cg_cm[cg][dim1] - c->bcr1 + tric_sh;
 +                    if (rb[dim1] > 0)
 +                    {
 +                        rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
 +                        /* Take care of coupling of the distances
 +                         * to the planes along dim0 and dim1 through dim2.
 +                         */
 +                        rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
 +                        /* Take care that the cell planes along dim1
 +                         * might not be orthogonal to that along dim2.
 +                         */
 +                        if (normal[dim1][dim2] > 0)
 +                        {
 +                            rb[dim2] -= rb[dim1]*normal[dim1][dim2];
 +                        }
 +                    }
 +                }
 +            }
 +            /* The distance along the communication direction */
 +            rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
 +            tric_sh = 0;
 +            for(i=dim+1; i<DIM; i++)
 +            {
 +                tric_sh -= cg_cm[cg][i]*v_d[i][dim];
 +            }
 +            rn[dim] += tric_sh;
 +            if (rn[dim] > 0)
 +            {
 +                r2 += rn[dim]*rn[dim]*skew_fac2_d;
 +                /* Take care of coupling of the distances
 +                 * to the planes along dim0 and dim1 through dim2.
 +                 */
 +                if (dim_ind == 1 && zonei == 1)
 +                {
 +                    r2 -= rn[dim0]*rn[dim]*skew_fac_01;
 +                }
 +            }
 +            if (bDistMB_pulse)
 +            {
 +                clear_rvec(rb);
 +                rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
 +                if (rb[dim] > 0)
 +                {
 +                    rb2 += rb[dim]*rb[dim]*skew_fac2_d;
 +                    /* Take care of coupling of the distances
 +                     * to the planes along dim0 and dim1 through dim2.
 +                     */
 +                    if (dim_ind == 1 && zonei == 1)
 +                    {
 +                        rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
 +                    }
 +                }
 +            }
 +        }
 +        
 +        if (r2 < r_comm2 ||
 +            (bDistBonded &&
 +             ((bDistMB && rb2 < r_bcomm2) ||
 +              (bDist2B && r2  < r_bcomm2)) &&
 +             (!bBondComm ||
 +              (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
 +               missing_link(comm->cglink,index_gl[cg],
 +                            comm->bLocalCG)))))
 +        {
 +            /* Make an index to the local charge groups */
 +            if (nsend+1 > ind->nalloc)
 +            {
 +                ind->nalloc = over_alloc_large(nsend+1);
 +                srenew(ind->index,ind->nalloc);
 +            }
 +            if (nsend+1 > *ibuf_nalloc)
 +            {
 +                *ibuf_nalloc = over_alloc_large(nsend+1);
 +                srenew(*ibuf,*ibuf_nalloc);
 +            }
 +            ind->index[nsend] = cg;
 +            (*ibuf)[nsend] = index_gl[cg];
 +            nsend_z++;
 +            vec_rvec_check_alloc(vbuf,nsend+1);
 +            
 +            if (dd->ci[dim] == 0)
 +            {
 +                /* Correct cg_cm for pbc */
 +                rvec_add(cg_cm[cg],box[dim],vbuf->v[nsend]);
 +                if (bScrew)
 +                {
 +                    vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
 +                    vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
 +                }
 +            }
 +            else
 +            {
 +                copy_rvec(cg_cm[cg],vbuf->v[nsend]);
 +            }
 +            nsend++;
 +            nat += cgindex[cg+1] - cgindex[cg];
 +        }
 +    }
 +
 +    *nsend_ptr   = nsend;
 +    *nat_ptr     = nat;
 +    *nsend_z_ptr = nsend_z;
 +}
 +
 +static void setup_dd_communication(gmx_domdec_t *dd,
 +                                   matrix box,gmx_ddbox_t *ddbox,
 +                                   t_forcerec *fr,t_state *state,rvec **f)
 +{
 +    int dim_ind,dim,dim0,dim1,dim2,dimd,p,nat_tot;
 +    int nzone,nzone_send,zone,zonei,cg0,cg1;
 +    int c,i,j,cg,cg_gl,nrcg;
 +    int *zone_cg_range,pos_cg,*index_gl,*cgindex,*recv_i;
 +    gmx_domdec_comm_t *comm;
 +    gmx_domdec_zones_t *zones;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t *ind;
 +    cginfo_mb_t *cginfo_mb;
 +    gmx_bool bBondComm,bDist2B,bDistMB,bDistBonded;
 +    real r_mb,r_comm2,r_scomm2,r_bcomm2,r_0,r_1,r2inc,inv_ncg;
 +    dd_corners_t corners;
 +    ivec tric_dist;
 +    rvec *cg_cm,*normal,*v_d,*v_0=NULL,*v_1=NULL,*recv_vr;
 +    real skew_fac2_d,skew_fac_01;
 +    rvec sf2_round;
 +    int  nsend,nat;
 +    int  th;
 +    
 +    if (debug)
 +    {
 +        fprintf(debug,"Setting up DD communication\n");
 +    }
 +    
 +    comm  = dd->comm;
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +    case ecutsGROUP:
 +        cg_cm = fr->cg_cm;
 +        break;
 +    case ecutsVERLET:
 +        cg_cm = state->x;
 +        break;
 +    default:
 +        gmx_incons("unimplemented");
 +        cg_cm = NULL;
 +    }
 +
 +    for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
 +    {
 +        dim = dd->dim[dim_ind];
 +
 +        /* Check if we need to use triclinic distances */
 +        tric_dist[dim_ind] = 0;
 +        for(i=0; i<=dim_ind; i++)
 +        {
 +            if (ddbox->tric_dir[dd->dim[i]])
 +            {
 +                tric_dist[dim_ind] = 1;
 +            }
 +        }
 +    }
 +
 +    bBondComm = comm->bBondComm;
 +
 +    /* Do we need to determine extra distances for multi-body bondeds? */
 +    bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
 +    
 +    /* Do we need to determine extra distances for only two-body bondeds? */
 +    bDist2B = (bBondComm && !bDistMB);
 +
 +    r_comm2  = sqr(comm->cutoff);
 +    r_bcomm2 = sqr(comm->cutoff_mbody);
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"bBondComm %d, r_bc %f\n",bBondComm,sqrt(r_bcomm2));
 +    }
 +
 +    zones = &comm->zones;
 +    
 +    dim0 = dd->dim[0];
 +    dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
 +    dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
 +
 +    set_dd_corners(dd,dim0,dim1,dim2,bDistMB,&corners);
 +    
 +    /* Triclinic stuff */
 +    normal = ddbox->normal;
 +    skew_fac_01 = 0;
 +    if (dd->ndim >= 2)
 +    {
 +        v_0 = ddbox->v[dim0];
 +        if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
 +        {
 +            /* Determine the coupling coefficient for the distances
 +             * to the cell planes along dim0 and dim1 through dim2.
 +             * This is required for correct rounding.
 +             */
 +            skew_fac_01 =
 +                ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
 +            if (debug)
 +            {
 +                fprintf(debug,"\nskew_fac_01 %f\n",skew_fac_01);
 +            }
 +        }
 +    }
 +    if (dd->ndim >= 3)
 +    {
 +        v_1 = ddbox->v[dim1];
 +    }
 +    
 +    zone_cg_range = zones->cg_range;
 +    index_gl = dd->index_gl;
 +    cgindex  = dd->cgindex;
 +    cginfo_mb = fr->cginfo_mb;
 +    
 +    zone_cg_range[0]   = 0;
 +    zone_cg_range[1]   = dd->ncg_home;
 +    comm->zone_ncg1[0] = dd->ncg_home;
 +    pos_cg             = dd->ncg_home;
 +    
 +    nat_tot = dd->nat_home;
 +    nzone = 1;
 +    for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
 +    {
 +        dim = dd->dim[dim_ind];
 +        cd = &comm->cd[dim_ind];
 +        
 +        if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
 +        {
 +            /* No pbc in this dimension, the first node should not comm. */
 +            nzone_send = 0;
 +        }
 +        else
 +        {
 +            nzone_send = nzone;
 +        }
 +
 +        v_d = ddbox->v[dim];
 +        skew_fac2_d = sqr(ddbox->skew_fac[dim]);
 +
 +        cd->bInPlace = TRUE;
 +        for(p=0; p<cd->np; p++)
 +        {
 +            /* Only atoms communicated in the first pulse are used
 +             * for multi-body bonded interactions or for bBondComm.
 +             */
 +            bDistBonded = ((bDistMB || bDist2B) && p == 0);
 +
 +            ind = &cd->ind[p];
 +            nsend = 0;
 +            nat = 0;
 +            for(zone=0; zone<nzone_send; zone++)
 +            {
 +                if (tric_dist[dim_ind] && dim_ind > 0)
 +                {
 +                    /* Determine slightly more optimized skew_fac's
 +                     * for rounding.
 +                     * This reduces the number of communicated atoms
 +                     * by about 10% for 3D DD of rhombic dodecahedra.
 +                     */
 +                    for(dimd=0; dimd<dim; dimd++)
 +                    {
 +                        sf2_round[dimd] = 1;
 +                        if (ddbox->tric_dir[dimd])
 +                        {
 +                            for(i=dd->dim[dimd]+1; i<DIM; i++)
 +                            {
 +                                /* If we are shifted in dimension i
 +                                 * and the cell plane is tilted forward
 +                                 * in dimension i, skip this coupling.
 +                                 */
 +                                if (!(zones->shift[nzone+zone][i] &&
 +                                      ddbox->v[dimd][i][dimd] >= 0))
 +                                {
 +                                    sf2_round[dimd] +=
 +                                        sqr(ddbox->v[dimd][i][dimd]);
 +                                }
 +                            }
 +                            sf2_round[dimd] = 1/sf2_round[dimd];
 +                        }
 +                    }
 +                }
 +
 +                zonei = zone_perm[dim_ind][zone];
 +                if (p == 0)
 +                {
 +                    /* Here we permutate the zones to obtain a convenient order
 +                     * for neighbor searching
 +                     */
 +                    cg0 = zone_cg_range[zonei];
 +                    cg1 = zone_cg_range[zonei+1];
 +                }
 +                else
 +                {
 +                    /* Look only at the cg's received in the previous grid pulse
 +                     */
 +                    cg1 = zone_cg_range[nzone+zone+1];
 +                    cg0 = cg1 - cd->ind[p-1].nrecv[zone];
 +                }
 +
 +#pragma omp parallel for num_threads(comm->nth) schedule(static)
 +                for(th=0; th<comm->nth; th++)
 +                {
 +                    gmx_domdec_ind_t *ind_p;
 +                    int **ibuf_p,*ibuf_nalloc_p;
 +                    vec_rvec_t *vbuf_p;
 +                    int *nsend_p,*nat_p;
 +                    int *nsend_zone_p;
 +                    int cg0_th,cg1_th;
 +
 +                    if (th == 0)
 +                    {
 +                        /* Thread 0 writes in the comm buffers */
 +                        ind_p         = ind;
 +                        ibuf_p        = &comm->buf_int;
 +                        ibuf_nalloc_p = &comm->nalloc_int;
 +                        vbuf_p        = &comm->vbuf;
 +                        nsend_p       = &nsend;
 +                        nat_p         = &nat;
 +                        nsend_zone_p  = &ind->nsend[zone];
 +                    }
 +                    else
 +                    {
 +                        /* Other threads write into temp buffers */
 +                        ind_p         = &comm->dth[th].ind;
 +                        ibuf_p        = &comm->dth[th].ibuf;
 +                        ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
 +                        vbuf_p        = &comm->dth[th].vbuf;
 +                        nsend_p       = &comm->dth[th].nsend;
 +                        nat_p         = &comm->dth[th].nat;
 +                        nsend_zone_p  = &comm->dth[th].nsend_zone;
 +
 +                        comm->dth[th].nsend      = 0;
 +                        comm->dth[th].nat        = 0;
 +                        comm->dth[th].nsend_zone = 0;
 +                    }
 +
 +                    if (comm->nth == 1)
 +                    {
 +                        cg0_th = cg0;
 +                        cg1_th = cg1;
 +                    }
 +                    else
 +                    {
 +                        cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
 +                        cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
 +                    }
 +                    
 +                    /* Get the cg's for this pulse in this zone */
 +                    get_zone_pulse_cgs(dd,zonei,zone,cg0_th,cg1_th,
 +                                       index_gl,cgindex,
 +                                       dim,dim_ind,dim0,dim1,dim2,
 +                                       r_comm2,r_bcomm2,
 +                                       box,tric_dist,
 +                                       normal,skew_fac2_d,skew_fac_01,
 +                                       v_d,v_0,v_1,&corners,sf2_round,
 +                                       bDistBonded,bBondComm,
 +                                       bDist2B,bDistMB,
 +                                       cg_cm,fr->cginfo,
 +                                       ind_p,
 +                                       ibuf_p,ibuf_nalloc_p,
 +                                       vbuf_p,
 +                                       nsend_p,nat_p,
 +                                       nsend_zone_p);
 +                }
 +
 +                /* Append data of threads>=1 to the communication buffers */
 +                for(th=1; th<comm->nth; th++)
 +                {
 +                    dd_comm_setup_work_t *dth;
 +                    int i,ns1;
 +
 +                    dth = &comm->dth[th];
 +
 +                    ns1 = nsend + dth->nsend_zone;
 +                    if (ns1 > ind->nalloc)
 +                    {
 +                        ind->nalloc = over_alloc_dd(ns1);
 +                        srenew(ind->index,ind->nalloc);
 +                    }
 +                    if (ns1 > comm->nalloc_int)
 +                    {
 +                        comm->nalloc_int = over_alloc_dd(ns1);
 +                        srenew(comm->buf_int,comm->nalloc_int);
 +                    }
 +                    if (ns1 > comm->vbuf.nalloc)
 +                    {
 +                        comm->vbuf.nalloc = over_alloc_dd(ns1);
 +                        srenew(comm->vbuf.v,comm->vbuf.nalloc);
 +                    }
 +
 +                    for(i=0; i<dth->nsend_zone; i++)
 +                    {
 +                        ind->index[nsend] = dth->ind.index[i];
 +                        comm->buf_int[nsend] = dth->ibuf[i];
 +                        copy_rvec(dth->vbuf.v[i],
 +                                  comm->vbuf.v[nsend]);
 +                        nsend++;
 +                    }
 +                    nat              += dth->nat;
 +                    ind->nsend[zone] += dth->nsend_zone;
 +                }
 +            }
 +            /* Clear the counts in case we do not have pbc */
 +            for(zone=nzone_send; zone<nzone; zone++)
 +            {
 +                ind->nsend[zone] = 0;
 +            }
 +            ind->nsend[nzone]   = nsend;
 +            ind->nsend[nzone+1] = nat;
 +            /* Communicate the number of cg's and atoms to receive */
 +            dd_sendrecv_int(dd, dim_ind, dddirBackward,
 +                            ind->nsend, nzone+2,
 +                            ind->nrecv, nzone+2);
 +            
 +            /* The rvec buffer is also required for atom buffers of size nsend
 +             * in dd_move_x and dd_move_f.
 +             */
 +            vec_rvec_check_alloc(&comm->vbuf,ind->nsend[nzone+1]);
 +
 +            if (p > 0)
 +            {
 +                /* We can receive in place if only the last zone is not empty */
 +                for(zone=0; zone<nzone-1; zone++)
 +                {
 +                    if (ind->nrecv[zone] > 0)
 +                    {
 +                        cd->bInPlace = FALSE;
 +                    }
 +                }
 +                if (!cd->bInPlace)
 +                {
 +                    /* The int buffer is only required here for the cg indices */
 +                    if (ind->nrecv[nzone] > comm->nalloc_int2)
 +                    {
 +                        comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
 +                        srenew(comm->buf_int2,comm->nalloc_int2);
 +                    }
 +                    /* The rvec buffer is also required for atom buffers
 +                     * of size nrecv in dd_move_x and dd_move_f.
 +                     */
 +                    i = max(cd->ind[0].nrecv[nzone+1],ind->nrecv[nzone+1]);
 +                    vec_rvec_check_alloc(&comm->vbuf2,i);
 +                }
 +            }
 +            
 +            /* Make space for the global cg indices */
 +            if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
 +                || dd->cg_nalloc == 0)
 +            {
 +                dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
 +                srenew(index_gl,dd->cg_nalloc);
 +                srenew(cgindex,dd->cg_nalloc+1);
 +            }
 +            /* Communicate the global cg indices */
 +            if (cd->bInPlace)
 +            {
 +                recv_i = index_gl + pos_cg;
 +            }
 +            else
 +            {
 +                recv_i = comm->buf_int2;
 +            }
 +            dd_sendrecv_int(dd, dim_ind, dddirBackward,
 +                            comm->buf_int, nsend,
 +                            recv_i,        ind->nrecv[nzone]);
 +
 +            /* Make space for cg_cm */
 +            dd_check_alloc_ncg(fr,state,f,pos_cg + ind->nrecv[nzone]);
 +            if (fr->cutoff_scheme == ecutsGROUP)
 +            {
 +                cg_cm = fr->cg_cm;
 +            }
 +            else
 +            {
 +                cg_cm = state->x;
 +            }
 +            /* Communicate cg_cm */
 +            if (cd->bInPlace)
 +            {
 +                recv_vr = cg_cm + pos_cg;
 +            }
 +            else
 +            {
 +                recv_vr = comm->vbuf2.v;
 +            }
 +            dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
 +                             comm->vbuf.v, nsend,
 +                             recv_vr,      ind->nrecv[nzone]);
 +            
 +            /* Make the charge group index */
 +            if (cd->bInPlace)
 +            {
 +                zone = (p == 0 ? 0 : nzone - 1);
 +                while (zone < nzone)
 +                {
 +                    for(cg=0; cg<ind->nrecv[zone]; cg++)
 +                    {
 +                        cg_gl = index_gl[pos_cg];
 +                        fr->cginfo[pos_cg] = ddcginfo(cginfo_mb,cg_gl);
 +                        nrcg = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
 +                        cgindex[pos_cg+1] = cgindex[pos_cg] + nrcg;
 +                        if (bBondComm)
 +                        {
 +                            /* Update the charge group presence,
 +                             * so we can use it in the next pass of the loop.
 +                             */
 +                            comm->bLocalCG[cg_gl] = TRUE;
 +                        }
 +                        pos_cg++;
 +                    }
 +                    if (p == 0)
 +                    {
 +                        comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
 +                    }
 +                    zone++;
 +                    zone_cg_range[nzone+zone] = pos_cg;
 +                }
 +            }
 +            else
 +            {
 +                /* This part of the code is never executed with bBondComm. */
 +                merge_cg_buffers(nzone,cd,p,zone_cg_range,
 +                                 index_gl,recv_i,cg_cm,recv_vr,
 +                                 cgindex,fr->cginfo_mb,fr->cginfo);
 +                pos_cg += ind->nrecv[nzone];
 +            }
 +            nat_tot += ind->nrecv[nzone+1];
 +        }
 +        if (!cd->bInPlace)
 +        {
 +            /* Store the atom block for easy copying of communication buffers */
 +            make_cell2at_index(cd,nzone,zone_cg_range[nzone],cgindex);
 +        }
 +        nzone += nzone;
 +    }
 +    dd->index_gl = index_gl;
 +    dd->cgindex  = cgindex;
 +    
 +    dd->ncg_tot = zone_cg_range[zones->n];
 +    dd->nat_tot = nat_tot;
 +    comm->nat[ddnatHOME] = dd->nat_home;
 +    for(i=ddnatZONE; i<ddnatNR; i++)
 +    {
 +        comm->nat[i] = dd->nat_tot;
 +    }
 +
 +    if (!bBondComm)
 +    {
 +        /* We don't need to update cginfo, since that was alrady done above.
 +         * So we pass NULL for the forcerec.
 +         */
 +        dd_set_cginfo(dd->index_gl,dd->ncg_home,dd->ncg_tot,
 +                      NULL,comm->bLocalCG);
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"Finished setting up DD communication, zones:");
 +        for(c=0; c<zones->n; c++)
 +        {
 +            fprintf(debug," %d",zones->cg_range[c+1]-zones->cg_range[c]);
 +        }
 +        fprintf(debug,"\n");
 +    }
 +}
 +
 +static void set_cg_boundaries(gmx_domdec_zones_t *zones)
 +{
 +    int c;
 +    
 +    for(c=0; c<zones->nizone; c++)
 +    {
 +        zones->izone[c].cg1  = zones->cg_range[c+1];
 +        zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
 +        zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
 +    }
 +}
 +
 +static void set_zones_size(gmx_domdec_t *dd,
 +                           matrix box,const gmx_ddbox_t *ddbox,
 +                           int zone_start,int zone_end)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_domdec_zones_t *zones;
 +    gmx_bool bDistMB;
 +    int  z,zi,zj0,zj1,d,dim;
 +    real rcs,rcmbs;
 +    int  i,j;
 +    real size_j,add_tric;
 +    real vol;
 +
 +    comm = dd->comm;
 +
 +    zones = &comm->zones;
 +
 +    /* Do we need to determine extra distances for multi-body bondeds? */
 +    bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
 +
 +    for(z=zone_start; z<zone_end; z++)
 +    {
 +        /* Copy cell limits to zone limits.
 +         * Valid for non-DD dims and non-shifted dims.
 +         */
 +        copy_rvec(comm->cell_x0,zones->size[z].x0);
 +        copy_rvec(comm->cell_x1,zones->size[z].x1);
 +    }
 +
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +
 +        for(z=0; z<zones->n; z++)
 +        {
 +            /* With a staggered grid we have different sizes
 +             * for non-shifted dimensions.
 +             */
 +            if (dd->bGridJump && zones->shift[z][dim] == 0)
 +            {
 +                if (d == 1)
 +                {
 +                    zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
 +                    zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
 +                }
 +                else if (d == 2)
 +                {
 +                    zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
 +                    zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
 +                }
 +            }
 +        }
 +
 +        rcs   = comm->cutoff;
 +        rcmbs = comm->cutoff_mbody;
 +        if (ddbox->tric_dir[dim])
 +        {
 +            rcs   /= ddbox->skew_fac[dim];
 +            rcmbs /= ddbox->skew_fac[dim];
 +        }
 +
 +        /* Set the lower limit for the shifted zone dimensions */
 +        for(z=zone_start; z<zone_end; z++)
 +        {
 +            if (zones->shift[z][dim] > 0)
 +            {
 +                dim = dd->dim[d];
 +                if (!dd->bGridJump || d == 0)
 +                {
 +                    zones->size[z].x0[dim] = comm->cell_x1[dim];
 +                    zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
 +                }
 +                else
 +                {
 +                    /* Here we take the lower limit of the zone from
 +                     * the lowest domain of the zone below.
 +                     */
 +                    if (z < 4)
 +                    {
 +                        zones->size[z].x0[dim] =
 +                             comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
 +                    }
 +                    else
 +                    {
 +                        if (d == 1)
 +                        {
 +                            zones->size[z].x0[dim] =
 +                                zones->size[zone_perm[2][z-4]].x0[dim];
 +                        }
 +                        else
 +                        {
 +                            zones->size[z].x0[dim] =
 +                                comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
 +                        }
 +                    }
 +                    /* A temporary limit, is updated below */
 +                    zones->size[z].x1[dim] = zones->size[z].x0[dim];
 +
 +                    if (bDistMB)
 +                    {
 +                        for(zi=0; zi<zones->nizone; zi++)
 +                        {
 +                            if (zones->shift[zi][dim] == 0)
 +                            {
 +                                /* This takes the whole zone into account.
 +                                 * With multiple pulses this will lead
 +                                 * to a larger zone then strictly necessary.
 +                                 */
 +                                zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
 +                                                             zones->size[zi].x1[dim]+rcmbs);
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +
 +        /* Loop over the i-zones to set the upper limit of each
 +         * j-zone they see.
 +         */
 +        for(zi=0; zi<zones->nizone; zi++)
 +        {
 +            if (zones->shift[zi][dim] == 0)
 +            {
 +                for(z=zones->izone[zi].j0; z<zones->izone[zi].j1; z++)
 +                {
 +                    if (zones->shift[z][dim] > 0)
 +                    {
 +                        zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
 +                                                     zones->size[zi].x1[dim]+rcs);
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    for(z=zone_start; z<zone_end; z++)
 +    {
-             for(j=i+1; j<ddbox->npbcdim; j++)
++        /* Initialization only required to keep the compiler happy */
++        rvec corner_min={0,0,0},corner_max={0,0,0},corner;
++        int  nc,c;
 +
-                  * the triclinic box, but trilinic x-y and rectangular y-z.
++        /* To determine the bounding box for a zone we need to find
++         * the extreme corners of 4, 2 or 1 corners.
++         */
++        nc = 1 << (ddbox->npbcdim - 1);
++
++        for(c=0; c<nc; c++)
++        {
++            /* Set up a zone corner at x=0, ignoring trilinic couplings */
++            corner[XX] = 0;
++            if ((c & 1) == 0)
++            {
++                corner[YY] = zones->size[z].x0[YY];
++            }
++            else
++            {
++                corner[YY] = zones->size[z].x1[YY];
++            }
++            if ((c & 2) == 0)
++            {
++                corner[ZZ] = zones->size[z].x0[ZZ];
++            }
++            else
++            {
++                corner[ZZ] = zones->size[z].x1[ZZ];
++            }
++            if (dd->ndim == 1 && box[ZZ][YY] != 0)
 +            {
 +                /* With 1D domain decomposition the cg's are not in
-                 if (box[j][i] != 0 &&
-                     !(dd->ndim == 1 && i == YY && j == ZZ))
++                 * the triclinic box, but triclinic x-y and rectangular y-z.
++                 * Shift y back, so it will later end up at 0.
 +                 */
-                     /* Correct for triclinic offset of the lower corner */
-                     add_tric = zones->size[z].x0[j]*box[j][i]/box[j][j];
-                     zones->size[z].bb_x0[i] += add_tric;
-                     zones->size[z].bb_x1[i] += add_tric;
-                     /* Correct for triclinic offset of the upper corner */
-                     size_j = zones->size[z].x1[j] - zones->size[z].x0[j];
-                     add_tric = size_j*box[j][i]/box[j][j];
-                     if (box[j][i] < 0)
-                     {
-                         zones->size[z].bb_x0[i] += add_tric;
-                     }
-                     else
-                     {
-                         zones->size[z].bb_x1[i] += add_tric;
-                     }
++                corner[YY] -= corner[ZZ]*box[ZZ][YY]/box[ZZ][ZZ];
++            }
++            /* Apply the triclinic couplings */
++            for(i=YY; i<ddbox->npbcdim; i++)
++            {
++                for(j=XX; j<i; j++)
 +                {
++                    corner[j] += corner[i]*box[i][j]/box[i][i];
 +                }
 +            }
++            if (c == 0)
++            {
++                copy_rvec(corner,corner_min);
++                copy_rvec(corner,corner_max);
++            }
++            else
++            {
++                for(i=0; i<DIM; i++)
++                {
++                    corner_min[i] = min(corner_min[i],corner[i]);
++                    corner_max[i] = max(corner_max[i],corner[i]);
++                }
++            }
++        }
++        /* Copy the extreme cornes without offset along x */
++        for(i=0; i<DIM; i++)
++        {
++            zones->size[z].bb_x0[i] = corner_min[i];
++            zones->size[z].bb_x1[i] = corner_max[i];
 +        }
++        /* Add the offset along x */
++        zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
++        zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
 +    }
 +
 +    if (zone_start == 0)
 +    {
 +        vol = 1;
 +        for(dim=0; dim<DIM; dim++)
 +        {
 +            vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
 +        }
 +        zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
 +    }
 +
 +    if (debug)
 +    {
 +        for(z=zone_start; z<zone_end; z++)
 +        {
 +            fprintf(debug,"zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
 +                    z,
 +                    zones->size[z].x0[XX],zones->size[z].x1[XX],
 +                    zones->size[z].x0[YY],zones->size[z].x1[YY],
 +                    zones->size[z].x0[ZZ],zones->size[z].x1[ZZ]);
 +            fprintf(debug,"zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
 +                    z,
 +                    zones->size[z].bb_x0[XX],zones->size[z].bb_x1[XX],
 +                    zones->size[z].bb_x0[YY],zones->size[z].bb_x1[YY],
 +                    zones->size[z].bb_x0[ZZ],zones->size[z].bb_x1[ZZ]);
 +        }
 +    }
 +}
 +
 +static int comp_cgsort(const void *a,const void *b)
 +{
 +    int comp;
 +    
 +    gmx_cgsort_t *cga,*cgb;
 +    cga = (gmx_cgsort_t *)a;
 +    cgb = (gmx_cgsort_t *)b;
 +    
 +    comp = cga->nsc - cgb->nsc;
 +    if (comp == 0)
 +    {
 +        comp = cga->ind_gl - cgb->ind_gl;
 +    }
 +    
 +    return comp;
 +}
 +
 +static void order_int_cg(int n,const gmx_cgsort_t *sort,
 +                         int *a,int *buf)
 +{
 +    int i;
 +    
 +    /* Order the data */
 +    for(i=0; i<n; i++)
 +    {
 +        buf[i] = a[sort[i].ind];
 +    }
 +    
 +    /* Copy back to the original array */
 +    for(i=0; i<n; i++)
 +    {
 +        a[i] = buf[i];
 +    }
 +}
 +
 +static void order_vec_cg(int n,const gmx_cgsort_t *sort,
 +                         rvec *v,rvec *buf)
 +{
 +    int i;
 +    
 +    /* Order the data */
 +    for(i=0; i<n; i++)
 +    {
 +        copy_rvec(v[sort[i].ind],buf[i]);
 +    }
 +    
 +    /* Copy back to the original array */
 +    for(i=0; i<n; i++)
 +    {
 +        copy_rvec(buf[i],v[i]);
 +    }
 +}
 +
 +static void order_vec_atom(int ncg,const int *cgindex,const gmx_cgsort_t *sort,
 +                           rvec *v,rvec *buf)
 +{
 +    int a,atot,cg,cg0,cg1,i;
 +    
 +    if (cgindex == NULL)
 +    {
 +        /* Avoid the useless loop of the atoms within a cg */
 +        order_vec_cg(ncg,sort,v,buf);
 +
 +        return;
 +    }
 +
 +    /* Order the data */
 +    a = 0;
 +    for(cg=0; cg<ncg; cg++)
 +    {
 +        cg0 = cgindex[sort[cg].ind];
 +        cg1 = cgindex[sort[cg].ind+1];
 +        for(i=cg0; i<cg1; i++)
 +        {
 +            copy_rvec(v[i],buf[a]);
 +            a++;
 +        }
 +    }
 +    atot = a;
 +    
 +    /* Copy back to the original array */
 +    for(a=0; a<atot; a++)
 +    {
 +        copy_rvec(buf[a],v[a]);
 +    }
 +}
 +
 +static void ordered_sort(int nsort2,gmx_cgsort_t *sort2,
 +                         int nsort_new,gmx_cgsort_t *sort_new,
 +                         gmx_cgsort_t *sort1)
 +{
 +    int i1,i2,i_new;
 +    
 +    /* The new indices are not very ordered, so we qsort them */
 +    qsort_threadsafe(sort_new,nsort_new,sizeof(sort_new[0]),comp_cgsort);
 +    
 +    /* sort2 is already ordered, so now we can merge the two arrays */
 +    i1 = 0;
 +    i2 = 0;
 +    i_new = 0;
 +    while(i2 < nsort2 || i_new < nsort_new)
 +    {
 +        if (i2 == nsort2)
 +        {
 +            sort1[i1++] = sort_new[i_new++];
 +        }
 +        else if (i_new == nsort_new)
 +        {
 +            sort1[i1++] = sort2[i2++];
 +        }
 +        else if (sort2[i2].nsc < sort_new[i_new].nsc ||
 +                 (sort2[i2].nsc == sort_new[i_new].nsc &&
 +                  sort2[i2].ind_gl < sort_new[i_new].ind_gl))
 +        {
 +            sort1[i1++] = sort2[i2++];
 +        }
 +        else
 +        {
 +            sort1[i1++] = sort_new[i_new++];
 +        }
 +    }
 +}
 +
 +static int dd_sort_order(gmx_domdec_t *dd,t_forcerec *fr,int ncg_home_old)
 +{
 +    gmx_domdec_sort_t *sort;
 +    gmx_cgsort_t *cgsort,*sort_i;
 +    int  ncg_new,nsort2,nsort_new,i,*a,moved,*ibuf;
 +    int  sort_last,sort_skip;
 +
 +    sort = dd->comm->sort;
 +
 +    a = fr->ns.grid->cell_index;
 +
 +    moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
 +
 +    if (ncg_home_old >= 0)
 +    {
 +        /* The charge groups that remained in the same ns grid cell
 +         * are completely ordered. So we can sort efficiently by sorting
 +         * the charge groups that did move into the stationary list.
 +         */
 +        ncg_new = 0;
 +        nsort2 = 0;
 +        nsort_new = 0;
 +        for(i=0; i<dd->ncg_home; i++)
 +        {
 +            /* Check if this cg did not move to another node */
 +            if (a[i] < moved)
 +            {
 +                if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
 +                {
 +                    /* This cg is new on this node or moved ns grid cell */
 +                    if (nsort_new >= sort->sort_new_nalloc)
 +                    {
 +                        sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
 +                        srenew(sort->sort_new,sort->sort_new_nalloc);
 +                    }
 +                    sort_i = &(sort->sort_new[nsort_new++]);
 +                }
 +                else
 +                {
 +                    /* This cg did not move */
 +                    sort_i = &(sort->sort2[nsort2++]);
 +                }
 +                /* Sort on the ns grid cell indices
 +                 * and the global topology index.
 +                 * index_gl is irrelevant with cell ns,
 +                 * but we set it here anyhow to avoid a conditional.
 +                 */
 +                sort_i->nsc    = a[i];
 +                sort_i->ind_gl = dd->index_gl[i];
 +                sort_i->ind    = i;
 +                ncg_new++;
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug,"ordered sort cgs: stationary %d moved %d\n",
 +                    nsort2,nsort_new);
 +        }
 +        /* Sort efficiently */
 +        ordered_sort(nsort2,sort->sort2,nsort_new,sort->sort_new,
 +                     sort->sort);
 +    }
 +    else
 +    {
 +        cgsort = sort->sort;
 +        ncg_new = 0;
 +        for(i=0; i<dd->ncg_home; i++)
 +        {
 +            /* Sort on the ns grid cell indices
 +             * and the global topology index
 +             */
 +            cgsort[i].nsc    = a[i];
 +            cgsort[i].ind_gl = dd->index_gl[i];
 +            cgsort[i].ind    = i;
 +            if (cgsort[i].nsc < moved)
 +            {
 +                ncg_new++;
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug,"qsort cgs: %d new home %d\n",dd->ncg_home,ncg_new);
 +        }
 +        /* Determine the order of the charge groups using qsort */
 +        qsort_threadsafe(cgsort,dd->ncg_home,sizeof(cgsort[0]),comp_cgsort);
 +    }
 +
 +    return ncg_new;
 +}
 +
 +static int dd_sort_order_nbnxn(gmx_domdec_t *dd,t_forcerec *fr)
 +{
 +    gmx_cgsort_t *sort;
 +    int  ncg_new,i,*a,na;
 +
 +    sort = dd->comm->sort->sort;
 +
 +    nbnxn_get_atomorder(fr->nbv->nbs,&a,&na);
 +
 +    ncg_new = 0;
 +    for(i=0; i<na; i++)
 +    {
 +        if (a[i] >= 0)
 +        {
 +            sort[ncg_new].ind = a[i];
 +            ncg_new++;
 +        }
 +    }
 +
 +    return ncg_new;
 +}
 +
 +static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
 +                          rvec *cgcm,t_forcerec *fr,t_state *state,
 +                          int ncg_home_old)
 +{
 +    gmx_domdec_sort_t *sort;
 +    gmx_cgsort_t *cgsort,*sort_i;
 +    int  *cgindex;
 +    int  ncg_new,i,*ibuf,cgsize;
 +    rvec *vbuf;
 +    
 +    sort = dd->comm->sort;
 +    
 +    if (dd->ncg_home > sort->sort_nalloc)
 +    {
 +        sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
 +        srenew(sort->sort,sort->sort_nalloc);
 +        srenew(sort->sort2,sort->sort_nalloc);
 +    }
 +    cgsort = sort->sort;
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +    case ecutsGROUP:
 +        ncg_new = dd_sort_order(dd,fr,ncg_home_old);
 +        break;
 +    case ecutsVERLET:
 +        ncg_new = dd_sort_order_nbnxn(dd,fr);
 +        break;
 +    default:
 +        gmx_incons("unimplemented");
 +        ncg_new = 0;
 +    }
 +
 +    /* We alloc with the old size, since cgindex is still old */
 +    vec_rvec_check_alloc(&dd->comm->vbuf,dd->cgindex[dd->ncg_home]);
 +    vbuf = dd->comm->vbuf.v;
 +    
 +    if (dd->comm->bCGs)
 +    {
 +        cgindex = dd->cgindex;
 +    }
 +    else
 +    {
 +        cgindex = NULL;
 +    }
 +
 +    /* Remove the charge groups which are no longer at home here */
 +    dd->ncg_home = ncg_new;
 +    if (debug)
 +    {
 +        fprintf(debug,"Set the new home charge group count to %d\n",
 +                dd->ncg_home);
 +    }
 +    
 +    /* Reorder the state */
 +    for(i=0; i<estNR; i++)
 +    {
 +        if (EST_DISTR(i) && (state->flags & (1<<i)))
 +        {
 +            switch (i)
 +            {
 +            case estX:
 +                order_vec_atom(dd->ncg_home,cgindex,cgsort,state->x,vbuf);
 +                break;
 +            case estV:
 +                order_vec_atom(dd->ncg_home,cgindex,cgsort,state->v,vbuf);
 +                break;
 +            case estSDX:
 +                order_vec_atom(dd->ncg_home,cgindex,cgsort,state->sd_X,vbuf);
 +                break;
 +            case estCGP:
 +                order_vec_atom(dd->ncg_home,cgindex,cgsort,state->cg_p,vbuf);
 +                break;
 +            case estLD_RNG:
 +            case estLD_RNGI:
 +            case estDISRE_INITF:
 +            case estDISRE_RM3TAV:
 +            case estORIRE_INITF:
 +            case estORIRE_DTAV:
 +                /* No ordering required */
 +                break;
 +            default:
 +                gmx_incons("Unknown state entry encountered in dd_sort_state");
 +                break;
 +            }
 +        }
 +    }
 +    if (fr->cutoff_scheme == ecutsGROUP)
 +    {
 +        /* Reorder cgcm */
 +        order_vec_cg(dd->ncg_home,cgsort,cgcm,vbuf);
 +    }
 +    
 +    if (dd->ncg_home+1 > sort->ibuf_nalloc)
 +    {
 +        sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
 +        srenew(sort->ibuf,sort->ibuf_nalloc);
 +    }
 +    ibuf = sort->ibuf;
 +    /* Reorder the global cg index */
 +    order_int_cg(dd->ncg_home,cgsort,dd->index_gl,ibuf);
 +    /* Reorder the cginfo */
 +    order_int_cg(dd->ncg_home,cgsort,fr->cginfo,ibuf);
 +    /* Rebuild the local cg index */
 +    if (dd->comm->bCGs)
 +    {
 +        ibuf[0] = 0;
 +        for(i=0; i<dd->ncg_home; i++)
 +        {
 +            cgsize = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
 +            ibuf[i+1] = ibuf[i] + cgsize;
 +        }
 +        for(i=0; i<dd->ncg_home+1; i++)
 +        {
 +            dd->cgindex[i] = ibuf[i];
 +        }
 +    }
 +    else
 +    {
 +        for(i=0; i<dd->ncg_home+1; i++)
 +        {
 +            dd->cgindex[i] = i;
 +        }
 +    }
 +    /* Set the home atom number */
 +    dd->nat_home = dd->cgindex[dd->ncg_home];
 +
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        /* The atoms are now exactly in grid order, update the grid order */
 +        nbnxn_set_atomorder(fr->nbv->nbs);
 +    }
 +    else
 +    {
 +        /* Copy the sorted ns cell indices back to the ns grid struct */
 +        for(i=0; i<dd->ncg_home; i++)
 +        {
 +            fr->ns.grid->cell_index[i] = cgsort[i].nsc;
 +        }
 +        fr->ns.grid->nr = dd->ncg_home;
 +    }
 +}
 +
 +static void add_dd_statistics(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int ddnat;
 +    
 +    comm = dd->comm;
 +    
 +    for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
 +    {
 +        comm->sum_nat[ddnat-ddnatZONE] +=
 +            comm->nat[ddnat] - comm->nat[ddnat-1];
 +    }
 +    comm->ndecomp++;
 +}
 +
 +void reset_dd_statistics_counters(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int ddnat;
 +    
 +    comm = dd->comm;
 +
 +    /* Reset all the statistics and counters for total run counting */
 +    for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
 +    {
 +        comm->sum_nat[ddnat-ddnatZONE] = 0;
 +    }
 +    comm->ndecomp = 0;
 +    comm->nload = 0;
 +    comm->load_step = 0;
 +    comm->load_sum = 0;
 +    comm->load_max = 0;
 +    clear_ivec(comm->load_lim);
 +    comm->load_mdf = 0;
 +    comm->load_pme = 0;
 +}
 +
 +void print_dd_statistics(t_commrec *cr,t_inputrec *ir,FILE *fplog)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int ddnat;
 +    double av;
 +   
 +    comm = cr->dd->comm;
 +    
 +    gmx_sumd(ddnatNR-ddnatZONE,comm->sum_nat,cr);
 +    
 +    if (fplog == NULL)
 +    {
 +        return;
 +    }
 +    
 +    fprintf(fplog,"\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
 +            
 +    for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
 +    {
 +        av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
 +        switch(ddnat)
 +        {
 +        case ddnatZONE:
 +            fprintf(fplog,
 +                    " av. #atoms communicated per step for force:  %d x %.1f\n",
 +                    2,av);
 +            break;
 +        case ddnatVSITE:
 +            if (cr->dd->vsite_comm)
 +            {
 +                fprintf(fplog,
 +                        " av. #atoms communicated per step for vsites: %d x %.1f\n",
 +                        (EEL_PME(ir->coulombtype) || ir->coulombtype==eelEWALD) ? 3 : 2,
 +                        av);
 +            }
 +            break;
 +        case ddnatCON:
 +            if (cr->dd->constraint_comm)
 +            {
 +                fprintf(fplog,
 +                        " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
 +                        1 + ir->nLincsIter,av);
 +            }
 +            break;
 +        default:
 +            gmx_incons(" Unknown type for DD statistics");
 +        }
 +    }
 +    fprintf(fplog,"\n");
 +    
 +    if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
 +    {
 +        print_dd_load_av(fplog,cr->dd);
 +    }
 +}
 +
 +void dd_partition_system(FILE            *fplog,
 +                         gmx_large_int_t      step,
 +                         t_commrec       *cr,
 +                         gmx_bool            bMasterState,
 +                         int             nstglobalcomm,
 +                         t_state         *state_global,
 +                         gmx_mtop_t      *top_global,
 +                         t_inputrec      *ir,
 +                         t_state         *state_local,
 +                         rvec            **f,
 +                         t_mdatoms       *mdatoms,
 +                         gmx_localtop_t  *top_local,
 +                         t_forcerec      *fr,
 +                         gmx_vsite_t     *vsite,
 +                         gmx_shellfc_t   shellfc,
 +                         gmx_constr_t    constr,
 +                         t_nrnb          *nrnb,
 +                         gmx_wallcycle_t wcycle,
 +                         gmx_bool            bVerbose)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_domdec_comm_t *comm;
 +    gmx_ddbox_t ddbox={0};
 +    t_block *cgs_gl;
 +    gmx_large_int_t step_pcoupl;
 +    rvec cell_ns_x0,cell_ns_x1;
 +    int  i,j,n,cg0=0,ncg_home_old=-1,ncg_moved,nat_f_novirsum;
 +    gmx_bool bBoxChanged,bNStGlobalComm,bDoDLB,bCheckDLB,bTurnOnDLB,bLogLoad;
 +    gmx_bool bRedist,bSortCG,bResortAll;
 +    ivec ncells_old={0,0,0},ncells_new={0,0,0},np;
 +    real grid_density;
 +    char sbuf[22];
 +      
 +    dd = cr->dd;
 +    comm = dd->comm;
 +
 +    bBoxChanged = (bMasterState || DEFORM(*ir));
 +    if (ir->epc != epcNO)
 +    {
 +        /* With nstpcouple > 1 pressure coupling happens.
 +         * one step after calculating the pressure.
 +         * Box scaling happens at the end of the MD step,
 +         * after the DD partitioning.
 +         * We therefore have to do DLB in the first partitioning
 +         * after an MD step where P-coupling occured.
 +         * We need to determine the last step in which p-coupling occurred.
 +         * MRS -- need to validate this for vv?
 +         */
 +        n = ir->nstpcouple;
 +        if (n == 1)
 +        {
 +            step_pcoupl = step - 1;
 +        }
 +        else
 +        {
 +            step_pcoupl = ((step - 1)/n)*n + 1;
 +        }
 +        if (step_pcoupl >= comm->partition_step)
 +        {
 +            bBoxChanged = TRUE;
 +        }
 +    }
 +
 +    bNStGlobalComm = (step % nstglobalcomm == 0);
 +
 +    if (!comm->bDynLoadBal)
 +    {
 +        bDoDLB = FALSE;
 +    }
 +    else
 +    {
 +        /* Should we do dynamic load balacing this step?
 +         * Since it requires (possibly expensive) global communication,
 +         * we might want to do DLB less frequently.
 +         */
 +        if (bBoxChanged || ir->epc != epcNO)
 +        {
 +            bDoDLB = bBoxChanged;
 +        }
 +        else
 +        {
 +            bDoDLB = bNStGlobalComm;
 +        }
 +    }
 +
 +    /* Check if we have recorded loads on the nodes */
 +    if (comm->bRecordLoad && dd_load_count(comm))
 +    {
 +        if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
 +        {
 +            /* Check if we should use DLB at the second partitioning
 +             * and every 100 partitionings,
 +             * so the extra communication cost is negligible.
 +             */
 +            n = max(100,nstglobalcomm);
 +            bCheckDLB = (comm->n_load_collect == 0 ||
 +                         comm->n_load_have % n == n-1);
 +        }
 +        else
 +        {
 +            bCheckDLB = FALSE;
 +        }
 +        
 +        /* Print load every nstlog, first and last step to the log file */
 +        bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
 +                    comm->n_load_collect == 0 ||
 +                    (ir->nsteps >= 0 &&
 +                     (step + ir->nstlist > ir->init_step + ir->nsteps)));
 +
 +        /* Avoid extra communication due to verbose screen output
 +         * when nstglobalcomm is set.
 +         */
 +        if (bDoDLB || bLogLoad || bCheckDLB ||
 +            (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
 +        {
 +            get_load_distribution(dd,wcycle);
 +            if (DDMASTER(dd))
 +            {
 +                if (bLogLoad)
 +                {
 +                    dd_print_load(fplog,dd,step-1);
 +                }
 +                if (bVerbose)
 +                {
 +                    dd_print_load_verbose(dd);
 +                }
 +            }
 +            comm->n_load_collect++;
 +
 +            if (bCheckDLB) {
 +                /* Since the timings are node dependent, the master decides */
 +                if (DDMASTER(dd))
 +                {
 +                    bTurnOnDLB =
 +                        (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
 +                    if (debug)
 +                    {
 +                        fprintf(debug,"step %s, imb loss %f\n",
 +                                gmx_step_str(step,sbuf),
 +                                dd_force_imb_perf_loss(dd));
 +                    }
 +                }
 +                dd_bcast(dd,sizeof(bTurnOnDLB),&bTurnOnDLB);
 +                if (bTurnOnDLB)
 +                {
 +                    turn_on_dlb(fplog,cr,step);
 +                    bDoDLB = TRUE;
 +                }
 +            }
 +        }
 +        comm->n_load_have++;
 +    }
 +
 +    cgs_gl = &comm->cgs_gl;
 +
 +    bRedist = FALSE;
 +    if (bMasterState)
 +    {
 +        /* Clear the old state */
 +        clear_dd_indices(dd,0,0);
 +
 +        set_ddbox(dd,bMasterState,cr,ir,state_global->box,
 +                  TRUE,cgs_gl,state_global->x,&ddbox);
 +    
 +        get_cg_distribution(fplog,step,dd,cgs_gl,
 +                            state_global->box,&ddbox,state_global->x);
 +        
 +        dd_distribute_state(dd,cgs_gl,
 +                            state_global,state_local,f);
 +        
 +        dd_make_local_cgs(dd,&top_local->cgs);
 +        
 +        /* Ensure that we have space for the new distribution */
 +        dd_check_alloc_ncg(fr,state_local,f,dd->ncg_home);
 +
 +        if (fr->cutoff_scheme == ecutsGROUP)
 +        {
 +            calc_cgcm(fplog,0,dd->ncg_home,
 +                      &top_local->cgs,state_local->x,fr->cg_cm);
 +        }
 +        
 +        inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
 +        
 +        dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
 +
 +        cg0 = 0;
 +    }
 +    else if (state_local->ddp_count != dd->ddp_count)
 +    {
 +        if (state_local->ddp_count > dd->ddp_count)
 +        {
 +            gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)",state_local->ddp_count,dd->ddp_count);
 +        }
 +        
 +        if (state_local->ddp_count_cg_gl != state_local->ddp_count)
 +        {
 +            gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)",state_local->ddp_count_cg_gl,state_local->ddp_count);
 +        }
 +        
 +        /* Clear the old state */
 +        clear_dd_indices(dd,0,0);
 +        
 +        /* Build the new indices */
 +        rebuild_cgindex(dd,cgs_gl->index,state_local);
 +        make_dd_indices(dd,cgs_gl->index,0);
 +
 +        if (fr->cutoff_scheme == ecutsGROUP)
 +        {
 +            /* Redetermine the cg COMs */
 +            calc_cgcm(fplog,0,dd->ncg_home,
 +                      &top_local->cgs,state_local->x,fr->cg_cm);
 +        }
 +        
 +        inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
 +
 +        dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
 +
 +        set_ddbox(dd,bMasterState,cr,ir,state_local->box,
 +                  TRUE,&top_local->cgs,state_local->x,&ddbox);
 +
 +        bRedist = comm->bDynLoadBal;
 +    }
 +    else
 +    {
 +        /* We have the full state, only redistribute the cgs */
 +
 +        /* Clear the non-home indices */
 +        clear_dd_indices(dd,dd->ncg_home,dd->nat_home);
 +
 +        /* Avoid global communication for dim's without pbc and -gcom */
 +        if (!bNStGlobalComm)
 +        {
 +            copy_rvec(comm->box0    ,ddbox.box0    );
 +            copy_rvec(comm->box_size,ddbox.box_size);
 +        }
 +        set_ddbox(dd,bMasterState,cr,ir,state_local->box,
 +                  bNStGlobalComm,&top_local->cgs,state_local->x,&ddbox);
 +
 +        bBoxChanged = TRUE;
 +        bRedist = TRUE;
 +    }
 +    /* For dim's without pbc and -gcom */
 +    copy_rvec(ddbox.box0    ,comm->box0    );
 +    copy_rvec(ddbox.box_size,comm->box_size);
 +    
 +    set_dd_cell_sizes(dd,&ddbox,dynamic_dd_box(&ddbox,ir),bMasterState,bDoDLB,
 +                      step,wcycle);
 +    
 +    if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
 +    {
 +        write_dd_grid_pdb("dd_grid",step,dd,state_local->box,&ddbox);
 +    }
 +    
 +    /* Check if we should sort the charge groups */
 +    if (comm->nstSortCG > 0)
 +    {
 +        bSortCG = (bMasterState ||
 +                   (bRedist && (step % comm->nstSortCG == 0)));
 +    }
 +    else
 +    {
 +        bSortCG = FALSE;
 +    }
 +
 +    ncg_home_old = dd->ncg_home;
 +
 +    ncg_moved = 0;
 +    if (bRedist)
 +    {
 +        wallcycle_sub_start(wcycle,ewcsDD_REDIST);
 +
 +        dd_redistribute_cg(fplog,step,dd,ddbox.tric_dir,
 +                           state_local,f,fr,mdatoms,
 +                           !bSortCG,nrnb,&cg0,&ncg_moved);
 +
 +        wallcycle_sub_stop(wcycle,ewcsDD_REDIST);
 +    }
 +    
 +    get_nsgrid_boundaries(ddbox.nboundeddim,state_local->box,
 +                          dd,&ddbox,
 +                          &comm->cell_x0,&comm->cell_x1,
 +                          dd->ncg_home,fr->cg_cm,
 +                          cell_ns_x0,cell_ns_x1,&grid_density);
 +
 +    if (bBoxChanged)
 +    {
 +        comm_dd_ns_cell_sizes(dd,&ddbox,cell_ns_x0,cell_ns_x1,step);
 +    }
 +
 +    switch (fr->cutoff_scheme)
 +    {
 +    case ecutsGROUP:
 +        copy_ivec(fr->ns.grid->n,ncells_old);
 +        grid_first(fplog,fr->ns.grid,dd,&ddbox,fr->ePBC,
 +                   state_local->box,cell_ns_x0,cell_ns_x1,
 +                   fr->rlistlong,grid_density);
 +        break;
 +    case ecutsVERLET:
 +        nbnxn_get_ncells(fr->nbv->nbs,&ncells_old[XX],&ncells_old[YY]);
 +        break;
 +    default:
 +        gmx_incons("unimplemented");
 +    }
 +    /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
 +    copy_ivec(ddbox.tric_dir,comm->tric_dir);
 +
 +    if (bSortCG)
 +    {
 +        wallcycle_sub_start(wcycle,ewcsDD_GRID);
 +
 +        /* Sort the state on charge group position.
 +         * This enables exact restarts from this step.
 +         * It also improves performance by about 15% with larger numbers
 +         * of atoms per node.
 +         */
 +        
 +        /* Fill the ns grid with the home cell,
 +         * so we can sort with the indices.
 +         */
 +        set_zones_ncg_home(dd);
 +
 +        switch (fr->cutoff_scheme)
 +        {
 +        case ecutsVERLET:
 +            set_zones_size(dd,state_local->box,&ddbox,0,1);
 +
 +            nbnxn_put_on_grid(fr->nbv->nbs,fr->ePBC,state_local->box,
 +                              0,
 +                              comm->zones.size[0].bb_x0,
 +                              comm->zones.size[0].bb_x1,
 +                              0,dd->ncg_home,
 +                              comm->zones.dens_zone0,
 +                              fr->cginfo,
 +                              state_local->x,
 +                              ncg_moved,comm->moved,
 +                              fr->nbv->grp[eintLocal].kernel_type,
 +                              fr->nbv->grp[eintLocal].nbat);
 +
 +            nbnxn_get_ncells(fr->nbv->nbs,&ncells_new[XX],&ncells_new[YY]);
 +            break;
 +        case ecutsGROUP:
 +            fill_grid(fplog,&comm->zones,fr->ns.grid,dd->ncg_home,
 +                      0,dd->ncg_home,fr->cg_cm);
 +            
 +            copy_ivec(fr->ns.grid->n,ncells_new);
 +            break;
 +        default:
 +            gmx_incons("unimplemented");
 +        }
 +
 +        bResortAll = bMasterState;
 +   
 +        /* Check if we can user the old order and ns grid cell indices
 +         * of the charge groups to sort the charge groups efficiently.
 +         */
 +        if (ncells_new[XX] != ncells_old[XX] ||
 +            ncells_new[YY] != ncells_old[YY] ||
 +            ncells_new[ZZ] != ncells_old[ZZ])
 +        {
 +            bResortAll = TRUE;
 +        }
 +
 +        if (debug)
 +        {
 +            fprintf(debug,"Step %s, sorting the %d home charge groups\n",
 +                    gmx_step_str(step,sbuf),dd->ncg_home);
 +        }
 +        dd_sort_state(dd,ir->ePBC,fr->cg_cm,fr,state_local,
 +                      bResortAll ? -1 : ncg_home_old);
 +        /* Rebuild all the indices */
 +        cg0 = 0;
 +        ga2la_clear(dd->ga2la);
 +
 +        wallcycle_sub_stop(wcycle,ewcsDD_GRID);
 +    }
 +
 +    wallcycle_sub_start(wcycle,ewcsDD_SETUPCOMM);
 +    
 +    /* Setup up the communication and communicate the coordinates */
 +    setup_dd_communication(dd,state_local->box,&ddbox,fr,state_local,f);
 +    
 +    /* Set the indices */
 +    make_dd_indices(dd,cgs_gl->index,cg0);
 +
 +    /* Set the charge group boundaries for neighbor searching */
 +    set_cg_boundaries(&comm->zones);
 +
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        set_zones_size(dd,state_local->box,&ddbox,
 +                       bSortCG ? 1 : 0,comm->zones.n);
 +    }
 +
 +    wallcycle_sub_stop(wcycle,ewcsDD_SETUPCOMM);
 +
 +    /*
 +    write_dd_pdb("dd_home",step,"dump",top_global,cr,
 +                 -1,state_local->x,state_local->box);
 +    */
 +
 +    wallcycle_sub_start(wcycle,ewcsDD_MAKETOP);
 +    
 +    /* Extract a local topology from the global topology */
 +    for(i=0; i<dd->ndim; i++)
 +    {
 +        np[dd->dim[i]] = comm->cd[i].np;
 +    }
 +    dd_make_local_top(fplog,dd,&comm->zones,dd->npbcdim,state_local->box,
 +                      comm->cellsize_min,np,
 +                      fr,
 +                      fr->cutoff_scheme==ecutsGROUP ? fr->cg_cm : state_local->x,
 +                      vsite,top_global,top_local);
 +
 +    wallcycle_sub_stop(wcycle,ewcsDD_MAKETOP);
 +
 +    wallcycle_sub_start(wcycle,ewcsDD_MAKECONSTR);
 +    
 +    /* Set up the special atom communication */
 +    n = comm->nat[ddnatZONE];
 +    for(i=ddnatZONE+1; i<ddnatNR; i++)
 +    {
 +        switch(i)
 +        {
 +        case ddnatVSITE:
 +            if (vsite && vsite->n_intercg_vsite)
 +            {
 +                n = dd_make_local_vsites(dd,n,top_local->idef.il);
 +            }
 +            break;
 +        case ddnatCON:
 +            if (dd->bInterCGcons || dd->bInterCGsettles)
 +            {
 +                /* Only for inter-cg constraints we need special code */
 +                n = dd_make_local_constraints(dd,n,top_global,fr->cginfo,
 +                                              constr,ir->nProjOrder,
 +                                              top_local->idef.il);
 +            }
 +            break;
 +        default:
 +            gmx_incons("Unknown special atom type setup");
 +        }
 +        comm->nat[i] = n;
 +    }
 +
 +    wallcycle_sub_stop(wcycle,ewcsDD_MAKECONSTR);
 +
 +    wallcycle_sub_start(wcycle,ewcsDD_TOPOTHER);
 +
 +    /* Make space for the extra coordinates for virtual site
 +     * or constraint communication.
 +     */
 +    state_local->natoms = comm->nat[ddnatNR-1];
 +    if (state_local->natoms > state_local->nalloc)
 +    {
 +        dd_realloc_state(state_local,f,state_local->natoms);
 +    }
 +
 +    if (fr->bF_NoVirSum)
 +    {
 +        if (vsite && vsite->n_intercg_vsite)
 +        {
 +            nat_f_novirsum = comm->nat[ddnatVSITE];
 +        }
 +        else
 +        {
 +            if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
 +            {
 +                nat_f_novirsum = dd->nat_tot;
 +            }
 +            else
 +            {
 +                nat_f_novirsum = dd->nat_home;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        nat_f_novirsum = 0;
 +    }
 +
 +    /* Set the number of atoms required for the force calculation.
 +     * Forces need to be constrained when using a twin-range setup
 +     * or with energy minimization. For simple simulations we could
 +     * avoid some allocation, zeroing and copying, but this is
 +     * probably not worth the complications ande checking.
 +     */
 +    forcerec_set_ranges(fr,dd->ncg_home,dd->ncg_tot,
 +                        dd->nat_tot,comm->nat[ddnatCON],nat_f_novirsum);
 +
 +    /* We make the all mdatoms up to nat_tot_con.
 +     * We could save some work by only setting invmass
 +     * between nat_tot and nat_tot_con.
 +     */
 +    /* This call also sets the new number of home particles to dd->nat_home */
 +    atoms2md(top_global,ir,
 +             comm->nat[ddnatCON],dd->gatindex,0,dd->nat_home,mdatoms);
 +
 +    /* Now we have the charges we can sort the FE interactions */
 +    dd_sort_local_top(dd,mdatoms,top_local);
 +
 +    if (vsite != NULL)
 +    {
 +        /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
 +        split_vsites_over_threads(top_local->idef.il,mdatoms,FALSE,vsite);
 +    }
 +
 +    if (shellfc)
 +    {
 +        /* Make the local shell stuff, currently no communication is done */
 +        make_local_shells(cr,mdatoms,shellfc);
 +    }
 +    
 +      if (ir->implicit_solvent)
 +    {
 +        make_local_gb(cr,fr->born,ir->gb_algorithm);
 +    }
 +
 +    init_bonded_thread_force_reduction(fr,&top_local->idef);
 +
 +    if (!(cr->duty & DUTY_PME))
 +    {
 +        /* Send the charges to our PME only node */
 +        gmx_pme_send_q(cr,mdatoms->nChargePerturbed,
 +                       mdatoms->chargeA,mdatoms->chargeB,
 +                       dd_pme_maxshift_x(dd),dd_pme_maxshift_y(dd));
 +    }
 +    
 +    if (constr)
 +    {
 +        set_constraints(constr,top_local,ir,mdatoms,cr);
 +    }
 +    
 +    if (ir->ePull != epullNO)
 +    {
 +        /* Update the local pull groups */
 +        dd_make_local_pull_groups(dd,ir->pull,mdatoms);
 +    }
 +    
 +    if (ir->bRot)
 +    {
 +        /* Update the local rotation groups */
 +        dd_make_local_rotation_groups(dd,ir->rot);
 +    }
 +
 +
 +    add_dd_statistics(dd);
 +    
 +    /* Make sure we only count the cycles for this DD partitioning */
 +    clear_dd_cycle_counts(dd);
 +    
 +    /* Because the order of the atoms might have changed since
 +     * the last vsite construction, we need to communicate the constructing
 +     * atom coordinates again (for spreading the forces this MD step).
 +     */
 +    dd_move_x_vsites(dd,state_local->box,state_local->x);
 +
 +    wallcycle_sub_stop(wcycle,ewcsDD_TOPOTHER);
 +    
 +    if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
 +    {
 +        dd_move_x(dd,state_local->box,state_local->x);
 +        write_dd_pdb("dd_dump",step,"dump",top_global,cr,
 +                     -1,state_local->x,state_local->box);
 +    }
 +
 +    /* Store the partitioning step */
 +    comm->partition_step = step;
 +    
 +    /* Increase the DD partitioning counter */
 +    dd->ddp_count++;
 +    /* The state currently matches this DD partitioning count, store it */
 +    state_local->ddp_count = dd->ddp_count;
 +    if (bMasterState)
 +    {
 +        /* The DD master node knows the complete cg distribution,
 +         * store the count so we can possibly skip the cg info communication.
 +         */
 +        comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
 +    }
 +
 +    if (comm->DD_debug > 0)
 +    {
 +        /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
 +        check_index_consistency(dd,top_global->natoms,ncg_mtop(top_global),
 +                                "after partitioning");
 +    }
 +}
index a4322845b59465be955942820b5231f062f2ea6c,0000000000000000000000000000000000000000..f9d87bb2a1cf7ec2c8e2491fdc2f2d7beb60d839
mode 100644,000000..100644
--- /dev/null
@@@ -1,306 -1,0 +1,305 @@@
-                     case F_VTEMP:       u = unit_temp_K;   break;
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +/* This file is completely threadsafe - keep it that way! */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +#include <string.h>
 +#include "sysstuff.h"
 +#include "smalloc.h"
 +#include "typedefs.h"
 +#include "gmx_fatal.h"
 +#include "string2.h"
 +#include "ebin.h"
 +#include "main.h"
 +#include "maths.h"
 +#include "vec.h"
 +#include "physics.h"
 +
 +t_ebin *mk_ebin(void)
 +{
 +  t_ebin *eb;
 +  
 +  snew(eb,1);
 +  
 +  return eb;
 +}
 +
 +int get_ebin_space(t_ebin *eb,int nener,const char *enm[],const char *unit)
 +{
 +    int  index;
 +    int  i,f;
 +    const char *u;
 +
 +    index = eb->nener;
 +    eb->nener += nener;
 +    srenew(eb->e,eb->nener);
 +    srenew(eb->e_sim,eb->nener);
 +    srenew(eb->enm,eb->nener);
 +    for(i=index; (i<eb->nener); i++)
 +    {
 +        eb->e[i].e        = 0;
 +        eb->e[i].eav      = 0;
 +        eb->e[i].esum     = 0;
 +        eb->e_sim[i].e    = 0;
 +        eb->e_sim[i].eav  = 0;
 +        eb->e_sim[i].esum = 0;
 +        eb->enm[i].name = strdup(enm[i-index]);
 +        if (unit != NULL)
 +        {
 +            eb->enm[i].unit = strdup(unit);
 +        }
 +        else
 +        {
 +            /* Determine the unit from the longname.
 +             * These units should have been defined in ifunc.c
 +             * But even better would be if all interactions functions
 +             * return energies and all non-interaction function
 +             * entries would be removed from the ifunc array.
 +             */
 +            u = unit_energy;
 +            for(f=0; f<F_NRE; f++)
 +            {
 +                if (strcmp(eb->enm[i].name,
 +                           interaction_function[f].longname) == 0)
 +                {
 +                    /* Only the terms in this list are not energies */
 +                    switch (f) {
 +                    case F_DISRESVIOL: u = unit_length;   break;
 +                    case F_ORIRESDEV:  u = "obs";         break;
 +                    case F_TEMP:       u = unit_temp_K;   break;
 +                    case F_PDISPCORR:
 +                    case F_PRES:       u = unit_pres_bar; break;
 +                    }
 +                }
 +            }
 +            eb->enm[i].unit = strdup(u);
 +        }
 +    }
 +    
 +    return index;
 +}
 +
 +void add_ebin(t_ebin *eb,int index,int nener,real ener[],gmx_bool bSum)
 +{
 +    int      i,m;
 +    double   e,sum,sigma,invmm,diff;
 +    t_energy *eg,*egs;
 +    
 +    if ((index+nener > eb->nener) || (index < 0))
 +    {
 +        gmx_fatal(FARGS,"%s-%d: Energies out of range: index=%d nener=%d maxener=%d",
 +                  __FILE__,__LINE__,index,nener,eb->nener);
 +    }
 +    
 +    eg = &(eb->e[index]);
 +    
 +    for(i=0; (i<nener); i++)
 +    {
 +        eg[i].e      = ener[i];
 +    }
 +    
 +    if (bSum)
 +    {
 +        egs = &(eb->e_sim[index]);
 +        
 +        m = eb->nsum;
 +        
 +        if (m == 0)
 +        {
 +            for(i=0; (i<nener); i++)
 +            {
 +                eg[i].eav    = 0;
 +                eg[i].esum   = ener[i];
 +                egs[i].esum += ener[i];
 +            }
 +        }
 +        else
 +        {
 +            invmm = (1.0/(double)m)/((double)m+1.0);
 +            
 +            for(i=0; (i<nener); i++)
 +            {
 +                /* Value for this component */
 +                e = ener[i];
 +                
 +                /* first update sigma, then sum */
 +                diff         = eg[i].esum - m*e;
 +                eg[i].eav   += diff*diff*invmm;
 +                eg[i].esum  += e;
 +                egs[i].esum += e;
 +            }
 +        }
 +    }
 +}
 +
 +void ebin_increase_count(t_ebin *eb,gmx_bool bSum)
 +{
 +    eb->nsteps++;
 +    eb->nsteps_sim++;
 +
 +    if (bSum)
 +    {
 +        eb->nsum++;
 +        eb->nsum_sim++;
 +    }
 +}
 +
 +void reset_ebin_sums(t_ebin *eb)
 +{
 +    eb->nsteps = 0;
 +    eb->nsum   = 0;
 +    /* The actual sums are cleared when the next frame is stored */
 +}
 +
 +void pr_ebin(FILE *fp,t_ebin *eb,int index,int nener,int nperline,
 +             int prmode,gmx_bool bPrHead)
 +{
 +    int  i,j,i0;
 +    real ee=0;
 +    int  rc;
 +    char buf[30];
 +
 +    rc = 0;
 +
 +    if (index < 0)
 +    {
 +        gmx_fatal(FARGS,"Invalid index in pr_ebin: %d",index);
 +    }
 +    if (nener == -1)
 +    {
 +        nener = eb->nener;
 +    }
 +    else
 +    {
 +        nener = index + nener;
 +    }
 +    for(i=index; (i<nener) && rc>=0; ) 
 +    {
 +        if (bPrHead)
 +        {
 +            i0=i;
 +            for(j=0; (j<nperline) && (i<nener) && rc>=0; j++,i++)
 +            {
 +                if (strncmp(eb->enm[i].name,"Pres",4) == 0)
 +                {
 +                    /* Print the pressure unit to avoid confusion */
 +                    sprintf(buf,"%s (%s)",eb->enm[i].name,unit_pres_bar);
 +                    rc = fprintf(fp,"%15s",buf);
 +                }
 +                else
 +                {
 +                    rc = fprintf(fp,"%15s",eb->enm[i].name);
 +                }
 +            }
 +
 +            if (rc >= 0)
 +            {
 +                rc = fprintf(fp,"\n");
 +            }
 +
 +            i=i0;
 +        }
 +        for(j=0; (j<nperline) && (i<nener) && rc>=0; j++,i++)
 +        {
 +            switch (prmode) {
 +                case eprNORMAL: ee = eb->e[i].e; break;
 +                case eprAVER:   ee = eb->e_sim[i].esum/eb->nsum_sim; break;
 +                default: gmx_fatal(FARGS,"Invalid print mode %d in pr_ebin",
 +                                   prmode);
 +            }
 +
 +            rc = fprintf(fp,"   %12.5e",ee);
 +        }
 +        if (rc >= 0)
 +        {
 +            rc = fprintf(fp,"\n");
 +        }
 +    }
 +    if (rc < 0)
 +    { 
 +        gmx_fatal(FARGS,"Cannot write to logfile; maybe you are out of disk space?");
 +    }
 +}
 +
 +#ifdef DEBUGEBIN
 +int main(int argc,char *argv[])
 +{
 +#define NE 12
 +#define NT 7
 +#define NS 5
 +
 +  t_ebin *eb;
 +  int    i;
 +  char   buf[25];
 +  char   *ce[NE],*ct[NT],*cs[NS];
 +  real   e[NE],t[NT],s[NS];
 +  int    ie,it,is;
 +  
 +  eb=mk_ebin();
 +  for(i=0; (i<NE); i++) {
 +    e[i]=i;
 +    sprintf(buf,"e%d",i);
 +    ce[i]=strdup(buf);
 +  }
 +  ie=get_ebin_space(eb,NE,ce);
 +  add_ebin(eb,ie,NE,e,0);
 +  for(i=0; (i<NS); i++) {
 +    s[i]=i;
 +    sprintf(buf,"s%d",i);
 +    cs[i]=strdup(buf);
 +  }
 +  is=get_ebin_space(eb,NS,cs);
 +  add_ebin(eb,is,NS,s,0);
 +  for(i=0; (i<NT); i++) {
 +    t[i]=i;
 +    sprintf(buf,"t%d",i);
 +    ct[i]=strdup(buf);
 +  }
 +  it=get_ebin_space(eb,NT,ct);
 +  add_ebin(eb,it,NT,t,0);
 +  
 +  printf("Normal:\n");
 +  pr_ebin(stdout,eb,0,-1,5,eprNORMAL,1);
 +
 +  printf("Average:\n");
 +  pr_ebin(stdout,eb,ie,NE,5,eprAVER,1);
 +  pr_ebin(stdout,eb,is,NS,3,eprAVER,1);
 +  pr_ebin(stdout,eb,it,NT,4,eprAVER,1);
 +}
 +#endif
index 4d684843e3da506049764380b649d11b332ffe28,0000000000000000000000000000000000000000..304ef6081ce987d45733b204f62692b7f08474f8
mode 100644,000000..100644
--- /dev/null
@@@ -1,2734 -1,0 +1,2748 @@@
-             if (!init_gpu(cr->nodeid_group_intra, gpu_err_str, &hwinfo->gpu_info))
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +#include <string.h>
 +#include <assert.h>
 +#include "sysstuff.h"
 +#include "typedefs.h"
 +#include "vec.h"
 +#include "maths.h"
 +#include "macros.h"
 +#include "smalloc.h"
 +#include "macros.h"
 +#include "gmx_fatal.h"
 +#include "gmx_fatal_collective.h"
 +#include "physics.h"
 +#include "force.h"
 +#include "tables.h"
 +#include "nonbonded.h"
 +#include "invblock.h"
 +#include "names.h"
 +#include "network.h"
 +#include "pbc.h"
 +#include "ns.h"
 +#include "mshift.h"
 +#include "txtdump.h"
 +#include "coulomb.h"
 +#include "md_support.h"
 +#include "md_logging.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "qmmm.h"
 +#include "copyrite.h"
 +#include "mtop_util.h"
 +#include "nbnxn_search.h"
 +#include "nbnxn_atomdata.h"
 +#include "nbnxn_consts.h"
 +#include "statutil.h"
 +#include "gmx_omp_nthreads.h"
 +
 +#ifdef _MSC_VER
 +/* MSVC definition for __cpuid() */
 +#include <intrin.h>
 +#endif
 +
 +#include "types/nbnxn_cuda_types_ext.h"
 +#include "gpu_utils.h"
 +#include "nbnxn_cuda_data_mgmt.h"
 +#include "pmalloc_cuda.h"
 +
 +t_forcerec *mk_forcerec(void)
 +{
 +  t_forcerec *fr;
 +  
 +  snew(fr,1);
 +  
 +  return fr;
 +}
 +
 +#ifdef DEBUG
 +static void pr_nbfp(FILE *fp,real *nbfp,gmx_bool bBHAM,int atnr)
 +{
 +  int i,j;
 +  
 +  for(i=0; (i<atnr); i++) {
 +    for(j=0; (j<atnr); j++) {
 +      fprintf(fp,"%2d - %2d",i,j);
 +      if (bBHAM)
 +      fprintf(fp,"  a=%10g, b=%10g, c=%10g\n",BHAMA(nbfp,atnr,i,j),
 +              BHAMB(nbfp,atnr,i,j),BHAMC(nbfp,atnr,i,j)/6.0);
 +      else
 +      fprintf(fp,"  c6=%10g, c12=%10g\n",C6(nbfp,atnr,i,j)/6.0,
 +            C12(nbfp,atnr,i,j)/12.0);
 +    }
 +  }
 +}
 +#endif
 +
 +static real *mk_nbfp(const gmx_ffparams_t *idef,gmx_bool bBHAM)
 +{
 +  real *nbfp;
 +  int  i,j,k,atnr;
 +  
 +  atnr=idef->atnr;
 +  if (bBHAM) {
 +    snew(nbfp,3*atnr*atnr);
 +    for(i=k=0; (i<atnr); i++) {
 +      for(j=0; (j<atnr); j++,k++) {
 +          BHAMA(nbfp,atnr,i,j) = idef->iparams[k].bham.a;
 +          BHAMB(nbfp,atnr,i,j) = idef->iparams[k].bham.b;
 +          /* nbfp now includes the 6.0 derivative prefactor */
 +          BHAMC(nbfp,atnr,i,j) = idef->iparams[k].bham.c*6.0;
 +      }
 +    }
 +  }
 +  else {
 +    snew(nbfp,2*atnr*atnr);
 +    for(i=k=0; (i<atnr); i++) {
 +      for(j=0; (j<atnr); j++,k++) {
 +          /* nbfp now includes the 6.0/12.0 derivative prefactors */
 +          C6(nbfp,atnr,i,j)   = idef->iparams[k].lj.c6*6.0;
 +          C12(nbfp,atnr,i,j)  = idef->iparams[k].lj.c12*12.0;
 +      }
 +    }
 +  }
 +
 +  return nbfp;
 +}
 +
 +/* This routine sets fr->solvent_opt to the most common solvent in the 
 + * system, e.g. esolSPC or esolTIP4P. It will also mark each charge group in 
 + * the fr->solvent_type array with the correct type (or esolNO).
 + *
 + * Charge groups that fulfill the conditions but are not identical to the
 + * most common one will be marked as esolNO in the solvent_type array. 
 + *
 + * TIP3p is identical to SPC for these purposes, so we call it
 + * SPC in the arrays (Apologies to Bill Jorgensen ;-)
 + * 
 + * NOTE: QM particle should not
 + * become an optimized solvent. Not even if there is only one charge
 + * group in the Qm 
 + */
 +
 +typedef struct 
 +{
 +    int    model;          
 +    int    count;
 +    int    vdwtype[4];
 +    real   charge[4];
 +} solvent_parameters_t;
 +
 +static void
 +check_solvent_cg(const gmx_moltype_t   *molt,
 +                 int                   cg0,
 +                 int                   nmol,
 +                 const unsigned char   *qm_grpnr,
 +                 const t_grps          *qm_grps,
 +                 t_forcerec *          fr,
 +                 int                   *n_solvent_parameters,
 +                 solvent_parameters_t  **solvent_parameters_p,
 +                 int                   cginfo,
 +                 int                   *cg_sp)
 +{
 +    const t_blocka *  excl;
 +    t_atom            *atom;
 +    int               j,k;
 +    int               j0,j1,nj;
 +    gmx_bool              perturbed;
 +    gmx_bool              has_vdw[4];
 +    gmx_bool              match;
 +    real              tmp_charge[4];
 +    int               tmp_vdwtype[4];
 +    int               tjA;
 +    gmx_bool              qm;
 +    solvent_parameters_t *solvent_parameters;
 +
 +    /* We use a list with parameters for each solvent type. 
 +     * Every time we discover a new molecule that fulfills the basic 
 +     * conditions for a solvent we compare with the previous entries
 +     * in these lists. If the parameters are the same we just increment
 +     * the counter for that type, and otherwise we create a new type
 +     * based on the current molecule.
 +     *
 +     * Once we've finished going through all molecules we check which
 +     * solvent is most common, and mark all those molecules while we
 +     * clear the flag on all others.
 +     */   
 +
 +    solvent_parameters = *solvent_parameters_p;
 +
 +    /* Mark the cg first as non optimized */
 +    *cg_sp = -1;
 +    
 +    /* Check if this cg has no exclusions with atoms in other charge groups
 +     * and all atoms inside the charge group excluded.
 +     * We only have 3 or 4 atom solvent loops.
 +     */
 +    if (GET_CGINFO_EXCL_INTER(cginfo) ||
 +        !GET_CGINFO_EXCL_INTRA(cginfo))
 +    {
 +        return;
 +    }
 +
 +    /* Get the indices of the first atom in this charge group */
 +    j0     = molt->cgs.index[cg0];
 +    j1     = molt->cgs.index[cg0+1];
 +    
 +    /* Number of atoms in our molecule */
 +    nj     = j1 - j0;
 +
 +    if (debug) {
 +        fprintf(debug,
 +                "Moltype '%s': there are %d atoms in this charge group\n",
 +                *molt->name,nj);
 +    }
 +    
 +    /* Check if it could be an SPC (3 atoms) or TIP4p (4) water,
 +     * otherwise skip it.
 +     */
 +    if (nj<3 || nj>4)
 +    {
 +        return;
 +    }
 +    
 +    /* Check if we are doing QM on this group */
 +    qm = FALSE; 
 +    if (qm_grpnr != NULL)
 +    {
 +        for(j=j0 ; j<j1 && !qm; j++)
 +        {
 +            qm = (qm_grpnr[j] < qm_grps->nr - 1);
 +        }
 +    }
 +    /* Cannot use solvent optimization with QM */
 +    if (qm)
 +    {
 +        return;
 +    }
 +    
 +    atom = molt->atoms.atom;
 +
 +    /* Still looks like a solvent, time to check parameters */
 +    
 +    /* If it is perturbed (free energy) we can't use the solvent loops,
 +     * so then we just skip to the next molecule.
 +     */   
 +    perturbed = FALSE; 
 +    
 +    for(j=j0; j<j1 && !perturbed; j++)
 +    {
 +        perturbed = PERTURBED(atom[j]);
 +    }
 +    
 +    if (perturbed)
 +    {
 +        return;
 +    }
 +    
 +    /* Now it's only a question if the VdW and charge parameters 
 +     * are OK. Before doing the check we compare and see if they are 
 +     * identical to a possible previous solvent type.
 +     * First we assign the current types and charges.    
 +     */
 +    for(j=0; j<nj; j++)
 +    {
 +        tmp_vdwtype[j] = atom[j0+j].type;
 +        tmp_charge[j]  = atom[j0+j].q;
 +    } 
 +    
 +    /* Does it match any previous solvent type? */
 +    for(k=0 ; k<*n_solvent_parameters; k++)
 +    {
 +        match = TRUE;
 +        
 +        
 +        /* We can only match SPC with 3 atoms and TIP4p with 4 atoms */
 +        if( (solvent_parameters[k].model==esolSPC   && nj!=3)  ||
 +            (solvent_parameters[k].model==esolTIP4P && nj!=4) )
 +            match = FALSE;
 +        
 +        /* Check that types & charges match for all atoms in molecule */
 +        for(j=0 ; j<nj && match==TRUE; j++)
 +        {                     
 +            if (tmp_vdwtype[j] != solvent_parameters[k].vdwtype[j])
 +            {
 +                match = FALSE;
 +            }
 +            if(tmp_charge[j] != solvent_parameters[k].charge[j])
 +            {
 +                match = FALSE;
 +            }
 +        }
 +        if (match == TRUE)
 +        {
 +            /* Congratulations! We have a matched solvent.
 +             * Flag it with this type for later processing.
 +             */
 +            *cg_sp = k;
 +            solvent_parameters[k].count += nmol;
 +
 +            /* We are done with this charge group */
 +            return;
 +        }
 +    }
 +    
 +    /* If we get here, we have a tentative new solvent type.
 +     * Before we add it we must check that it fulfills the requirements
 +     * of the solvent optimized loops. First determine which atoms have
 +     * VdW interactions.   
 +     */
 +    for(j=0; j<nj; j++) 
 +    {
 +        has_vdw[j] = FALSE;
 +        tjA        = tmp_vdwtype[j];
 +        
 +        /* Go through all other tpes and see if any have non-zero
 +         * VdW parameters when combined with this one.
 +         */   
 +        for(k=0; k<fr->ntype && (has_vdw[j]==FALSE); k++)
 +        {
 +            /* We already checked that the atoms weren't perturbed,
 +             * so we only need to check state A now.
 +             */ 
 +            if (fr->bBHAM) 
 +            {
 +                has_vdw[j] = (has_vdw[j] || 
 +                              (BHAMA(fr->nbfp,fr->ntype,tjA,k) != 0.0) ||
 +                              (BHAMB(fr->nbfp,fr->ntype,tjA,k) != 0.0) ||
 +                              (BHAMC(fr->nbfp,fr->ntype,tjA,k) != 0.0));
 +            }
 +            else
 +            {
 +                /* Standard LJ */
 +                has_vdw[j] = (has_vdw[j] || 
 +                              (C6(fr->nbfp,fr->ntype,tjA,k)  != 0.0) ||
 +                              (C12(fr->nbfp,fr->ntype,tjA,k) != 0.0));
 +            }
 +        }
 +    }
 +    
 +    /* Now we know all we need to make the final check and assignment. */
 +    if (nj == 3)
 +    {
 +        /* So, is it an SPC?
 +         * For this we require thatn all atoms have charge, 
 +         * the charges on atom 2 & 3 should be the same, and only
 +         * atom 1 might have VdW.
 +         */
 +        if (has_vdw[1] == FALSE &&
 +            has_vdw[2] == FALSE &&
 +            tmp_charge[0]  != 0 &&
 +            tmp_charge[1]  != 0 &&
 +            tmp_charge[2]  == tmp_charge[1])
 +        {
 +            srenew(solvent_parameters,*n_solvent_parameters+1);
 +            solvent_parameters[*n_solvent_parameters].model = esolSPC;
 +            solvent_parameters[*n_solvent_parameters].count = nmol;
 +            for(k=0;k<3;k++)
 +            {
 +                solvent_parameters[*n_solvent_parameters].vdwtype[k] = tmp_vdwtype[k];
 +                solvent_parameters[*n_solvent_parameters].charge[k]  = tmp_charge[k];
 +            }
 +
 +            *cg_sp = *n_solvent_parameters;
 +            (*n_solvent_parameters)++;
 +        }
 +    }
 +    else if (nj==4)
 +    {
 +        /* Or could it be a TIP4P?
 +         * For this we require thatn atoms 2,3,4 have charge, but not atom 1. 
 +         * Only atom 1 mght have VdW.
 +         */
 +        if(has_vdw[1] == FALSE &&
 +           has_vdw[2] == FALSE &&
 +           has_vdw[3] == FALSE &&
 +           tmp_charge[0]  == 0 &&
 +           tmp_charge[1]  != 0 &&
 +           tmp_charge[2]  == tmp_charge[1] &&
 +           tmp_charge[3]  != 0)
 +        {
 +            srenew(solvent_parameters,*n_solvent_parameters+1);
 +            solvent_parameters[*n_solvent_parameters].model = esolTIP4P;
 +            solvent_parameters[*n_solvent_parameters].count = nmol;
 +            for(k=0;k<4;k++)
 +            {
 +                solvent_parameters[*n_solvent_parameters].vdwtype[k] = tmp_vdwtype[k];
 +                solvent_parameters[*n_solvent_parameters].charge[k]  = tmp_charge[k];
 +            }
 +            
 +            *cg_sp = *n_solvent_parameters;
 +            (*n_solvent_parameters)++;
 +        }
 +    }
 +
 +    *solvent_parameters_p = solvent_parameters;
 +}
 +
 +static void
 +check_solvent(FILE *                fp,
 +              const gmx_mtop_t *    mtop,
 +              t_forcerec *          fr,
 +              cginfo_mb_t           *cginfo_mb)
 +{
 +    const t_block *   cgs;
 +    const t_block *   mols;
 +    const gmx_moltype_t *molt;
 +    int               mb,mol,cg_mol,at_offset,cg_offset,am,cgm,i,nmol_ch,nmol;
 +    int               n_solvent_parameters;
 +    solvent_parameters_t *solvent_parameters;
 +    int               **cg_sp;
 +    int               bestsp,bestsol;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"Going to determine what solvent types we have.\n");
 +    }
 +
 +    mols = &mtop->mols;
 +
 +    n_solvent_parameters = 0;
 +    solvent_parameters = NULL;
 +    /* Allocate temporary array for solvent type */
 +    snew(cg_sp,mtop->nmolblock);
 +
 +    cg_offset = 0;
 +    at_offset = 0;
 +    for(mb=0; mb<mtop->nmolblock; mb++)
 +    {
 +        molt = &mtop->moltype[mtop->molblock[mb].type];
 +        cgs  = &molt->cgs;
 +        /* Here we have to loop over all individual molecules
 +         * because we need to check for QMMM particles.
 +         */
 +        snew(cg_sp[mb],cginfo_mb[mb].cg_mod);
 +        nmol_ch = cginfo_mb[mb].cg_mod/cgs->nr;
 +        nmol    = mtop->molblock[mb].nmol/nmol_ch;
 +        for(mol=0; mol<nmol_ch; mol++)
 +        {
 +            cgm = mol*cgs->nr;
 +            am  = mol*cgs->index[cgs->nr];
 +            for(cg_mol=0; cg_mol<cgs->nr; cg_mol++)
 +            {
 +                check_solvent_cg(molt,cg_mol,nmol,
 +                                 mtop->groups.grpnr[egcQMMM] ?
 +                                 mtop->groups.grpnr[egcQMMM]+at_offset+am : 0,
 +                                 &mtop->groups.grps[egcQMMM],
 +                                 fr,
 +                                 &n_solvent_parameters,&solvent_parameters,
 +                                 cginfo_mb[mb].cginfo[cgm+cg_mol],
 +                                 &cg_sp[mb][cgm+cg_mol]);
 +            }
 +        }
 +        cg_offset += cgs->nr;
 +        at_offset += cgs->index[cgs->nr];
 +    }
 +
 +    /* Puh! We finished going through all charge groups.
 +     * Now find the most common solvent model.
 +     */   
 +    
 +    /* Most common solvent this far */
 +    bestsp = -2;
 +    for(i=0;i<n_solvent_parameters;i++)
 +    {
 +        if (bestsp == -2 ||
 +            solvent_parameters[i].count > solvent_parameters[bestsp].count)
 +        {
 +            bestsp = i;
 +        }
 +    }
 +    
 +    if (bestsp >= 0)
 +    {
 +        bestsol = solvent_parameters[bestsp].model;
 +    }
 +    else
 +    {
 +        bestsol = esolNO;
 +    }
 +    
 +#ifdef DISABLE_WATER_NLIST
 +      bestsol = esolNO;
 +#endif
 +
 +    fr->nWatMol = 0;
 +    for(mb=0; mb<mtop->nmolblock; mb++)
 +    {
 +        cgs = &mtop->moltype[mtop->molblock[mb].type].cgs;
 +        nmol = (mtop->molblock[mb].nmol*cgs->nr)/cginfo_mb[mb].cg_mod;
 +        for(i=0; i<cginfo_mb[mb].cg_mod; i++)
 +        {
 +            if (cg_sp[mb][i] == bestsp)
 +            {
 +                SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[i],bestsol);
 +                fr->nWatMol += nmol;
 +            }
 +            else
 +            {
 +                SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[i],esolNO);
 +            }
 +        }
 +        sfree(cg_sp[mb]);
 +    }
 +    sfree(cg_sp);
 +    
 +    if (bestsol != esolNO && fp!=NULL)
 +    {
 +        fprintf(fp,"\nEnabling %s-like water optimization for %d molecules.\n\n",
 +                esol_names[bestsol],
 +                solvent_parameters[bestsp].count);
 +    }
 +
 +    sfree(solvent_parameters);
 +    fr->solvent_opt = bestsol;
 +}
 +
 +enum { acNONE=0, acCONSTRAINT, acSETTLE };
 +
 +static cginfo_mb_t *init_cginfo_mb(FILE *fplog,const gmx_mtop_t *mtop,
 +                                   t_forcerec *fr,gmx_bool bNoSolvOpt,
 +                                   gmx_bool *bExcl_IntraCGAll_InterCGNone)
 +{
 +    const t_block *cgs;
 +    const t_blocka *excl;
 +    const gmx_moltype_t *molt;
 +    const gmx_molblock_t *molb;
 +    cginfo_mb_t *cginfo_mb;
 +    gmx_bool *type_VDW;
 +    int  *cginfo;
 +    int  cg_offset,a_offset,cgm,am;
 +    int  mb,m,ncg_tot,cg,a0,a1,gid,ai,j,aj,excl_nalloc;
 +    int  *a_con;
 +    int  ftype;
 +    int  ia;
 +    gmx_bool bId,*bExcl,bExclIntraAll,bExclInter,bHaveVDW,bHaveQ;
 +
 +    ncg_tot = ncg_mtop(mtop);
 +    snew(cginfo_mb,mtop->nmolblock);
 +
 +    snew(type_VDW,fr->ntype);
 +    for(ai=0; ai<fr->ntype; ai++)
 +    {
 +        type_VDW[ai] = FALSE;
 +        for(j=0; j<fr->ntype; j++)
 +        {
 +            type_VDW[ai] = type_VDW[ai] ||
 +                fr->bBHAM ||
 +                C6(fr->nbfp,fr->ntype,ai,j) != 0 ||
 +                C12(fr->nbfp,fr->ntype,ai,j) != 0;
 +        }
 +    }
 +
 +    *bExcl_IntraCGAll_InterCGNone = TRUE;
 +
 +    excl_nalloc = 10;
 +    snew(bExcl,excl_nalloc);
 +    cg_offset = 0;
 +    a_offset  = 0;
 +    for(mb=0; mb<mtop->nmolblock; mb++)
 +    {
 +        molb = &mtop->molblock[mb];
 +        molt = &mtop->moltype[molb->type];
 +        cgs  = &molt->cgs;
 +        excl = &molt->excls;
 +
 +        /* Check if the cginfo is identical for all molecules in this block.
 +         * If so, we only need an array of the size of one molecule.
 +         * Otherwise we make an array of #mol times #cgs per molecule.
 +         */
 +        bId = TRUE;
 +        am = 0;
 +        for(m=0; m<molb->nmol; m++)
 +        {
 +            am = m*cgs->index[cgs->nr];
 +            for(cg=0; cg<cgs->nr; cg++)
 +            {
 +                a0 = cgs->index[cg];
 +                a1 = cgs->index[cg+1];
 +                if (ggrpnr(&mtop->groups,egcENER,a_offset+am+a0) !=
 +                    ggrpnr(&mtop->groups,egcENER,a_offset   +a0))
 +                {
 +                    bId = FALSE;
 +                }
 +                if (mtop->groups.grpnr[egcQMMM] != NULL)
 +                {
 +                    for(ai=a0; ai<a1; ai++)
 +                    {
 +                        if (mtop->groups.grpnr[egcQMMM][a_offset+am+ai] !=
 +                            mtop->groups.grpnr[egcQMMM][a_offset   +ai])
 +                        {
 +                            bId = FALSE;
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +
 +        cginfo_mb[mb].cg_start = cg_offset;
 +        cginfo_mb[mb].cg_end   = cg_offset + molb->nmol*cgs->nr;
 +        cginfo_mb[mb].cg_mod   = (bId ? 1 : molb->nmol)*cgs->nr;
 +        snew(cginfo_mb[mb].cginfo,cginfo_mb[mb].cg_mod);
 +        cginfo = cginfo_mb[mb].cginfo;
 +
 +        /* Set constraints flags for constrained atoms */
 +        snew(a_con,molt->atoms.nr);
 +        for(ftype=0; ftype<F_NRE; ftype++)
 +        {
 +            if (interaction_function[ftype].flags & IF_CONSTRAINT)
 +            {
 +                int nral;
 +
 +                nral = NRAL(ftype);
 +                for(ia=0; ia<molt->ilist[ftype].nr; ia+=1+nral)
 +                {
 +                    int a;
 +
 +                    for(a=0; a<nral; a++)
 +                    {
 +                        a_con[molt->ilist[ftype].iatoms[ia+1+a]] =
 +                            (ftype == F_SETTLE ? acSETTLE : acCONSTRAINT);
 +                    }
 +                }
 +            }
 +        }
 +
 +        for(m=0; m<(bId ? 1 : molb->nmol); m++)
 +        {
 +            cgm = m*cgs->nr;
 +            am  = m*cgs->index[cgs->nr];
 +            for(cg=0; cg<cgs->nr; cg++)
 +            {
 +                a0 = cgs->index[cg];
 +                a1 = cgs->index[cg+1];
 +
 +                /* Store the energy group in cginfo */
 +                gid = ggrpnr(&mtop->groups,egcENER,a_offset+am+a0);
 +                SET_CGINFO_GID(cginfo[cgm+cg],gid);
 +                
 +                /* Check the intra/inter charge group exclusions */
 +                if (a1-a0 > excl_nalloc) {
 +                    excl_nalloc = a1 - a0;
 +                    srenew(bExcl,excl_nalloc);
 +                }
 +                /* bExclIntraAll: all intra cg interactions excluded
 +                 * bExclInter:    any inter cg interactions excluded
 +                 */
 +                bExclIntraAll = TRUE;
 +                bExclInter    = FALSE;
 +                bHaveVDW      = FALSE;
 +                bHaveQ        = FALSE;
 +                for(ai=a0; ai<a1; ai++)
 +                {
 +                    /* Check VDW and electrostatic interactions */
 +                    bHaveVDW = bHaveVDW || (type_VDW[molt->atoms.atom[ai].type] ||
 +                                            type_VDW[molt->atoms.atom[ai].typeB]);
 +                    bHaveQ  = bHaveQ    || (molt->atoms.atom[ai].q != 0 ||
 +                                            molt->atoms.atom[ai].qB != 0);
 +
 +                    /* Clear the exclusion list for atom ai */
 +                    for(aj=a0; aj<a1; aj++)
 +                    {
 +                        bExcl[aj-a0] = FALSE;
 +                    }
 +                    /* Loop over all the exclusions of atom ai */
 +                    for(j=excl->index[ai]; j<excl->index[ai+1]; j++)
 +                    {
 +                        aj = excl->a[j];
 +                        if (aj < a0 || aj >= a1)
 +                        {
 +                            bExclInter = TRUE;
 +                        }
 +                        else
 +                        {
 +                            bExcl[aj-a0] = TRUE;
 +                        }
 +                    }
 +                    /* Check if ai excludes a0 to a1 */
 +                    for(aj=a0; aj<a1; aj++)
 +                    {
 +                        if (!bExcl[aj-a0])
 +                        {
 +                            bExclIntraAll = FALSE;
 +                        }
 +                    }
 +
 +                    switch (a_con[ai])
 +                    {
 +                    case acCONSTRAINT:
 +                        SET_CGINFO_CONSTR(cginfo[cgm+cg]);
 +                        break;
 +                    case acSETTLE:
 +                        SET_CGINFO_SETTLE(cginfo[cgm+cg]);
 +                        break;
 +                    default:
 +                        break;
 +                    }
 +                }
 +                if (bExclIntraAll)
 +                {
 +                    SET_CGINFO_EXCL_INTRA(cginfo[cgm+cg]);
 +                }
 +                if (bExclInter)
 +                {
 +                    SET_CGINFO_EXCL_INTER(cginfo[cgm+cg]);
 +                }
 +                if (a1 - a0 > MAX_CHARGEGROUP_SIZE)
 +                {
 +                    /* The size in cginfo is currently only read with DD */
 +                    gmx_fatal(FARGS,"A charge group has size %d which is larger than the limit of %d atoms",a1-a0,MAX_CHARGEGROUP_SIZE);
 +                }
 +                if (bHaveVDW)
 +                {
 +                    SET_CGINFO_HAS_VDW(cginfo[cgm+cg]);
 +                }
 +                if (bHaveQ)
 +                {
 +                    SET_CGINFO_HAS_Q(cginfo[cgm+cg]);
 +                }
 +                /* Store the charge group size */
 +                SET_CGINFO_NATOMS(cginfo[cgm+cg],a1-a0);
 +
 +                if (!bExclIntraAll || bExclInter)
 +                {
 +                    *bExcl_IntraCGAll_InterCGNone = FALSE;
 +                }
 +            }
 +        }
 +
 +        sfree(a_con);
 +
 +        cg_offset += molb->nmol*cgs->nr;
 +        a_offset  += molb->nmol*cgs->index[cgs->nr];
 +    }
 +    sfree(bExcl);
 +    
 +    /* the solvent optimizer is called after the QM is initialized,
 +     * because we don't want to have the QM subsystemto become an
 +     * optimized solvent
 +     */
 +
 +    check_solvent(fplog,mtop,fr,cginfo_mb);
 +    
 +    if (getenv("GMX_NO_SOLV_OPT"))
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Found environment variable GMX_NO_SOLV_OPT.\n"
 +                    "Disabling all solvent optimization\n");
 +        }
 +        fr->solvent_opt = esolNO;
 +    }
 +    if (bNoSolvOpt)
 +    {
 +        fr->solvent_opt = esolNO;
 +    }
 +    if (!fr->solvent_opt)
 +    {
 +        for(mb=0; mb<mtop->nmolblock; mb++)
 +        {
 +            for(cg=0; cg<cginfo_mb[mb].cg_mod; cg++)
 +            {
 +                SET_CGINFO_SOLOPT(cginfo_mb[mb].cginfo[cg],esolNO);
 +            }
 +        }
 +    }
 +    
 +    return cginfo_mb;
 +}
 +
 +static int *cginfo_expand(int nmb,cginfo_mb_t *cgi_mb)
 +{
 +    int ncg,mb,cg;
 +    int *cginfo;
 +
 +    ncg = cgi_mb[nmb-1].cg_end;
 +    snew(cginfo,ncg);
 +    mb = 0;
 +    for(cg=0; cg<ncg; cg++)
 +    {
 +        while (cg >= cgi_mb[mb].cg_end)
 +        {
 +            mb++;
 +        }
 +        cginfo[cg] =
 +            cgi_mb[mb].cginfo[(cg - cgi_mb[mb].cg_start) % cgi_mb[mb].cg_mod];
 +    }
 +
 +    return cginfo;
 +}
 +
 +static void set_chargesum(FILE *log,t_forcerec *fr,const gmx_mtop_t *mtop)
 +{
 +    double qsum,q2sum,q;
 +    int    mb,nmol,i;
 +    const t_atoms *atoms;
 +    
 +    qsum  = 0;
 +    q2sum = 0;
 +    for(mb=0; mb<mtop->nmolblock; mb++)
 +    {
 +        nmol  = mtop->molblock[mb].nmol;
 +        atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
 +        for(i=0; i<atoms->nr; i++)
 +        {
 +            q = atoms->atom[i].q;
 +            qsum  += nmol*q;
 +            q2sum += nmol*q*q;
 +        }
 +    }
 +    fr->qsum[0]  = qsum;
 +    fr->q2sum[0] = q2sum;
 +    if (fr->efep != efepNO)
 +    {
 +        qsum  = 0;
 +        q2sum = 0;
 +        for(mb=0; mb<mtop->nmolblock; mb++)
 +        {
 +            nmol  = mtop->molblock[mb].nmol;
 +            atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
 +            for(i=0; i<atoms->nr; i++)
 +            {
 +                q = atoms->atom[i].qB;
 +                qsum  += nmol*q;
 +                q2sum += nmol*q*q;
 +            }
 +            fr->qsum[1]  = qsum;
 +            fr->q2sum[1] = q2sum;
 +        }
 +    }
 +    else
 +    {
 +        fr->qsum[1]  = fr->qsum[0];
 +        fr->q2sum[1] = fr->q2sum[0];
 +    }
 +    if (log) {
 +        if (fr->efep == efepNO)
 +            fprintf(log,"System total charge: %.3f\n",fr->qsum[0]);
 +        else
 +            fprintf(log,"System total charge, top. A: %.3f top. B: %.3f\n",
 +                    fr->qsum[0],fr->qsum[1]);
 +    }
 +}
 +
 +void update_forcerec(FILE *log,t_forcerec *fr,matrix box)
 +{
 +    if (fr->eeltype == eelGRF)
 +    {
 +        calc_rffac(NULL,fr->eeltype,fr->epsilon_r,fr->epsilon_rf,
 +                   fr->rcoulomb,fr->temp,fr->zsquare,box,
 +                   &fr->kappa,&fr->k_rf,&fr->c_rf);
 +    }
 +}
 +
 +void set_avcsixtwelve(FILE *fplog,t_forcerec *fr,const gmx_mtop_t *mtop)
 +{
 +    const t_atoms *atoms,*atoms_tpi;
 +    const t_blocka *excl;
 +    int    mb,nmol,nmolc,i,j,tpi,tpj,j1,j2,k,n,nexcl,q;
 +#if (defined SIZEOF_LONG_LONG_INT) && (SIZEOF_LONG_LONG_INT >= 8)    
 +    long long int  npair,npair_ij,tmpi,tmpj;
 +#else
 +    double npair, npair_ij,tmpi,tmpj;
 +#endif
 +    double csix,ctwelve;
 +    int    ntp,*typecount;
 +    gmx_bool   bBHAM;
 +    real   *nbfp;
 +
 +    ntp = fr->ntype;
 +    bBHAM = fr->bBHAM;
 +    nbfp = fr->nbfp;
 +    
 +    for(q=0; q<(fr->efep==efepNO ? 1 : 2); q++) {
 +        csix = 0;
 +        ctwelve = 0;
 +        npair = 0;
 +        nexcl = 0;
 +        if (!fr->n_tpi) {
 +            /* Count the types so we avoid natoms^2 operations */
 +            snew(typecount,ntp);
 +            for(mb=0; mb<mtop->nmolblock; mb++) {
 +                nmol  = mtop->molblock[mb].nmol;
 +                atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
 +                for(i=0; i<atoms->nr; i++) {
 +                    if (q == 0)
 +                    {
 +                        tpi = atoms->atom[i].type;
 +                    }
 +                    else
 +                    {
 +                        tpi = atoms->atom[i].typeB;
 +                    }
 +                    typecount[tpi] += nmol;
 +                }
 +            }
 +            for(tpi=0; tpi<ntp; tpi++) {
 +                for(tpj=tpi; tpj<ntp; tpj++) {
 +                    tmpi = typecount[tpi];
 +                    tmpj = typecount[tpj];
 +                    if (tpi != tpj)
 +                    {
 +                        npair_ij = tmpi*tmpj;
 +                    }
 +                    else
 +                    {
 +                        npair_ij = tmpi*(tmpi - 1)/2;
 +                    }
 +                    if (bBHAM) {
 +                        /* nbfp now includes the 6.0 derivative prefactor */
 +                        csix    += npair_ij*BHAMC(nbfp,ntp,tpi,tpj)/6.0;
 +                    } else {
 +                        /* nbfp now includes the 6.0/12.0 derivative prefactors */
 +                        csix    += npair_ij*   C6(nbfp,ntp,tpi,tpj)/6.0;
 +                        ctwelve += npair_ij*  C12(nbfp,ntp,tpi,tpj)/12.0;
 +                    }
 +                    npair += npair_ij;
 +                }
 +            }
 +            sfree(typecount);
 +            /* Subtract the excluded pairs.
 +             * The main reason for substracting exclusions is that in some cases
 +             * some combinations might never occur and the parameters could have
 +             * any value. These unused values should not influence the dispersion
 +             * correction.
 +             */
 +            for(mb=0; mb<mtop->nmolblock; mb++) {
 +                nmol  = mtop->molblock[mb].nmol;
 +                atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
 +                excl  = &mtop->moltype[mtop->molblock[mb].type].excls;
 +                for(i=0; (i<atoms->nr); i++) {
 +                    if (q == 0)
 +                    {
 +                        tpi = atoms->atom[i].type;
 +                    }
 +                    else
 +                    {
 +                        tpi = atoms->atom[i].typeB;
 +                    }
 +                    j1  = excl->index[i];
 +                    j2  = excl->index[i+1];
 +                    for(j=j1; j<j2; j++) {
 +                        k = excl->a[j];
 +                        if (k > i)
 +                        {
 +                            if (q == 0)
 +                            {
 +                                tpj = atoms->atom[k].type;
 +                            }
 +                            else
 +                            {
 +                                tpj = atoms->atom[k].typeB;
 +                            }
 +                            if (bBHAM) {
 +                                /* nbfp now includes the 6.0 derivative prefactor */
 +                               csix -= nmol*BHAMC(nbfp,ntp,tpi,tpj)/6.0;
 +                            } else {
 +                                /* nbfp now includes the 6.0/12.0 derivative prefactors */
 +                                csix    -= nmol*C6 (nbfp,ntp,tpi,tpj)/6.0;
 +                                ctwelve -= nmol*C12(nbfp,ntp,tpi,tpj)/12.0;
 +                            }
 +                            nexcl += nmol;
 +                        }
 +                    }
 +                }
 +            }
 +        } else {
 +            /* Only correct for the interaction of the test particle
 +             * with the rest of the system.
 +             */
 +            atoms_tpi =
 +                &mtop->moltype[mtop->molblock[mtop->nmolblock-1].type].atoms;
 +
 +            npair = 0;
 +            for(mb=0; mb<mtop->nmolblock; mb++) {
 +                nmol  = mtop->molblock[mb].nmol;
 +                atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
 +                for(j=0; j<atoms->nr; j++) {
 +                    nmolc = nmol;
 +                    /* Remove the interaction of the test charge group
 +                     * with itself.
 +                     */
 +                    if (mb == mtop->nmolblock-1)
 +                    {
 +                        nmolc--;
 +                        
 +                        if (mb == 0 && nmol == 1)
 +                        {
 +                            gmx_fatal(FARGS,"Old format tpr with TPI, please generate a new tpr file");
 +                        }
 +                    }
 +                    if (q == 0)
 +                    {
 +                        tpj = atoms->atom[j].type;
 +                    }
 +                    else
 +                    {
 +                        tpj = atoms->atom[j].typeB;
 +                    }
 +                    for(i=0; i<fr->n_tpi; i++)
 +                    {
 +                        if (q == 0)
 +                        {
 +                            tpi = atoms_tpi->atom[i].type;
 +                        }
 +                        else
 +                        {
 +                            tpi = atoms_tpi->atom[i].typeB;
 +                        }
 +                        if (bBHAM)
 +                        {
 +                            /* nbfp now includes the 6.0 derivative prefactor */
 +                            csix    += nmolc*BHAMC(nbfp,ntp,tpi,tpj)/6.0;
 +                        }
 +                        else
 +                        {
 +                            /* nbfp now includes the 6.0/12.0 derivative prefactors */
 +                            csix    += nmolc*C6 (nbfp,ntp,tpi,tpj)/6.0;
 +                            ctwelve += nmolc*C12(nbfp,ntp,tpi,tpj)/12.0;
 +                        }
 +                        npair += nmolc;
 +                    }
 +                }
 +            }
 +        }
 +        if (npair - nexcl <= 0 && fplog) {
 +            fprintf(fplog,"\nWARNING: There are no atom pairs for dispersion correction\n\n");
 +            csix     = 0;
 +            ctwelve  = 0;
 +        } else {
 +            csix    /= npair - nexcl;
 +            ctwelve /= npair - nexcl;
 +        }
 +        if (debug) {
 +            fprintf(debug,"Counted %d exclusions\n",nexcl);
 +            fprintf(debug,"Average C6 parameter is: %10g\n",(double)csix);
 +            fprintf(debug,"Average C12 parameter is: %10g\n",(double)ctwelve);
 +        }
 +        fr->avcsix[q]    = csix;
 +        fr->avctwelve[q] = ctwelve;
 +    }
 +    if (fplog != NULL)
 +    {
 +        if (fr->eDispCorr == edispcAllEner ||
 +            fr->eDispCorr == edispcAllEnerPres)
 +        {
 +            fprintf(fplog,"Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
 +                    fr->avcsix[0],fr->avctwelve[0]);
 +        }
 +        else
 +        {
 +            fprintf(fplog,"Long Range LJ corr.: <C6> %10.4e\n",fr->avcsix[0]);
 +        }
 +    }
 +}
 +
 +
 +static void set_bham_b_max(FILE *fplog,t_forcerec *fr,
 +                           const gmx_mtop_t *mtop)
 +{
 +    const t_atoms *at1,*at2;
 +    int  mt1,mt2,i,j,tpi,tpj,ntypes;
 +    real b,bmin;
 +    real *nbfp;
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog,"Determining largest Buckingham b parameter for table\n");
 +    }
 +    nbfp   = fr->nbfp;
 +    ntypes = fr->ntype;
 +    
 +    bmin           = -1;
 +    fr->bham_b_max = 0;
 +    for(mt1=0; mt1<mtop->nmoltype; mt1++)
 +    {
 +        at1 = &mtop->moltype[mt1].atoms;
 +        for(i=0; (i<at1->nr); i++)
 +        {
 +            tpi = at1->atom[i].type;
 +            if (tpi >= ntypes)
 +                gmx_fatal(FARGS,"Atomtype[%d] = %d, maximum = %d",i,tpi,ntypes);
 +            
 +            for(mt2=mt1; mt2<mtop->nmoltype; mt2++)
 +            {
 +                at2 = &mtop->moltype[mt2].atoms;
 +                for(j=0; (j<at2->nr); j++) {
 +                    tpj = at2->atom[j].type;
 +                    if (tpj >= ntypes)
 +                    {
 +                        gmx_fatal(FARGS,"Atomtype[%d] = %d, maximum = %d",j,tpj,ntypes);
 +                    }
 +                    b = BHAMB(nbfp,ntypes,tpi,tpj);
 +                    if (b > fr->bham_b_max)
 +                    {
 +                        fr->bham_b_max = b;
 +                    }
 +                    if ((b < bmin) || (bmin==-1))
 +                    {
 +                        bmin = b;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +    if (fplog)
 +    {
 +        fprintf(fplog,"Buckingham b parameters, min: %g, max: %g\n",
 +                bmin,fr->bham_b_max);
 +    }
 +}
 +
 +static void make_nbf_tables(FILE *fp,const output_env_t oenv,
 +                            t_forcerec *fr,real rtab,
 +                            const t_commrec *cr,
 +                            const char *tabfn,char *eg1,char *eg2,
 +                            t_nblists *nbl)
 +{
 +    char buf[STRLEN];
 +    int i,j;
 +
 +    if (tabfn == NULL) {
 +        if (debug)
 +            fprintf(debug,"No table file name passed, can not read table, can not do non-bonded interactions\n");
 +        return;
 +    }
 +
 +    sprintf(buf,"%s",tabfn);
 +    if (eg1 && eg2)
 +    /* Append the two energy group names */
 +        sprintf(buf + strlen(tabfn) - strlen(ftp2ext(efXVG)) - 1,"_%s_%s.%s",
 +                eg1,eg2,ftp2ext(efXVG));
 +    nbl->table_elec_vdw = make_tables(fp,oenv,fr,MASTER(cr),buf,rtab,0);
 +    /* Copy the contents of the table to separate coulomb and LJ tables too,
 +     * to improve cache performance.
 +     */
 +    /* For performance reasons we want
 +     * the table data to be aligned to 16-byte. The pointers could be freed
 +     * but currently aren't.
 +     */
 +    nbl->table_elec.interaction = GMX_TABLE_INTERACTION_ELEC;
 +    nbl->table_elec.format = nbl->table_elec_vdw.format;
 +    nbl->table_elec.r = nbl->table_elec_vdw.r;
 +    nbl->table_elec.n = nbl->table_elec_vdw.n;
 +    nbl->table_elec.scale = nbl->table_elec_vdw.scale;
 +    nbl->table_elec.scale_exp = nbl->table_elec_vdw.scale_exp;
 +    nbl->table_elec.formatsize = nbl->table_elec_vdw.formatsize;
 +    nbl->table_elec.ninteractions = 1;
 +    nbl->table_elec.stride = nbl->table_elec.formatsize * nbl->table_elec.ninteractions;
 +    snew_aligned(nbl->table_elec.data,nbl->table_elec.stride*(nbl->table_elec.n+1),16);
 +
 +    nbl->table_vdw.interaction = GMX_TABLE_INTERACTION_VDWREP_VDWDISP;
 +    nbl->table_vdw.format = nbl->table_elec_vdw.format;
 +    nbl->table_vdw.r = nbl->table_elec_vdw.r;
 +    nbl->table_vdw.n = nbl->table_elec_vdw.n;
 +    nbl->table_vdw.scale = nbl->table_elec_vdw.scale;
 +    nbl->table_vdw.scale_exp = nbl->table_elec_vdw.scale_exp;
 +    nbl->table_vdw.formatsize = nbl->table_elec_vdw.formatsize;
 +    nbl->table_vdw.ninteractions = 2;
 +    nbl->table_vdw.stride = nbl->table_vdw.formatsize * nbl->table_vdw.ninteractions;
 +    snew_aligned(nbl->table_vdw.data,nbl->table_vdw.stride*(nbl->table_vdw.n+1),16);
 +
 +    for(i=0; i<=nbl->table_elec_vdw.n; i++)
 +    {
 +        for(j=0; j<4; j++)
 +            nbl->table_elec.data[4*i+j] = nbl->table_elec_vdw.data[12*i+j];
 +        for(j=0; j<8; j++)
 +            nbl->table_vdw.data[8*i+j] = nbl->table_elec_vdw.data[12*i+4+j];
 +    }
 +}
 +
 +static void count_tables(int ftype1,int ftype2,const gmx_mtop_t *mtop,
 +                         int *ncount,int **count)
 +{
 +    const gmx_moltype_t *molt;
 +    const t_ilist *il;
 +    int mt,ftype,stride,i,j,tabnr;
 +    
 +    for(mt=0; mt<mtop->nmoltype; mt++)
 +    {
 +        molt = &mtop->moltype[mt];
 +        for(ftype=0; ftype<F_NRE; ftype++)
 +        {
 +            if (ftype == ftype1 || ftype == ftype2) {
 +                il = &molt->ilist[ftype];
 +                stride = 1 + NRAL(ftype);
 +                for(i=0; i<il->nr; i+=stride) {
 +                    tabnr = mtop->ffparams.iparams[il->iatoms[i]].tab.table;
 +                    if (tabnr < 0)
 +                        gmx_fatal(FARGS,"A bonded table number is smaller than 0: %d\n",tabnr);
 +                    if (tabnr >= *ncount) {
 +                        srenew(*count,tabnr+1);
 +                        for(j=*ncount; j<tabnr+1; j++)
 +                            (*count)[j] = 0;
 +                        *ncount = tabnr+1;
 +                    }
 +                    (*count)[tabnr]++;
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +static bondedtable_t *make_bonded_tables(FILE *fplog,
 +                                         int ftype1,int ftype2,
 +                                         const gmx_mtop_t *mtop,
 +                                         const char *basefn,const char *tabext)
 +{
 +    int  i,ncount,*count;
 +    char tabfn[STRLEN];
 +    bondedtable_t *tab;
 +    
 +    tab = NULL;
 +    
 +    ncount = 0;
 +    count = NULL;
 +    count_tables(ftype1,ftype2,mtop,&ncount,&count);
 +    
 +    if (ncount > 0) {
 +        snew(tab,ncount);
 +        for(i=0; i<ncount; i++) {
 +            if (count[i] > 0) {
 +                sprintf(tabfn,"%s",basefn);
 +                sprintf(tabfn + strlen(basefn) - strlen(ftp2ext(efXVG)) - 1,"_%s%d.%s",
 +                        tabext,i,ftp2ext(efXVG));
 +                tab[i] = make_bonded_table(fplog,tabfn,NRAL(ftype1)-2);
 +            }
 +        }
 +        sfree(count);
 +    }
 +  
 +    return tab;
 +}
 +
 +void forcerec_set_ranges(t_forcerec *fr,
 +                         int ncg_home,int ncg_force,
 +                         int natoms_force,
 +                         int natoms_force_constr,int natoms_f_novirsum)
 +{
 +    fr->cg0 = 0;
 +    fr->hcg = ncg_home;
 +
 +    /* fr->ncg_force is unused in the standard code,
 +     * but it can be useful for modified code dealing with charge groups.
 +     */
 +    fr->ncg_force           = ncg_force;
 +    fr->natoms_force        = natoms_force;
 +    fr->natoms_force_constr = natoms_force_constr;
 +
 +    if (fr->natoms_force_constr > fr->nalloc_force)
 +    {
 +        fr->nalloc_force = over_alloc_dd(fr->natoms_force_constr);
 +
 +        if (fr->bTwinRange)
 +        {
 +            srenew(fr->f_twin,fr->nalloc_force);
 +        }
 +    }
 +
 +    if (fr->bF_NoVirSum)
 +    {
 +        fr->f_novirsum_n = natoms_f_novirsum;
 +        if (fr->f_novirsum_n > fr->f_novirsum_nalloc)
 +        {
 +            fr->f_novirsum_nalloc = over_alloc_dd(fr->f_novirsum_n);
 +            srenew(fr->f_novirsum_alloc,fr->f_novirsum_nalloc);
 +        }
 +    }
 +    else
 +    {
 +        fr->f_novirsum_n = 0;
 +    }
 +}
 +
 +static real cutoff_inf(real cutoff)
 +{
 +    if (cutoff == 0)
 +    {
 +        cutoff = GMX_CUTOFF_INF;
 +    }
 +
 +    return cutoff;
 +}
 +
 +static void make_adress_tf_tables(FILE *fp,const output_env_t oenv,
 +                            t_forcerec *fr,const t_inputrec *ir,
 +                          const char *tabfn, const gmx_mtop_t *mtop,
 +                            matrix     box)
 +{
 +  char buf[STRLEN];
 +  int i,j;
 +
 +  if (tabfn == NULL) {
 +        gmx_fatal(FARGS,"No thermoforce table file given. Use -tabletf to specify a file\n");
 +    return;
 +  }
 +
 +  snew(fr->atf_tabs, ir->adress->n_tf_grps);
 +
 +  for (i=0; i<ir->adress->n_tf_grps; i++){
 +    j = ir->adress->tf_table_index[i]; /* get energy group index */
 +    sprintf(buf + strlen(tabfn) - strlen(ftp2ext(efXVG)) - 1,"tf_%s.%s",
 +        *(mtop->groups.grpname[mtop->groups.grps[egcENER].nm_ind[j]]) ,ftp2ext(efXVG));
 +    printf("loading tf table for energygrp index %d from %s\n", ir->adress->tf_table_index[j], buf);
 +    fr->atf_tabs[i] = make_atf_table(fp,oenv,fr,buf, box);
 +  }
 +
 +}
 +
 +gmx_bool can_use_allvsall(const t_inputrec *ir, const gmx_mtop_t *mtop,
 +                      gmx_bool bPrintNote,t_commrec *cr,FILE *fp)
 +{
 +    gmx_bool bAllvsAll;
 +
 +    bAllvsAll =
 +        (
 +         ir->rlist==0            &&
 +         ir->rcoulomb==0         &&
 +         ir->rvdw==0             &&
 +         ir->ePBC==epbcNONE      &&
 +         ir->vdwtype==evdwCUT    &&
 +         ir->coulombtype==eelCUT &&
 +         ir->efep==efepNO        &&
 +         (ir->implicit_solvent == eisNO || 
 +          (ir->implicit_solvent==eisGBSA && (ir->gb_algorithm==egbSTILL || 
 +                                             ir->gb_algorithm==egbHCT   || 
 +                                             ir->gb_algorithm==egbOBC))) &&
 +         getenv("GMX_NO_ALLVSALL") == NULL
 +            );
 +    
 +    if (bAllvsAll && ir->opts.ngener > 1)
 +    {
 +        const char *note="NOTE: Can not use all-vs-all force loops, because there are multiple energy monitor groups; you might get significantly higher performance when using only a single energy monitor group.\n";
 +
 +        if (bPrintNote)
 +        {
 +            if (MASTER(cr))
 +            {
 +                fprintf(stderr,"\n%s\n",note);
 +            }
 +            if (fp != NULL)
 +            {
 +                fprintf(fp,"\n%s\n",note);
 +            }
 +        }
 +        bAllvsAll = FALSE;
 +    }
 +
 +    if(bAllvsAll && fp && MASTER(cr))
 +    {
 +        fprintf(fp,"\nUsing accelerated all-vs-all kernels.\n\n");
 +    }
 +    
 +    return bAllvsAll;
 +}
 +
 +
 +static void init_forcerec_f_threads(t_forcerec *fr,int nenergrp)
 +{
 +    int t,i;
 +
 +    /* These thread local data structures are used for bondeds only */
 +    fr->nthreads = gmx_omp_nthreads_get(emntBonded);
 +
 +    if (fr->nthreads > 1)
 +    {
 +        snew(fr->f_t,fr->nthreads);
 +        /* Thread 0 uses the global force and energy arrays */
 +        for(t=1; t<fr->nthreads; t++)
 +        {
 +            fr->f_t[t].f = NULL;
 +            fr->f_t[t].f_nalloc = 0;
 +            snew(fr->f_t[t].fshift,SHIFTS);
 +            fr->f_t[t].grpp.nener = nenergrp*nenergrp;
 +            for(i=0; i<egNR; i++)
 +            {
 +                snew(fr->f_t[t].grpp.ener[i],fr->f_t[t].grpp.nener);
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void pick_nbnxn_kernel_cpu(FILE *fp,
 +                                  const t_commrec *cr,
 +                                  const gmx_cpuid_t cpuid_info,
 +                                  int *kernel_type,
 +                                  int *ewald_excl)
 +{
 +    *kernel_type = nbk4x4_PlainC;
 +    *ewald_excl  = ewaldexclTable;
 +
 +#ifdef GMX_X86_SSE2
 +    {
 +        /* On Intel Sandy-Bridge AVX-256 kernels are always faster.
 +         * On AMD Bulldozer AVX-256 is much slower than AVX-128.
 +         */
 +        if(gmx_cpuid_feature(cpuid_info, GMX_CPUID_FEATURE_X86_AVX) == 1 &&
 +           gmx_cpuid_vendor(cpuid_info) != GMX_CPUID_VENDOR_AMD)
 +        {
 +#ifdef GMX_X86_AVX_256
 +            *kernel_type = nbk4xN_X86_SIMD256;
 +#else
 +            *kernel_type = nbk4xN_X86_SIMD128;
 +#endif
 +        }
 +        else
 +        {
 +            *kernel_type = nbk4xN_X86_SIMD128;
 +        }
 +
 +        if (getenv("GMX_NBNXN_AVX128") != NULL)
 +        {
 +            *kernel_type = nbk4xN_X86_SIMD128;
 +        }
 +        if (getenv("GMX_NBNXN_AVX256") != NULL)
 +        {
 +#ifdef GMX_X86_AVX_256
 +            *kernel_type = nbk4xN_X86_SIMD256;
 +#else
 +            gmx_fatal(FARGS,"You requested AVX-256 nbnxn kernels, but GROMACS was built without AVX support");
 +#endif
 +        }
 +
 +        /* Analytical Ewald exclusion correction is only an option in the
 +         * x86 SIMD kernel. This is faster in single precision
 +         * on Bulldozer and slightly faster on Sandy Bridge.
 +         */
 +#if (defined GMX_X86_AVX_128_FMA || defined GMX_X86_AVX_256) && !defined GMX_DOUBLE
 +        *ewald_excl = ewaldexclAnalytical;
 +#endif
 +        if (getenv("GMX_NBNXN_EWALD_TABLE") != NULL)
 +        {
 +            *ewald_excl = ewaldexclTable;
 +        }
 +        if (getenv("GMX_NBNXN_EWALD_ANALYTICAL") != NULL)
 +        {
 +            *ewald_excl = ewaldexclAnalytical;
 +        }
 +
 +    }
 +#endif /* GMX_X86_SSE2 */
 +}
 +
 +
 +/* Note that _mm_... intrinsics can be converted to either SSE or AVX
 + * depending on compiler flags.
 + * For gcc we check for __AVX__
 + * At least a check for icc should be added (if there is a macro)
 + */
 +static const char *nbk_name[] =
 +  { "not set", "plain C 4x4",
 +#if !(defined GMX_X86_AVX_256 || defined GMX_X86_AVX128_FMA || defined __AVX__)
 +#ifndef GMX_X86_SSE4_1
 +#ifndef GMX_DOUBLE
 +    "SSE2 4x4",
 +#else
 +    "SSE2 4x2",
 +#endif
 +#else
 +#ifndef GMX_DOUBLE
 +    "SSE4.1 4x4",
 +#else
 +    "SSE4.1 4x2",
 +#endif
 +#endif
 +#else
 +#ifndef GMX_DOUBLE
 +    "AVX-128 4x4",
 +#else
 +    "AVX-128 4x2",
 +#endif
 +#endif
 +#ifndef GMX_DOUBLE
 +    "AVX-256 4x8",
 +#else
 +    "AVX-256 4x4",
 +#endif
 +    "CUDA 8x8x8", "plain C 8x8x8" };
 +
 +static void pick_nbnxn_kernel(FILE *fp,
 +                              const t_commrec *cr,
 +                              const gmx_hw_info_t *hwinfo,
 +                              gmx_bool use_cpu_acceleration,
 +                              gmx_bool *bUseGPU,
 +                              int *kernel_type,
 +                              int *ewald_excl,
 +                              gmx_bool bDoNonbonded)
 +{
 +    gmx_bool bEmulateGPU, bGPU, bEmulateGPUEnvVarSet;
 +    char gpu_err_str[STRLEN];
 +
 +    assert(kernel_type);
 +
 +    *kernel_type = nbkNotSet;
 +    *ewald_excl  = ewaldexclTable;
 +
 +    bEmulateGPUEnvVarSet = (getenv("GMX_EMULATE_GPU") != NULL);
 +
 +    /* if bUseGPU == NULL we don't want a GPU (e.g. hybrid mode kernel selection) */
 +    bGPU = ((bUseGPU != NULL) && hwinfo->bCanUseGPU);
 +
 +    /* Run GPU emulation mode if GMX_EMULATE_GPU is defined. We will
 +     * automatically switch to emulation if non-bonded calculations are
 +     * turned off via GMX_NO_NONBONDED - this is the simple and elegant
 +     * way to turn off GPU initialization, data movement, and cleanup. */
 +    bEmulateGPU = (bEmulateGPUEnvVarSet || (!bDoNonbonded && bGPU));
 +
 +    /* Enable GPU mode when GPUs are available or GPU emulation is requested.
 +     * The latter is useful to assess the performance one can expect by adding
 +     * GPU(s) to the machine. The conditional below allows this even if mdrun
 +     * is compiled without GPU acceleration support.
 +     * Note that such a GPU acceleration performance assessment should be
 +     * carried out by setting the GMX_EMULATE_GPU and GMX_NO_NONBONDED env. vars
 +     * (and freezing the system as otherwise it would explode). */
 +    if (bGPU || bEmulateGPUEnvVarSet)
 +    {
 +        if (bEmulateGPU)
 +        {
 +            bGPU = FALSE;
 +        }
 +        else
 +        {
 +            /* Each PP node will use the intra-node id-th device from the
 +             * list of detected/selected GPUs. */
-                           get_gpu_device_id(&hwinfo->gpu_info, cr->nodeid_group_intra),
++            if (!init_gpu(cr->rank_pp_intranode, gpu_err_str, &hwinfo->gpu_info))
 +            {
 +                /* At this point the init should never fail as we made sure that
 +                 * we have all the GPUs we need. If it still does, we'll bail. */
 +                gmx_fatal(FARGS, "On node %d failed to initialize GPU #%d: %s",
 +                          cr->nodeid,
-                         &fr->hwinfo->gpu_info, cr->nodeid_group_intra,
++                          get_gpu_device_id(&hwinfo->gpu_info, cr->rank_pp_intranode),
 +                          gpu_err_str);
 +            }
 +        }
 +        *bUseGPU = bGPU;
 +    }
 +
 +    if (bEmulateGPU)
 +    {
 +        *kernel_type = nbk8x8x8_PlainC;
 +
 +        if (bDoNonbonded)
 +        {
 +            md_print_warn(cr, fp, "Emulating a GPU run on the CPU (slow)");
 +        }
 +    }
 +    else if (bGPU)
 +    {
 +        *kernel_type = nbk8x8x8_CUDA;
 +    }
 +
 +    if (*kernel_type == nbkNotSet)
 +    {
 +        if (use_cpu_acceleration)
 +        {
 +            pick_nbnxn_kernel_cpu(fp,cr,hwinfo->cpuid_info,
 +                                  kernel_type,ewald_excl);
 +        }
 +        else
 +        {
 +            *kernel_type = nbk4x4_PlainC;
 +        }
 +    }
 +
 +    if (bDoNonbonded && fp != NULL)
 +    {
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr,"Using %s non-bonded kernels\n",
 +                    nbk_name[*kernel_type]);
 +        }
 +        fprintf(fp,"\nUsing %s non-bonded kernels\n\n",
 +                nbk_name[*kernel_type]);
 +    }
 +}
 +
 +gmx_bool uses_simple_tables(int cutoff_scheme,
 +                            nonbonded_verlet_t *nbv,
 +                            int group)
 +{
 +    gmx_bool bUsesSimpleTables = TRUE;
 +    int grp_index;
 +
 +    switch(cutoff_scheme)
 +    {
 +    case ecutsGROUP:
 +        bUsesSimpleTables = TRUE;
 +        break;
 +    case ecutsVERLET:
 +        assert(NULL != nbv && NULL != nbv->grp);
 +        grp_index = (group < 0) ? 0 : (nbv->ngrp - 1);
 +        bUsesSimpleTables = nbnxn_kernel_pairlist_simple(nbv->grp[grp_index].kernel_type);
 +        break;
 +    default:
 +        gmx_incons("unimplemented");
 +    }
 +    return bUsesSimpleTables;
 +}
 +
 +static void init_ewald_f_table(interaction_const_t *ic,
 +                               gmx_bool bUsesSimpleTables,
 +                               real rtab)
 +{
 +    real maxr;
 +
 +    if (bUsesSimpleTables)
 +    {
 +        /* With a spacing of 0.0005 we are at the force summation accuracy
 +         * for the SSE kernels for "normal" atomistic simulations.
 +         */
 +        ic->tabq_scale = ewald_spline3_table_scale(ic->ewaldcoeff,
 +                                                   ic->rcoulomb);
 +        
 +        maxr = (rtab>ic->rcoulomb) ? rtab : ic->rcoulomb;
 +        ic->tabq_size  = (int)(maxr*ic->tabq_scale) + 2;
 +    }
 +    else
 +    {
 +        ic->tabq_size = GPU_EWALD_COULOMB_FORCE_TABLE_SIZE;
 +        /* Subtract 2 iso 1 to avoid access out of range due to rounding */
 +        ic->tabq_scale = (ic->tabq_size - 2)/ic->rcoulomb;
 +    }
 +
 +    sfree_aligned(ic->tabq_coul_FDV0);
 +    sfree_aligned(ic->tabq_coul_F);
 +    sfree_aligned(ic->tabq_coul_V);
 +
 +    /* Create the original table data in FDV0 */
 +    snew_aligned(ic->tabq_coul_FDV0,ic->tabq_size*4,16);
 +    snew_aligned(ic->tabq_coul_F,ic->tabq_size,16);
 +    snew_aligned(ic->tabq_coul_V,ic->tabq_size,16);
 +    table_spline3_fill_ewald_lr(ic->tabq_coul_F,ic->tabq_coul_V,ic->tabq_coul_FDV0,
 +                                ic->tabq_size,1/ic->tabq_scale,ic->ewaldcoeff);
 +}
 +
 +void init_interaction_const_tables(FILE *fp, 
 +                                   interaction_const_t *ic,
 +                                   gmx_bool bUsesSimpleTables,
 +                                   real rtab)
 +{
 +    real spacing;
 +
 +    if (ic->eeltype == eelEWALD || EEL_PME(ic->eeltype))
 +    {
 +        init_ewald_f_table(ic,bUsesSimpleTables,rtab);
 +
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,"Initialized non-bonded Ewald correction tables, spacing: %.2e size: %d\n\n",
 +                    1/ic->tabq_scale,ic->tabq_size);
 +        }
 +    }
 +}
 +
 +void init_interaction_const(FILE *fp, 
 +                            interaction_const_t **interaction_const,
 +                            const t_forcerec *fr,
 +                            real  rtab)
 +{
 +    interaction_const_t *ic;
 +    gmx_bool bUsesSimpleTables = TRUE;
 +
 +    snew(ic, 1);
 +
 +    /* Just allocate something so we can free it */
 +    snew_aligned(ic->tabq_coul_FDV0,16,16);
 +    snew_aligned(ic->tabq_coul_F,16,16);
 +    snew_aligned(ic->tabq_coul_V,16,16);
 +
 +    ic->rlist       = fr->rlist;
 +    ic->rlistlong   = fr->rlistlong;
 +    
 +    /* Lennard-Jones */
 +    ic->rvdw        = fr->rvdw;
 +    if (fr->vdw_modifier==eintmodPOTSHIFT)
 +    {
 +        ic->sh_invrc6 = pow(ic->rvdw,-6.0);
 +    }
 +    else
 +    {
 +        ic->sh_invrc6 = 0;
 +    }
 +
 +    /* Electrostatics */
 +    ic->eeltype     = fr->eeltype;
 +    ic->rcoulomb    = fr->rcoulomb;
 +    ic->epsilon_r   = fr->epsilon_r;
 +    ic->epsfac      = fr->epsfac;
 +
 +    /* Ewald */
 +    ic->ewaldcoeff  = fr->ewaldcoeff;
 +    if (fr->coulomb_modifier==eintmodPOTSHIFT)
 +    {
 +        ic->sh_ewald = gmx_erfc(ic->ewaldcoeff*ic->rcoulomb);
 +    }
 +    else
 +    {
 +        ic->sh_ewald = 0;
 +    }
 +
 +    /* Reaction-field */
 +    if (EEL_RF(ic->eeltype))
 +    {
 +        ic->epsilon_rf = fr->epsilon_rf;
 +        ic->k_rf       = fr->k_rf;
 +        ic->c_rf       = fr->c_rf;
 +    }
 +    else
 +    {
 +        /* For plain cut-off we might use the reaction-field kernels */
 +        ic->epsilon_rf = ic->epsilon_r;
 +        ic->k_rf       = 0;
 +        if (fr->coulomb_modifier==eintmodPOTSHIFT)
 +        {
 +            ic->c_rf   = 1/ic->rcoulomb;
 +        }
 +        else
 +        {
 +            ic->c_rf   = 0;
 +        }
 +    }
 +
 +    if (fp != NULL)
 +    {
 +        fprintf(fp,"Potential shift: LJ r^-12: %.3f r^-6 %.3f",
 +                sqr(ic->sh_invrc6),ic->sh_invrc6);
 +        if (ic->eeltype == eelCUT)
 +        {
 +            fprintf(fp,", Coulomb %.3f",ic->c_rf);
 +        }
 +        else if (EEL_PME(ic->eeltype))
 +        {
 +            fprintf(fp,", Ewald %.3e",ic->sh_ewald);
 +        }
 +        fprintf(fp,"\n");
 +    }
 +
 +    *interaction_const = ic;
 +
 +    if (fr->nbv != NULL && fr->nbv->bUseGPU)
 +    {
 +        nbnxn_cuda_init_const(fr->nbv->cu_nbv, ic, fr->nbv);
 +    }
 +
 +    bUsesSimpleTables = uses_simple_tables(fr->cutoff_scheme, fr->nbv, -1);
 +    init_interaction_const_tables(fp,ic,bUsesSimpleTables,rtab);
 +}
 +
 +static void init_nb_verlet(FILE *fp,
 +                           nonbonded_verlet_t **nb_verlet,
 +                           const t_inputrec *ir,
 +                           const t_forcerec *fr,
 +                           const t_commrec *cr,
 +                           const char *nbpu_opt)
 +{
 +    nonbonded_verlet_t *nbv;
 +    int  i;
 +    char *env;
 +    gmx_bool bHybridGPURun = FALSE;
 +
 +    nbnxn_alloc_t *nb_alloc;
 +    nbnxn_free_t  *nb_free;
 +
 +    snew(nbv, 1);
 +
 +    nbv->nbs = NULL;
 +
 +    nbv->ngrp = (DOMAINDECOMP(cr) ? 2 : 1);
 +    for(i=0; i<nbv->ngrp; i++)
 +    {
 +        nbv->grp[i].nbl_lists.nnbl = 0;
 +        nbv->grp[i].nbat           = NULL;
 +        nbv->grp[i].kernel_type    = nbkNotSet;
 +
 +        if (i == 0) /* local */
 +        {
 +            pick_nbnxn_kernel(fp, cr, fr->hwinfo, fr->use_cpu_acceleration,
 +                              &nbv->bUseGPU,
 +                              &nbv->grp[i].kernel_type,
 +                              &nbv->grp[i].ewald_excl,
 +                              fr->bNonbonded);
 +        }
 +        else /* non-local */
 +        {
 +            if (nbpu_opt != NULL && strcmp(nbpu_opt,"gpu_cpu") == 0)
 +            {
 +                /* Use GPU for local, select a CPU kernel for non-local */
 +                pick_nbnxn_kernel(fp, cr, fr->hwinfo, fr->use_cpu_acceleration,
 +                                  NULL,
 +                                  &nbv->grp[i].kernel_type,
 +                                  &nbv->grp[i].ewald_excl,
 +                                  fr->bNonbonded);
 +
 +                bHybridGPURun = TRUE;
 +            }
 +            else
 +            {
 +                /* Use the same kernel for local and non-local interactions */
 +                nbv->grp[i].kernel_type = nbv->grp[0].kernel_type;
 +                nbv->grp[i].ewald_excl  = nbv->grp[0].ewald_excl;
 +            }
 +        }
 +    }
 +
 +    if (nbv->bUseGPU)
 +    {
 +        /* init the NxN GPU data; the last argument tells whether we'll have
 +         * both local and non-local NB calculation on GPU */
 +        nbnxn_cuda_init(fp, &nbv->cu_nbv,
++                        &fr->hwinfo->gpu_info, cr->rank_pp_intranode,
 +                        (nbv->ngrp > 1) && !bHybridGPURun);
 +
 +        if ((env = getenv("GMX_NB_MIN_CI")) != NULL)
 +        {
 +            char *end;
 +
 +            nbv->min_ci_balanced = strtol(env, &end, 10);
 +            if (!end || (*end != 0) || nbv->min_ci_balanced <= 0)
 +            {
 +                gmx_fatal(FARGS, "Invalid value passed in GMX_NB_MIN_CI=%s, positive integer required", env);
 +            }
 +
 +            if (debug)
 +            {
 +                fprintf(debug, "Neighbor-list balancing parameter: %d (passed as env. var.)\n", 
 +                        nbv->min_ci_balanced);
 +            }
 +        }
 +        else
 +        {
 +            nbv->min_ci_balanced = nbnxn_cuda_min_ci_balanced(nbv->cu_nbv);
 +            if (debug)
 +            {
 +                fprintf(debug, "Neighbor-list balancing parameter: %d (auto-adjusted to the number of GPU multi-processors)\n",
 +                        nbv->min_ci_balanced);
 +            }
 +        }
 +    }
 +    else
 +    {
 +        nbv->min_ci_balanced = 0;
 +    }
 +
 +    *nb_verlet = nbv;
 +
 +    nbnxn_init_search(&nbv->nbs,
 +                      DOMAINDECOMP(cr) ? & cr->dd->nc : NULL,
 +                      DOMAINDECOMP(cr) ? domdec_zones(cr->dd) : NULL,
 +                      gmx_omp_nthreads_get(emntNonbonded));
 +
 +    for(i=0; i<nbv->ngrp; i++)
 +    {
 +        if (nbv->grp[0].kernel_type == nbk8x8x8_CUDA)
 +        {
 +            nb_alloc = &pmalloc;
 +            nb_free  = &pfree;
 +        }
 +        else
 +        {
 +            nb_alloc = NULL;
 +            nb_free  = NULL;
 +        }
 +
 +        nbnxn_init_pairlist_set(&nbv->grp[i].nbl_lists,
 +                                nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type),
 +                                /* 8x8x8 "non-simple" lists are ATM always combined */
 +                                !nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type),
 +                                nb_alloc, nb_free);
 +
 +        if (i == 0 ||
 +            nbv->grp[0].kernel_type != nbv->grp[i].kernel_type)
 +        {
 +            snew(nbv->grp[i].nbat,1);
 +            nbnxn_atomdata_init(fp,
 +                                nbv->grp[i].nbat,
 +                                nbv->grp[i].kernel_type,
 +                                fr->ntype,fr->nbfp,
 +                                ir->opts.ngener,
 +                                nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type) ? gmx_omp_nthreads_get(emntNonbonded) : 1,
 +                                nb_alloc, nb_free);
 +        }
 +        else
 +        {
 +            nbv->grp[i].nbat = nbv->grp[0].nbat;
 +        }
 +    }
 +}
 +
 +void init_forcerec(FILE *fp,
 +                   const output_env_t oenv,
 +                   t_forcerec *fr,
 +                   t_fcdata   *fcd,
 +                   const t_inputrec *ir,
 +                   const gmx_mtop_t *mtop,
 +                   const t_commrec  *cr,
 +                   matrix     box,
 +                   gmx_bool       bMolEpot,
 +                   const char *tabfn,
 +                   const char *tabafn,
 +                   const char *tabpfn,
 +                   const char *tabbfn,
 +                   const char *nbpu_opt,
 +                   gmx_bool   bNoSolvOpt,
 +                   real       print_force)
 +{
 +    int     i,j,m,natoms,ngrp,negp_pp,negptable,egi,egj;
 +    real    rtab;
 +    char    *env;
 +    double  dbl;
 +    rvec    box_size;
 +    const t_block *cgs;
 +    gmx_bool    bGenericKernelOnly;
 +    gmx_bool    bTab,bSep14tab,bNormalnblists;
 +    t_nblists *nbl;
 +    int     *nm_ind,egp_flags;
 +    
 +    /* By default we turn acceleration on, but it might be turned off further down... */
 +    fr->use_cpu_acceleration = TRUE;
 +
 +    fr->bDomDec = DOMAINDECOMP(cr);
 +
 +    natoms = mtop->natoms;
 +
 +    if (check_box(ir->ePBC,box))
 +    {
 +        gmx_fatal(FARGS,check_box(ir->ePBC,box));
 +    }
 +    
 +    /* Test particle insertion ? */
 +    if (EI_TPI(ir->eI)) {
 +        /* Set to the size of the molecule to be inserted (the last one) */
 +        /* Because of old style topologies, we have to use the last cg
 +         * instead of the last molecule type.
 +         */
 +        cgs = &mtop->moltype[mtop->molblock[mtop->nmolblock-1].type].cgs;
 +        fr->n_tpi = cgs->index[cgs->nr] - cgs->index[cgs->nr-1];
 +        if (fr->n_tpi != mtop->mols.index[mtop->mols.nr] - mtop->mols.index[mtop->mols.nr-1]) {
 +            gmx_fatal(FARGS,"The molecule to insert can not consist of multiple charge groups.\nMake it a single charge group.");
 +        }
 +    } else {
 +        fr->n_tpi = 0;
 +    }
 +    
 +    /* Copy AdResS parameters */
 +    if (ir->bAdress) {
 +      fr->adress_type     = ir->adress->type;
 +      fr->adress_const_wf = ir->adress->const_wf;
 +      fr->adress_ex_width = ir->adress->ex_width;
 +      fr->adress_hy_width = ir->adress->hy_width;
 +      fr->adress_icor     = ir->adress->icor;
 +      fr->adress_site     = ir->adress->site;
 +      fr->adress_ex_forcecap = ir->adress->ex_forcecap;
 +      fr->adress_do_hybridpairs = ir->adress->do_hybridpairs;
 +
 +
 +      snew(fr->adress_group_explicit , ir->adress->n_energy_grps);
 +      for (i=0; i< ir->adress->n_energy_grps; i++){
 +          fr->adress_group_explicit[i]= ir->adress->group_explicit[i];
 +      }
 +
 +      fr->n_adress_tf_grps = ir->adress->n_tf_grps;
 +      snew(fr->adress_tf_table_index, fr->n_adress_tf_grps);
 +      for (i=0; i< fr->n_adress_tf_grps; i++){
 +          fr->adress_tf_table_index[i]= ir->adress->tf_table_index[i];
 +      }
 +      copy_rvec(ir->adress->refs,fr->adress_refs);
 +    } else {
 +      fr->adress_type = eAdressOff;
 +      fr->adress_do_hybridpairs = FALSE;
 +    }
 +    
 +    /* Copy the user determined parameters */
 +    fr->userint1 = ir->userint1;
 +    fr->userint2 = ir->userint2;
 +    fr->userint3 = ir->userint3;
 +    fr->userint4 = ir->userint4;
 +    fr->userreal1 = ir->userreal1;
 +    fr->userreal2 = ir->userreal2;
 +    fr->userreal3 = ir->userreal3;
 +    fr->userreal4 = ir->userreal4;
 +    
 +    /* Shell stuff */
 +    fr->fc_stepsize = ir->fc_stepsize;
 +    
 +    /* Free energy */
 +    fr->efep       = ir->efep;
 +    fr->sc_alphavdw = ir->fepvals->sc_alpha;
 +    if (ir->fepvals->bScCoul)
 +    {
 +        fr->sc_alphacoul = ir->fepvals->sc_alpha;
 +        fr->sc_sigma6_min = pow(ir->fepvals->sc_sigma_min,6);
 +    }
 +    else
 +    {
 +        fr->sc_alphacoul = 0;
 +        fr->sc_sigma6_min = 0; /* only needed when bScCoul is on */
 +    }
 +    fr->sc_power   = ir->fepvals->sc_power;
 +    fr->sc_r_power   = ir->fepvals->sc_r_power;
 +    fr->sc_sigma6_def = pow(ir->fepvals->sc_sigma,6);
 +
 +    env = getenv("GMX_SCSIGMA_MIN");
 +    if (env != NULL)
 +    {
 +        dbl = 0;
 +        sscanf(env,"%lf",&dbl);
 +        fr->sc_sigma6_min = pow(dbl,6);
 +        if (fp)
 +        {
 +            fprintf(fp,"Setting the minimum soft core sigma to %g nm\n",dbl);
 +        }
 +    }
 +
 +    fr->bNonbonded = TRUE;
 +    if (getenv("GMX_NO_NONBONDED") != NULL)
 +    {
 +        /* turn off non-bonded calculations */
 +        fr->bNonbonded = FALSE;
 +        md_print_warn(cr,fp,
 +                      "Found environment variable GMX_NO_NONBONDED.\n"
 +                      "Disabling nonbonded calculations.\n");
 +    }
 +
 +    bGenericKernelOnly = FALSE;
 +
 +    /* We now check in the NS code whether a particular combination of interactions
 +     * can be used with water optimization, and disable it if that is not the case.
 +     */
 +
 +    if (getenv("GMX_NB_GENERIC") != NULL)
 +    {
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,
 +                    "Found environment variable GMX_NB_GENERIC.\n"
 +                    "Disabling all interaction-specific nonbonded kernels, will only\n"
 +                    "use the slow generic ones in src/gmxlib/nonbonded/nb_generic.c\n\n");
 +        }
 +        bGenericKernelOnly = TRUE;
 +    }
 +
 +    if (bGenericKernelOnly==TRUE)
 +    {
 +        bNoSolvOpt         = TRUE;
 +    }
 +
 +    if( (getenv("GMX_DISABLE_CPU_ACCELERATION") != NULL) || (getenv("GMX_NOOPTIMIZEDKERNELS") != NULL) )
 +    {
 +        fr->use_cpu_acceleration = FALSE;
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,
 +                    "\nFound environment variable GMX_DISABLE_CPU_ACCELERATION.\n"
 +                    "Disabling all CPU architecture-specific (e.g. SSE2/SSE4/AVX) routines.\n\n");
 +        }
 +    }
 +
 +    fr->bBHAM = (mtop->ffparams.functype[0] == F_BHAM);
 +
 +    /* Check if we can/should do all-vs-all kernels */
 +    fr->bAllvsAll       = can_use_allvsall(ir,mtop,FALSE,NULL,NULL);
 +    fr->AllvsAll_work   = NULL;
 +    fr->AllvsAll_workgb = NULL;
 +
 +
 +    /* Neighbour searching stuff */
 +    fr->cutoff_scheme = ir->cutoff_scheme;
 +    fr->bGrid         = (ir->ns_type == ensGRID);
 +    fr->ePBC          = ir->ePBC;
 +
 +    /* Determine if we will do PBC for distances in bonded interactions */
 +    if (fr->ePBC == epbcNONE)
 +    {
 +        fr->bMolPBC = FALSE;
 +    }
 +    else
 +    {
 +        if (!DOMAINDECOMP(cr))
 +        {
 +            /* The group cut-off scheme and SHAKE assume charge groups
 +             * are whole, but not using molpbc is faster in most cases.
 +             */
 +            if (fr->cutoff_scheme == ecutsGROUP ||
 +                (ir->eConstrAlg == econtSHAKE &&
 +                 (gmx_mtop_ftype_count(mtop,F_CONSTR) > 0 ||
 +                  gmx_mtop_ftype_count(mtop,F_CONSTRNC) > 0)))
 +            {
 +                fr->bMolPBC = ir->bPeriodicMols;
 +            }
 +            else
 +            {
 +                fr->bMolPBC = TRUE;
 +                if (getenv("GMX_USE_GRAPH") != NULL)
 +                {
 +                    fr->bMolPBC = FALSE;
 +                    if (fp)
 +                    {
 +                        fprintf(fp,"\nGMX_MOLPBC is set, using the graph for bonded interactions\n\n");
 +                    }
 +                }
 +            }
 +        }
 +        else
 +        {
 +            fr->bMolPBC = dd_bonded_molpbc(cr->dd,fr->ePBC);
 +        }
 +    }
 +
 +    fr->rc_scaling = ir->refcoord_scaling;
 +    copy_rvec(ir->posres_com,fr->posres_com);
 +    copy_rvec(ir->posres_comB,fr->posres_comB);
 +    fr->rlist      = cutoff_inf(ir->rlist);
 +    fr->rlistlong  = cutoff_inf(ir->rlistlong);
 +    fr->eeltype    = ir->coulombtype;
 +    fr->vdwtype    = ir->vdwtype;
 +
 +    fr->coulomb_modifier = ir->coulomb_modifier;
 +    fr->vdw_modifier     = ir->vdw_modifier;
 +
 +    /* Electrostatics: Translate from interaction-setting-in-mdp-file to kernel interaction format */
 +    switch(fr->eeltype)
 +    {
 +        case eelCUT:
 +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_COULOMB;
 +            break;
 +
 +        case eelRF:
 +        case eelGRF:
 +        case eelRF_NEC:
 +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_REACTIONFIELD;
 +            break;
 +
 +        case eelRF_ZERO:
 +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_REACTIONFIELD;
 +            fr->coulomb_modifier          = eintmodEXACTCUTOFF;
 +            break;
 +
 +        case eelSWITCH:
 +        case eelSHIFT:
 +        case eelUSER:
 +        case eelENCADSHIFT:
 +        case eelPMESWITCH:
 +        case eelPMEUSER:
 +        case eelPMEUSERSWITCH:
 +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_CUBICSPLINETABLE;
 +            break;
 +
 +        case eelPME:
 +        case eelEWALD:
 +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_EWALD;
 +            break;
 +
 +        default:
 +            gmx_fatal(FARGS,"Unsupported electrostatic interaction: %s",eel_names[fr->eeltype]);
 +            break;
 +    }
 +
 +    /* Vdw: Translate from mdp settings to kernel format */
 +    switch(fr->vdwtype)
 +    {
 +        case evdwCUT:
 +            if(fr->bBHAM)
 +            {
 +                fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_BUCKINGHAM;
 +            }
 +            else
 +            {
 +                fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_LENNARDJONES;
 +            }
 +            break;
 +
 +        case evdwSWITCH:
 +        case evdwSHIFT:
 +        case evdwUSER:
 +        case evdwENCADSHIFT:
 +            fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_CUBICSPLINETABLE;
 +            break;
 +
 +        default:
 +            gmx_fatal(FARGS,"Unsupported vdw interaction: %s",evdw_names[fr->vdwtype]);
 +            break;
 +    }
 +
 +    /* These start out identical to ir, but might be altered if we e.g. tabulate the interaction in the kernel */
 +    fr->nbkernel_elec_modifier    = fr->coulomb_modifier;
 +    fr->nbkernel_vdw_modifier     = fr->vdw_modifier;
 +
 +    fr->bTwinRange = fr->rlistlong > fr->rlist;
 +    fr->bEwald     = (EEL_PME(fr->eeltype) || fr->eeltype==eelEWALD);
 +    
 +    fr->reppow     = mtop->ffparams.reppow;
 +
 +    if (ir->cutoff_scheme == ecutsGROUP)
 +    {
 +        fr->bvdwtab    = (fr->vdwtype != evdwCUT ||
 +                          !gmx_within_tol(fr->reppow,12.0,10*GMX_DOUBLE_EPS));
 +        /* We have special kernels for standard Ewald and PME, but the pme-switch ones are tabulated above */
 +        fr->bcoultab   = !(fr->eeltype == eelCUT ||
 +                           fr->eeltype == eelEWALD ||
 +                           fr->eeltype == eelPME ||
 +                           fr->eeltype == eelRF ||
 +                           fr->eeltype == eelRF_ZERO);
 +
 +        /* If the user absolutely wants different switch/shift settings for coul/vdw, it is likely
 +         * going to be faster to tabulate the interaction than calling the generic kernel.
 +         */
 +        if(fr->nbkernel_elec_modifier==eintmodPOTSWITCH && fr->nbkernel_vdw_modifier==eintmodPOTSWITCH)
 +        {
 +            if((fr->rcoulomb_switch != fr->rvdw_switch) || (fr->rcoulomb != fr->rvdw))
 +            {
 +                fr->bcoultab = TRUE;
 +            }
 +        }
 +        else if((fr->nbkernel_elec_modifier==eintmodPOTSHIFT && fr->nbkernel_vdw_modifier==eintmodPOTSHIFT) ||
 +                ((fr->nbkernel_elec_interaction == GMX_NBKERNEL_ELEC_REACTIONFIELD &&
 +                  fr->nbkernel_elec_modifier==eintmodEXACTCUTOFF &&
 +                  (fr->nbkernel_vdw_modifier==eintmodPOTSWITCH || fr->nbkernel_vdw_modifier==eintmodPOTSHIFT))))
 +        {
 +            if(fr->rcoulomb != fr->rvdw)
 +            {
 +                fr->bcoultab = TRUE;
 +            }
 +        }
 +
 +        if (getenv("GMX_REQUIRE_TABLES"))
 +        {
 +            fr->bvdwtab  = TRUE;
 +            fr->bcoultab = TRUE;
 +        }
 +
 +        if (fp)
 +        {
 +            fprintf(fp,"Table routines are used for coulomb: %s\n",bool_names[fr->bcoultab]);
 +            fprintf(fp,"Table routines are used for vdw:     %s\n",bool_names[fr->bvdwtab ]);
 +        }
 +
 +        if(fr->bvdwtab==TRUE)
 +        {
 +            fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_CUBICSPLINETABLE;
 +            fr->nbkernel_vdw_modifier    = eintmodNONE;
 +        }
 +        if(fr->bcoultab==TRUE)
 +        {
 +            fr->nbkernel_elec_interaction = GMX_NBKERNEL_ELEC_CUBICSPLINETABLE;
 +            fr->nbkernel_elec_modifier    = eintmodNONE;
 +        }
 +    }
 +
 +    if (ir->cutoff_scheme == ecutsVERLET)
 +    {
 +        if (!gmx_within_tol(fr->reppow,12.0,10*GMX_DOUBLE_EPS))
 +        {
 +            gmx_fatal(FARGS,"Cut-off scheme %S only supports LJ repulsion power 12",ecutscheme_names[ir->cutoff_scheme]);
 +        }
 +        fr->bvdwtab  = FALSE;
 +        fr->bcoultab = FALSE;
 +    }
 +    
 +    /* Tables are used for direct ewald sum */
 +    if(fr->bEwald)
 +    {
 +        if (EEL_PME(ir->coulombtype))
 +        {
 +            if (fp)
 +                fprintf(fp,"Will do PME sum in reciprocal space.\n");
 +            if (ir->coulombtype == eelP3M_AD)
 +            {
 +                please_cite(fp,"Hockney1988");
 +                please_cite(fp,"Ballenegger2012");
 +            }
 +            else
 +            {
 +                please_cite(fp,"Essmann95a");
 +            }
 +            
 +            if (ir->ewald_geometry == eewg3DC)
 +            {
 +                if (fp)
 +                {
 +                    fprintf(fp,"Using the Ewald3DC correction for systems with a slab geometry.\n");
 +                }
 +                please_cite(fp,"In-Chul99a");
 +            }
 +        }
 +        fr->ewaldcoeff=calc_ewaldcoeff(ir->rcoulomb, ir->ewald_rtol);
 +        init_ewald_tab(&(fr->ewald_table), cr, ir, fp);
 +        if (fp)
 +        {
 +            fprintf(fp,"Using a Gaussian width (1/beta) of %g nm for Ewald\n",
 +                    1/fr->ewaldcoeff);
 +        }
 +    }
 +    
 +    /* Electrostatics */
 +    fr->epsilon_r  = ir->epsilon_r;
 +    fr->epsilon_rf = ir->epsilon_rf;
 +    fr->fudgeQQ    = mtop->ffparams.fudgeQQ;
 +    fr->rcoulomb_switch = ir->rcoulomb_switch;
 +    fr->rcoulomb        = cutoff_inf(ir->rcoulomb);
 +    
 +    /* Parameters for generalized RF */
 +    fr->zsquare = 0.0;
 +    fr->temp    = 0.0;
 +    
 +    if (fr->eeltype == eelGRF)
 +    {
 +        init_generalized_rf(fp,mtop,ir,fr);
 +    }
 +    else if (fr->eeltype == eelSHIFT)
 +    {
 +        for(m=0; (m<DIM); m++)
 +            box_size[m]=box[m][m];
 +        
 +        if ((fr->eeltype == eelSHIFT && fr->rcoulomb > fr->rcoulomb_switch))
 +            set_shift_consts(fp,fr->rcoulomb_switch,fr->rcoulomb,box_size,fr);
 +    }
 +    
 +    fr->bF_NoVirSum = (EEL_FULL(fr->eeltype) ||
 +                       gmx_mtop_ftype_count(mtop,F_POSRES) > 0 ||
 +                       gmx_mtop_ftype_count(mtop,F_FBPOSRES) > 0 ||
 +                       IR_ELEC_FIELD(*ir) ||
 +                       (fr->adress_icor != eAdressICOff)
 +                      );
 +    
 +    if (fr->cutoff_scheme == ecutsGROUP &&
 +        ncg_mtop(mtop) > fr->cg_nalloc && !DOMAINDECOMP(cr)) {
 +        /* Count the total number of charge groups */
 +        fr->cg_nalloc = ncg_mtop(mtop);
 +        srenew(fr->cg_cm,fr->cg_nalloc);
 +    }
 +    if (fr->shift_vec == NULL)
 +        snew(fr->shift_vec,SHIFTS);
 +    
 +    if (fr->fshift == NULL)
 +        snew(fr->fshift,SHIFTS);
 +    
 +    if (fr->nbfp == NULL) {
 +        fr->ntype = mtop->ffparams.atnr;
 +        fr->nbfp  = mk_nbfp(&mtop->ffparams,fr->bBHAM);
 +    }
 +    
 +    /* Copy the energy group exclusions */
 +    fr->egp_flags = ir->opts.egp_flags;
 +    
 +    /* Van der Waals stuff */
 +    fr->rvdw        = cutoff_inf(ir->rvdw);
 +    fr->rvdw_switch = ir->rvdw_switch;
 +    if ((fr->vdwtype != evdwCUT) && (fr->vdwtype != evdwUSER) && !fr->bBHAM) {
 +        if (fr->rvdw_switch >= fr->rvdw)
 +            gmx_fatal(FARGS,"rvdw_switch (%f) must be < rvdw (%f)",
 +                      fr->rvdw_switch,fr->rvdw);
 +        if (fp)
 +            fprintf(fp,"Using %s Lennard-Jones, switch between %g and %g nm\n",
 +                    (fr->eeltype==eelSWITCH) ? "switched":"shifted",
 +                    fr->rvdw_switch,fr->rvdw);
 +    } 
 +    
 +    if (fr->bBHAM && (fr->vdwtype == evdwSHIFT || fr->vdwtype == evdwSWITCH))
 +        gmx_fatal(FARGS,"Switch/shift interaction not supported with Buckingham");
 +    
 +    if (fp)
 +        fprintf(fp,"Cut-off's:   NS: %g   Coulomb: %g   %s: %g\n",
 +                fr->rlist,fr->rcoulomb,fr->bBHAM ? "BHAM":"LJ",fr->rvdw);
 +    
 +    fr->eDispCorr = ir->eDispCorr;
 +    if (ir->eDispCorr != edispcNO)
 +    {
 +        set_avcsixtwelve(fp,fr,mtop);
 +    }
 +    
 +    if (fr->bBHAM)
 +    {
 +        set_bham_b_max(fp,fr,mtop);
 +    }
 +
 +    fr->bGB = (ir->implicit_solvent == eisGBSA);
 +      fr->gb_epsilon_solvent = ir->gb_epsilon_solvent;
 +
 +    /* Copy the GBSA data (radius, volume and surftens for each
 +     * atomtype) from the topology atomtype section to forcerec.
 +     */
 +    snew(fr->atype_radius,fr->ntype);
 +    snew(fr->atype_vol,fr->ntype);
 +    snew(fr->atype_surftens,fr->ntype);
 +    snew(fr->atype_gb_radius,fr->ntype);
 +    snew(fr->atype_S_hct,fr->ntype);
 +
 +    if (mtop->atomtypes.nr > 0)
 +    {
 +        for(i=0;i<fr->ntype;i++)
 +            fr->atype_radius[i] =mtop->atomtypes.radius[i];
 +        for(i=0;i<fr->ntype;i++)
 +            fr->atype_vol[i] = mtop->atomtypes.vol[i];
 +        for(i=0;i<fr->ntype;i++)
 +            fr->atype_surftens[i] = mtop->atomtypes.surftens[i];
 +        for(i=0;i<fr->ntype;i++)
 +            fr->atype_gb_radius[i] = mtop->atomtypes.gb_radius[i];
 +        for(i=0;i<fr->ntype;i++)
 +            fr->atype_S_hct[i] = mtop->atomtypes.S_hct[i];
 +    }  
 +      
 +      /* Generate the GB table if needed */
 +      if(fr->bGB)
 +      {
 +#ifdef GMX_DOUBLE
 +              fr->gbtabscale=2000;
 +#else
 +              fr->gbtabscale=500;
 +#endif
 +              
 +              fr->gbtabr=100;
 +              fr->gbtab=make_gb_table(fp,oenv,fr,tabpfn,fr->gbtabscale);
 +
 +        init_gb(&fr->born,cr,fr,ir,mtop,ir->rgbradii,ir->gb_algorithm);
 +
 +        /* Copy local gb data (for dd, this is done in dd_partition_system) */
 +        if (!DOMAINDECOMP(cr))
 +        {
 +            make_local_gb(cr,fr->born,ir->gb_algorithm);
 +        }
 +    }
 +
 +    /* Set the charge scaling */
 +    if (fr->epsilon_r != 0)
 +        fr->epsfac = ONE_4PI_EPS0/fr->epsilon_r;
 +    else
 +        /* eps = 0 is infinite dieletric: no coulomb interactions */
 +        fr->epsfac = 0;
 +    
 +    /* Reaction field constants */
 +    if (EEL_RF(fr->eeltype))
 +        calc_rffac(fp,fr->eeltype,fr->epsilon_r,fr->epsilon_rf,
 +                   fr->rcoulomb,fr->temp,fr->zsquare,box,
 +                   &fr->kappa,&fr->k_rf,&fr->c_rf);
 +    
 +    set_chargesum(fp,fr,mtop);
 +    
 +    /* if we are using LR electrostatics, and they are tabulated,
 +     * the tables will contain modified coulomb interactions.
 +     * Since we want to use the non-shifted ones for 1-4
 +     * coulombic interactions, we must have an extra set of tables.
 +     */
 +    
 +    /* Construct tables.
 +     * A little unnecessary to make both vdw and coul tables sometimes,
 +     * but what the heck... */
 +    
 +    bTab = fr->bcoultab || fr->bvdwtab || fr->bEwald;
 +
 +    bSep14tab = ((!bTab || fr->eeltype!=eelCUT || fr->vdwtype!=evdwCUT ||
 +                  fr->bBHAM || fr->bEwald) &&
 +                 (gmx_mtop_ftype_count(mtop,F_LJ14) > 0 ||
 +                  gmx_mtop_ftype_count(mtop,F_LJC14_Q) > 0 ||
 +                  gmx_mtop_ftype_count(mtop,F_LJC_PAIRS_NB) > 0));
 +
 +    negp_pp = ir->opts.ngener - ir->nwall;
 +    negptable = 0;
 +    if (!bTab) {
 +        bNormalnblists = TRUE;
 +        fr->nnblists = 1;
 +    } else {
 +        bNormalnblists = (ir->eDispCorr != edispcNO);
 +        for(egi=0; egi<negp_pp; egi++) {
 +            for(egj=egi;  egj<negp_pp; egj++) {
 +                egp_flags = ir->opts.egp_flags[GID(egi,egj,ir->opts.ngener)];
 +                if (!(egp_flags & EGP_EXCL)) {
 +                    if (egp_flags & EGP_TABLE) {
 +                        negptable++;
 +                    } else {
 +                        bNormalnblists = TRUE;
 +                    }
 +                }
 +            }
 +        }
 +        if (bNormalnblists) {
 +            fr->nnblists = negptable + 1;
 +        } else {
 +            fr->nnblists = negptable;
 +        }
 +        if (fr->nnblists > 1)
 +            snew(fr->gid2nblists,ir->opts.ngener*ir->opts.ngener);
 +    }
++
++    if (ir->adress){
++        fr->nnblists*=2;
++    }
++
 +    snew(fr->nblists,fr->nnblists);
 +    
 +    /* This code automatically gives table length tabext without cut-off's,
 +     * in that case grompp should already have checked that we do not need
 +     * normal tables and we only generate tables for 1-4 interactions.
 +     */
 +    rtab = ir->rlistlong + ir->tabext;
 +
 +    if (bTab) {
 +        /* make tables for ordinary interactions */
 +        if (bNormalnblists) {
 +            make_nbf_tables(fp,oenv,fr,rtab,cr,tabfn,NULL,NULL,&fr->nblists[0]);
++            if (ir->adress){
++                make_nbf_tables(fp,oenv,fr,rtab,cr,tabfn,NULL,NULL,&fr->nblists[fr->nnblists/2]);
++            }
 +            if (!bSep14tab)
 +                fr->tab14 = fr->nblists[0].table_elec_vdw;
 +            m = 1;
 +        } else {
 +            m = 0;
 +        }
 +        if (negptable > 0) {
 +            /* Read the special tables for certain energy group pairs */
 +            nm_ind = mtop->groups.grps[egcENER].nm_ind;
 +            for(egi=0; egi<negp_pp; egi++) {
 +                for(egj=egi;  egj<negp_pp; egj++) {
 +                    egp_flags = ir->opts.egp_flags[GID(egi,egj,ir->opts.ngener)];
 +                    if ((egp_flags & EGP_TABLE) && !(egp_flags & EGP_EXCL)) {
 +                        nbl = &(fr->nblists[m]);
 +                        if (fr->nnblists > 1) {
 +                            fr->gid2nblists[GID(egi,egj,ir->opts.ngener)] = m;
 +                        }
 +                        /* Read the table file with the two energy groups names appended */
 +                        make_nbf_tables(fp,oenv,fr,rtab,cr,tabfn,
 +                                        *mtop->groups.grpname[nm_ind[egi]],
 +                                        *mtop->groups.grpname[nm_ind[egj]],
 +                                        &fr->nblists[m]);
++                        if (ir->adress){
++                             make_nbf_tables(fp,oenv,fr,rtab,cr,tabfn,
++                                        *mtop->groups.grpname[nm_ind[egi]],
++                                        *mtop->groups.grpname[nm_ind[egj]],
++                                        &fr->nblists[fr->nnblists/2+m]);
++                        }
 +                        m++;
 +                    } else if (fr->nnblists > 1) {
 +                        fr->gid2nblists[GID(egi,egj,ir->opts.ngener)] = 0;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +    if (bSep14tab)
 +    {
 +        /* generate extra tables with plain Coulomb for 1-4 interactions only */
 +        fr->tab14 = make_tables(fp,oenv,fr,MASTER(cr),tabpfn,rtab,
 +                                GMX_MAKETABLES_14ONLY);
 +    }
 +
 +    /* Read AdResS Thermo Force table if needed */
 +    if(fr->adress_icor == eAdressICThermoForce)
 +    {
 +        /* old todo replace */ 
 +        
 +        if (ir->adress->n_tf_grps > 0){
 +            make_adress_tf_tables(fp,oenv,fr,ir,tabfn, mtop, box);
 +
 +        }else{
 +            /* load the default table */
 +            snew(fr->atf_tabs, 1);
 +            fr->atf_tabs[DEFAULT_TF_TABLE] = make_atf_table(fp,oenv,fr,tabafn, box);
 +        }
 +    }
 +    
 +    /* Wall stuff */
 +    fr->nwall = ir->nwall;
 +    if (ir->nwall && ir->wall_type==ewtTABLE)
 +    {
 +        make_wall_tables(fp,oenv,ir,tabfn,&mtop->groups,fr);
 +    }
 +    
 +    if (fcd && tabbfn) {
 +        fcd->bondtab  = make_bonded_tables(fp,
 +                                           F_TABBONDS,F_TABBONDSNC,
 +                                           mtop,tabbfn,"b");
 +        fcd->angletab = make_bonded_tables(fp,
 +                                           F_TABANGLES,-1,
 +                                           mtop,tabbfn,"a");
 +        fcd->dihtab   = make_bonded_tables(fp,
 +                                           F_TABDIHS,-1,
 +                                           mtop,tabbfn,"d");
 +    } else {
 +        if (debug)
 +            fprintf(debug,"No fcdata or table file name passed, can not read table, can not do bonded interactions\n");
 +    }
 +    
 +    /* QM/MM initialization if requested
 +     */
 +    if (ir->bQMMM)
 +    {
 +        fprintf(stderr,"QM/MM calculation requested.\n");
 +    }
 +    
 +    fr->bQMMM      = ir->bQMMM;   
 +    fr->qr         = mk_QMMMrec();
 +    
 +    /* Set all the static charge group info */
 +    fr->cginfo_mb = init_cginfo_mb(fp,mtop,fr,bNoSolvOpt,
 +                                   &fr->bExcl_IntraCGAll_InterCGNone);
 +    if (DOMAINDECOMP(cr)) {
 +        fr->cginfo = NULL;
 +    } else {
 +        fr->cginfo = cginfo_expand(mtop->nmolblock,fr->cginfo_mb);
 +    }
 +    
 +    if (!DOMAINDECOMP(cr))
 +    {
 +        /* When using particle decomposition, the effect of the second argument,
 +         * which sets fr->hcg, is corrected later in do_md and init_em.
 +         */
 +        forcerec_set_ranges(fr,ncg_mtop(mtop),ncg_mtop(mtop),
 +                            mtop->natoms,mtop->natoms,mtop->natoms);
 +    }
 +    
 +    fr->print_force = print_force;
 +
 +
 +    /* coarse load balancing vars */
 +    fr->t_fnbf=0.;
 +    fr->t_wait=0.;
 +    fr->timesteps=0;
 +    
 +    /* Initialize neighbor search */
 +    init_ns(fp,cr,&fr->ns,fr,mtop,box);
 +
 +    if (cr->duty & DUTY_PP)
 +    {
 +        gmx_nonbonded_setup(fp,fr,bGenericKernelOnly);
 +    /*
 +     if (ir->bAdress)
 +        {
 +            gmx_setup_adress_kernels(fp,bGenericKernelOnly);
 +        }
 +     */
 +    }
 +
 +    /* Initialize the thread working data for bonded interactions */
 +    init_forcerec_f_threads(fr,mtop->groups.grps[egcENER].nr);
 +    
 +    snew(fr->excl_load,fr->nthreads+1);
 +
 +    if (fr->cutoff_scheme == ecutsVERLET)
 +    {
 +        if (ir->rcoulomb != ir->rvdw)
 +        {
 +            gmx_fatal(FARGS,"With Verlet lists rcoulomb and rvdw should be identical");
 +        }
 +
 +        init_nb_verlet(fp, &fr->nbv, ir, fr, cr, nbpu_opt);
 +    }
 +
 +    /* fr->ic is used both by verlet and group kernels (to some extent) now */
 +    init_interaction_const(fp, &fr->ic, fr, rtab);
 +    if (ir->eDispCorr != edispcNO)
 +    {
 +        calc_enervirdiff(fp,ir->eDispCorr,fr);
 +    }
 +}
 +
 +#define pr_real(fp,r) fprintf(fp,"%s: %e\n",#r,r)
 +#define pr_int(fp,i)  fprintf((fp),"%s: %d\n",#i,i)
 +#define pr_bool(fp,b) fprintf((fp),"%s: %s\n",#b,bool_names[b])
 +
 +void pr_forcerec(FILE *fp,t_forcerec *fr,t_commrec *cr)
 +{
 +  int i;
 +
 +  pr_real(fp,fr->rlist);
 +  pr_real(fp,fr->rcoulomb);
 +  pr_real(fp,fr->fudgeQQ);
 +  pr_bool(fp,fr->bGrid);
 +  pr_bool(fp,fr->bTwinRange);
 +  /*pr_int(fp,fr->cg0);
 +    pr_int(fp,fr->hcg);*/
 +  for(i=0; i<fr->nnblists; i++)
 +    pr_int(fp,fr->nblists[i].table_elec_vdw.n);
 +  pr_real(fp,fr->rcoulomb_switch);
 +  pr_real(fp,fr->rcoulomb);
 +  
 +  fflush(fp);
 +}
 +
 +void forcerec_set_excl_load(t_forcerec *fr,
 +                            const gmx_localtop_t *top,const t_commrec *cr)
 +{
 +    const int *ind,*a;
 +    int t,i,j,ntot,n,ntarget;
 +
 +    if (cr != NULL && PARTDECOMP(cr))
 +    {
 +        /* No OpenMP with particle decomposition */
 +        pd_at_range(cr,
 +                    &fr->excl_load[0],
 +                    &fr->excl_load[1]);
 +
 +        return;
 +    }
 +
 +    ind = top->excls.index;
 +    a   = top->excls.a;
 +
 +    ntot = 0;
 +    for(i=0; i<top->excls.nr; i++)
 +    {
 +        for(j=ind[i]; j<ind[i+1]; j++)
 +        {
 +            if (a[j] > i)
 +            {
 +                ntot++;
 +            }
 +        }
 +    }
 +
 +    fr->excl_load[0] = 0;
 +    n = 0;
 +    i = 0;
 +    for(t=1; t<=fr->nthreads; t++)
 +    {
 +        ntarget = (ntot*t)/fr->nthreads;
 +        while(i < top->excls.nr && n < ntarget)
 +        {
 +            for(j=ind[i]; j<ind[i+1]; j++)
 +            {
 +                if (a[j] > i)
 +                {
 +                    n++;
 +                }
 +            }
 +            i++;
 +        }
 +        fr->excl_load[t] = i;
 +    }
 +}
 +
index b3b7b6ab32b514eebdbff2e0767dfe041e22c566,0000000000000000000000000000000000000000..7b32baab22541a6d02743b87fe8bff26882935f2
mode 100644,000000..100644
--- /dev/null
@@@ -1,774 -1,0 +1,771 @@@
-         /* calculate temperature using virial */
-         enerd->term[F_VTEMP] = calc_temp(trace(total_vir),ir->opts.nrdf[0]);
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include "typedefs.h"
 +#include "string2.h"
 +#include "smalloc.h"
 +#include "mdrun.h"
 +#include "domdec.h"
 +#include "mtop_util.h"
 +#include "gmx_wallcycle.h"
 +#include "vcm.h"
 +#include "nrnb.h"
 +#include "macros.h"
 +#include "md_logging.h"
 +#include "md_support.h"
 +
 +/* Is the signal in one simulation independent of other simulations? */
 +gmx_bool gs_simlocal[eglsNR] = { TRUE, FALSE, FALSE, TRUE };
 +
 +/* check which of the multisim simulations has the shortest number of
 +   steps and return that number of nsteps */
 +gmx_large_int_t get_multisim_nsteps(const t_commrec *cr,
 +                                    gmx_large_int_t nsteps)
 +{
 +    gmx_large_int_t steps_out;
 +
 +    if MASTER(cr)
 +    {
 +        gmx_large_int_t *buf;
 +        int s;
 +
 +        snew(buf,cr->ms->nsim);
 +
 +        buf[cr->ms->sim] = nsteps;
 +        gmx_sumli_sim(cr->ms->nsim, buf, cr->ms);
 +
 +        steps_out=-1;
 +        for(s=0; s<cr->ms->nsim; s++)
 +        {
 +            /* find the smallest positive number */
 +            if (buf[s]>= 0 && ((steps_out < 0) || (buf[s]<steps_out)) )
 +            {
 +                steps_out=buf[s];
 +            }
 +        }
 +        sfree(buf);
 +
 +        /* if we're the limiting simulation, don't do anything */
 +        if (steps_out>=0 && steps_out<nsteps) 
 +        {
 +            char strbuf[255];
 +            snprintf(strbuf, 255, "Will stop simulation %%d after %s steps (another simulation will end then).\n", gmx_large_int_pfmt);
 +            fprintf(stderr, strbuf, cr->ms->sim, steps_out);
 +        }
 +    }
 +    /* broadcast to non-masters */
 +    gmx_bcast(sizeof(gmx_large_int_t), &steps_out, cr);
 +    return steps_out;
 +}
 +
 +int multisim_min(const gmx_multisim_t *ms,int nmin,int n)
 +{
 +    int  *buf;
 +    gmx_bool bPos,bEqual;
 +    int  s,d;
 +
 +    snew(buf,ms->nsim);
 +    buf[ms->sim] = n;
 +    gmx_sumi_sim(ms->nsim,buf,ms);
 +    bPos   = TRUE;
 +    bEqual = TRUE;
 +    for(s=0; s<ms->nsim; s++)
 +    {
 +        bPos   = bPos   && (buf[s] > 0);
 +        bEqual = bEqual && (buf[s] == buf[0]);
 +    }
 +    if (bPos)
 +    {
 +        if (bEqual)
 +        {
 +            nmin = min(nmin,buf[0]);
 +        }
 +        else
 +        {
 +            /* Find the least common multiple */
 +            for(d=2; d<nmin; d++)
 +            {
 +                s = 0;
 +                while (s < ms->nsim && d % buf[s] == 0)
 +                {
 +                    s++;
 +                }
 +                if (s == ms->nsim)
 +                {
 +                    /* We found the LCM and it is less than nmin */
 +                    nmin = d;
 +                    break;
 +                }
 +            }
 +        }
 +    }
 +    sfree(buf);
 +
 +    return nmin;
 +}
 +
 +int multisim_nstsimsync(const t_commrec *cr,
 +                        const t_inputrec *ir,int repl_ex_nst)
 +{
 +    int nmin;
 +
 +    if (MASTER(cr))
 +    {
 +        nmin = INT_MAX;
 +        nmin = multisim_min(cr->ms,nmin,ir->nstlist);
 +        nmin = multisim_min(cr->ms,nmin,ir->nstcalcenergy);
 +        nmin = multisim_min(cr->ms,nmin,repl_ex_nst);
 +        if (nmin == INT_MAX)
 +        {
 +            gmx_fatal(FARGS,"Can not find an appropriate interval for inter-simulation communication, since nstlist, nstcalcenergy and -replex are all <= 0");
 +        }
 +        /* Avoid inter-simulation communication at every (second) step */
 +        if (nmin <= 2)
 +        {
 +            nmin = 10;
 +        }
 +    }
 +
 +    gmx_bcast(sizeof(int),&nmin,cr);
 +
 +    return nmin;
 +}
 +
 +void init_global_signals(globsig_t *gs,const t_commrec *cr,
 +                         const t_inputrec *ir,int repl_ex_nst)
 +{
 +    int i;
 +
 +    if (MULTISIM(cr))
 +    {
 +        gs->nstms = multisim_nstsimsync(cr,ir,repl_ex_nst);
 +        if (debug)
 +        {
 +            fprintf(debug,"Syncing simulations for checkpointing and termination every %d steps\n",gs->nstms);
 +        }
 +    }
 +    else
 +    {
 +        gs->nstms = 1;
 +    }
 +
 +    for(i=0; i<eglsNR; i++)
 +    {
 +        gs->sig[i] = 0;
 +        gs->set[i] = 0;
 +    }
 +}
 +
 +void copy_coupling_state(t_state *statea,t_state *stateb, 
 +                         gmx_ekindata_t *ekinda,gmx_ekindata_t *ekindb, t_grpopts* opts) 
 +{
 +    
 +    /* MRS note -- might be able to get rid of some of the arguments.  Look over it when it's all debugged */
 +    
 +    int i,j,nc;
 +
 +    /* Make sure we have enough space for x and v */
 +    if (statea->nalloc > stateb->nalloc)
 +    {
 +        stateb->nalloc = statea->nalloc;
 +        srenew(stateb->x,stateb->nalloc);
 +        srenew(stateb->v,stateb->nalloc);
 +    }
 +
 +    stateb->natoms     = statea->natoms;
 +    stateb->ngtc       = statea->ngtc;
 +    stateb->nnhpres    = statea->nnhpres;
 +    stateb->veta       = statea->veta;
 +    if (ekinda) 
 +    {
 +        copy_mat(ekinda->ekin,ekindb->ekin);
 +        for (i=0; i<stateb->ngtc; i++) 
 +        {
 +            ekindb->tcstat[i].T = ekinda->tcstat[i].T;
 +            ekindb->tcstat[i].Th = ekinda->tcstat[i].Th;
 +            copy_mat(ekinda->tcstat[i].ekinh,ekindb->tcstat[i].ekinh);
 +            copy_mat(ekinda->tcstat[i].ekinf,ekindb->tcstat[i].ekinf);
 +            ekindb->tcstat[i].ekinscalef_nhc =  ekinda->tcstat[i].ekinscalef_nhc;
 +            ekindb->tcstat[i].ekinscaleh_nhc =  ekinda->tcstat[i].ekinscaleh_nhc;
 +            ekindb->tcstat[i].vscale_nhc =  ekinda->tcstat[i].vscale_nhc;
 +        }
 +    }
 +    copy_rvecn(statea->x,stateb->x,0,stateb->natoms);
 +    copy_rvecn(statea->v,stateb->v,0,stateb->natoms);
 +    copy_mat(statea->box,stateb->box);
 +    copy_mat(statea->box_rel,stateb->box_rel);
 +    copy_mat(statea->boxv,stateb->boxv);
 +
 +    for (i = 0; i<stateb->ngtc; i++) 
 +    { 
 +        nc = i*opts->nhchainlength;
 +        for (j=0; j<opts->nhchainlength; j++) 
 +        {
 +            stateb->nosehoover_xi[nc+j]  = statea->nosehoover_xi[nc+j];
 +            stateb->nosehoover_vxi[nc+j] = statea->nosehoover_vxi[nc+j];
 +        }
 +    }
 +    if (stateb->nhpres_xi != NULL)
 +    {
 +        for (i = 0; i<stateb->nnhpres; i++) 
 +        {
 +            nc = i*opts->nhchainlength;
 +            for (j=0; j<opts->nhchainlength; j++) 
 +            {
 +                stateb->nhpres_xi[nc+j]  = statea->nhpres_xi[nc+j];
 +                stateb->nhpres_vxi[nc+j] = statea->nhpres_vxi[nc+j];
 +            }
 +        }
 +    }
 +}
 +
 +real compute_conserved_from_auxiliary(t_inputrec *ir, t_state *state, t_extmass *MassQ)
 +{
 +    real quantity = 0;
 +    switch (ir->etc) 
 +    {
 +    case etcNO:
 +        break;
 +    case etcBERENDSEN:
 +        break;
 +    case etcNOSEHOOVER:
 +        quantity = NPT_energy(ir,state,MassQ);                
 +        break;
 +    case etcVRESCALE:
 +        quantity = vrescale_energy(&(ir->opts),state->therm_integral);
 +        break;
 +    default:
 +        break;
 +    }
 +    return quantity;
 +}
 +
 +void compute_globals(FILE *fplog, gmx_global_stat_t gstat, t_commrec *cr, t_inputrec *ir, 
 +                     t_forcerec *fr, gmx_ekindata_t *ekind, 
 +                     t_state *state, t_state *state_global, t_mdatoms *mdatoms, 
 +                     t_nrnb *nrnb, t_vcm *vcm, gmx_wallcycle_t wcycle,
 +                     gmx_enerdata_t *enerd,tensor force_vir, tensor shake_vir, tensor total_vir, 
 +                     tensor pres, rvec mu_tot, gmx_constr_t constr, 
 +                     globsig_t *gs,gmx_bool bInterSimGS,
 +                     matrix box, gmx_mtop_t *top_global, real *pcurr, 
 +                     int natoms, gmx_bool *bSumEkinhOld, int flags)
 +{
 +    int  i,gsi;
 +    real gs_buf[eglsNR];
 +    tensor corr_vir,corr_pres,shakeall_vir;
 +    gmx_bool bEner,bPres,bTemp, bVV;
 +    gmx_bool bRerunMD, bStopCM, bGStat, bIterate, 
 +        bFirstIterate,bReadEkin,bEkinAveVel,bScaleEkin, bConstrain;
 +    real ekin,temp,prescorr,enercorr,dvdlcorr;
 +    
 +    /* translate CGLO flags to gmx_booleans */
 +    bRerunMD = flags & CGLO_RERUNMD;
 +    bStopCM = flags & CGLO_STOPCM;
 +    bGStat = flags & CGLO_GSTAT;
 +
 +    bReadEkin = (flags & CGLO_READEKIN);
 +    bScaleEkin = (flags & CGLO_SCALEEKIN);
 +    bEner = flags & CGLO_ENERGY;
 +    bTemp = flags & CGLO_TEMPERATURE;
 +    bPres  = (flags & CGLO_PRESSURE);
 +    bConstrain = (flags & CGLO_CONSTRAINT);
 +    bIterate = (flags & CGLO_ITERATE);
 +    bFirstIterate = (flags & CGLO_FIRSTITERATE);
 +
 +    /* we calculate a full state kinetic energy either with full-step velocity verlet
 +       or half step where we need the pressure */
 +    
 +    bEkinAveVel = (ir->eI==eiVV || (ir->eI==eiVVAK && bPres) || bReadEkin);
 +    
 +    /* in initalization, it sums the shake virial in vv, and to 
 +       sums ekinh_old in leapfrog (or if we are calculating ekinh_old) for other reasons */
 +
 +    /* ########## Kinetic energy  ############## */
 +    
 +    if (bTemp) 
 +    {
 +        /* Non-equilibrium MD: this is parallellized, but only does communication
 +         * when there really is NEMD.
 +         */
 +        
 +        if (PAR(cr) && (ekind->bNEMD)) 
 +        {
 +            accumulate_u(cr,&(ir->opts),ekind);
 +        }
 +        debug_gmx();
 +        if (bReadEkin)
 +        {
 +            restore_ekinstate_from_state(cr,ekind,&state_global->ekinstate);
 +        }
 +        else 
 +        {
 +
 +            calc_ke_part(state,&(ir->opts),mdatoms,ekind,nrnb,bEkinAveVel,bIterate);
 +        }
 +        
 +        debug_gmx();
 +    }
 +
 +    /* Calculate center of mass velocity if necessary, also parallellized */
 +    if (bStopCM)
 +    {
 +        calc_vcm_grp(fplog,mdatoms->start,mdatoms->homenr,mdatoms,
 +                     state->x,state->v,vcm);
 +    }
 +
 +    if (bTemp || bStopCM || bPres || bEner || bConstrain)
 +    {
 +        if (!bGStat)
 +        {
 +            /* We will not sum ekinh_old,                                                            
 +             * so signal that we still have to do it.                                                
 +             */
 +            *bSumEkinhOld = TRUE;
 +
 +        }
 +        else
 +        {
 +            if (gs != NULL)
 +            {
 +                for(i=0; i<eglsNR; i++)
 +                {
 +                    gs_buf[i] = gs->sig[i];
 +                }
 +            }
 +            if (PAR(cr)) 
 +            {
 +                wallcycle_start(wcycle,ewcMoveE);
 +                global_stat(fplog,gstat,cr,enerd,force_vir,shake_vir,mu_tot,
 +                            ir,ekind,constr,bStopCM ? vcm : NULL,
 +                            gs != NULL ? eglsNR : 0,gs_buf,
 +                            top_global,state,
 +                            *bSumEkinhOld,flags);
 +                wallcycle_stop(wcycle,ewcMoveE);
 +            }
 +            if (gs != NULL)
 +            {
 +                if (MULTISIM(cr) && bInterSimGS)
 +                {
 +                    if (MASTER(cr))
 +                    {
 +                        /* Communicate the signals between the simulations */
 +                        gmx_sum_sim(eglsNR,gs_buf,cr->ms);
 +                    }
 +                    /* Communicate the signals form the master to the others */
 +                    gmx_bcast(eglsNR*sizeof(gs_buf[0]),gs_buf,cr);
 +                }
 +                for(i=0; i<eglsNR; i++)
 +                {
 +                    if (bInterSimGS || gs_simlocal[i])
 +                    {
 +                        /* Set the communicated signal only when it is non-zero,
 +                         * since signals might not be processed at each MD step.
 +                         */
 +                        gsi = (gs_buf[i] >= 0 ?
 +                               (int)(gs_buf[i] + 0.5) :
 +                               (int)(gs_buf[i] - 0.5));
 +                        if (gsi != 0)
 +                        {
 +                            gs->set[i] = gsi;
 +                        }
 +                        /* Turn off the local signal */
 +                        gs->sig[i] = 0;
 +                    }
 +                }
 +            }
 +            *bSumEkinhOld = FALSE;
 +        }
 +    }
 +    
 +    if (!ekind->bNEMD && debug && bTemp && (vcm->nr > 0))
 +    {
 +        correct_ekin(debug,
 +                     mdatoms->start,mdatoms->start+mdatoms->homenr,
 +                     state->v,vcm->group_p[0],
 +                     mdatoms->massT,mdatoms->tmass,ekind->ekin);
 +    }
 +    
 +    /* Do center of mass motion removal */
 +    if (bStopCM)
 +    {
 +        check_cm_grp(fplog,vcm,ir,1);
 +        do_stopcm_grp(fplog,mdatoms->start,mdatoms->homenr,mdatoms->cVCM,
 +                      state->x,state->v,vcm);
 +        inc_nrnb(nrnb,eNR_STOPCM,mdatoms->homenr);
 +    }
 +
 +    if (bEner)
 +    {
 +        /* Calculate the amplitude of the cosine velocity profile */
 +        ekind->cosacc.vcos = ekind->cosacc.mvcos/mdatoms->tmass;
 +    }
 +
 +    if (bTemp) 
 +    {
 +        /* Sum the kinetic energies of the groups & calc temp */
 +        /* compute full step kinetic energies if vv, or if vv-avek and we are computing the pressure with IR_NPT_TROTTER */
 +        /* three maincase:  VV with AveVel (md-vv), vv with AveEkin (md-vv-avek), leap with AveEkin (md).  
 +           Leap with AveVel is not supported; it's not clear that it will actually work.  
 +           bEkinAveVel: If TRUE, we simply multiply ekin by ekinscale to get a full step kinetic energy. 
 +           If FALSE, we average ekinh_old and ekinh*ekinscale_nhc to get an averaged half step kinetic energy.
 +           bSaveEkinOld: If TRUE (in the case of iteration = bIterate is TRUE), we don't reset the ekinscale_nhc.  
 +           If FALSE, we go ahead and erase over it.
 +        */ 
 +        enerd->term[F_TEMP] = sum_ekin(&(ir->opts),ekind,&(enerd->term[F_DKDL]),
 +                                       bEkinAveVel,bIterate,bScaleEkin);
 + 
 +        enerd->term[F_EKIN] = trace(ekind->ekin);
 +    }
 +    
 +    /* ##########  Long range energy information ###### */
 +    
 +    if (bEner || bPres || bConstrain) 
 +    {
 +        calc_dispcorr(fplog,ir,fr,0,top_global->natoms,box,state->lambda[efptVDW],
 +                      corr_pres,corr_vir,&prescorr,&enercorr,&dvdlcorr);
 +    }
 +    
 +    if (bEner && bFirstIterate) 
 +    {
 +        enerd->term[F_DISPCORR] = enercorr;
 +        enerd->term[F_EPOT] += enercorr;
 +        enerd->term[F_DVDL_VDW] += dvdlcorr;
 +    }
 +    
 +    /* ########## Now pressure ############## */
 +    if (bPres || bConstrain) 
 +    {
 +        
 +        m_add(force_vir,shake_vir,total_vir);
 +        
 +        /* Calculate pressure and apply LR correction if PPPM is used.
 +         * Use the box from last timestep since we already called update().
 +         */
 +        
 +        enerd->term[F_PRES] = calc_pres(fr->ePBC,ir->nwall,box,ekind->ekin,total_vir,pres);
 +        
 +        /* Calculate long range corrections to pressure and energy */
 +        /* this adds to enerd->term[F_PRES] and enerd->term[F_ETOT], 
 +           and computes enerd->term[F_DISPCORR].  Also modifies the 
 +           total_vir and pres tesors */
 +        
 +        m_add(total_vir,corr_vir,total_vir);
 +        m_add(pres,corr_pres,pres);
 +        enerd->term[F_PDISPCORR] = prescorr;
 +        enerd->term[F_PRES] += prescorr;
 +        *pcurr = enerd->term[F_PRES];
 +    }    
 +}
 +
 +void check_nst_param(FILE *fplog,t_commrec *cr,
 +                     const char *desc_nst,int nst,
 +                     const char *desc_p,int *p)
 +{
 +    if (*p > 0 && *p % nst != 0)
 +    {
 +        /* Round up to the next multiple of nst */
 +        *p = ((*p)/nst + 1)*nst;
 +        md_print_warn(cr,fplog,
 +                      "NOTE: %s changes %s to %d\n",desc_nst,desc_p,*p);
 +    }
 +}
 +
 +void set_current_lambdas(gmx_large_int_t step, t_lambda *fepvals, gmx_bool bRerunMD,
 +                         t_trxframe *rerun_fr,t_state *state_global, t_state *state, double lam0[])
 +/* find the current lambdas.  If rerunning, we either read in a state, or a lambda value,
 +   requiring different logic. */
 +{
 +    real frac;
 +    int i,fep_state=0;
 +    if (bRerunMD)
 +    {
 +        if (rerun_fr->bLambda)
 +        {
 +            if (fepvals->delta_lambda!=0)
 +            {
 +                state_global->lambda[efptFEP] = rerun_fr->lambda;
 +                for (i=0;i<efptNR;i++)
 +                {
 +                    if (i!= efptFEP)
 +                    {
 +                        state->lambda[i] = state_global->lambda[i];
 +                    }
 +                }
 +            }
 +            else
 +            {
 +                /* find out between which two value of lambda we should be */
 +                frac = (step*fepvals->delta_lambda);
 +                fep_state = floor(frac*fepvals->n_lambda);
 +                /* interpolate between this state and the next */
 +                /* this assumes that the initial lambda corresponds to lambda==0, which is verified in grompp */
 +                frac = (frac*fepvals->n_lambda)-fep_state;
 +                for (i=0;i<efptNR;i++)
 +                {
 +                    state_global->lambda[i] = lam0[i] + (fepvals->all_lambda[i][fep_state]) +
 +                        frac*(fepvals->all_lambda[i][fep_state+1]-fepvals->all_lambda[i][fep_state]);
 +                }
 +            }
 +        }
 +        else if (rerun_fr->bFepState)
 +        {
 +            state_global->fep_state = rerun_fr->fep_state;
 +            for (i=0;i<efptNR;i++)
 +            {
 +                state_global->lambda[i] = fepvals->all_lambda[i][fep_state];
 +            }
 +        }
 +    }
 +    else
 +    {
 +        if (fepvals->delta_lambda!=0)
 +        {
 +            /* find out between which two value of lambda we should be */
 +            frac = (step*fepvals->delta_lambda);
 +            if (fepvals->n_lambda > 0)
 +            {
 +                fep_state = floor(frac*fepvals->n_lambda);
 +                /* interpolate between this state and the next */
 +                /* this assumes that the initial lambda corresponds to lambda==0, which is verified in grompp */
 +                frac = (frac*fepvals->n_lambda)-fep_state;
 +                for (i=0;i<efptNR;i++)
 +                {
 +                    state_global->lambda[i] = lam0[i] + (fepvals->all_lambda[i][fep_state]) +
 +                        frac*(fepvals->all_lambda[i][fep_state+1]-fepvals->all_lambda[i][fep_state]);
 +                }
 +            }
 +            else
 +            {
 +                for (i=0;i<efptNR;i++)
 +                {
 +                    state_global->lambda[i] = lam0[i] + frac;
 +                }
 +            }
 +        }
 +    }
 +    for (i=0;i<efptNR;i++)
 +    {
 +        state->lambda[i] = state_global->lambda[i];
 +    }
 +}
 +
 +static void min_zero(int *n,int i)
 +{
 +    if (i > 0 && (*n == 0 || i < *n))
 +    {
 +        *n = i;
 +    }
 +}
 +
 +static int lcd4(int i1,int i2,int i3,int i4)
 +{
 +    int nst;
 +
 +    nst = 0;
 +    min_zero(&nst,i1);
 +    min_zero(&nst,i2);
 +    min_zero(&nst,i3);
 +    min_zero(&nst,i4);
 +    if (nst == 0)
 +    {
 +        gmx_incons("All 4 inputs for determininig nstglobalcomm are <= 0");
 +    }
 +    
 +    while (nst > 1 && ((i1 > 0 && i1 % nst != 0)  ||
 +                       (i2 > 0 && i2 % nst != 0)  ||
 +                       (i3 > 0 && i3 % nst != 0)  ||
 +                       (i4 > 0 && i4 % nst != 0)))
 +    {
 +        nst--;
 +    }
 +
 +    return nst;
 +}
 +
 +int check_nstglobalcomm(FILE *fplog,t_commrec *cr,
 +                        int nstglobalcomm,t_inputrec *ir)
 +{
 +    if (!EI_DYNAMICS(ir->eI))
 +    {
 +        nstglobalcomm = 1;
 +    }
 +
 +    if (nstglobalcomm == -1)
 +    {
 +        if (!(ir->nstcalcenergy > 0 ||
 +              ir->nstlist > 0 ||
 +              ir->etc != etcNO ||
 +              ir->epc != epcNO))
 +        {
 +            nstglobalcomm = 10;
 +            if (ir->nstenergy > 0 && ir->nstenergy < nstglobalcomm)
 +            {
 +                nstglobalcomm = ir->nstenergy;
 +            }
 +        }
 +        else
 +        {
 +            /* Ensure that we do timely global communication for
 +             * (possibly) each of the four following options.
 +             */
 +            nstglobalcomm = lcd4(ir->nstcalcenergy,
 +                                 ir->nstlist,
 +                                 ir->etc != etcNO ? ir->nsttcouple : 0,
 +                                 ir->epc != epcNO ? ir->nstpcouple : 0);
 +        }
 +    }
 +    else
 +    {
 +        if (ir->nstlist > 0 &&
 +            nstglobalcomm > ir->nstlist && nstglobalcomm % ir->nstlist != 0)
 +        {
 +            nstglobalcomm = (nstglobalcomm / ir->nstlist)*ir->nstlist;
 +            md_print_warn(cr,fplog,"WARNING: nstglobalcomm is larger than nstlist, but not a multiple, setting it to %d\n",nstglobalcomm);
 +        }
 +        if (ir->nstcalcenergy > 0)
 +        {
 +            check_nst_param(fplog,cr,"-gcom",nstglobalcomm,
 +                            "nstcalcenergy",&ir->nstcalcenergy);
 +        }
 +        if (ir->etc != etcNO && ir->nsttcouple > 0)
 +        {
 +            check_nst_param(fplog,cr,"-gcom",nstglobalcomm,
 +                            "nsttcouple",&ir->nsttcouple);
 +        }
 +        if (ir->epc != epcNO && ir->nstpcouple > 0)
 +        {
 +            check_nst_param(fplog,cr,"-gcom",nstglobalcomm,
 +                            "nstpcouple",&ir->nstpcouple);
 +        }
 +
 +        check_nst_param(fplog,cr,"-gcom",nstglobalcomm,
 +                        "nstenergy",&ir->nstenergy);
 +
 +        check_nst_param(fplog,cr,"-gcom",nstglobalcomm,
 +                        "nstlog",&ir->nstlog);
 +    }
 +
 +    if (ir->comm_mode != ecmNO && ir->nstcomm < nstglobalcomm)
 +    {
 +        md_print_warn(cr,fplog,"WARNING: Changing nstcomm from %d to %d\n",
 +                      ir->nstcomm,nstglobalcomm);
 +        ir->nstcomm = nstglobalcomm;
 +    }
 +
 +    return nstglobalcomm;
 +}
 +
 +void check_ir_old_tpx_versions(t_commrec *cr,FILE *fplog,
 +                               t_inputrec *ir,gmx_mtop_t *mtop)
 +{
 +    /* Check required for old tpx files */
 +    if (IR_TWINRANGE(*ir) && ir->nstlist > 1 &&
 +        ir->nstcalcenergy % ir->nstlist != 0)
 +    {
 +        md_print_warn(cr,fplog,"Old tpr file with twin-range settings: modifying energy calculation and/or T/P-coupling frequencies\n");
 +
 +        if (gmx_mtop_ftype_count(mtop,F_CONSTR) +
 +            gmx_mtop_ftype_count(mtop,F_CONSTRNC) > 0 &&
 +            ir->eConstrAlg == econtSHAKE)
 +        {
 +            md_print_warn(cr,fplog,"With twin-range cut-off's and SHAKE the virial and pressure are incorrect\n");
 +            if (ir->epc != epcNO)
 +            {
 +                gmx_fatal(FARGS,"Can not do pressure coupling with twin-range cut-off's and SHAKE");
 +            }
 +        }
 +        check_nst_param(fplog,cr,"nstlist",ir->nstlist,
 +                        "nstcalcenergy",&ir->nstcalcenergy);
 +        if (ir->epc != epcNO)
 +        {
 +            check_nst_param(fplog,cr,"nstlist",ir->nstlist,
 +                            "nstpcouple",&ir->nstpcouple);
 +        }
 +        check_nst_param(fplog,cr,"nstcalcenergy",ir->nstcalcenergy,
 +                        "nstenergy",&ir->nstenergy);
 +        check_nst_param(fplog,cr,"nstcalcenergy",ir->nstcalcenergy,
 +                        "nstlog",&ir->nstlog);
 +        if (ir->efep != efepNO)
 +        {
 +            check_nst_param(fplog,cr,"nstcalcenergy",ir->nstcalcenergy,
 +                            "nstdhdl",&ir->fepvals->nstdhdl);
 +        }
 +    }
 +}
 +
 +void rerun_parallel_comm(t_commrec *cr,t_trxframe *fr,
 +                         gmx_bool *bNotLastFrame)
 +{
 +    gmx_bool bAlloc;
 +    rvec *xp,*vp;
 +
 +    bAlloc = (fr->natoms == 0);
 +
 +    if (MASTER(cr) && !*bNotLastFrame)
 +    {
 +        fr->natoms = -1;
 +    }
 +    xp = fr->x;
 +    vp = fr->v;
 +    gmx_bcast(sizeof(*fr),fr,cr);
 +    fr->x = xp;
 +    fr->v = vp;
 +
 +    *bNotLastFrame = (fr->natoms >= 0);
 +
 +    if (*bNotLastFrame && PARTDECOMP(cr))
 +    {
 +        /* x and v are the only variable size quantities stored in trr
 +         * that are required for rerun (f is not needed).
 +         */
 +        if (bAlloc)
 +        {
 +            snew(fr->x,fr->natoms);
 +            snew(fr->v,fr->natoms);
 +        }
 +        if (fr->bX)
 +        {
 +            gmx_bcast(fr->natoms*sizeof(fr->x[0]),fr->x[0],cr);
 +        }
 +        if (fr->bV)
 +        {
 +            gmx_bcast(fr->natoms*sizeof(fr->v[0]),fr->v[0],cr);
 +        }
 +    }
 +}
index 826df3160a4f7b579ffdfbc087cb0ca409884c53,0000000000000000000000000000000000000000..d41e94dfe8e992dcff8687c0b0e94cebb161cb1f
mode 100644,000000..100644
--- /dev/null
@@@ -1,330 -1,0 +1,328 @@@
-     if (ir->bAdress)
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include "typedefs.h"
 +#include "mdatoms.h"
 +#include "smalloc.h"
 +#include "main.h"
 +#include "qmmm.h"
 +#include "mtop_util.h"
 +#include "gmx_omp_nthreads.h"
 +
 +#define ALMOST_ZERO 1e-30
 +
 +t_mdatoms *init_mdatoms(FILE *fp,gmx_mtop_t *mtop,gmx_bool bFreeEnergy)
 +{
 +  int    mb,a,g,nmol;
 +  double tmA,tmB;
 +  t_atom *atom;
 +  t_mdatoms *md;
 +  gmx_mtop_atomloop_all_t aloop;
 +  t_ilist *ilist;
 +
 +  snew(md,1);
 +
 +  md->nenergrp = mtop->groups.grps[egcENER].nr;
 +  md->bVCMgrps = FALSE;
 +  tmA = 0.0;
 +  tmB = 0.0;
 +
 +  aloop = gmx_mtop_atomloop_all_init(mtop);
 +  while(gmx_mtop_atomloop_all_next(aloop,&a,&atom)) {
 +    if (ggrpnr(&mtop->groups,egcVCM,a) > 0)
 +      md->bVCMgrps = TRUE;
 +    
 +    if (bFreeEnergy && PERTURBED(*atom)) {
 +      md->nPerturbed++;
 +      if (atom->mB != atom->m)
 +      md->nMassPerturbed++;
 +      if (atom->qB != atom->q)
 +      md->nChargePerturbed++;
 +    }
 +    
 +    tmA += atom->m;
 +    tmB += atom->mB;
 +  }
 +
 +  md->tmassA = tmA;
 +  md->tmassB = tmB;
 +  
 +  if (bFreeEnergy && fp)
 +    fprintf(fp,
 +          "There are %d atoms and %d charges for free energy perturbation\n",
 +          md->nPerturbed,md->nChargePerturbed);
 +
 +  md->bOrires = gmx_mtop_ftype_count(mtop,F_ORIRES);
 +
 +  return md;
 +}
 +
 +void atoms2md(gmx_mtop_t *mtop,t_inputrec *ir,
 +            int nindex,int *index,
 +            int start,int homenr,
 +            t_mdatoms *md)
 +{
 +  gmx_mtop_atomlookup_t alook;
 +  int       i;
 +  t_grpopts *opts;
 +  gmx_groups_t *groups;
 +  gmx_molblock_t *molblock;
 +
 +  opts = &ir->opts;
 +
 +  groups = &mtop->groups;
 +
 +  molblock = mtop->molblock;
 +
 +  /* Index==NULL indicates particle decomposition,
 +   * unless we have an empty DD node, so also check for homenr and start.
 +   * This should be signaled properly with an extra parameter or nindex==-1.
 +   */
 +  if (index == NULL && (homenr > 0 || start > 0)) {
 +    md->nr = mtop->natoms;
 +  } else {
 +    md->nr = nindex;
 +  }
 +
 +  if (md->nr > md->nalloc) {
 +    md->nalloc = over_alloc_dd(md->nr);
 +
 +    if (md->nMassPerturbed) {
 +      srenew(md->massA,md->nalloc);
 +      srenew(md->massB,md->nalloc);
 +    }
 +    srenew(md->massT,md->nalloc);
 +    srenew(md->invmass,md->nalloc);
 +    srenew(md->chargeA,md->nalloc);
 +    if (md->nPerturbed) {
 +      srenew(md->chargeB,md->nalloc);
 +    }
 +    srenew(md->typeA,md->nalloc);
 +    if (md->nPerturbed) {
 +      srenew(md->typeB,md->nalloc);
 +    }
 +    srenew(md->ptype,md->nalloc);
 +    if (opts->ngtc > 1) {
 +      srenew(md->cTC,md->nalloc);
 +      /* We always copy cTC with domain decomposition */
 +    }
 +    srenew(md->cENER,md->nalloc);
 +    if (opts->ngacc > 1)
 +      srenew(md->cACC,md->nalloc);
 +    if (opts->nFreeze &&
 +      (opts->ngfrz > 1 ||
 +       opts->nFreeze[0][XX] || opts->nFreeze[0][YY] || opts->nFreeze[0][ZZ]))
 +      srenew(md->cFREEZE,md->nalloc);
 +    if (md->bVCMgrps)
 +      srenew(md->cVCM,md->nalloc);
 +    if (md->bOrires)
 +      srenew(md->cORF,md->nalloc);
 +    if (md->nPerturbed)
 +      srenew(md->bPerturbed,md->nalloc);
 +    
 +    /* Note that these user t_mdatoms array pointers are NULL
 +     * when there is only one group present.
 +     * Therefore, when adding code, the user should use something like:
 +     * gprnrU1 = (md->cU1==NULL ? 0 : md->cU1[localatindex])
 +     */
 +    if (mtop->groups.grpnr[egcUser1] != NULL)
 +      srenew(md->cU1,md->nalloc);
 +    if (mtop->groups.grpnr[egcUser2] != NULL)
 +      srenew(md->cU2,md->nalloc);
 +    
 +    if (ir->bQMMM)
 +      srenew(md->bQM,md->nalloc);
-       md->purecg = FALSE;
-       md->pureex = FALSE;
++    if (ir->bAdress) {
 +      srenew(md->wf,md->nalloc);
 +      srenew(md->tf_table_index,md->nalloc);
++    }
 +  }
 +
 +  alook = gmx_mtop_atomlookup_init(mtop);
 +
 +#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntDefault)) schedule(static)
 +  for(i=0; i<md->nr; i++) {
 +    int     g,ag,molb;
 +    real    mA,mB,fac;
 +    t_atom  *atom;
 +
 +    if (index == NULL) {
 +      ag = i;
 +    } else {
 +      ag   = index[i];
 +    }
 +    gmx_mtop_atomnr_to_atom(alook,ag,&atom);
 +
 +    if (md->cFREEZE) {
 +      md->cFREEZE[i] = ggrpnr(groups,egcFREEZE,ag);
 +    }
 +        if (EI_ENERGY_MINIMIZATION(ir->eI))
 +        {
 +            /* Displacement is proportional to F, masses used for constraints */
 +            mA = 1.0;
 +            mB = 1.0;
 +        }
 +        else if (ir->eI == eiBD)
 +        {
 +            /* With BD the physical masses are irrelevant.
 +             * To keep the code simple we use most of the normal MD code path
 +             * for BD. Thus for constraining the masses should be proportional
 +             * to the friction coefficient. We set the absolute value such that
 +             * m/2<(dx/dt)^2> = m/2*2kT/fric*dt = kT/2 => m=fric*dt/2
 +             * Then if we set the (meaningless) velocity to v=dx/dt, we get the
 +             * correct kinetic energy and temperature using the usual code path.
 +             * Thus with BD v*dt will give the displacement and the reported
 +             * temperature can signal bad integration (too large time step).
 +             */
 +            if (ir->bd_fric > 0)
 +            {
 +                mA = 0.5*ir->bd_fric*ir->delta_t;
 +                mB = 0.5*ir->bd_fric*ir->delta_t;
 +            }
 +            else
 +            {
 +                /* The friction coefficient is mass/tau_t */
 +                fac = ir->delta_t/opts->tau_t[md->cTC ? groups->grpnr[egcTC][ag] : 0];
 +                mA = 0.5*atom->m*fac;
 +                mB = 0.5*atom->mB*fac;
 +            }
 +        }
 +        else
 +        {
 +            mA = atom->m;
 +            mB = atom->mB;
 +        }
 +    if (md->nMassPerturbed) {
 +      md->massA[i]    = mA;
 +      md->massB[i]    = mB;
 +    }
 +    md->massT[i]      = mA;
 +    if (mA == 0.0) {
 +      md->invmass[i]    = 0;
 +    } else if (md->cFREEZE) {
 +      g = md->cFREEZE[i];
 +      if (opts->nFreeze[g][XX] && opts->nFreeze[g][YY] && opts->nFreeze[g][ZZ])
 +      /* Set the mass of completely frozen particles to ALMOST_ZERO iso 0
 +       * to avoid div by zero in lincs or shake.
 +       * Note that constraints can still move a partially frozen particle.
 +       */
 +      md->invmass[i]  = ALMOST_ZERO;
 +      else
 +      md->invmass[i]  = 1.0/mA;
 +    } else {
 +      md->invmass[i]  = 1.0/mA;
 +    }
 +    md->chargeA[i]    = atom->q;
 +    md->typeA[i]      = atom->type;
 +    if (md->nPerturbed) {
 +      md->chargeB[i]  = atom->qB;
 +      md->typeB[i]    = atom->typeB;
 +      md->bPerturbed[i] = PERTURBED(*atom);
 +    }
 +    md->ptype[i]      = atom->ptype;
 +    if (md->cTC)
 +      md->cTC[i]      = groups->grpnr[egcTC][ag];
 +    md->cENER[i]      =
 +      (groups->grpnr[egcENER] ? groups->grpnr[egcENER][ag] : 0);
 +    if (md->cACC)
 +      md->cACC[i]     = groups->grpnr[egcACC][ag];
 +    if (md->cVCM)
 +      md->cVCM[i]             = groups->grpnr[egcVCM][ag];
 +    if (md->cORF)
 +      md->cORF[i]             = groups->grpnr[egcORFIT][ag];
 +
 +    if (md->cU1)
 +      md->cU1[i]      = groups->grpnr[egcUser1][ag];
 +    if (md->cU2)
 +      md->cU2[i]      = groups->grpnr[egcUser2][ag];
 +
 +    if (ir->bQMMM) {
 +      if (groups->grpnr[egcQMMM] == 0 || 
 +        groups->grpnr[egcQMMM][ag] < groups->grps[egcQMMM].nr-1) {
 +      md->bQM[i]      = TRUE;
 +      } else {
 +      md->bQM[i]      = FALSE;
 +      }
 +    }
 +    /* Initialize AdResS weighting functions to adressw */
 +    if (ir->bAdress){
 +       md->wf[i]           = 1.0;
 +        /* if no tf table groups specified, use default table */
 +       md->tf_table_index[i] = DEFAULT_TF_TABLE;
 +       if (ir->adress->n_tf_grps > 0){
 +            /* if tf table groups specified, tf is only applied to thoose energy groups*/
 +            md->tf_table_index[i] = NO_TF_TABLE;
 +            /* check wether atom is in one of the relevant energy groups and assign a table index */
 +            for (g=0; g<ir->adress->n_tf_grps; g++){
 +                if (md->cENER[i] == ir->adress->tf_table_index[g]){
 +                   md->tf_table_index[i] = g;
 +                }
 +            }
 +        }
 +    }
 +  }
 +
 +  gmx_mtop_atomlookup_destroy(alook);
 +
 +  md->start  = start;
 +  md->homenr = homenr;
 +  md->lambda = 0;
 +}
 +
 +void update_mdatoms(t_mdatoms *md,real lambda)
 +{
 +  int    al,end;
 +  real   L1=1.0-lambda;
 +  
 +  end=md->nr;
 +
 +  if (md->nMassPerturbed) {
 +    for(al=0; (al<end); al++) {
 +      if (md->bPerturbed[al]) {
 +      md->massT[al] = L1*md->massA[al]+ lambda*md->massB[al];
 +      if (md->invmass[al] > 1.1*ALMOST_ZERO)
 +        md->invmass[al] = 1.0/md->massT[al];
 +      }
 +    }
 +    md->tmass = L1*md->tmassA + lambda*md->tmassB;
 +  } else {
 +    md->tmass = md->tmassA;
 +  }
 +  md->lambda = lambda;
 +}
index a1e17220b22d167ae29da9161417b958130ed84b,0000000000000000000000000000000000000000..aa6cfad96f77e7aefa10760a013bd47e5c1e7a5b
mode 100644,000000..100644
--- /dev/null
@@@ -1,1452 -1,0 +1,1450 @@@
-         else if (i==F_VTEMP)
-             md->bEner[i] =  (EI_DYNAMICS(ir->eI) && getenv("GMX_VIRIAL_TEMPERATURE"));
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <string.h>
 +#include <float.h>
 +#include "typedefs.h"
 +#include "string2.h"
 +#include "mdebin.h"
 +#include "smalloc.h"
 +#include "physics.h"
 +#include "enxio.h"
 +#include "vec.h"
 +#include "disre.h"
 +#include "main.h"
 +#include "network.h"
 +#include "names.h"
 +#include "orires.h"
 +#include "constr.h"
 +#include "mtop_util.h"
 +#include "xvgr.h"
 +#include "gmxfio.h"
 +#include "macros.h"
 +#include "mdrun.h"
 +#include "mdebin_bar.h"
 +
 +
 +static const char *conrmsd_nm[] = { "Constr. rmsd", "Constr.2 rmsd" };
 +
 +static const char *boxs_nm[] = { "Box-X", "Box-Y", "Box-Z" };
 +
 +static const char *tricl_boxs_nm[] = {
 +    "Box-XX", "Box-YY", "Box-ZZ",
 +    "Box-YX", "Box-ZX", "Box-ZY"
 +};
 +
 +static const char *vol_nm[] = { "Volume" };
 +
 +static const char *dens_nm[] = {"Density" };
 +
 +static const char *pv_nm[] = {"pV" };
 +
 +static const char *enthalpy_nm[] = {"Enthalpy" };
 +
 +static const char *boxvel_nm[] = {
 +    "Box-Vel-XX", "Box-Vel-YY", "Box-Vel-ZZ",
 +    "Box-Vel-YX", "Box-Vel-ZX", "Box-Vel-ZY"
 +};
 +
 +#define NBOXS asize(boxs_nm)
 +#define NTRICLBOXS asize(tricl_boxs_nm)
 +
 +t_mdebin *init_mdebin(ener_file_t fp_ene,
 +                      const gmx_mtop_t *mtop,
 +                      const t_inputrec *ir,
 +                      FILE *fp_dhdl)
 +{
 +    const char *ener_nm[F_NRE];
 +    static const char *vir_nm[] = {
 +        "Vir-XX", "Vir-XY", "Vir-XZ",
 +        "Vir-YX", "Vir-YY", "Vir-YZ",
 +        "Vir-ZX", "Vir-ZY", "Vir-ZZ"
 +    };
 +    static const char *sv_nm[] = {
 +        "ShakeVir-XX", "ShakeVir-XY", "ShakeVir-XZ",
 +        "ShakeVir-YX", "ShakeVir-YY", "ShakeVir-YZ",
 +        "ShakeVir-ZX", "ShakeVir-ZY", "ShakeVir-ZZ"
 +    };
 +    static const char *fv_nm[] = {
 +        "ForceVir-XX", "ForceVir-XY", "ForceVir-XZ",
 +        "ForceVir-YX", "ForceVir-YY", "ForceVir-YZ",
 +        "ForceVir-ZX", "ForceVir-ZY", "ForceVir-ZZ"
 +    };
 +    static const char *pres_nm[] = {
 +        "Pres-XX","Pres-XY","Pres-XZ",
 +        "Pres-YX","Pres-YY","Pres-YZ",
 +        "Pres-ZX","Pres-ZY","Pres-ZZ"
 +    };
 +    static const char *surft_nm[] = {
 +        "#Surf*SurfTen"
 +    };
 +    static const char *mu_nm[] = {
 +        "Mu-X", "Mu-Y", "Mu-Z"
 +    };
 +    static const char *vcos_nm[] = {
 +        "2CosZ*Vel-X"
 +    };
 +    static const char *visc_nm[] = {
 +        "1/Viscosity"
 +    };
 +    static const char *baro_nm[] = {
 +        "Barostat"
 +    };
 +
 +    char     **grpnms;
 +    const gmx_groups_t *groups;
 +    char     **gnm;
 +    char     buf[256];
 +    const char     *bufi;
 +    t_mdebin *md;
 +    int      i,j,ni,nj,n,nh,k,kk,ncon,nset;
 +    gmx_bool     bBHAM,bNoseHoover,b14;
 +
 +    snew(md,1);
 +
 +    md->bVir=TRUE;
 +    md->bPress=TRUE;
 +    md->bSurft=TRUE;
 +    md->bMu=TRUE;
 +
 +    if (EI_DYNAMICS(ir->eI))
 +    {
 +        md->delta_t = ir->delta_t;
 +    }
 +    else
 +    {
 +        md->delta_t = 0;
 +    }
 +
 +    groups = &mtop->groups;
 +
 +    bBHAM = (mtop->ffparams.functype[0] == F_BHAM);
 +    b14   = (gmx_mtop_ftype_count(mtop,F_LJ14) > 0 ||
 +             gmx_mtop_ftype_count(mtop,F_LJC14_Q) > 0);
 +
 +    ncon = gmx_mtop_ftype_count(mtop,F_CONSTR);
 +    nset = gmx_mtop_ftype_count(mtop,F_SETTLE);
 +    md->bConstr    = (ncon > 0 || nset > 0);
 +    md->bConstrVir = FALSE;
 +    if (md->bConstr) {
 +        if (ncon > 0 && ir->eConstrAlg == econtLINCS) {
 +            if (ir->eI == eiSD2)
 +                md->nCrmsd = 2;
 +            else
 +                md->nCrmsd = 1;
 +        }
 +        md->bConstrVir = (getenv("GMX_CONSTRAINTVIR") != NULL);
 +    } else {
 +        md->nCrmsd = 0;
 +    }
 +
 +    /* Energy monitoring */
 +    for(i=0;i<egNR;i++)
 +    {
 +        md->bEInd[i]=FALSE;
 +    }
 +
 +#ifndef GMX_OPENMM
 +    for(i=0; i<F_NRE; i++)
 +    {
 +        md->bEner[i] = FALSE;
 +        if (i == F_LJ)
 +            md->bEner[i] = !bBHAM;
 +        else if (i == F_BHAM)
 +            md->bEner[i] = bBHAM;
 +        else if (i == F_EQM)
 +            md->bEner[i] = ir->bQMMM;
 +        else if (i == F_COUL_LR)
 +            md->bEner[i] = (ir->rcoulomb > ir->rlist);
 +        else if (i == F_LJ_LR)
 +            md->bEner[i] = (!bBHAM && ir->rvdw > ir->rlist);
 +        else if (i == F_BHAM_LR)
 +            md->bEner[i] = (bBHAM && ir->rvdw > ir->rlist);
 +        else if (i == F_RF_EXCL)
 +            md->bEner[i] = (EEL_RF(ir->coulombtype) && ir->coulombtype != eelRF_NEC && ir->cutoff_scheme == ecutsGROUP);
 +        else if (i == F_COUL_RECIP)
 +            md->bEner[i] = EEL_FULL(ir->coulombtype);
 +        else if (i == F_LJ14)
 +            md->bEner[i] = b14;
 +        else if (i == F_COUL14)
 +            md->bEner[i] = b14;
 +        else if (i == F_LJC14_Q || i == F_LJC_PAIRS_NB)
 +            md->bEner[i] = FALSE;
 +        else if ((i == F_DVDL_COUL && ir->fepvals->separate_dvdl[efptCOUL]) ||
 +                 (i == F_DVDL_VDW  && ir->fepvals->separate_dvdl[efptVDW]) ||
 +                 (i == F_DVDL_BONDED && ir->fepvals->separate_dvdl[efptBONDED]) ||
 +                 (i == F_DVDL_RESTRAINT && ir->fepvals->separate_dvdl[efptRESTRAINT]) ||
 +                 (i == F_DKDL && ir->fepvals->separate_dvdl[efptMASS]) ||
 +                 (i == F_DVDL && ir->fepvals->separate_dvdl[efptFEP]))
 +            md->bEner[i] = (ir->efep != efepNO);
 +        else if ((interaction_function[i].flags & IF_VSITE) ||
 +                 (i == F_CONSTR) || (i == F_CONSTRNC) || (i == F_SETTLE))
 +            md->bEner[i] = FALSE;
 +        else if ((i == F_COUL_SR) || (i == F_EPOT) || (i == F_PRES)  || (i==F_EQM))
 +            md->bEner[i] = TRUE;
 +        else if ((i == F_GBPOL) && ir->implicit_solvent==eisGBSA)
 +            md->bEner[i] = TRUE;
 +        else if ((i == F_NPSOLVATION) && ir->implicit_solvent==eisGBSA && (ir->sa_algorithm != esaNO))
 +            md->bEner[i] = TRUE;
 +        else if ((i == F_GB12) || (i == F_GB13) || (i == F_GB14))
 +            md->bEner[i] = FALSE;
 +        else if ((i == F_ETOT) || (i == F_EKIN) || (i == F_TEMP))
 +            md->bEner[i] = EI_DYNAMICS(ir->eI);
 +        else if (i == F_DISPCORR || i == F_PDISPCORR)
 +            md->bEner[i] = (ir->eDispCorr != edispcNO);
 +        else if (i == F_DISRESVIOL)
 +            md->bEner[i] = (gmx_mtop_ftype_count(mtop,F_DISRES) > 0);
 +        else if (i == F_ORIRESDEV)
 +            md->bEner[i] = (gmx_mtop_ftype_count(mtop,F_ORIRES) > 0);
 +        else if (i == F_CONNBONDS)
 +            md->bEner[i] = FALSE;
 +        else if (i == F_COM_PULL)
 +            md->bEner[i] = (ir->ePull == epullUMBRELLA || ir->ePull == epullCONST_F || ir->bRot);
 +        else if (i == F_ECONSERVED)
 +            md->bEner[i] = ((ir->etc == etcNOSEHOOVER || ir->etc == etcVRESCALE) &&
 +                            (ir->epc == epcNO || ir->epc==epcMTTK));
 +        else
 +            md->bEner[i] = (gmx_mtop_ftype_count(mtop,i) > 0);
 +    }
 +#else
 +    /* OpenMM always produces only the following 4 energy terms */
 +    md->bEner[F_EPOT] = TRUE;
 +    md->bEner[F_EKIN] = TRUE;
 +    md->bEner[F_ETOT] = TRUE;
 +    md->bEner[F_TEMP] = TRUE;
 +#endif
 +
 +    /* for adress simulations, most energy terms are not meaningfull, and thus disabled*/
 +    if (ir->bAdress && !debug) {
 +        for (i = 0; i < F_NRE; i++) {
 +            md->bEner[i] = FALSE;
 +            if(i == F_EKIN){ md->bEner[i] = TRUE;}
 +            if(i == F_TEMP){ md->bEner[i] = TRUE;}
 +        }
 +        md->bVir=FALSE;
 +        md->bPress=FALSE;
 +        md->bSurft=FALSE;
 +        md->bMu=FALSE;
 +    }
 +
 +    md->f_nre=0;
 +    for(i=0; i<F_NRE; i++)
 +    {
 +        if (md->bEner[i])
 +        {
 +            ener_nm[md->f_nre]=interaction_function[i].longname;
 +            md->f_nre++;
 +        }
 +    }
 +
 +    md->epc = ir->epc;
 +    md->bDiagPres = !TRICLINIC(ir->ref_p);
 +    md->ref_p = (ir->ref_p[XX][XX]+ir->ref_p[YY][YY]+ir->ref_p[ZZ][ZZ])/DIM;
 +    md->bTricl = TRICLINIC(ir->compress) || TRICLINIC(ir->deform);
 +    md->bDynBox = DYNAMIC_BOX(*ir);
 +    md->etc = ir->etc;
 +    md->bNHC_trotter = IR_NVT_TROTTER(ir);
 +    md->bPrintNHChains = ir-> bPrintNHChains;
 +    md->bMTTK = (IR_NPT_TROTTER(ir) || IR_NPH_TROTTER(ir));
 +    md->bMu = NEED_MUTOT(*ir);
 +
 +    md->ebin  = mk_ebin();
 +    /* Pass NULL for unit to let get_ebin_space determine the units
 +     * for interaction_function[i].longname
 +     */
 +    md->ie    = get_ebin_space(md->ebin,md->f_nre,ener_nm,NULL);
 +    if (md->nCrmsd)
 +    {
 +        /* This should be called directly after the call for md->ie,
 +         * such that md->iconrmsd follows directly in the list.
 +         */
 +        md->iconrmsd = get_ebin_space(md->ebin,md->nCrmsd,conrmsd_nm,"");
 +    }
 +    if (md->bDynBox)
 +    {
 +        md->ib    = get_ebin_space(md->ebin,
 +                                   md->bTricl ? NTRICLBOXS : NBOXS,
 +                                   md->bTricl ? tricl_boxs_nm : boxs_nm,
 +                                   unit_length);
 +        md->ivol  = get_ebin_space(md->ebin, 1, vol_nm,  unit_volume);
 +        md->idens = get_ebin_space(md->ebin, 1, dens_nm, unit_density_SI);
 +        if (md->bDiagPres)
 +        {
 +            md->ipv   = get_ebin_space(md->ebin, 1, pv_nm,   unit_energy);
 +            md->ienthalpy = get_ebin_space(md->ebin, 1, enthalpy_nm,   unit_energy);
 +        }
 +    }
 +    if (md->bConstrVir)
 +    {
 +        md->isvir = get_ebin_space(md->ebin,asize(sv_nm),sv_nm,unit_energy);
 +        md->ifvir = get_ebin_space(md->ebin,asize(fv_nm),fv_nm,unit_energy);
 +    }
 +    if (md->bVir)
 +        md->ivir   = get_ebin_space(md->ebin,asize(vir_nm),vir_nm,unit_energy);
 +    if (md->bPress)
 +        md->ipres  = get_ebin_space(md->ebin,asize(pres_nm),pres_nm,unit_pres_bar);
 +    if (md->bSurft)
 +        md->isurft = get_ebin_space(md->ebin,asize(surft_nm),surft_nm,
 +                                unit_surft_bar);
 +    if (md->epc == epcPARRINELLORAHMAN || md->epc == epcMTTK)
 +    {
 +        md->ipc = get_ebin_space(md->ebin,md->bTricl ? 6 : 3,
 +                                 boxvel_nm,unit_vel);
 +    }
 +    if (md->bMu)
 +    {
 +        md->imu    = get_ebin_space(md->ebin,asize(mu_nm),mu_nm,unit_dipole_D);
 +    }
 +    if (ir->cos_accel != 0)
 +    {
 +        md->ivcos = get_ebin_space(md->ebin,asize(vcos_nm),vcos_nm,unit_vel);
 +        md->ivisc = get_ebin_space(md->ebin,asize(visc_nm),visc_nm,
 +                                   unit_invvisc_SI);
 +    }
 +
 +    /* Energy monitoring */
 +    for(i=0;i<egNR;i++)
 +    {
 +        md->bEInd[i] = FALSE;
 +    }
 +    md->bEInd[egCOULSR] = TRUE;
 +    md->bEInd[egLJSR  ] = TRUE;
 +
 +    if (ir->rcoulomb > ir->rlist)
 +    {
 +        md->bEInd[egCOULLR] = TRUE;
 +    }
 +    if (!bBHAM)
 +    {
 +        if (ir->rvdw > ir->rlist)
 +        {
 +            md->bEInd[egLJLR]   = TRUE;
 +        }
 +    }
 +    else
 +    {
 +        md->bEInd[egLJSR]   = FALSE;
 +        md->bEInd[egBHAMSR] = TRUE;
 +        if (ir->rvdw > ir->rlist)
 +        {
 +            md->bEInd[egBHAMLR]   = TRUE;
 +        }
 +    }
 +    if (b14)
 +    {
 +        md->bEInd[egLJ14] = TRUE;
 +        md->bEInd[egCOUL14] = TRUE;
 +    }
 +    md->nEc=0;
 +    for(i=0; (i<egNR); i++)
 +    {
 +        if (md->bEInd[i])
 +        {
 +            md->nEc++;
 +        }
 +    }
 +
 +    n=groups->grps[egcENER].nr;
 +    /* for adress simulations, most energy terms are not meaningfull, and thus disabled*/
 +    if (!ir->bAdress){
 +        /*standard simulation*/
 +        md->nEg=n;
 +        md->nE=(n*(n+1))/2;
 +    }
 +    else if (!debug) {
 +        /*AdResS simulation*/
 +       md->nU=0;
 +       md->nEg=0;
 +       md->nE=0;
 +       md->nEc=0;
 +       md->isvir=FALSE;
 +    }
 +    snew(md->igrp,md->nE);
 +    if (md->nE > 1)
 +    {
 +        n=0;
 +        snew(gnm,md->nEc);
 +        for(k=0; (k<md->nEc); k++)
 +        {
 +            snew(gnm[k],STRLEN);
 +        }
 +        for(i=0; (i<groups->grps[egcENER].nr); i++)
 +        {
 +            ni=groups->grps[egcENER].nm_ind[i];
 +            for(j=i; (j<groups->grps[egcENER].nr); j++)
 +            {
 +                nj=groups->grps[egcENER].nm_ind[j];
 +                for(k=kk=0; (k<egNR); k++)
 +                {
 +                    if (md->bEInd[k])
 +                    {
 +                        sprintf(gnm[kk],"%s:%s-%s",egrp_nm[k],
 +                                *(groups->grpname[ni]),*(groups->grpname[nj]));
 +                        kk++;
 +                    }
 +                }
 +                md->igrp[n]=get_ebin_space(md->ebin,md->nEc,
 +                                           (const char **)gnm,unit_energy);
 +                n++;
 +            }
 +        }
 +        for(k=0; (k<md->nEc); k++)
 +        {
 +            sfree(gnm[k]);
 +        }
 +        sfree(gnm);
 +
 +        if (n != md->nE)
 +        {
 +            gmx_incons("Number of energy terms wrong");
 +        }
 +    }
 +
 +    md->nTC=groups->grps[egcTC].nr;
 +    md->nNHC = ir->opts.nhchainlength; /* shorthand for number of NH chains */
 +    if (md->bMTTK)
 +    {
 +        md->nTCP = 1;  /* assume only one possible coupling system for barostat
 +                          for now */
 +    }
 +    else
 +    {
 +        md->nTCP = 0;
 +    }
 +    if (md->etc == etcNOSEHOOVER)
 +    {
 +        if (md->bNHC_trotter)
 +        {
 +            md->mde_n = 2*md->nNHC*md->nTC;
 +        }
 +        else
 +        {
 +            md->mde_n = 2*md->nTC;
 +        }
 +        if (md->epc == epcMTTK)
 +        {
 +            md->mdeb_n = 2*md->nNHC*md->nTCP;
 +        }
 +    } else {
 +        md->mde_n = md->nTC;
 +        md->mdeb_n = 0;
 +    }
 +
 +    snew(md->tmp_r,md->mde_n);
 +    snew(md->tmp_v,md->mde_n);
 +    snew(md->grpnms,md->mde_n);
 +    grpnms = md->grpnms;
 +
 +    for(i=0; (i<md->nTC); i++)
 +    {
 +        ni=groups->grps[egcTC].nm_ind[i];
 +        sprintf(buf,"T-%s",*(groups->grpname[ni]));
 +        grpnms[i]=strdup(buf);
 +    }
 +    md->itemp=get_ebin_space(md->ebin,md->nTC,(const char **)grpnms,
 +                             unit_temp_K);
 +
 +    if (md->etc == etcNOSEHOOVER)
 +    {
 +        if (md->bPrintNHChains)
 +        {
 +            if (md->bNHC_trotter)
 +            {
 +                for(i=0; (i<md->nTC); i++)
 +                {
 +                    ni=groups->grps[egcTC].nm_ind[i];
 +                    bufi = *(groups->grpname[ni]);
 +                    for(j=0; (j<md->nNHC); j++)
 +                    {
 +                        sprintf(buf,"Xi-%d-%s",j,bufi);
 +                        grpnms[2*(i*md->nNHC+j)]=strdup(buf);
 +                        sprintf(buf,"vXi-%d-%s",j,bufi);
 +                        grpnms[2*(i*md->nNHC+j)+1]=strdup(buf);
 +                    }
 +                }
 +                md->itc=get_ebin_space(md->ebin,md->mde_n,
 +                                       (const char **)grpnms,unit_invtime);
 +                if (md->bMTTK)
 +                {
 +                    for(i=0; (i<md->nTCP); i++)
 +                    {
 +                        bufi = baro_nm[0];  /* All barostat DOF's together for now. */
 +                        for(j=0; (j<md->nNHC); j++)
 +                        {
 +                            sprintf(buf,"Xi-%d-%s",j,bufi);
 +                            grpnms[2*(i*md->nNHC+j)]=strdup(buf);
 +                            sprintf(buf,"vXi-%d-%s",j,bufi);
 +                            grpnms[2*(i*md->nNHC+j)+1]=strdup(buf);
 +                        }
 +                    }
 +                    md->itcb=get_ebin_space(md->ebin,md->mdeb_n,
 +                                            (const char **)grpnms,unit_invtime);
 +                }
 +            }
 +            else
 +            {
 +                for(i=0; (i<md->nTC); i++)
 +                {
 +                    ni=groups->grps[egcTC].nm_ind[i];
 +                    bufi = *(groups->grpname[ni]);
 +                    sprintf(buf,"Xi-%s",bufi);
 +                    grpnms[2*i]=strdup(buf);
 +                    sprintf(buf,"vXi-%s",bufi);
 +                    grpnms[2*i+1]=strdup(buf);
 +                }
 +                md->itc=get_ebin_space(md->ebin,md->mde_n,
 +                                       (const char **)grpnms,unit_invtime);
 +            }
 +        }
 +    }
 +    else if (md->etc == etcBERENDSEN || md->etc == etcYES ||
 +             md->etc == etcVRESCALE)
 +    {
 +        for(i=0; (i<md->nTC); i++)
 +        {
 +            ni=groups->grps[egcTC].nm_ind[i];
 +            sprintf(buf,"Lamb-%s",*(groups->grpname[ni]));
 +            grpnms[i]=strdup(buf);
 +        }
 +        md->itc=get_ebin_space(md->ebin,md->mde_n,(const char **)grpnms,"");
 +    }
 +
 +    sfree(grpnms);
 +
 +
 +    md->nU=groups->grps[egcACC].nr;
 +    if (md->nU > 1)
 +    {
 +        snew(grpnms,3*md->nU);
 +        for(i=0; (i<md->nU); i++)
 +        {
 +            ni=groups->grps[egcACC].nm_ind[i];
 +            sprintf(buf,"Ux-%s",*(groups->grpname[ni]));
 +            grpnms[3*i+XX]=strdup(buf);
 +            sprintf(buf,"Uy-%s",*(groups->grpname[ni]));
 +            grpnms[3*i+YY]=strdup(buf);
 +            sprintf(buf,"Uz-%s",*(groups->grpname[ni]));
 +            grpnms[3*i+ZZ]=strdup(buf);
 +        }
 +        md->iu=get_ebin_space(md->ebin,3*md->nU,(const char **)grpnms,unit_vel);
 +        sfree(grpnms);
 +    }
 +
 +    if ( fp_ene )
 +    {
 +        do_enxnms(fp_ene,&md->ebin->nener,&md->ebin->enm);
 +    }
 +
 +    md->print_grpnms=NULL;
 +
 +    /* check whether we're going to write dh histograms */
 +    md->dhc=NULL;
 +    if (ir->fepvals->separate_dhdl_file == esepdhdlfileNO )
 +    {
 +        /* Currently dh histograms are only written with dynamics */
 +        if (EI_DYNAMICS(ir->eI))
 +        {
 +            snew(md->dhc, 1);
 +
 +            mde_delta_h_coll_init(md->dhc, ir);
 +        }
 +        md->fp_dhdl = NULL;
 +    }
 +    else
 +    {
 +        md->fp_dhdl = fp_dhdl;
 +    }
 +    if (ir->bSimTemp) {
 +        int i;
 +        snew(md->temperatures,ir->fepvals->n_lambda);
 +        for (i=0;i<ir->fepvals->n_lambda;i++)
 +        {
 +            md->temperatures[i] = ir->simtempvals->temperatures[i];
 +        }
 +    }
 +    return md;
 +}
 +
 +extern FILE *open_dhdl(const char *filename,const t_inputrec *ir,
 +                       const output_env_t oenv)
 +{
 +    FILE *fp;
 +    const char *dhdl="dH/d\\lambda",*deltag="\\DeltaH",*lambda="\\lambda",
 +        *lambdastate="\\lambda state",*remain="remaining";
 +    char title[STRLEN],label_x[STRLEN],label_y[STRLEN];
 +    int  i,np,nps,nsets,nsets_de,nsetsbegin;
 +    t_lambda *fep;
 +    char **setname;
 +    char buf[STRLEN];
 +    int bufplace=0;
 +
 +    int nsets_dhdl = 0;
 +    int s = 0;
 +    int nsetsextend;
 +
 +    /* for simplicity */
 +    fep = ir->fepvals;
 +
 +    if (fep->n_lambda == 0)
 +    {
 +        sprintf(title,"%s",dhdl);
 +        sprintf(label_x,"Time (ps)");
 +        sprintf(label_y,"%s (%s %s)",
 +                dhdl,unit_energy,"[\\lambda]\\S-1\\N");
 +    }
 +    else
 +    {
 +        sprintf(title,"%s and %s",dhdl,deltag);
 +        sprintf(label_x,"Time (ps)");
 +        sprintf(label_y,"%s and %s (%s %s)",
 +                dhdl,deltag,unit_energy,"[\\8l\\4]\\S-1\\N");
 +    }
 +    fp = gmx_fio_fopen(filename,"w+");
 +    xvgr_header(fp,title,label_x,label_y,exvggtXNY,oenv);
 +
 +    if (!(ir->bSimTemp))
 +    {
 +        bufplace = sprintf(buf,"T = %g (K) ",
 +                ir->opts.ref_t[0]);
 +    }
 +    if (ir->efep != efepSLOWGROWTH)
 +    {
 +        if (fep->n_lambda == 0)
 +        {
 +            sprintf(&(buf[bufplace]),"%s = %g",
 +                    lambda,fep->init_lambda);
 +        }
 +        else
 +        {
 +            sprintf(&(buf[bufplace]),"%s = %d",
 +                    lambdastate,fep->init_fep_state);
 +        }
 +    }
 +    xvgr_subtitle(fp,buf,oenv);
 +
 +    for (i=0;i<efptNR;i++)
 +    {
 +        if (fep->separate_dvdl[i]) {nsets_dhdl++;}
 +    }
 +
 +    /* count the number of delta_g states */
 +    nsets_de = fep->n_lambda;
 +
 +    nsets = nsets_dhdl + nsets_de; /* dhdl + fep differences */
 +
 +    if (fep->n_lambda>0 && ir->bExpanded)
 +    {
 +        nsets += 1;   /*add fep state for expanded ensemble */
 +    }
 +
 +    if (fep->bPrintEnergy)
 +    {
 +        nsets += 1;  /* add energy to the dhdl as well */
 +    }
 +
 +    nsetsextend = nsets;
 +    if ((ir->epc!=epcNO) && (fep->n_lambda>0))
 +    {
 +        nsetsextend += 1; /* for PV term, other terms possible if required for the reduced potential (only needed with foreign lambda) */
 +    }
 +    snew(setname,nsetsextend);
 +
 +    if (ir->bExpanded)
 +    {
 +        /* state for the fep_vals, if we have alchemical sampling */
 +        sprintf(buf,"%s","Thermodynamic state");
 +        setname[s] = strdup(buf);
 +        s+=1;
 +    }
 +
 +    if (fep->bPrintEnergy)
 +    {
 +        sprintf(buf,"%s (%s)","Energy",unit_energy);
 +        setname[s] = strdup(buf);
 +        s+=1;
 +    }
 +
 +    for (i=0;i<efptNR;i++)
 +    {
 +        if (fep->separate_dvdl[i]) {
 +            sprintf(buf,"%s (%s)",dhdl,efpt_names[i]);
 +            setname[s] = strdup(buf);
 +            s+=1;
 +        }
 +    }
 +
 +    if (fep->n_lambda > 0)
 +    {
 +        /* g_bar has to determine the lambda values used in this simulation
 +         * from this xvg legend.
 +         */
 +
 +        if (ir->bExpanded) {
 +            nsetsbegin = 1;  /* for including the expanded ensemble */
 +        } else {
 +            nsetsbegin = 0;
 +        }
 +
 +        if (fep->bPrintEnergy)
 +        {
 +            nsetsbegin += 1;
 +        }
 +        nsetsbegin += nsets_dhdl;
 +
 +        for(s=nsetsbegin; s<nsets; s++)
 +        {
 +            nps = sprintf(buf,"%s %s (",deltag,lambda);
 +            for (i=0;i<efptNR;i++)
 +            {
 +                if (fep->separate_dvdl[i])
 +                {
 +                    np = sprintf(&buf[nps],"%g,",fep->all_lambda[i][s-(nsetsbegin)]);
 +                    nps += np;
 +                }
 +            }
 +            if (ir->bSimTemp)
 +            {
 +                /* print the temperature for this state if doing simulated annealing */
 +                sprintf(&buf[nps],"T = %g (%s))",ir->simtempvals->temperatures[s-(nsetsbegin)],unit_temp_K);
 +            }
 +            else
 +            {
 +                sprintf(&buf[nps-1],")");  /* -1 to overwrite the last comma */
 +            }
 +            setname[s] = strdup(buf);
 +        }
 +        if (ir->epc!=epcNO) {
 +            np = sprintf(buf,"pV (%s)",unit_energy);
 +            setname[nsetsextend-1] = strdup(buf);  /* the first entry after nsets */
 +        }
 +
 +        xvgr_legend(fp,nsetsextend,(const char **)setname,oenv);
 +
 +        for(s=0; s<nsetsextend; s++)
 +        {
 +            sfree(setname[s]);
 +        }
 +        sfree(setname);
 +    }
 +
 +    return fp;
 +}
 +
 +static void copy_energy(t_mdebin *md, real e[],real ecpy[])
 +{
 +    int i,j;
 +
 +    for(i=j=0; (i<F_NRE); i++)
 +        if (md->bEner[i])
 +            ecpy[j++] = e[i];
 +    if (j != md->f_nre)
 +        gmx_incons("Number of energy terms wrong");
 +}
 +
 +void upd_mdebin(t_mdebin *md,
 +                gmx_bool bDoDHDL,
 +                gmx_bool bSum,
 +                double time,
 +                real tmass,
 +                gmx_enerdata_t *enerd,
 +                t_state *state,
 +                t_lambda *fep,
 +                t_expanded *expand,
 +                matrix  box,
 +                tensor svir,
 +                tensor fvir,
 +                tensor vir,
 +                tensor pres,
 +                gmx_ekindata_t *ekind,
 +                rvec mu_tot,
 +                gmx_constr_t constr)
 +{
 +    int    i,j,k,kk,m,n,gid;
 +    real   crmsd[2],tmp6[6];
 +    real   bs[NTRICLBOXS],vol,dens,pv,enthalpy;
 +    real   eee[egNR];
 +    real   ecopy[F_NRE];
 +    double store_dhdl[efptNR];
 +    double *dE=NULL;
 +    real   store_energy=0;
 +    real   tmp;
 +
 +    /* Do NOT use the box in the state variable, but the separate box provided
 +     * as an argument. This is because we sometimes need to write the box from
 +     * the last timestep to match the trajectory frames.
 +     */
 +    copy_energy(md, enerd->term,ecopy);
 +    add_ebin(md->ebin,md->ie,md->f_nre,ecopy,bSum);
 +    if (md->nCrmsd)
 +    {
 +        crmsd[0] = constr_rmsd(constr,FALSE);
 +        if (md->nCrmsd > 1)
 +        {
 +            crmsd[1] = constr_rmsd(constr,TRUE);
 +        }
 +        add_ebin(md->ebin,md->iconrmsd,md->nCrmsd,crmsd,FALSE);
 +    }
 +    if (md->bDynBox)
 +    {
 +        int nboxs;
 +        if(md->bTricl)
 +        {
 +            bs[0] = box[XX][XX];
 +            bs[1] = box[YY][YY];
 +            bs[2] = box[ZZ][ZZ];
 +            bs[3] = box[YY][XX];
 +            bs[4] = box[ZZ][XX];
 +            bs[5] = box[ZZ][YY];
 +            nboxs=NTRICLBOXS;
 +        }
 +        else
 +        {
 +            bs[0] = box[XX][XX];
 +            bs[1] = box[YY][YY];
 +            bs[2] = box[ZZ][ZZ];
 +            nboxs=NBOXS;
 +        }
 +        vol  = box[XX][XX]*box[YY][YY]*box[ZZ][ZZ];
 +        dens = (tmass*AMU)/(vol*NANO*NANO*NANO);
 +        add_ebin(md->ebin,md->ib   ,nboxs,bs   ,bSum);
 +        add_ebin(md->ebin,md->ivol ,1    ,&vol ,bSum);
 +        add_ebin(md->ebin,md->idens,1    ,&dens,bSum);
 +
 +        if (md->bDiagPres)
 +        {
 +            /* This is pV (in kJ/mol).  The pressure is the reference pressure,
 +               not the instantaneous pressure */
 +            pv = vol*md->ref_p/PRESFAC;
 +
 +            add_ebin(md->ebin,md->ipv  ,1    ,&pv  ,bSum);
 +            enthalpy = pv + enerd->term[F_ETOT];
 +            add_ebin(md->ebin,md->ienthalpy  ,1    ,&enthalpy  ,bSum);
 +        }
 +    }
 +    if (md->bConstrVir)
 +    {
 +        add_ebin(md->ebin,md->isvir,9,svir[0],bSum);
 +        add_ebin(md->ebin,md->ifvir,9,fvir[0],bSum);
 +    }
 +    if (md->bVir)
 +        add_ebin(md->ebin,md->ivir,9,vir[0],bSum);
 +    if (md->bPress)
 +        add_ebin(md->ebin,md->ipres,9,pres[0],bSum);
 +    if (md->bSurft){
 +        tmp = (pres[ZZ][ZZ]-(pres[XX][XX]+pres[YY][YY])*0.5)*box[ZZ][ZZ];
 +        add_ebin(md->ebin,md->isurft,1,&tmp,bSum);
 +    }
 +    if (md->epc == epcPARRINELLORAHMAN || md->epc == epcMTTK)
 +    {
 +        tmp6[0] = state->boxv[XX][XX];
 +        tmp6[1] = state->boxv[YY][YY];
 +        tmp6[2] = state->boxv[ZZ][ZZ];
 +        tmp6[3] = state->boxv[YY][XX];
 +        tmp6[4] = state->boxv[ZZ][XX];
 +        tmp6[5] = state->boxv[ZZ][YY];
 +        add_ebin(md->ebin,md->ipc,md->bTricl ? 6 : 3,tmp6,bSum);
 +    }
 +    if (md->bMu)
 +    {
 +        add_ebin(md->ebin,md->imu,3,mu_tot,bSum);
 +    }
 +    if (ekind && ekind->cosacc.cos_accel != 0)
 +    {
 +        vol  = box[XX][XX]*box[YY][YY]*box[ZZ][ZZ];
 +        dens = (tmass*AMU)/(vol*NANO*NANO*NANO);
 +        add_ebin(md->ebin,md->ivcos,1,&(ekind->cosacc.vcos),bSum);
 +        /* 1/viscosity, unit 1/(kg m^-1 s^-1) */
 +        tmp = 1/(ekind->cosacc.cos_accel/(ekind->cosacc.vcos*PICO)
 +                 *dens*vol*sqr(box[ZZ][ZZ]*NANO/(2*M_PI)));
 +        add_ebin(md->ebin,md->ivisc,1,&tmp,bSum);
 +    }
 +    if (md->nE > 1)
 +    {
 +        n=0;
 +        for(i=0; (i<md->nEg); i++)
 +        {
 +            for(j=i; (j<md->nEg); j++)
 +            {
 +                gid=GID(i,j,md->nEg);
 +                for(k=kk=0; (k<egNR); k++)
 +                {
 +                    if (md->bEInd[k])
 +                    {
 +                        eee[kk++] = enerd->grpp.ener[k][gid];
 +                    }
 +                }
 +                add_ebin(md->ebin,md->igrp[n],md->nEc,eee,bSum);
 +                n++;
 +            }
 +        }
 +    }
 +
 +    if (ekind)
 +    {
 +        for(i=0; (i<md->nTC); i++)
 +        {
 +            md->tmp_r[i] = ekind->tcstat[i].T;
 +        }
 +        add_ebin(md->ebin,md->itemp,md->nTC,md->tmp_r,bSum);
 +
 +        if (md->etc == etcNOSEHOOVER)
 +        {
 +            /* whether to print Nose-Hoover chains: */
 +            if (md->bPrintNHChains)
 +            {
 +                if (md->bNHC_trotter)
 +                {
 +                    for(i=0; (i<md->nTC); i++)
 +                    {
 +                        for (j=0;j<md->nNHC;j++)
 +                        {
 +                            k = i*md->nNHC+j;
 +                            md->tmp_r[2*k] = state->nosehoover_xi[k];
 +                            md->tmp_r[2*k+1] = state->nosehoover_vxi[k];
 +                        }
 +                    }
 +                    add_ebin(md->ebin,md->itc,md->mde_n,md->tmp_r,bSum);
 +
 +                    if (md->bMTTK) {
 +                        for(i=0; (i<md->nTCP); i++)
 +                        {
 +                            for (j=0;j<md->nNHC;j++)
 +                            {
 +                                k = i*md->nNHC+j;
 +                                md->tmp_r[2*k] = state->nhpres_xi[k];
 +                                md->tmp_r[2*k+1] = state->nhpres_vxi[k];
 +                            }
 +                        }
 +                        add_ebin(md->ebin,md->itcb,md->mdeb_n,md->tmp_r,bSum);
 +                    }
 +                }
 +                else
 +                {
 +                    for(i=0; (i<md->nTC); i++)
 +                    {
 +                        md->tmp_r[2*i] = state->nosehoover_xi[i];
 +                        md->tmp_r[2*i+1] = state->nosehoover_vxi[i];
 +                    }
 +                    add_ebin(md->ebin,md->itc,md->mde_n,md->tmp_r,bSum);
 +                }
 +            }
 +        }
 +        else if (md->etc == etcBERENDSEN || md->etc == etcYES ||
 +                 md->etc == etcVRESCALE)
 +        {
 +            for(i=0; (i<md->nTC); i++)
 +            {
 +                md->tmp_r[i] = ekind->tcstat[i].lambda;
 +            }
 +            add_ebin(md->ebin,md->itc,md->nTC,md->tmp_r,bSum);
 +        }
 +    }
 +
 +    if (ekind && md->nU > 1)
 +    {
 +        for(i=0; (i<md->nU); i++)
 +        {
 +            copy_rvec(ekind->grpstat[i].u,md->tmp_v[i]);
 +        }
 +        add_ebin(md->ebin,md->iu,3*md->nU,md->tmp_v[0],bSum);
 +    }
 +
 +    ebin_increase_count(md->ebin,bSum);
 +
 +    /* BAR + thermodynamic integration values */
 +    if ((md->fp_dhdl || md->dhc) && bDoDHDL && (enerd->n_lambda > 0))
 +    {
 +        snew(dE,enerd->n_lambda-1);
 +        for(i=0; i<enerd->n_lambda-1; i++) {
 +            dE[i] = enerd->enerpart_lambda[i+1]-enerd->enerpart_lambda[0];  /* zero for simulated tempering */
 +            if (md->temperatures!=NULL)
 +            {
 +                /* MRS: is this right, given the way we have defined the exchange probabilities? */
 +                /* is this even useful to have at all? */
 +                dE[i] += (md->temperatures[i]/md->temperatures[state->fep_state]-1.0)*enerd->term[F_EKIN];
 +            }
 +        }
 +    }
 +
 +    if (md->fp_dhdl && bDoDHDL)
 +    {
 +        fprintf(md->fp_dhdl,"%.4f",time);
 +        /* the current free energy state */
 +
 +        /* print the current state if we are doing expanded ensemble */
 +        if (expand->elmcmove > elmcmoveNO) {
 +            fprintf(md->fp_dhdl," %4d",state->fep_state);
 +        }
 +        /* total energy (for if the temperature changes */
 +        if (fep->bPrintEnergy)
 +        {
 +            store_energy = enerd->term[F_ETOT];
 +            fprintf(md->fp_dhdl," %#.8g",store_energy);
 +        }
 +
 +        for (i=0;i<efptNR;i++)
 +        {
 +            if (fep->separate_dvdl[i])
 +            {
 +                fprintf(md->fp_dhdl," %#.8g",enerd->term[F_DVDL+i]); /* assumes F_DVDL is first */
 +            }
 +        }
 +        for(i=1; i<enerd->n_lambda; i++)
 +        {
 +            fprintf(md->fp_dhdl," %#.8g",dE[i-1]);
 +
 +        }
 +        if ((md->epc!=epcNO)  && (enerd->n_lambda > 0))
 +        {
 +            fprintf(md->fp_dhdl," %#.8g",pv);   /* PV term only needed when there are alternate state lambda */
 +        }
 +        fprintf(md->fp_dhdl,"\n");
 +        /* and the binary free energy output */
 +    }
 +    if (md->dhc && bDoDHDL)
 +    {
 +        int idhdl = 0;
 +        for (i=0;i<efptNR;i++)
 +        {
 +            if (fep->separate_dvdl[i])
 +            {
 +                store_dhdl[idhdl] = enerd->term[F_DVDL+i]; /* assumes F_DVDL is first */
 +                idhdl+=1;
 +            }
 +        }
 +        /* store_dh is dE */
 +        mde_delta_h_coll_add_dh(md->dhc,
 +                                (double)state->fep_state,
 +                                store_energy,
 +                                pv,
 +                                (expand->elamstats>elamstatsNO),
 +                                (fep->bPrintEnergy),
 +                                (md->epc!=epcNO),
 +                                idhdl,
 +                                fep->n_lambda,
 +                                store_dhdl,
 +                                dE,
 +                                time);
 +    }
 +    if ((md->fp_dhdl || md->dhc) && bDoDHDL && (enerd->n_lambda >0))
 +    {
 +        sfree(dE);
 +    }
 +}
 +
 +
 +void upd_mdebin_step(t_mdebin *md)
 +{
 +    ebin_increase_count(md->ebin,FALSE);
 +}
 +
 +static void npr(FILE *log,int n,char c)
 +{
 +    for(; (n>0); n--) fprintf(log,"%c",c);
 +}
 +
 +static void pprint(FILE *log,const char *s,t_mdebin *md)
 +{
 +    char CHAR='#';
 +    int  slen;
 +    char buf1[22],buf2[22];
 +
 +    slen = strlen(s);
 +    fprintf(log,"\t<======  ");
 +    npr(log,slen,CHAR);
 +    fprintf(log,"  ==>\n");
 +    fprintf(log,"\t<====  %s  ====>\n",s);
 +    fprintf(log,"\t<==  ");
 +    npr(log,slen,CHAR);
 +    fprintf(log,"  ======>\n\n");
 +
 +    fprintf(log,"\tStatistics over %s steps using %s frames\n",
 +            gmx_step_str(md->ebin->nsteps_sim,buf1),
 +            gmx_step_str(md->ebin->nsum_sim,buf2));
 +    fprintf(log,"\n");
 +}
 +
 +void print_ebin_header(FILE *log,gmx_large_int_t steps,double time,real lambda)
 +{
 +    char buf[22];
 +
 +    fprintf(log,"   %12s   %12s   %12s\n"
 +            "   %12s   %12.5f   %12.5f\n\n",
 +            "Step","Time","Lambda",gmx_step_str(steps,buf),time,lambda);
 +}
 +
 +void print_ebin(ener_file_t fp_ene,gmx_bool bEne,gmx_bool bDR,gmx_bool bOR,
 +                FILE *log,
 +                gmx_large_int_t step,double time,
 +                int mode,gmx_bool bCompact,
 +                t_mdebin *md,t_fcdata *fcd,
 +                gmx_groups_t *groups,t_grpopts *opts)
 +{
 +    /*static char **grpnms=NULL;*/
 +    char        buf[246];
 +    int         i,j,n,ni,nj,ndr,nor,b;
 +    int         ndisre=0;
 +    real        *disre_rm3tav, *disre_rt;
 +
 +    /* these are for the old-style blocks (1 subblock, only reals), because
 +       there can be only one per ID for these */
 +    int         nr[enxNR];
 +    int         id[enxNR];
 +    real        *block[enxNR];
 +
 +    /* temporary arrays for the lambda values to write out */
 +    double      enxlambda_data[2];
 +
 +    t_enxframe  fr;
 +
 +    switch (mode)
 +    {
 +        case eprNORMAL:
 +            init_enxframe(&fr);
 +            fr.t            = time;
 +            fr.step         = step;
 +            fr.nsteps       = md->ebin->nsteps;
 +            fr.dt           = md->delta_t;
 +            fr.nsum         = md->ebin->nsum;
 +            fr.nre          = (bEne) ? md->ebin->nener : 0;
 +            fr.ener         = md->ebin->e;
 +            ndisre          = bDR ? fcd->disres.npair : 0;
 +            disre_rm3tav    = fcd->disres.rm3tav;
 +            disre_rt        = fcd->disres.rt;
 +            /* Optional additional old-style (real-only) blocks. */
 +            for(i=0; i<enxNR; i++)
 +            {
 +                nr[i] = 0;
 +            }
 +            if (fcd->orires.nr > 0 && bOR)
 +            {
 +                diagonalize_orires_tensors(&(fcd->orires));
 +                nr[enxOR]     = fcd->orires.nr;
 +                block[enxOR]  = fcd->orires.otav;
 +                id[enxOR]     = enxOR;
 +                nr[enxORI]    = (fcd->orires.oinsl != fcd->orires.otav) ?
 +                          fcd->orires.nr : 0;
 +                block[enxORI] = fcd->orires.oinsl;
 +                id[enxORI]    = enxORI;
 +                nr[enxORT]    = fcd->orires.nex*12;
 +                block[enxORT] = fcd->orires.eig;
 +                id[enxORT]    = enxORT;
 +            }
 +
 +            /* whether we are going to wrte anything out: */
 +            if (fr.nre || ndisre || nr[enxOR] || nr[enxORI])
 +            {
 +
 +                /* the old-style blocks go first */
 +                fr.nblock = 0;
 +                for(i=0; i<enxNR; i++)
 +                {
 +                    if (nr[i] > 0)
 +                    {
 +                        fr.nblock = i + 1;
 +                    }
 +                }
 +                add_blocks_enxframe(&fr, fr.nblock);
 +                for(b=0;b<fr.nblock;b++)
 +                {
 +                    add_subblocks_enxblock(&(fr.block[b]), 1);
 +                    fr.block[b].id=id[b];
 +                    fr.block[b].sub[0].nr = nr[b];
 +#ifndef GMX_DOUBLE
 +                    fr.block[b].sub[0].type = xdr_datatype_float;
 +                    fr.block[b].sub[0].fval = block[b];
 +#else
 +                    fr.block[b].sub[0].type = xdr_datatype_double;
 +                    fr.block[b].sub[0].dval = block[b];
 +#endif
 +                }
 +
 +                /* check for disre block & fill it. */
 +                if (ndisre>0)
 +                {
 +                    int db = fr.nblock;
 +                    fr.nblock+=1;
 +                    add_blocks_enxframe(&fr, fr.nblock);
 +
 +                    add_subblocks_enxblock(&(fr.block[db]), 2);
 +                    fr.block[db].id=enxDISRE;
 +                    fr.block[db].sub[0].nr=ndisre;
 +                    fr.block[db].sub[1].nr=ndisre;
 +#ifndef GMX_DOUBLE
 +                    fr.block[db].sub[0].type=xdr_datatype_float;
 +                    fr.block[db].sub[1].type=xdr_datatype_float;
 +                    fr.block[db].sub[0].fval=disre_rt;
 +                    fr.block[db].sub[1].fval=disre_rm3tav;
 +#else
 +                    fr.block[db].sub[0].type=xdr_datatype_double;
 +                    fr.block[db].sub[1].type=xdr_datatype_double;
 +                    fr.block[db].sub[0].dval=disre_rt;
 +                    fr.block[db].sub[1].dval=disre_rm3tav;
 +#endif
 +                }
 +                /* here we can put new-style blocks */
 +
 +                /* Free energy perturbation blocks */
 +                if (md->dhc)
 +                {
 +                    mde_delta_h_coll_handle_block(md->dhc, &fr, fr.nblock);
 +                }
 +
 +                /* we can now free & reset the data in the blocks */
 +                if (md->dhc)
 +                {
 +                    mde_delta_h_coll_reset(md->dhc);
 +                }
 +
 +                /* do the actual I/O */
 +                do_enx(fp_ene,&fr);
 +                gmx_fio_check_file_position(enx_file_pointer(fp_ene));
 +                if (fr.nre)
 +                {
 +                    /* We have stored the sums, so reset the sum history */
 +                    reset_ebin_sums(md->ebin);
 +                }
 +            }
 +            free_enxframe(&fr);
 +            break;
 +        case eprAVER:
 +            if (log)
 +            {
 +                pprint(log,"A V E R A G E S",md);
 +            }
 +            break;
 +        case eprRMS:
 +            if (log)
 +            {
 +                pprint(log,"R M S - F L U C T U A T I O N S",md);
 +            }
 +            break;
 +        default:
 +            gmx_fatal(FARGS,"Invalid print mode (%d)",mode);
 +    }
 +
 +    if (log)
 +    {
 +        for(i=0;i<opts->ngtc;i++)
 +        {
 +            if(opts->annealing[i]!=eannNO)
 +            {
 +                fprintf(log,"Current ref_t for group %s: %8.1f\n",
 +                        *(groups->grpname[groups->grps[egcTC].nm_ind[i]]),
 +                        opts->ref_t[i]);
 +            }
 +        }
 +        if (mode==eprNORMAL && fcd->orires.nr>0)
 +        {
 +            print_orires_log(log,&(fcd->orires));
 +        }
 +        fprintf(log,"   Energies (%s)\n",unit_energy);
 +        pr_ebin(log,md->ebin,md->ie,md->f_nre+md->nCrmsd,5,mode,TRUE);
 +        fprintf(log,"\n");
 +
 +        if (!bCompact)
 +        {
 +            if (md->bDynBox)
 +            {
 +                pr_ebin(log,md->ebin,md->ib, md->bTricl ? NTRICLBOXS : NBOXS,5,
 +                        mode,TRUE);
 +                fprintf(log,"\n");
 +            }
 +            if (md->bConstrVir)
 +            {
 +                fprintf(log,"   Constraint Virial (%s)\n",unit_energy);
 +                pr_ebin(log,md->ebin,md->isvir,9,3,mode,FALSE);
 +                fprintf(log,"\n");
 +                fprintf(log,"   Force Virial (%s)\n",unit_energy);
 +                pr_ebin(log,md->ebin,md->ifvir,9,3,mode,FALSE);
 +                fprintf(log,"\n");
 +            }
 +            if (md->bVir)
 +            {
 +                fprintf(log,"   Total Virial (%s)\n",unit_energy);
 +                pr_ebin(log,md->ebin,md->ivir,9,3,mode,FALSE);
 +                fprintf(log,"\n");
 +            }
 +            if (md->bPress)
 +            {
 +                fprintf(log,"   Pressure (%s)\n",unit_pres_bar);
 +                pr_ebin(log,md->ebin,md->ipres,9,3,mode,FALSE);
 +                fprintf(log,"\n");
 +            }
 +            if (md->bMu)
 +            {
 +                fprintf(log,"   Total Dipole (%s)\n",unit_dipole_D);
 +                pr_ebin(log,md->ebin,md->imu,3,3,mode,FALSE);
 +                fprintf(log,"\n");
 +            }
 +
 +            if (md->nE > 1)
 +            {
 +                if (md->print_grpnms==NULL)
 +                {
 +                    snew(md->print_grpnms,md->nE);
 +                    n=0;
 +                    for(i=0; (i<md->nEg); i++)
 +                    {
 +                        ni=groups->grps[egcENER].nm_ind[i];
 +                        for(j=i; (j<md->nEg); j++)
 +                        {
 +                            nj=groups->grps[egcENER].nm_ind[j];
 +                            sprintf(buf,"%s-%s",*(groups->grpname[ni]),
 +                                    *(groups->grpname[nj]));
 +                            md->print_grpnms[n++]=strdup(buf);
 +                        }
 +                    }
 +                }
 +                sprintf(buf,"Epot (%s)",unit_energy);
 +                fprintf(log,"%15s   ",buf);
 +                for(i=0; (i<egNR); i++)
 +                {
 +                    if (md->bEInd[i])
 +                    {
 +                        fprintf(log,"%12s   ",egrp_nm[i]);
 +                    }
 +                }
 +                fprintf(log,"\n");
 +                for(i=0; (i<md->nE); i++)
 +                {
 +                    fprintf(log,"%15s",md->print_grpnms[i]);
 +                    pr_ebin(log,md->ebin,md->igrp[i],md->nEc,md->nEc,mode,
 +                            FALSE);
 +                }
 +                fprintf(log,"\n");
 +            }
 +            if (md->nTC > 1)
 +            {
 +                pr_ebin(log,md->ebin,md->itemp,md->nTC,4,mode,TRUE);
 +                fprintf(log,"\n");
 +            }
 +            if (md->nU > 1)
 +            {
 +                fprintf(log,"%15s   %12s   %12s   %12s\n",
 +                        "Group","Ux","Uy","Uz");
 +                for(i=0; (i<md->nU); i++)
 +                {
 +                    ni=groups->grps[egcACC].nm_ind[i];
 +                    fprintf(log,"%15s",*groups->grpname[ni]);
 +                    pr_ebin(log,md->ebin,md->iu+3*i,3,3,mode,FALSE);
 +                }
 +                fprintf(log,"\n");
 +            }
 +        }
 +    }
 +
 +}
 +
 +void update_energyhistory(energyhistory_t * enerhist,t_mdebin * mdebin)
 +{
 +    int i;
 +
 +    enerhist->nsteps     = mdebin->ebin->nsteps;
 +    enerhist->nsum       = mdebin->ebin->nsum;
 +    enerhist->nsteps_sim = mdebin->ebin->nsteps_sim;
 +    enerhist->nsum_sim   = mdebin->ebin->nsum_sim;
 +    enerhist->nener      = mdebin->ebin->nener;
 +
 +    if (mdebin->ebin->nsum > 0)
 +    {
 +        /* Check if we need to allocate first */
 +        if(enerhist->ener_ave == NULL)
 +        {
 +            snew(enerhist->ener_ave,enerhist->nener);
 +            snew(enerhist->ener_sum,enerhist->nener);
 +        }
 +
 +        for(i=0;i<enerhist->nener;i++)
 +        {
 +            enerhist->ener_ave[i] = mdebin->ebin->e[i].eav;
 +            enerhist->ener_sum[i] = mdebin->ebin->e[i].esum;
 +        }
 +    }
 +
 +    if (mdebin->ebin->nsum_sim > 0)
 +    {
 +        /* Check if we need to allocate first */
 +        if(enerhist->ener_sum_sim == NULL)
 +        {
 +            snew(enerhist->ener_sum_sim,enerhist->nener);
 +        }
 +
 +        for(i=0;i<enerhist->nener;i++)
 +        {
 +            enerhist->ener_sum_sim[i] = mdebin->ebin->e_sim[i].esum;
 +        }
 +    }
 +    if (mdebin->dhc)
 +    {
 +        mde_delta_h_coll_update_energyhistory(mdebin->dhc, enerhist);
 +    }
 +}
 +
 +void restore_energyhistory_from_state(t_mdebin * mdebin,
 +                                      energyhistory_t * enerhist)
 +{
 +    int i;
 +
 +    if ((enerhist->nsum > 0 || enerhist->nsum_sim > 0) &&
 +        mdebin->ebin->nener != enerhist->nener)
 +    {
 +        gmx_fatal(FARGS,"Mismatch between number of energies in run input (%d) and checkpoint file (%d).",
 +                  mdebin->ebin->nener,enerhist->nener);
 +    }
 +
 +    mdebin->ebin->nsteps     = enerhist->nsteps;
 +    mdebin->ebin->nsum       = enerhist->nsum;
 +    mdebin->ebin->nsteps_sim = enerhist->nsteps_sim;
 +    mdebin->ebin->nsum_sim   = enerhist->nsum_sim;
 +
 +    for(i=0; i<mdebin->ebin->nener; i++)
 +    {
 +        mdebin->ebin->e[i].eav  =
 +                  (enerhist->nsum > 0 ? enerhist->ener_ave[i] : 0);
 +        mdebin->ebin->e[i].esum =
 +                  (enerhist->nsum > 0 ? enerhist->ener_sum[i] : 0);
 +        mdebin->ebin->e_sim[i].esum =
 +                  (enerhist->nsum_sim > 0 ? enerhist->ener_sum_sim[i] : 0);
 +    }
 +    if (mdebin->dhc)
 +    {
 +        mde_delta_h_coll_restore_energyhistory(mdebin->dhc, enerhist);
 +    }
 +}
index 22c315bdb37719ed5714677cd6f9beebedf0bb28,0000000000000000000000000000000000000000..f7375ce63cbce73e565b4c5e025b2fc5b2e9f27c
mode 100644,000000..100644
--- /dev/null
@@@ -1,1285 -1,0 +1,1302 @@@
-     for(i=i0; i<i1; i++)
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + */
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +#include <string.h>
 +#include "smalloc.h"
 +#include "macros.h"
 +#include "vec.h"
 +#include "nbnxn_consts.h"
 +#include "nbnxn_internal.h"
 +#include "nbnxn_search.h"
 +#include "nbnxn_atomdata.h"
 +#include "gmx_omp_nthreads.h"
 +
 +/* Default nbnxn allocation routine, allocates 32 byte aligned,
 + * which works for plain C and aligned SSE and AVX loads/stores.
 + */
 +void nbnxn_alloc_aligned(void **ptr,size_t nbytes)
 +{
 +    *ptr = save_malloc_aligned("ptr",__FILE__,__LINE__,nbytes,1,32);
 +}
 +
 +/* Free function for memory allocated with nbnxn_alloc_aligned */
 +void nbnxn_free_aligned(void *ptr)
 +{
 +    sfree_aligned(ptr);
 +}
 +
 +/* Reallocation wrapper function for nbnxn data structures */
 +void nbnxn_realloc_void(void **ptr,
 +                        int nbytes_copy,int nbytes_new,
 +                        nbnxn_alloc_t *ma,
 +                        nbnxn_free_t  *mf)
 +{
 +    void *ptr_new;
 +
 +    ma(&ptr_new,nbytes_new);
 +
 +    if (nbytes_new > 0 && ptr_new == NULL)
 +    {
 +        gmx_fatal(FARGS, "Allocation of %d bytes failed", nbytes_new);
 +    }
 +
 +    if (nbytes_copy > 0)
 +    {
 +        if (nbytes_new < nbytes_copy)
 +        {
 +            gmx_incons("In nbnxn_realloc_void: new size less than copy size");
 +        }
 +        memcpy(ptr_new,*ptr,nbytes_copy);
 +    }
 +    if (*ptr != NULL)
 +    {
 +        mf(*ptr);
 +    }
 +    *ptr = ptr_new;
 +}
 +
 +/* Reallocate the nbnxn_atomdata_t for a size of n atoms */
 +void nbnxn_atomdata_realloc(nbnxn_atomdata_t *nbat,int n)
 +{
 +    int t;
 +
 +    nbnxn_realloc_void((void **)&nbat->type,
 +                       nbat->natoms*sizeof(*nbat->type),
 +                       n*sizeof(*nbat->type),
 +                       nbat->alloc,nbat->free);
 +    nbnxn_realloc_void((void **)&nbat->lj_comb,
 +                       nbat->natoms*2*sizeof(*nbat->lj_comb),
 +                       n*2*sizeof(*nbat->lj_comb),
 +                       nbat->alloc,nbat->free);
 +    if (nbat->XFormat != nbatXYZQ)
 +    {
 +        nbnxn_realloc_void((void **)&nbat->q,
 +                           nbat->natoms*sizeof(*nbat->q),
 +                           n*sizeof(*nbat->q),
 +                           nbat->alloc,nbat->free);
 +    }
 +    if (nbat->nenergrp > 1)
 +    {
 +        nbnxn_realloc_void((void **)&nbat->energrp,
 +                           nbat->natoms/nbat->na_c*sizeof(*nbat->energrp),
 +                           n/nbat->na_c*sizeof(*nbat->energrp),
 +                           nbat->alloc,nbat->free);
 +    }
 +    nbnxn_realloc_void((void **)&nbat->x,
 +                       nbat->natoms*nbat->xstride*sizeof(*nbat->x),
 +                       n*nbat->xstride*sizeof(*nbat->x),
 +                       nbat->alloc,nbat->free);
 +    for(t=0; t<nbat->nout; t++)
 +    {
 +        /* Allocate one element extra for possible signaling with CUDA */
 +        nbnxn_realloc_void((void **)&nbat->out[t].f,
 +                           nbat->natoms*nbat->fstride*sizeof(*nbat->out[t].f),
 +                           n*nbat->fstride*sizeof(*nbat->out[t].f),
 +                           nbat->alloc,nbat->free);
 +    }
 +    nbat->nalloc = n;
 +}
 +
 +/* Initializes an nbnxn_atomdata_output_t data structure */
 +static void nbnxn_atomdata_output_init(nbnxn_atomdata_output_t *out,
 +                                       int nb_kernel_type,
 +                                       int nenergrp,int stride,
 +                                       nbnxn_alloc_t *ma)
 +{
 +    int cj_size;
 +
 +    out->f = NULL;
 +    ma((void **)&out->fshift,SHIFTS*DIM*sizeof(*out->fshift));
 +    out->nV = nenergrp*nenergrp;
 +    ma((void **)&out->Vvdw,out->nV*sizeof(*out->Vvdw));
 +    ma((void **)&out->Vc  ,out->nV*sizeof(*out->Vc  ));
 +
 +    if (nb_kernel_type == nbk4xN_X86_SIMD128 ||
 +        nb_kernel_type == nbk4xN_X86_SIMD256)
 +    {
 +        cj_size = nbnxn_kernel_to_cj_size(nb_kernel_type);
 +        out->nVS = nenergrp*nenergrp*stride*(cj_size>>1)*cj_size;
 +        ma((void **)&out->VSvdw,out->nVS*sizeof(*out->VSvdw));
 +        ma((void **)&out->VSc  ,out->nVS*sizeof(*out->VSc  ));
 +    }
 +    else
 +    {
 +        out->nVS = 0;
 +    }
 +}
 +
 +static void copy_int_to_nbat_int(const int *a,int na,int na_round,
 +                                 const int *in,int fill,int *innb)
 +{
 +    int i,j;
 +
 +    j = 0;
 +    for(i=0; i<na; i++)
 +    {
 +        innb[j++] = in[a[i]];
 +    }
 +    /* Complete the partially filled last cell with fill */
 +    for(; i<na_round; i++)
 +    {
 +        innb[j++] = fill;
 +    }
 +}
 +
 +static void clear_nbat_real(int na,int nbatFormat,real *xnb,int a0)
 +{
 +    int a,d,j,c;
 +
 +    switch (nbatFormat)
 +    {
 +    case nbatXYZ:
 +        for(a=0; a<na; a++)
 +        {
 +            for(d=0; d<DIM; d++)
 +            {
 +                xnb[(a0+a)*STRIDE_XYZ+d] = 0;
 +            }
 +        }
 +        break;
 +    case nbatXYZQ:
 +        for(a=0; a<na; a++)
 +        {
 +            for(d=0; d<DIM; d++)
 +            {
 +                xnb[(a0+a)*STRIDE_XYZQ+d] = 0;
 +            }
 +        }
 +        break;
 +    case nbatX4:
 +        j = X4_IND_A(a0);
 +        c = a0 & (PACK_X4-1);
 +        for(a=0; a<na; a++)
 +        {
 +            xnb[j+XX*PACK_X4] = 0;
 +            xnb[j+YY*PACK_X4] = 0;
 +            xnb[j+ZZ*PACK_X4] = 0;
 +            j++;
 +            c++;
 +            if (c == PACK_X4)
 +            {
 +                j += (DIM-1)*PACK_X4;
 +                c  = 0;
 +            }
 +        }
 +        break;
 +    case nbatX8:
 +        j = X8_IND_A(a0);
 +        c = a0 & (PACK_X8-1);
 +        for(a=0; a<na; a++)
 +        {
 +            xnb[j+XX*PACK_X8] = 0;
 +            xnb[j+YY*PACK_X8] = 0;
 +            xnb[j+ZZ*PACK_X8] = 0;
 +            j++;
 +            c++;
 +            if (c == PACK_X8)
 +            {
 +                j += (DIM-1)*PACK_X8;
 +                c  = 0;
 +            }
 +        }
 +        break;
 +    }
 +}
 +
 +void copy_rvec_to_nbat_real(const int *a,int na,int na_round,
 +                            rvec *x,int nbatFormat,real *xnb,int a0,
 +                            int cx,int cy,int cz)
 +{
 +    int i,j,c;
 +
 +/* We might need to place filler particles to fill up the cell to na_round.
 + * The coefficients (LJ and q) for such particles are zero.
 + * But we might still get NaN as 0*NaN when distances are too small.
 + * We hope that -107 nm is far away enough from to zero
 + * to avoid accidental short distances to particles shifted down for pbc.
 + */
 +#define NBAT_FAR_AWAY 107
 +
 +    switch (nbatFormat)
 +    {
 +    case nbatXYZ:
 +        j = a0*STRIDE_XYZ;
 +        for(i=0; i<na; i++)
 +        {
 +            xnb[j++] = x[a[i]][XX];
 +            xnb[j++] = x[a[i]][YY];
 +            xnb[j++] = x[a[i]][ZZ];
 +        }
 +        /* Complete the partially filled last cell with copies of the last element.
 +         * This simplifies the bounding box calculation and avoid
 +         * numerical issues with atoms that are coincidentally close.
 +         */
 +        for(; i<na_round; i++)
 +        {
 +            xnb[j++] = -NBAT_FAR_AWAY*(1 + cx);
 +            xnb[j++] = -NBAT_FAR_AWAY*(1 + cy);
 +            xnb[j++] = -NBAT_FAR_AWAY*(1 + cz + i);
 +        }
 +        break;
 +    case nbatXYZQ:
 +        j = a0*STRIDE_XYZQ;
 +        for(i=0; i<na; i++)
 +        {
 +            xnb[j++] = x[a[i]][XX];
 +            xnb[j++] = x[a[i]][YY];
 +            xnb[j++] = x[a[i]][ZZ];
 +            j++;
 +        }
 +        /* Complete the partially filled last cell with particles far apart */
 +        for(; i<na_round; i++)
 +        {
 +            xnb[j++] = -NBAT_FAR_AWAY*(1 + cx);
 +            xnb[j++] = -NBAT_FAR_AWAY*(1 + cy);
 +            xnb[j++] = -NBAT_FAR_AWAY*(1 + cz + i);
 +            j++;
 +        }
 +        break;
 +    case nbatX4:
 +        j = X4_IND_A(a0);
 +        c = a0 & (PACK_X4-1);
 +        for(i=0; i<na; i++)
 +        {
 +            xnb[j+XX*PACK_X4] = x[a[i]][XX];
 +            xnb[j+YY*PACK_X4] = x[a[i]][YY];
 +            xnb[j+ZZ*PACK_X4] = x[a[i]][ZZ];
 +            j++;
 +            c++;
 +            if (c == PACK_X4)
 +            {
 +                j += (DIM-1)*PACK_X4;
 +                c  = 0;
 +            }
 +        }
 +        /* Complete the partially filled last cell with particles far apart */
 +        for(; i<na_round; i++)
 +        {
 +            xnb[j+XX*PACK_X4] = -NBAT_FAR_AWAY*(1 + cx);
 +            xnb[j+YY*PACK_X4] = -NBAT_FAR_AWAY*(1 + cy);
 +            xnb[j+ZZ*PACK_X4] = -NBAT_FAR_AWAY*(1 + cz + i);
 +            j++;
 +            c++;
 +            if (c == PACK_X4)
 +            {
 +                j += (DIM-1)*PACK_X4;
 +                c  = 0;
 +            }
 +        }
 +        break;
 +    case nbatX8:
 +        j = X8_IND_A(a0);
 +        c = a0 & (PACK_X8 - 1);
 +        for(i=0; i<na; i++)
 +        {
 +            xnb[j+XX*PACK_X8] = x[a[i]][XX];
 +            xnb[j+YY*PACK_X8] = x[a[i]][YY];
 +            xnb[j+ZZ*PACK_X8] = x[a[i]][ZZ];
 +            j++;
 +            c++;
 +            if (c == PACK_X8)
 +            {
 +                j += (DIM-1)*PACK_X8;
 +                c  = 0;
 +            }
 +        }
 +        /* Complete the partially filled last cell with particles far apart */
 +        for(; i<na_round; i++)
 +        {
 +            xnb[j+XX*PACK_X8] = -NBAT_FAR_AWAY*(1 + cx);
 +            xnb[j+YY*PACK_X8] = -NBAT_FAR_AWAY*(1 + cy);
 +            xnb[j+ZZ*PACK_X8] = -NBAT_FAR_AWAY*(1 + cz + i);
 +            j++;
 +            c++;
 +            if (c == PACK_X8)
 +            {
 +                j += (DIM-1)*PACK_X8;
 +                c  = 0;
 +            }
 +        }
 +        break;
 +    default:
 +        gmx_incons("Unsupported stride");
 +    }
 +}
 +
 +/* Determines the combination rule (or none) to be used, stores it,
 + * and sets the LJ parameters required with the rule.
 + */
 +static void set_combination_rule_data(nbnxn_atomdata_t *nbat)
 +{
 +    int  nt,i,j;
 +    real c6,c12;
 +
 +    nt = nbat->ntype;
 +
 +    switch (nbat->comb_rule)
 +    {
 +    case  ljcrGEOM:
 +        nbat->comb_rule = ljcrGEOM;
 +
 +        for(i=0; i<nt; i++)
 +        {
 +            /* Copy the diagonal from the nbfp matrix */
 +            nbat->nbfp_comb[i*2  ] = sqrt(nbat->nbfp[(i*nt+i)*2  ]);
 +            nbat->nbfp_comb[i*2+1] = sqrt(nbat->nbfp[(i*nt+i)*2+1]);
 +        }
 +        break;
 +    case ljcrLB:
 +        for(i=0; i<nt; i++)
 +        {
 +            /* Get 6*C6 and 12*C12 from the diagonal of the nbfp matrix */
 +            c6  = nbat->nbfp[(i*nt+i)*2  ];
 +            c12 = nbat->nbfp[(i*nt+i)*2+1];
 +            if (c6 > 0 && c12 > 0)
 +            {
 +                /* We store 0.5*2^1/6*sigma and sqrt(4*3*eps),
 +                 * so we get 6*C6 and 12*C12 after combining.
 +                 */
 +                nbat->nbfp_comb[i*2  ] = 0.5*pow(c12/c6,1.0/6.0);
 +                nbat->nbfp_comb[i*2+1] = sqrt(c6*c6/c12);
 +            }
 +            else
 +            {
 +                nbat->nbfp_comb[i*2  ] = 0;
 +                nbat->nbfp_comb[i*2+1] = 0;
 +            }
 +        }
 +        break;
 +    case ljcrNONE:
 +        /* In nbfp_s4 we use a stride of 4 for storing two parameters */
 +        nbat->alloc((void **)&nbat->nbfp_s4,nt*nt*4*sizeof(*nbat->nbfp_s4));
 +        for(i=0; i<nt; i++)
 +        {
 +            for(j=0; j<nt; j++)
 +            {
 +                nbat->nbfp_s4[(i*nt+j)*4+0] = nbat->nbfp[(i*nt+j)*2+0];
 +                nbat->nbfp_s4[(i*nt+j)*4+1] = nbat->nbfp[(i*nt+j)*2+1];
 +                nbat->nbfp_s4[(i*nt+j)*4+2] = 0;
 +                nbat->nbfp_s4[(i*nt+j)*4+3] = 0;
 +            }
 +        }
 +        break;
 +    default:
 +        gmx_incons("Unknown combination rule");
 +        break;
 +    }
 +}
 +
 +/* Initializes an nbnxn_atomdata_t data structure */
 +void nbnxn_atomdata_init(FILE *fp,
 +                         nbnxn_atomdata_t *nbat,
 +                         int nb_kernel_type,
 +                         int ntype,const real *nbfp,
 +                         int n_energygroups,
 +                         int nout,
 +                         nbnxn_alloc_t *alloc,
 +                         nbnxn_free_t  *free)
 +{
 +    int  i,j;
 +    real c6,c12,tol;
 +    char *ptr;
 +    gmx_bool simple,bCombGeom,bCombLB;
 +
 +    if (alloc == NULL)
 +    {
 +        nbat->alloc = nbnxn_alloc_aligned;
 +    }
 +    else
 +    {
 +        nbat->alloc = alloc;
 +    }
 +    if (free == NULL)
 +    {
 +        nbat->free = nbnxn_free_aligned;
 +    }
 +    else
 +    {
 +        nbat->free = free;
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"There are %d atom types in the system, adding one for nbnxn_atomdata_t\n",ntype);
 +    }
 +    nbat->ntype = ntype + 1;
 +    nbat->alloc((void **)&nbat->nbfp,
 +                nbat->ntype*nbat->ntype*2*sizeof(*nbat->nbfp));
 +    nbat->alloc((void **)&nbat->nbfp_comb,nbat->ntype*2*sizeof(*nbat->nbfp_comb));
 +
 +    /* A tolerance of 1e-5 seems reasonable for (possibly hand-typed)
 +     * force-field floating point parameters.
 +     */
 +    tol = 1e-5;
 +    ptr = getenv("GMX_LJCOMB_TOL");
 +    if (ptr != NULL)
 +    {
 +        double dbl;
 +
 +        sscanf(ptr,"%lf",&dbl);
 +        tol = dbl;
 +    }
 +    bCombGeom = TRUE;
 +    bCombLB   = TRUE;
 +
 +    /* Temporarily fill nbat->nbfp_comb with sigma and epsilon
 +     * to check for the LB rule.
 +     */
 +    for(i=0; i<ntype; i++)
 +    {
 +        c6  = nbfp[(i*ntype+i)*2  ]/6.0;
 +        c12 = nbfp[(i*ntype+i)*2+1]/12.0;
 +        if (c6 > 0 && c12 > 0)
 +        {
 +            nbat->nbfp_comb[i*2  ] = pow(c12/c6,1.0/6.0);
 +            nbat->nbfp_comb[i*2+1] = 0.25*c6*c6/c12;
 +        }
 +        else if (c6 == 0 && c12 == 0)
 +        {
 +            nbat->nbfp_comb[i*2  ] = 0;
 +            nbat->nbfp_comb[i*2+1] = 0;
 +        }
 +        else
 +        {
 +            /* Can not use LB rule with only dispersion or repulsion */
 +            bCombLB = FALSE;
 +        }
 +    }
 +
 +    for(i=0; i<nbat->ntype; i++)
 +    {
 +        for(j=0; j<nbat->ntype; j++)
 +        {
 +            if (i < ntype && j < ntype)
 +            {
 +                /* fr->nbfp has been updated, so that array too now stores c6/c12 including
 +                 * the 6.0/12.0 prefactors to save 2 flops in the most common case (force-only).
 +                 */
 +                c6  = nbfp[(i*ntype+j)*2  ];
 +                c12 = nbfp[(i*ntype+j)*2+1];
 +                nbat->nbfp[(i*nbat->ntype+j)*2  ] = c6;
 +                nbat->nbfp[(i*nbat->ntype+j)*2+1] = c12;
 +
 +                /* Compare 6*C6 and 12*C12 for geometric cobination rule */
 +                bCombGeom = bCombGeom &&
 +                    gmx_within_tol(c6*c6  ,nbfp[(i*ntype+i)*2  ]*nbfp[(j*ntype+j)*2  ],tol) &&
 +                    gmx_within_tol(c12*c12,nbfp[(i*ntype+i)*2+1]*nbfp[(j*ntype+j)*2+1],tol);
 +
 +                /* Compare C6 and C12 for Lorentz-Berthelot combination rule */
 +                c6  /= 6.0;
 +                c12 /= 12.0;
 +                bCombLB = bCombLB &&
 +                    ((c6 == 0 && c12 == 0 &&
 +                      (nbat->nbfp_comb[i*2+1] == 0 || nbat->nbfp_comb[j*2+1] == 0)) ||
 +                     (c6 > 0 && c12 > 0 &&
 +                      gmx_within_tol(pow(c12/c6,1.0/6.0),0.5*(nbat->nbfp_comb[i*2]+nbat->nbfp_comb[j*2]),tol) &&
 +                      gmx_within_tol(0.25*c6*c6/c12,sqrt(nbat->nbfp_comb[i*2+1]*nbat->nbfp_comb[j*2+1]),tol)));
 +            }
 +            else
 +            {
 +                /* Add zero parameters for the additional dummy atom type */
 +                nbat->nbfp[(i*nbat->ntype+j)*2  ] = 0;
 +                nbat->nbfp[(i*nbat->ntype+j)*2+1] = 0;
 +            }
 +        }
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug,"Combination rules: geometric %d Lorentz-Berthelot %d\n",
 +                bCombGeom,bCombLB);
 +    }
 +
 +    simple = nbnxn_kernel_pairlist_simple(nb_kernel_type);
 +
 +    if (simple)
 +    {
 +        /* We prefer the geometic combination rule,
 +         * as that gives a slightly faster kernel than the LB rule.
 +         */
 +        if (bCombGeom)
 +        {
 +            nbat->comb_rule = ljcrGEOM;
 +        }
 +        else if (bCombLB)
 +        {
 +            nbat->comb_rule = ljcrLB;
 +        }
 +        else
 +        {
 +            nbat->comb_rule = ljcrNONE;
 +
 +            nbat->free(nbat->nbfp_comb);
 +        }
 +
 +        if (fp)
 +        {
 +            if (nbat->comb_rule == ljcrNONE)
 +            {
 +                fprintf(fp,"Using full Lennard-Jones parameter combination matrix\n\n");
 +            }
 +            else
 +            {
 +                fprintf(fp,"Using %s Lennard-Jones combination rule\n\n",
 +                        nbat->comb_rule==ljcrGEOM ? "geometric" : "Lorentz-Berthelot");
 +            }
 +        }
 +
 +        set_combination_rule_data(nbat);
 +    }
 +    else
 +    {
 +        nbat->comb_rule = ljcrNONE;
 +
 +        nbat->free(nbat->nbfp_comb);
 +    }
 +
 +    nbat->natoms  = 0;
 +    nbat->type    = NULL;
 +    nbat->lj_comb = NULL;
 +    if (simple)
 +    {
 +        switch (nb_kernel_type)
 +        {
 +        case nbk4xN_X86_SIMD128:
 +            nbat->XFormat = nbatX4;
 +            break;
 +        case nbk4xN_X86_SIMD256:
 +#ifndef GMX_DOUBLE
 +            nbat->XFormat = nbatX8;
 +#else
 +            nbat->XFormat = nbatX4;
 +#endif
 +            break;
 +        default:
 +            nbat->XFormat = nbatXYZ;
 +            break;
 +        }
 +
 +        nbat->FFormat = nbat->XFormat;
 +    }
 +    else
 +    {
 +        nbat->XFormat = nbatXYZQ;
 +        nbat->FFormat = nbatXYZ;
 +    }
 +    nbat->q       = NULL;
 +    nbat->nenergrp = n_energygroups;
 +    if (!simple)
 +    {
 +        /* Energy groups not supported yet for super-sub lists */
 +        if (n_energygroups > 1 && fp != NULL)
 +        {
 +            fprintf(fp,"\nNOTE: With GPUs, reporting energy group contributions is not supported\n\n");
 +        }
 +        nbat->nenergrp = 1;
 +    }
 +    /* Temporary storage goes as #grp^3*simd_width^2/2, so limit to 64 */
 +    if (nbat->nenergrp > 64)
 +    {
 +        gmx_fatal(FARGS,"With NxN kernels not more than 64 energy groups are supported\n");
 +    }
 +    nbat->neg_2log = 1;
 +    while (nbat->nenergrp > (1<<nbat->neg_2log))
 +    {
 +        nbat->neg_2log++;
 +    }
 +    nbat->energrp = NULL;
 +    nbat->alloc((void **)&nbat->shift_vec,SHIFTS*sizeof(*nbat->shift_vec));
 +    nbat->xstride = (nbat->XFormat == nbatXYZQ ? STRIDE_XYZQ : DIM);
 +    nbat->fstride = (nbat->FFormat == nbatXYZQ ? STRIDE_XYZQ : DIM);
 +    nbat->x       = NULL;
 +    nbat->nout    = nout;
 +    snew(nbat->out,nbat->nout);
 +    nbat->nalloc  = 0;
 +    for(i=0; i<nbat->nout; i++)
 +    {
 +        nbnxn_atomdata_output_init(&nbat->out[i],
 +                                   nb_kernel_type,
 +                                   nbat->nenergrp,1<<nbat->neg_2log,
 +                                   nbat->alloc);
 +    }
++    nbat->buffer_flags.flag        = NULL;
++    nbat->buffer_flags.flag_nalloc = 0;
 +}
 +
 +static void copy_lj_to_nbat_lj_comb_x4(const real *ljparam_type,
 +                                       const int *type,int na,
 +                                       real *ljparam_at)
 +{
 +    int is,k,i;
 +
 +    /* The LJ params follow the combination rule:
 +     * copy the params for the type array to the atom array.
 +     */
 +    for(is=0; is<na; is+=PACK_X4)
 +    {
 +        for(k=0; k<PACK_X4; k++)
 +        {
 +            i = is + k;
 +            ljparam_at[is*2        +k] = ljparam_type[type[i]*2  ];
 +            ljparam_at[is*2+PACK_X4+k] = ljparam_type[type[i]*2+1];
 +        }
 +    }
 +}
 +
 +static void copy_lj_to_nbat_lj_comb_x8(const real *ljparam_type,
 +                                       const int *type,int na,
 +                                       real *ljparam_at)
 +{
 +    int is,k,i;
 +
 +    /* The LJ params follow the combination rule:
 +     * copy the params for the type array to the atom array.
 +     */
 +    for(is=0; is<na; is+=PACK_X8)
 +    {
 +        for(k=0; k<PACK_X8; k++)
 +        {
 +            i = is + k;
 +            ljparam_at[is*2        +k] = ljparam_type[type[i]*2  ];
 +            ljparam_at[is*2+PACK_X8+k] = ljparam_type[type[i]*2+1];
 +        }
 +    }
 +}
 +
 +/* Sets the atom type and LJ data in nbnxn_atomdata_t */
 +static void nbnxn_atomdata_set_atomtypes(nbnxn_atomdata_t *nbat,
 +                                         int ngrid,
 +                                         const nbnxn_search_t nbs,
 +                                         const int *type)
 +{
 +    int g,i,ncz,ash;
 +    const nbnxn_grid_t *grid;
 +
 +    for(g=0; g<ngrid; g++)
 +    {
 +        grid = &nbs->grid[g];
 +
 +        /* Loop over all columns and copy and fill */
 +        for(i=0; i<grid->ncx*grid->ncy; i++)
 +        {
 +            ncz = grid->cxy_ind[i+1] - grid->cxy_ind[i];
 +            ash = (grid->cell0 + grid->cxy_ind[i])*grid->na_sc;
 +
 +            copy_int_to_nbat_int(nbs->a+ash,grid->cxy_na[i],ncz*grid->na_sc,
 +                                 type,nbat->ntype-1,nbat->type+ash);
 +
 +            if (nbat->comb_rule != ljcrNONE)
 +            {
 +                if (nbat->XFormat == nbatX4)
 +                {
 +                    copy_lj_to_nbat_lj_comb_x4(nbat->nbfp_comb,
 +                                               nbat->type+ash,ncz*grid->na_sc,
 +                                               nbat->lj_comb+ash*2);
 +                }
 +                else if (nbat->XFormat == nbatX8)
 +                {
 +                    copy_lj_to_nbat_lj_comb_x8(nbat->nbfp_comb,
 +                                               nbat->type+ash,ncz*grid->na_sc,
 +                                               nbat->lj_comb+ash*2);
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* Sets the charges in nbnxn_atomdata_t *nbat */
 +static void nbnxn_atomdata_set_charges(nbnxn_atomdata_t *nbat,
 +                                       int ngrid,
 +                                       const nbnxn_search_t nbs,
 +                                       const real *charge)
 +{
 +    int  g,cxy,ncz,ash,na,na_round,i,j;
 +    real *q;
 +    const nbnxn_grid_t *grid;
 +
 +    for(g=0; g<ngrid; g++)
 +    {
 +        grid = &nbs->grid[g];
 +
 +        /* Loop over all columns and copy and fill */
 +        for(cxy=0; cxy<grid->ncx*grid->ncy; cxy++)
 +        {
 +            ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
 +            na  = grid->cxy_na[cxy];
 +            na_round = (grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy])*grid->na_sc;
 +
 +            if (nbat->XFormat == nbatXYZQ)
 +            {
 +                q = nbat->x + ash*STRIDE_XYZQ + ZZ + 1;
 +                for(i=0; i<na; i++)
 +                {
 +                    *q = charge[nbs->a[ash+i]];
 +                    q += STRIDE_XYZQ;
 +                }
 +                /* Complete the partially filled last cell with zeros */
 +                for(; i<na_round; i++)
 +                {
 +                    *q = 0;
 +                    q += STRIDE_XYZQ;
 +                }
 +            }
 +            else
 +            {
 +                q = nbat->q + ash;
 +                for(i=0; i<na; i++)
 +                {
 +                    *q = charge[nbs->a[ash+i]];
 +                    q++;
 +                }
 +                /* Complete the partially filled last cell with zeros */
 +                for(; i<na_round; i++)
 +                {
 +                    *q = 0;
 +                    q++;
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* Copies the energy group indices to a reordered and packed array */
 +static void copy_egp_to_nbat_egps(const int *a,int na,int na_round,
 +                                  int na_c,int bit_shift,
 +                                  const int *in,int *innb)
 +{
 +    int i,j,sa,at;
 +    int comb;
 +
 +    j = 0;
 +    for(i=0; i<na; i+=na_c)
 +    {
 +        /* Store na_c energy group numbers into one int */
 +        comb = 0;
 +        for(sa=0; sa<na_c; sa++)
 +        {
 +            at = a[i+sa];
 +            if (at >= 0)
 +            {
 +                comb |= (GET_CGINFO_GID(in[at]) << (sa*bit_shift));
 +            }
 +        }
 +        innb[j++] = comb;
 +    }
 +    /* Complete the partially filled last cell with fill */
 +    for(; i<na_round; i+=na_c)
 +    {
 +        innb[j++] = 0;
 +    }
 +}
 +
 +/* Set the energy group indices for atoms in nbnxn_atomdata_t */
 +static void nbnxn_atomdata_set_energygroups(nbnxn_atomdata_t *nbat,
 +                                            int ngrid,
 +                                            const nbnxn_search_t nbs,
 +                                            const int *atinfo)
 +{
 +    int g,i,ncz,ash;
 +    const nbnxn_grid_t *grid;
 +
 +    for(g=0; g<ngrid; g++)
 +    {
 +        grid = &nbs->grid[g];
 +
 +        /* Loop over all columns and copy and fill */
 +        for(i=0; i<grid->ncx*grid->ncy; i++)
 +        {
 +            ncz = grid->cxy_ind[i+1] - grid->cxy_ind[i];
 +            ash = (grid->cell0 + grid->cxy_ind[i])*grid->na_sc;
 +
 +            copy_egp_to_nbat_egps(nbs->a+ash,grid->cxy_na[i],ncz*grid->na_sc,
 +                                  nbat->na_c,nbat->neg_2log,
 +                                  atinfo,nbat->energrp+(ash>>grid->na_c_2log));
 +        }
 +    }
 +}
 +
 +/* Sets all required atom parameter data in nbnxn_atomdata_t */
 +void nbnxn_atomdata_set(nbnxn_atomdata_t *nbat,
 +                        int locality,
 +                        const nbnxn_search_t nbs,
 +                        const t_mdatoms *mdatoms,
 +                        const int *atinfo)
 +{
 +    int ngrid;
 +
 +    if (locality == eatLocal)
 +    {
 +        ngrid = 1;
 +    }
 +    else
 +    {
 +        ngrid = nbs->ngrid;
 +    }
 +
 +    nbnxn_atomdata_set_atomtypes(nbat,ngrid,nbs,mdatoms->typeA);
 +
 +    nbnxn_atomdata_set_charges(nbat,ngrid,nbs,mdatoms->chargeA);
 +
 +    if (nbat->nenergrp > 1)
 +    {
 +        nbnxn_atomdata_set_energygroups(nbat,ngrid,nbs,atinfo);
 +    }
 +}
 +
 +/* Copies the shift vector array to nbnxn_atomdata_t */
 +void nbnxn_atomdata_copy_shiftvec(gmx_bool bDynamicBox,
 +                                   rvec *shift_vec,
 +                                   nbnxn_atomdata_t *nbat)
 +{
 +    int i;
 +
 +    nbat->bDynamicBox = bDynamicBox;
 +    for(i=0; i<SHIFTS; i++)
 +    {
 +        copy_rvec(shift_vec[i],nbat->shift_vec[i]);
 +    }
 +}
 +
 +/* Copies (and reorders) the coordinates to nbnxn_atomdata_t */
 +void nbnxn_atomdata_copy_x_to_nbat_x(const nbnxn_search_t nbs,
 +                                      int locality,
 +                                      gmx_bool FillLocal,
 +                                      rvec *x,
 +                                      nbnxn_atomdata_t *nbat)
 +{
 +    int g0=0,g1=0;
 +    int nth,th;
 +
 +    switch (locality)
 +    {
 +    case eatAll:
 +        g0 = 0;
 +        g1 = nbs->ngrid;
 +        break;
 +    case eatLocal:
 +        g0 = 0;
 +        g1 = 1;
 +        break;
 +    case eatNonlocal:
 +        g0 = 1;
 +        g1 = nbs->ngrid;
 +        break;
 +    }
 +
 +    if (FillLocal)
 +    {
 +        nbat->natoms_local = nbs->grid[0].nc*nbs->grid[0].na_sc;
 +    }
 +
 +    nth = gmx_omp_nthreads_get(emntPairsearch);
 +
 +#pragma omp parallel for num_threads(nth) schedule(static)
 +    for(th=0; th<nth; th++)
 +    {
 +        int g;
 +
 +        for(g=g0; g<g1; g++)
 +        {
 +            const nbnxn_grid_t *grid;
 +            int cxy0,cxy1,cxy;
 +
 +            grid = &nbs->grid[g];
 +
 +            cxy0 = (grid->ncx*grid->ncy* th   +nth-1)/nth;
 +            cxy1 = (grid->ncx*grid->ncy*(th+1)+nth-1)/nth;
 +
 +            for(cxy=cxy0; cxy<cxy1; cxy++)
 +            {
 +                int na,ash,na_fill;
 +
 +                na  = grid->cxy_na[cxy];
 +                ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
 +
 +                if (g == 0 && FillLocal)
 +                {
 +                    na_fill =
 +                        (grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy])*grid->na_sc;
 +                }
 +                else
 +                {
 +                    /* We fill only the real particle locations.
 +                     * We assume the filling entries at the end have been
 +                     * properly set before during ns.
 +                     */
 +                    na_fill = na;
 +                }
 +                copy_rvec_to_nbat_real(nbs->a+ash,na,na_fill,x,
 +                                       nbat->XFormat,nbat->x,ash,
 +                                       0,0,0);
 +            }
 +        }
 +    }
 +}
 +
++static void
++nbnxn_atomdata_clear_reals(real * gmx_restrict dest,
++                           int i0, int i1)
++{
++    int i;
++
++    for(i=i0; i<i1; i++)
++    {
++        dest[i] = 0;
++    }
++}
++
 +static void
 +nbnxn_atomdata_reduce_reals(real * gmx_restrict dest,
++                            gmx_bool bDestSet,
 +                            real ** gmx_restrict src,
 +                            int nsrc,
 +                            int i0, int i1)
 +{
 +    int i,s;
 +
-         for(s=0; s<nsrc; s++)
++    if (bDestSet)
++    {
++        /* The destination buffer contains data, add to it */
++        for(i=i0; i<i1; i++)
++        {
++            for(s=0; s<nsrc; s++)
++            {
++                dest[i] += src[s][i];
++            }
++        }
++    }
++    else
 +    {
-             dest[i] += src[s][i];
++        /* The destination buffer is unitialized, set it first */
++        for(i=i0; i<i1; i++)
 +        {
-     if ((i0 & (GMX_X86_SIMD_WIDTH_HERE-1)) ||
-         (i1 & (GMX_X86_SIMD_WIDTH_HERE-1)))
++            dest[i] = src[0][i];
++            for(s=1; s<nsrc; s++)
++            {
++                dest[i] += src[s][i];
++            }
 +        }
 +    }
 +}
 +
 +static void
 +nbnxn_atomdata_reduce_reals_x86_simd(real * gmx_restrict dest,
++                                     gmx_bool bDestSet,
 +                                     real ** gmx_restrict src,
 +                                     int nsrc,
 +                                     int i0, int i1)
 +{
 +#ifdef NBNXN_SEARCH_SSE
 +/* We can use AVX256 here, but not when AVX128 kernels are selected.
 + * As this reduction is not faster with AVX256 anyway, we use 128-bit SIMD.
 + */
++#ifdef GMX_X86_AVX_256
++#define GMX_MM256_HERE
++#else
 +#define GMX_MM128_HERE
++#endif
 +#include "gmx_x86_simd_macros.h"
 +
 +    int       i,s;
 +    gmx_mm_pr dest_SSE,src_SSE;
 +
-         gmx_incons("bounds not a multiple of GMX_X86_SIMD_WIDTH_HERE in nbnxn_atomdata_reduce_reals_x86_simd");
++    if (bDestSet)
 +    {
-     for(i=i0; i<i1; i+=GMX_X86_SIMD_WIDTH_HERE)
++        for(i=i0; i<i1; i+=GMX_X86_SIMD_WIDTH_HERE)
++        {
++            dest_SSE = gmx_load_pr(dest+i);
++            for(s=0; s<nsrc; s++)
++            {
++                src_SSE  = gmx_load_pr(src[s]+i);
++                dest_SSE = gmx_add_pr(dest_SSE,src_SSE);
++            }
++            gmx_store_pr(dest+i,dest_SSE);
++        }
 +    }
-         dest_SSE = gmx_load_pr(dest+i);
-         for(s=0; s<nsrc; s++)
++    else
 +    {
-             src_SSE  = gmx_load_pr(src[s]+i);
-             dest_SSE = gmx_add_pr(dest_SSE,src_SSE);
++        for(i=i0; i<i1; i+=GMX_X86_SIMD_WIDTH_HERE)
 +        {
-         gmx_store_pr(dest+i,dest_SSE);
++            dest_SSE = gmx_load_pr(src[0]+i);
++            for(s=1; s<nsrc; s++)
++            {
++                src_SSE  = gmx_load_pr(src[s]+i);
++                dest_SSE = gmx_add_pr(dest_SSE,src_SSE);
++            }
++            gmx_store_pr(dest+i,dest_SSE);
 +        }
-     gmx_bool bStreamingReduce;
 +    }
 +
 +#undef GMX_MM128_HERE
 +#undef GMX_MM256_HERE
 +#endif
 +}
 +
 +/* Add part of the force array(s) from nbnxn_atomdata_t to f */
 +static void
 +nbnxn_atomdata_add_nbat_f_to_f_part(const nbnxn_search_t nbs,
 +                                    const nbnxn_atomdata_t *nbat,
 +                                    nbnxn_atomdata_output_t *out,
 +                                    int nfa,
 +                                    int a0,int a1,
 +                                    rvec *f)
 +{
 +    int  a,i,fa;
 +    const int  *cell;
 +    const real *fnb;
 +
 +    cell = nbs->cell;
 +
 +    /* Loop over all columns and copy and fill */
 +    switch (nbat->FFormat)
 +    {
 +    case nbatXYZ:
 +    case nbatXYZQ:
 +        if (nfa == 1)
 +        {
 +            fnb = out[0].f;
 +
 +            for(a=a0; a<a1; a++)
 +            {
 +                i = cell[a]*nbat->fstride;
 +
 +                f[a][XX] += fnb[i];
 +                f[a][YY] += fnb[i+1];
 +                f[a][ZZ] += fnb[i+2];
 +            }
 +        }
 +        else
 +        {
 +            for(a=a0; a<a1; a++)
 +            {
 +                i = cell[a]*nbat->fstride;
 +
 +                for(fa=0; fa<nfa; fa++)
 +                {
 +                    f[a][XX] += out[fa].f[i];
 +                    f[a][YY] += out[fa].f[i+1];
 +                    f[a][ZZ] += out[fa].f[i+2];
 +                }
 +            }
 +        }
 +        break;
 +    case nbatX4:
 +        if (nfa == 1)
 +        {
 +            fnb = out[0].f;
 +
 +            for(a=a0; a<a1; a++)
 +            {
 +                i = X4_IND_A(cell[a]);
 +
 +                f[a][XX] += fnb[i+XX*PACK_X4];
 +                f[a][YY] += fnb[i+YY*PACK_X4];
 +                f[a][ZZ] += fnb[i+ZZ*PACK_X4];
 +            }
 +        }
 +        else
 +        {
 +            for(a=a0; a<a1; a++)
 +            {
 +                i = X4_IND_A(cell[a]);
 +                
 +                for(fa=0; fa<nfa; fa++)
 +                {
 +                    f[a][XX] += out[fa].f[i+XX*PACK_X4];
 +                    f[a][YY] += out[fa].f[i+YY*PACK_X4];
 +                    f[a][ZZ] += out[fa].f[i+ZZ*PACK_X4];
 +                }
 +            }
 +        }
 +        break;
 +    case nbatX8:
 +        if (nfa == 1)
 +        {
 +            fnb = out[0].f;
 +
 +            for(a=a0; a<a1; a++)
 +            {
 +                i = X8_IND_A(cell[a]);
 +
 +                f[a][XX] += fnb[i+XX*PACK_X8];
 +                f[a][YY] += fnb[i+YY*PACK_X8];
 +                f[a][ZZ] += fnb[i+ZZ*PACK_X8];
 +            }
 +        }
 +        else
 +        {
 +            for(a=a0; a<a1; a++)
 +            {
 +                i = X8_IND_A(cell[a]);
 +                
 +                for(fa=0; fa<nfa; fa++)
 +                {
 +                    f[a][XX] += out[fa].f[i+XX*PACK_X8];
 +                    f[a][YY] += out[fa].f[i+YY*PACK_X8];
 +                    f[a][ZZ] += out[fa].f[i+ZZ*PACK_X8];
 +                }
 +            }
 +        }
 +        break;
 +    }
 +}
 +
 +/* Add the force array(s) from nbnxn_atomdata_t to f */
 +void nbnxn_atomdata_add_nbat_f_to_f(const nbnxn_search_t nbs,
 +                                    int locality,
 +                                    const nbnxn_atomdata_t *nbat,
 +                                    rvec *f)
 +{
 +    int a0=0,na=0;
 +    int nth,th;
-     /* Using the two-step streaming reduction is probably always faster */
-     bStreamingReduce = (nbat->nout > 1);
-     if (bStreamingReduce)
 +
 +    nbs_cycle_start(&nbs->cc[enbsCCreducef]);
 +
 +    switch (locality)
 +    {
 +    case eatAll:
 +        a0 = 0;
 +        na = nbs->natoms_nonlocal;
 +        break;
 +    case eatLocal:
 +        a0 = 0;
 +        na = nbs->natoms_local;
 +        break;
 +    case eatNonlocal:
 +        a0 = nbs->natoms_local;
 +        na = nbs->natoms_nonlocal - nbs->natoms_local;
 +        break;
 +    }
 +
 +    nth = gmx_omp_nthreads_get(emntNonbonded);
 +
-             int g0,g1,g;
++    if (nbat->nout > 1)
 +    {
++        if (locality != eatAll)
++        {
++            gmx_incons("add_f_to_f called with nout>1 and locality!=eatAll");
++        }
++
 +        /* Reduce the force thread output buffers into buffer 0, before adding
 +         * them to the, differently ordered, "real" force buffer.
 +         */
 +#pragma omp parallel for num_threads(nth) schedule(static)
 +        for(th=0; th<nth; th++)
 +        {
-             /* For which grids should we reduce the force output? */
-             g0 = ((locality==eatLocal || locality==eatAll) ? 0 : 1);
-             g1 = (locality==eatLocal ? 1 : nbs->ngrid);
++            const nbnxn_buffer_flags_t *flags;
++            int b0,b1,b;
++            int i0,i1;
++            int nfptr;
++            real *fptr[NBNXN_BUFFERFLAG_MAX_THREADS];
++            int out;
 +
-             for(g=g0; g<g1; g++)
-             {
-                 nbnxn_grid_t *grid;
-                 int b0,b1,b;
-                 int c0,c1,i0,i1;
-                 int nfptr;
-                 real *fptr[NBNXN_CELLBLOCK_MAX_THREADS];
-                 int out;
-                 grid = &nbs->grid[g];
++            flags = &nbat->buffer_flags;
 +
-                 /* Calculate the cell-block range for our thread */
-                 b0 = (grid->cellblock_flags.ncb* th   )/nth;
-                 b1 = (grid->cellblock_flags.ncb*(th+1))/nth;
++            /* Calculate the cell-block range for our thread */
++            b0 = (flags->nflag* th   )/nth;
++            b1 = (flags->nflag*(th+1))/nth;
 +
-                 if (grid->cellblock_flags.bUse)
++            for(b=b0; b<b1; b++)
++            {
++                i0 =  b   *NBNXN_BUFFERFLAG_SIZE*nbat->fstride;
++                i1 = (b+1)*NBNXN_BUFFERFLAG_SIZE*nbat->fstride;
 +
-                     for(b=b0; b<b1; b++)
++                nfptr = 0;
++                for(out=1; out<nbat->nout; out++)
 +                {
-                         c0 = b*NBNXN_CELLBLOCK_SIZE;
-                         c1 = min(c0 + NBNXN_CELLBLOCK_SIZE,grid->nc);
-                         i0 = (grid->cell0 + c0)*grid->na_c*nbat->fstride;
-                         i1 = (grid->cell0 + c1)*grid->na_c*nbat->fstride;
-                         nfptr = 0;
-                         for(out=1; out<nbat->nout; out++)
-                         {
-                             if (grid->cellblock_flags.flag[b] & (1U<<out))
-                             {
-                                 fptr[nfptr++] = nbat->out[out].f;
-                             }
-                         }
-                         if (nfptr > 0)
-                         {
- #ifdef NBNXN_SEARCH_SSE
-                             nbnxn_atomdata_reduce_reals_x86_simd
- #else
-                             nbnxn_atomdata_reduce_reals
- #endif
-                                                        (nbat->out[0].f,
-                                                         fptr,nfptr,
-                                                         i0,i1);
-                         }
++                    if (flags->flag[b] & (1U<<out))
 +                    {
-                 else
++                        fptr[nfptr++] = nbat->out[out].f;
 +                    }
 +                }
-                     c0 = b0*NBNXN_CELLBLOCK_SIZE;
-                     c1 = min(b1*NBNXN_CELLBLOCK_SIZE,grid->nc);
-                     i0 = (grid->cell0 + c0)*grid->na_c*nbat->fstride;
-                     i1 = (grid->cell0 + c1)*grid->na_c*nbat->fstride;
-                     nfptr = 0;
-                     for(out=1; out<nbat->nout; out++)
-                     {
-                         fptr[nfptr++] = nbat->out[out].f;
-                     }
++                if (nfptr > 0)
 +                {
-                                             bStreamingReduce ? 1 : nbat->nout,
 +#ifdef NBNXN_SEARCH_SSE
 +                    nbnxn_atomdata_reduce_reals_x86_simd
 +#else
 +                    nbnxn_atomdata_reduce_reals
 +#endif
 +                                               (nbat->out[0].f,
++                                                flags->flag[b] & (1U<<0),
 +                                                fptr,nfptr,
 +                                                i0,i1);
 +                }
++                else if (!(flags->flag[b] & (1U<<0)))
++                {
++                    nbnxn_atomdata_clear_reals(nbat->out[0].f,
++                                               i0,i1);
++                }
 +            }
 +        }
 +    }
 +
 +#pragma omp parallel for num_threads(nth) schedule(static)
 +    for(th=0; th<nth; th++)
 +    {
 +        nbnxn_atomdata_add_nbat_f_to_f_part(nbs,nbat,
 +                                            nbat->out,
++                                            1,
 +                                            a0+((th+0)*na)/nth,
 +                                            a0+((th+1)*na)/nth,
 +                                            f);
 +    }
 +
 +    nbs_cycle_stop(&nbs->cc[enbsCCreducef]);
 +}
 +
 +/* Adds the shift forces from nbnxn_atomdata_t to fshift */
 +void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t *nbat,
 +                                              rvec *fshift)
 +{
 +    const nbnxn_atomdata_output_t *out;
 +    int  th;
 +    int  s;
 +    rvec sum;
 +
 +    out = nbat->out;
 +    
 +    for(s=0; s<SHIFTS; s++)
 +    {
 +        clear_rvec(sum);
 +        for(th=0; th<nbat->nout; th++)
 +        {
 +            sum[XX] += out[th].fshift[s*DIM+XX];
 +            sum[YY] += out[th].fshift[s*DIM+YY];
 +            sum[ZZ] += out[th].fshift[s*DIM+ZZ];
 +        }
 +        rvec_inc(fshift[s],sum);
 +    }
 +}
index bfcf1277e53e71ce4944ca7aca80e39abcc856da,0000000000000000000000000000000000000000..48722c680dd8be55558185ad908efb0b5ac91982
mode 100644,000000..100644
--- /dev/null
@@@ -1,269 -1,0 +1,242 @@@
- #define _nsnxn_internal_h
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustr
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +
 +#ifndef _nbnxn_internal_h
- /* Block size for the non-bonded thread force-buffer reduction,
-  * should be a multiple of 2 in case of AVX256.
-  */
- #define NBNXN_CELLBLOCK_SIZE       4
- #define NBNXN_CELLBLOCK_SIZE_2LOG  2
- /* We currently store the reduction flags as bits in an unsigned int.
-  * In most cases this limits the number of flags to 32.
-  * The reduction will automatically disable the flagging and do a full
-  * reduction when the flags won't fit, but this will lead to very slow
-  * reduction. As we anyhow don't expect reasonable performance with
-  * more than 32 threads, we put in this hard limit.
-  * You can increase this number, but the reduction will be very slow.
-  */
- #define NBNXN_CELLBLOCK_MAX_THREADS  32
- /* Flags for telling if threads write to force output buffers */
- typedef struct {
-     int ncb;         /* The number of cell blocks                         */
-     gmx_bool bUse;   /* Should we use these flags?                        */
-     unsigned *flag;  /* Bit i is set when thread i writes to a cell-block */
-     int flag_nalloc; /* Allocation size of cxy_flag                       */
- } nbnxn_cellblock_flags;
++#define _nbnxn_internal_h
 +
 +#include "typedefs.h"
 +#include "domdec.h"
 +#include "gmx_cyclecounter.h"
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
 +
 +#ifdef GMX_X86_SSE2
 +#define NBNXN_SEARCH_SSE
 +#endif
 +
 +
-     nbnxn_cellblock_flags cellblock_flags; /* Flags for F output buffers */
 +/* A pair-search grid struct for one domain decomposition zone */
 +typedef struct {
 +    rvec c0;             /* The lower corner of the (local) grid        */
 +    rvec c1;             /* The upper corner of the (local) grid        */
 +    real atom_density;   /* The atom number density for the local grid  */
 +
 +    gmx_bool bSimple;    /* Is this grid simple or super/sub            */
 +    int  na_c;           /* Number of atoms per cluster                 */
 +    int  na_cj;          /* Number of atoms for list j-clusters         */
 +    int  na_sc;          /* Number of atoms per super-cluster           */
 +    int  na_c_2log;      /* 2log of na_c                                */
 +
 +    int  ncx;            /* Number of (super-)cells along x             */
 +    int  ncy;            /* Number of (super-)cells along y             */
 +    int  nc;             /* Total number of (super-)cells               */
 +
 +    real sx;             /* x-size of a (super-)cell                    */
 +    real sy;             /* y-size of a (super-)cell                    */
 +    real inv_sx;         /* 1/sx                                        */
 +    real inv_sy;         /* 1/sy                                        */
 +
 +    int  cell0;          /* Index in nbs->cell corresponding to cell 0  */
 +
 +    int  *cxy_na;        /* The number of atoms for each column in x,y  */
 +    int  *cxy_ind;       /* Grid (super)cell index, offset from cell0   */
 +    int  cxy_nalloc;     /* Allocation size for cxy_na and cxy_ind      */
 +
 +    int   *nsubc;        /* The number of sub cells for each super cell */
 +    float *bbcz;         /* Bounding boxes in z for the super cells     */
 +    float *bb;           /* 3D bounding boxes for the sub cells         */
 +    float *bbj;          /* 3D j-b.boxes for SSE-double or AVX-single   */
 +    int   *flags;        /* Flag for the super cells                    */
 +    int   nc_nalloc;     /* Allocation size for the pointers above      */
 +
 +    float *bbcz_simple;  /* bbcz for simple grid converted from super   */
 +    float *bb_simple;    /* bb for simple grid converted from super     */
 +    int   *flags_simple; /* flags for simple grid converted from super  */
 +    int   nc_nalloc_simple; /* Allocation size for the pointers above   */
 +
-     nbnxn_cellblock_flags gridi_flags; /* Flags for i-grid f buffer     */
-     nbnxn_cellblock_flags gridj_flags; /* Flags for j-grid f buffer     */
 +    int  nsubc_tot;      /* Total number of subcell, used for printing  */
 +} nbnxn_grid_t;
 +
 +#ifdef NBNXN_SEARCH_SSE
 +#define GMX_MM128_HERE
 +#include "gmx_x86_simd_macros.h"
 +typedef struct nbnxn_x_ci_x86_simd128 {
 +    /* The i-cluster coordinates for simple search */
 +    gmx_mm_pr ix_SSE0,iy_SSE0,iz_SSE0;
 +    gmx_mm_pr ix_SSE1,iy_SSE1,iz_SSE1;
 +    gmx_mm_pr ix_SSE2,iy_SSE2,iz_SSE2;
 +    gmx_mm_pr ix_SSE3,iy_SSE3,iz_SSE3;
 +} nbnxn_x_ci_x86_simd128_t;
 +#undef GMX_MM128_HERE
 +#ifdef GMX_X86_AVX_256
 +#define GMX_MM256_HERE
 +#include "gmx_x86_simd_macros.h"
 +typedef struct nbnxn_x_ci_x86_simd256 {
 +    /* The i-cluster coordinates for simple search */
 +    gmx_mm_pr ix_SSE0,iy_SSE0,iz_SSE0;
 +    gmx_mm_pr ix_SSE1,iy_SSE1,iz_SSE1;
 +    gmx_mm_pr ix_SSE2,iy_SSE2,iz_SSE2;
 +    gmx_mm_pr ix_SSE3,iy_SSE3,iz_SSE3;
 +} nbnxn_x_ci_x86_simd256_t;
 +#undef GMX_MM256_HERE
 +#endif
 +#endif
 +
 +/* Working data for the actual i-supercell during pair search */
 +typedef struct nbnxn_list_work {
 +    gmx_cache_protect_t cp0; /* Protect cache between threads               */
 +
 +    float *bb_ci;      /* The bounding boxes, pbc shifted, for each cluster */
 +    real  *x_ci;       /* The coordinates, pbc shifted, for each atom       */
 +#ifdef NBNXN_SEARCH_SSE
 +    nbnxn_x_ci_x86_simd128_t *x_ci_x86_simd128;
 +#ifdef GMX_X86_AVX_256
 +    nbnxn_x_ci_x86_simd256_t *x_ci_x86_simd256;
 +#endif
 +#endif
 +    int  cj_ind;       /* The current cj_ind index for the current list     */
 +    int  cj4_init;     /* The first unitialized cj4 block                   */
 +
 +    float *d2;         /* Bounding box distance work array                  */
 +
 +    nbnxn_cj_t *cj;    /* The j-cell list                                   */
 +    int  cj_nalloc;    /* Allocation size of cj                             */
 +
 +    int ncj_noq;       /* Nr. of cluster pairs without Coul for flop count  */
 +    int ncj_hlj;       /* Nr. of cluster pairs with 1/2 LJ for flop count   */
 +
 +    gmx_cache_protect_t cp1; /* Protect cache between threads               */
 +} nbnxn_list_work_t;
 +
 +/* Function type for setting the i-atom coordinate working data */
 +typedef void
 +gmx_icell_set_x_t(int ci,
 +                  real shx,real shy,real shz,
 +                  int na_c,
 +                  int stride,const real *x,
 +                  nbnxn_list_work_t *work);
 +
 +static gmx_icell_set_x_t icell_set_x_simple;
 +#ifdef NBNXN_SEARCH_SSE
 +static gmx_icell_set_x_t icell_set_x_simple_x86_simd128;
 +#ifdef GMX_X86_AVX_256
 +static gmx_icell_set_x_t icell_set_x_simple_x86_simd256;
 +#endif
 +#endif
 +static gmx_icell_set_x_t icell_set_x_supersub;
 +#ifdef NBNXN_SEARCH_SSE
 +static gmx_icell_set_x_t icell_set_x_supersub_sse8;
 +#endif
 +
 +/* Local cycle count struct for profiling */
 +typedef struct {
 +    int          count;
 +    gmx_cycles_t c;
 +    gmx_cycles_t start;
 +} nbnxn_cycle_t;
 +
 +/* Local cycle count enum for profiling */
 +enum { enbsCCgrid, enbsCCsearch, enbsCCcombine, enbsCCreducef, enbsCCnr };
 +
 +/* Thread-local work struct, contains part of nbnxn_grid_t */
 +typedef struct {
 +    gmx_cache_protect_t cp0;
 +
 +    int *cxy_na;
 +    int cxy_na_nalloc;
 +
 +    int  *sort_work;
 +    int  sort_work_nalloc;
 +
++    nbnxn_buffer_flags_t buffer_flags; /* Flags for force buffer access */
 +
 +    int  ndistc;         /* Number of distance checks for flop counting */
 +
 +    nbnxn_cycle_t cc[enbsCCnr];
 +
 +    gmx_cache_protect_t cp1;
 +} nbnxn_search_work_t;
 +
 +/* Main pair-search struct, contains the grid(s), not the pair-list(s) */
 +typedef struct nbnxn_search {
 +    int  ePBC;            /* PBC type enum                              */
 +    matrix box;           /* The periodic unit-cell                     */
 +
 +    gmx_bool DomDec;      /* Are we doing domain decomposition?         */
 +    ivec dd_dim;          /* Are we doing DD in x,y,z?                  */
 +    gmx_domdec_zones_t *zones; /* The domain decomposition zones        */
 +
 +    int  ngrid;           /* The number of grids, equal to #DD-zones    */
 +    nbnxn_grid_t *grid;   /* Array of grids, size ngrid                 */
 +    int  *cell;           /* Actual allocated cell array for all grids  */
 +    int  cell_nalloc;     /* Allocation size of cell                    */
 +    int  *a;              /* Atom index for grid, the inverse of cell   */
 +    int  a_nalloc;        /* Allocation size of a                       */
 +
 +    int  natoms_local;    /* The local atoms run from 0 to natoms_local */
 +    int  natoms_nonlocal; /* The non-local atoms run from natoms_local
 +                           * to natoms_nonlocal */
 +
 +    gmx_bool print_cycles;
 +    int      search_count;
 +    nbnxn_cycle_t cc[enbsCCnr];
 +
 +    gmx_icell_set_x_t *icell_set_x; /* Function for setting i-coords    */
 +
 +    int  nthread_max;     /* Maximum number of threads for pair-search  */
 +    nbnxn_search_work_t *work; /* Work array, size nthread_max          */
 +} nbnxn_search_t_t;
 +
 +
 +static void nbs_cycle_start(nbnxn_cycle_t *cc)
 +{
 +    cc->start = gmx_cycles_read();
 +}
 +
 +static void nbs_cycle_stop(nbnxn_cycle_t *cc)
 +{
 +    cc->c += gmx_cycles_read() - cc->start;
 +    cc->count++;
 +}
 +
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +#endif
index 442938e675f89af78c9766903e5c74a69f179846,0000000000000000000000000000000000000000..428666a8525499c48b7a5b569bf6185dcaeb81d0
mode 100644,000000..100644
--- /dev/null
@@@ -1,89 -1,0 +1,127 @@@
- void
- clear_f(const nbnxn_atomdata_t *nbat,real *f)
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2009, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + * As a special exception, you may use this file as part of a free software
 + * library without restriction.  Specifically, if other files instantiate
 + * templates or use macros or inline functions from this file, or you compile
 + * this file and link it with other files to produce an executable, this
 + * file does not by itself cause the resulting executable to be covered by
 + * the GNU Lesser General Public License.
 + *
 + * In plain-speak: do not worry about classes/macros/templates either - only
 + * changes to the library have to be LGPL, not an application linking with it.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website!
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include "nbnxn_kernel_common.h"
 +
++static void
++clear_f_all(const nbnxn_atomdata_t *nbat,real *f)
 +{
 +    int i;
 +
 +    for(i=0; i<nbat->natoms*nbat->fstride; i++)
 +    {
 +        f[i] = 0;
 +    }
 +}
 +
++static void
++clear_f_flagged(const nbnxn_atomdata_t *nbat,int output_index,real *f)
++{
++    const nbnxn_buffer_flags_t *flags;
++    unsigned our_flag;
++    int g,b,a0,a1,i;
++
++    flags = &nbat->buffer_flags;
++
++    our_flag = (1U << output_index);
++
++    for(b=0; b<flags->nflag; b++)
++    {
++        if (flags->flag[b] & our_flag)
++        {
++            a0 = b*NBNXN_BUFFERFLAG_SIZE;
++            a1 = a0 + NBNXN_BUFFERFLAG_SIZE;
++            for(i=a0*nbat->fstride; i<a1*nbat->fstride; i++)
++            {
++                f[i] = 0;
++            }
++        }
++    }
++}
++
++void
++clear_f(const nbnxn_atomdata_t *nbat,int output_index,real *f)
++{
++    if (nbat->bUseBufferFlags)
++    {
++        clear_f_flagged(nbat, output_index, f);
++    }
++    else
++    {
++        clear_f_all(nbat, f);
++    }
++}
++
 +void
 +clear_fshift(real *fshift)
 +{
 +    int i;
 +
 +    for(i=0; i<SHIFTS*DIM; i++)
 +    {
 +        fshift[i] = 0;
 +    }
 +}
 +
 +void
 +reduce_energies_over_lists(const nbnxn_atomdata_t     *nbat,
 +                           int                        nlist,
 +                           real                       *Vvdw,
 +                           real                       *Vc)
 +{
 +    int nb;
 +    int i,j,ind,indr;
 +
 +    for(nb=0; nb<nlist; nb++)
 +    {
 +        for(i=0; i<nbat->nenergrp; i++)
 +        {
 +            /* Reduce the diagonal terms */
 +            ind = i*nbat->nenergrp + i;
 +            Vvdw[ind] += nbat->out[nb].Vvdw[ind];
 +            Vc[ind]   += nbat->out[nb].Vc[ind];
 +
 +            /* Reduce the off-diagonal terms */
 +            for(j=i+1; j<nbat->nenergrp; j++)
 +            {
 +                /* The output should contain only one off-diagonal part */
 +                ind  = i*nbat->nenergrp + j;
 +                indr = j*nbat->nenergrp + i;
 +                Vvdw[ind] += nbat->out[nb].Vvdw[ind] + nbat->out[nb].Vvdw[indr];
 +                Vc[ind]   += nbat->out[nb].Vc[ind]   + nbat->out[nb].Vc[indr];
 +            }
 +        }
 +    }
 +}
index 716ed7f7187746c53bb96fcb7e0e65b8488ac250,0000000000000000000000000000000000000000..1589d0349ecd6e6e203f980aad88d77710a6a462
mode 100644,000000..100644
--- /dev/null
@@@ -1,65 -1,0 +1,71 @@@
- clear_f(const nbnxn_atomdata_t *nbat,real *f);
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + */
 +
 +#ifndef _nbnxn_kernel_common_h
 +#define _nbnxn_kernel_common_h
 +
 +#include "typedefs.h"
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +#if 0
 +}
 +#endif
 +
++/* Clear the force buffer f. Either the whole buffer or only the parts
++ * used by the current thread when nbat->bUseBufferFlags is set.
++ * In the latter case output_index is the task/thread list/buffer index.
++ */
 +void
++clear_f(const nbnxn_atomdata_t *nbat,int output_index,real *f);
 +
++/* Clear the shift forces */
 +void
 +clear_fshift(real *fshift);
 +
++/* Reduce the collected energy terms over the pair-lists/threads */
 +void
 +reduce_energies_over_lists(const nbnxn_atomdata_t     *nbat,
 +                           int                        nlist,
 +                           real                       *Vvdw,
 +                           real                       *Vc);
 +
 +#if 0
 +{
 +#endif
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +#endif
index 918c92b039f3a47298a1ff0f313d5f315601ad60,0000000000000000000000000000000000000000..f915050caf92cbe35a2ebd0e607138cc18cd8a9e
mode 100644,000000..100644
--- /dev/null
@@@ -1,377 -1,0 +1,377 @@@
-         clear_f(nbat, f);
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROningen Mixture of Alchemy and Childrens' Stories
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "types/simple.h"
 +#include "maths.h"
 +#include "vec.h"
 +#include "typedefs.h"
 +#include "force.h"
 +#include "nbnxn_kernel_gpu_ref.h"
 +#include "../nbnxn_consts.h"
 +#include "nbnxn_kernel_common.h"
 +
 +#define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
 +#define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
 +
 +void
 +nbnxn_kernel_gpu_ref(const nbnxn_pairlist_t     *nbl,
 +                     const nbnxn_atomdata_t     *nbat,
 +                     const interaction_const_t  *iconst,
 +                     rvec                       *shift_vec,
 +                     int                        force_flags,
 +                     int                        clearF,
 +                     real *                     f,
 +                     real *                     fshift,
 +                     real *                     Vc,
 +                     real *                     Vvdw)
 +{
 +    const nbnxn_sci_t *nbln;
 +    const real    *x;
 +    gmx_bool      bEner;
 +    gmx_bool      bEwald;
 +    const real    *Ftab=NULL;
 +    real          rcut2,rvdw2,rlist2;
 +    int           ntype;
 +    real          facel;
 +    int           n;
 +    int           ish3;
 +    int           sci;
 +    int           cj4_ind0,cj4_ind1,cj4_ind;
 +    int           ci,cj;
 +    int           ic,jc,ia,ja,is,ifs,js,jfs,im,jm;
 +    int           n0;
 +    int           ggid;
 +    real          shX,shY,shZ;
 +    real          fscal,tx,ty,tz;
 +    real          rinvsq;
 +    real          iq;
 +    real          qq,vcoul=0,krsq,vctot;
 +    int           nti;
 +    int           tj;
 +    real          rt,r,eps;
 +    real          rinvsix;
 +    real          Vvdwtot;
 +    real          Vvdw_rep,Vvdw_disp;
 +    real          ix,iy,iz,fix,fiy,fiz;
 +    real          jx,jy,jz;
 +    real          dx,dy,dz,rsq,rinv;
 +    int           int_bit;
 +    real          fexcl;
 +    real          c6,c12,cexp1,cexp2,br;
 +    const real *  shiftvec;
 +    real *        vdwparam;
 +    int *         shift;
 +    int *         type;
 +    const nbnxn_excl_t *excl[2];
 +
 +    int           npair_tot,npair;
 +    int           nhwu,nhwu_pruned;
 +
 +    if (nbl->na_ci != CL_SIZE)
 +    {
 +        gmx_fatal(FARGS,"The neighborlist cluster size in the GPU reference kernel is %d, expected it to be %d",nbl->na_ci,CL_SIZE);
 +    }
 +
 +    if (clearF == enbvClearFYes)
 +    {
++        clear_f(nbat, 0, f);
 +    }
 +
 +    bEner = (force_flags & GMX_FORCE_ENERGY);
 +
 +    bEwald = EEL_FULL(iconst->eeltype);
 +    if (bEwald)
 +    {
 +        Ftab = iconst->tabq_coul_F;
 +    }
 +
 +    rcut2               = iconst->rcoulomb*iconst->rcoulomb;
 +    rvdw2               = iconst->rvdw*iconst->rvdw;
 +
 +    rlist2              = nbl->rlist*nbl->rlist;
 +
 +    type                = nbat->type;
 +    facel               = iconst->epsfac;
 +    shiftvec            = shift_vec[0];
 +    vdwparam            = nbat->nbfp;
 +    ntype               = nbat->ntype;
 +
 +    x = nbat->x;
 +
 +    npair_tot   = 0;
 +    nhwu        = 0;
 +    nhwu_pruned = 0;
 +
 +    for(n=0; n<nbl->nsci; n++)
 +    {
 +        nbln = &nbl->sci[n];
 +
 +        ish3             = 3*nbln->shift;     
 +        shX              = shiftvec[ish3];  
 +        shY              = shiftvec[ish3+1];
 +        shZ              = shiftvec[ish3+2];
 +        cj4_ind0         = nbln->cj4_ind_start;      
 +        cj4_ind1         = nbln->cj4_ind_end;    
 +        sci              = nbln->sci;
 +        vctot            = 0;              
 +        Vvdwtot          = 0;              
 +
 +        if (nbln->shift == CENTRAL &&
 +            nbl->cj4[cj4_ind0].cj[0] == sci*NCL_PER_SUPERCL)
 +        {
 +            /* we have the diagonal:
 +             * add the charge self interaction energy term
 +             */
 +            for(im=0; im<NCL_PER_SUPERCL; im++)
 +            {
 +                ci = sci*NCL_PER_SUPERCL + im;
 +                for (ic=0; ic<CL_SIZE; ic++)
 +                {
 +                    ia     = ci*CL_SIZE + ic;
 +                    iq     = x[ia*nbat->xstride+3];
 +                    vctot += iq*iq;
 +                }
 +            }
 +            if (!bEwald)
 +            {
 +                vctot *= -facel*0.5*iconst->c_rf;
 +            }
 +            else
 +            {
 +                /* last factor 1/sqrt(pi) */
 +                vctot *= -facel*iconst->ewaldcoeff*M_1_SQRTPI;
 +            }
 +        }
 +        
 +        for(cj4_ind=cj4_ind0; (cj4_ind<cj4_ind1); cj4_ind++)
 +        {
 +            excl[0]           = &nbl->excl[nbl->cj4[cj4_ind].imei[0].excl_ind];
 +            excl[1]           = &nbl->excl[nbl->cj4[cj4_ind].imei[1].excl_ind];
 +
 +            for(jm=0; jm<4; jm++)
 +            {
 +                cj               = nbl->cj4[cj4_ind].cj[jm];
 +
 +                for(im=0; im<NCL_PER_SUPERCL; im++)
 +                {
 +                    /* We're only using the first imask,
 +                     * but here imei[1].imask is identical.
 +                     */
 +                    if ((nbl->cj4[cj4_ind].imei[0].imask >> (jm*NCL_PER_SUPERCL+im)) & 1)
 +                    {
 +                        gmx_bool within_rlist;
 +
 +                        ci               = sci*NCL_PER_SUPERCL + im;
 +
 +                        within_rlist     = FALSE;
 +                        npair            = 0;
 +                        for(ic=0; ic<CL_SIZE; ic++)
 +                        {
 +                            ia               = ci*CL_SIZE + ic;
 +                    
 +                            is               = ia*nbat->xstride;
 +                            ifs              = ia*nbat->fstride;
 +                            ix               = shX + x[is+0];
 +                            iy               = shY + x[is+1];
 +                            iz               = shZ + x[is+2];
 +                            iq               = facel*x[is+3];
 +                            nti              = ntype*2*type[ia];
 +                    
 +                            fix              = 0;
 +                            fiy              = 0;
 +                            fiz              = 0;
 +
 +                            for(jc=0; jc<CL_SIZE; jc++)
 +                            {
 +                                ja               = cj*CL_SIZE + jc;
 +
 +                                if (nbln->shift == CENTRAL &&
 +                                    ci == cj && ja <= ia)
 +                                {
 +                                    continue;
 +                                }
 +                        
 +                                int_bit = ((excl[jc>>2]->pair[(jc & 3)*CL_SIZE+ic] >> (jm*NCL_PER_SUPERCL+im)) & 1); 
 +
 +                                js               = ja*nbat->xstride;
 +                                jfs              = ja*nbat->fstride;
 +                                jx               = x[js+0];      
 +                                jy               = x[js+1];      
 +                                jz               = x[js+2];      
 +                                dx               = ix - jx;      
 +                                dy               = iy - jy;      
 +                                dz               = iz - jz;      
 +                                rsq              = dx*dx + dy*dy + dz*dz;
 +                                if (rsq < rlist2)
 +                                {
 +                                    within_rlist = TRUE;
 +                                }
 +                                if (rsq >= rcut2)
 +                                {
 +                                    continue;
 +                                }
 +
 +                                if (type[ia] != ntype-1 && type[ja] != ntype-1)
 +                                {
 +                                    npair++;
 +                                }
 +
 +                                /* avoid NaN for excluded pairs at r=0 */
 +                                rsq             += (1.0 - int_bit)*NBNXN_AVOID_SING_R2_INC;
 +
 +                                rinv             = gmx_invsqrt(rsq);
 +                                rinvsq           = rinv*rinv;  
 +                                fscal            = 0;
 +                        
 +                                qq               = iq*x[js+3];
 +                                if (!bEwald)
 +                                {
 +                                    /* Reaction-field */
 +                                    krsq  = iconst->k_rf*rsq;
 +                                    fscal = qq*(int_bit*rinv - 2*krsq)*rinvsq;
 +                                    if (bEner)
 +                                    {
 +                                        vcoul = qq*(int_bit*rinv + krsq - iconst->c_rf);
 +                                    }
 +                                }
 +                                else
 +                                {
 +                                    r     = rsq*rinv;
 +                                    rt    = r*iconst->tabq_scale;
 +                                    n0    = rt;
 +                                    eps   = rt - n0;
 +
 +                                    fexcl = (1 - eps)*Ftab[n0] + eps*Ftab[n0+1];
 +
 +                                    fscal = qq*(int_bit*rinvsq - fexcl)*rinv;
 +
 +                                    if (bEner)
 +                                    {
 +                                        vcoul = qq*((int_bit - gmx_erf(iconst->ewaldcoeff*r))*rinv - int_bit*iconst->sh_ewald);
 +                                    }
 +                                }
 +
 +                                if (rsq < rvdw2)
 +                                {
 +                                    tj        = nti + 2*type[ja];
 +
 +                                    /* Vanilla Lennard-Jones cutoff */
 +                                    c6        = vdwparam[tj];
 +                                    c12       = vdwparam[tj+1];
 +                                
 +                                    rinvsix   = int_bit*rinvsq*rinvsq*rinvsq;
 +                                    Vvdw_disp = c6*rinvsix;     
 +                                    Vvdw_rep  = c12*rinvsix*rinvsix;
 +                                    fscal    += (Vvdw_rep - Vvdw_disp)*rinvsq;
 +
 +                                    if (bEner)
 +                                    {
 +                                        vctot   += vcoul;
 +
 +                                        Vvdwtot +=
 +                                            (Vvdw_rep - int_bit*c12*iconst->sh_invrc6*iconst->sh_invrc6)/12 -
 +                                            (Vvdw_disp - int_bit*c6*iconst->sh_invrc6)/6;
 +                                    }
 +                                }
 +                                
 +                                tx        = fscal*dx;
 +                                ty        = fscal*dy;
 +                                tz        = fscal*dz;
 +                                fix       = fix + tx;
 +                                fiy       = fiy + ty;
 +                                fiz       = fiz + tz;
 +                                f[jfs+0] -= tx;
 +                                f[jfs+1] -= ty;
 +                                f[jfs+2] -= tz;
 +                            }
 +                            
 +                            f[ifs+0]        += fix;
 +                            f[ifs+1]        += fiy;
 +                            f[ifs+2]        += fiz;
 +                            fshift[ish3]     = fshift[ish3]   + fix;
 +                            fshift[ish3+1]   = fshift[ish3+1] + fiy;
 +                            fshift[ish3+2]   = fshift[ish3+2] + fiz;
 +
 +                            /* Count in half work-units.
 +                             * In CUDA one work-unit is 2 warps.
 +                             */
 +                            if ((ic+1) % (CL_SIZE/2) == 0)
 +                            {
 +                                npair_tot += npair;
 +
 +                                nhwu++;
 +                                if (within_rlist)
 +                                {
 +                                    nhwu_pruned++;
 +                                }
 +
 +                                within_rlist = FALSE;
 +                                npair        = 0;
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +        
 +        if (bEner)
 +        {
 +            ggid = 0;
 +            Vc[ggid]         = Vc[ggid]   + vctot;
 +            Vvdw[ggid]       = Vvdw[ggid] + Vvdwtot;
 +        }
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"number of half %dx%d atom pairs: %d after pruning: %d fraction %4.2f\n",
 +                nbl->na_ci,nbl->na_ci,
 +                nhwu,nhwu_pruned,nhwu_pruned/(double)nhwu);
 +        fprintf(debug,"generic kernel pair interactions:            %d\n",
 +                nhwu*nbl->na_ci/2*nbl->na_ci);
 +        fprintf(debug,"generic kernel post-prune pair interactions: %d\n",
 +                nhwu_pruned*nbl->na_ci/2*nbl->na_ci);
 +        fprintf(debug,"generic kernel non-zero pair interactions:   %d\n",
 +                npair_tot);
 +        fprintf(debug,"ratio non-zero/post-prune pair interactions: %4.2f\n",
 +                npair_tot/(double)(nhwu_pruned*nbl->na_ci/2*nbl->na_ci));
 +    }
 +}
index b3b95c9c1c9af93b0a72b536427bd4b60162d4e5,0000000000000000000000000000000000000000..05e3e494473f48d31df5931adc21c8442fb9bd4d
mode 100644,000000..100644
--- /dev/null
@@@ -1,255 -1,0 +1,255 @@@
-             clear_f(nbat,out->f);
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2009, The GROMACS Development Team
 + *
 + * Gromacs is a library for molecular simulation and trajectory analysis,
 + * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
 + * a full list of developers and information, check out http://www.gromacs.org
 + *
 + * This program is free software; you can redistribute it and/or modify it under
 + * the terms of the GNU Lesser General Public License as published by the Free
 + * Software Foundation; either version 2 of the License, or (at your option) any
 + * later version.
 + * As a special exception, you may use this file as part of a free software
 + * library without restriction.  Specifically, if other files instantiate
 + * templates or use macros or inline functions from this file, or you compile
 + * this file and link it with other files to produce an executable, this
 + * file does not by itself cause the resulting executable to be covered by
 + * the GNU Lesser General Public License.
 + *
 + * In plain-speak: do not worry about classes/macros/templates either - only
 + * changes to the library have to be LGPL, not an application linking with it.
 + *
 + * To help fund GROMACS development, we humbly ask that you cite
 + * the papers people have written on it - you can find them on the website!
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "typedefs.h"
 +#include "vec.h"
 +#include "smalloc.h"
 +#include "force.h"
 +#include "gmx_omp_nthreads.h"
 +#include "nbnxn_kernel_ref.h"
 +#include "../nbnxn_consts.h"
 +#include "nbnxn_kernel_common.h"
 +
 +/* Analytical reaction-field kernels */
 +#define CALC_COUL_RF
 +
 +/* Include the force+energy kernels */
 +#define CALC_ENERGIES
 +#include "nbnxn_kernel_ref_outer.h"
 +#undef CALC_ENERGIES
 +
 +/* Include the force+energygroups kernels */
 +#define CALC_ENERGIES
 +#define ENERGY_GROUPS
 +#include "nbnxn_kernel_ref_outer.h"
 +#undef ENERGY_GROUPS
 +#undef CALC_ENERGIES
 +
 +/* Include the force only kernels */
 +#include "nbnxn_kernel_ref_outer.h"
 +
 +#undef CALC_COUL_RF
 +
 +
 +/* Tabulated exclusion interaction electrostatics kernels */
 +#define CALC_COUL_TAB
 +
 +/* Include the force+energy kernels */
 +#define CALC_ENERGIES
 +#include "nbnxn_kernel_ref_outer.h"
 +#undef CALC_ENERGIES
 +
 +/* Include the force+energygroups kernels */
 +#define CALC_ENERGIES
 +#define ENERGY_GROUPS
 +#include "nbnxn_kernel_ref_outer.h"
 +#undef ENERGY_GROUPS
 +#undef CALC_ENERGIES
 +
 +/* Include the force only kernels */
 +#include "nbnxn_kernel_ref_outer.h"
 +
 +/* Twin-range cut-off kernels */
 +#define VDW_CUTOFF_CHECK
 +
 +/* Include the force+energy kernels */
 +#define CALC_ENERGIES
 +#include "nbnxn_kernel_ref_outer.h"
 +#undef CALC_ENERGIES
 +
 +/* Include the force+energygroups kernels */
 +#define CALC_ENERGIES
 +#define ENERGY_GROUPS
 +#include "nbnxn_kernel_ref_outer.h"
 +#undef ENERGY_GROUPS
 +#undef CALC_ENERGIES
 +
 +/* Include the force only kernels */
 +#include "nbnxn_kernel_ref_outer.h"
 +
 +#undef VDW_CUTOFF_CHECK
 +
 +#undef CALC_COUL_TAB
 +
 +
 +typedef void (*p_nbk_func_ener)(const nbnxn_pairlist_t     *nbl,
 +                                const nbnxn_atomdata_t     *nbat,
 +                                const interaction_const_t  *ic,
 +                                rvec                       *shift_vec,
 +                                real                       *f,
 +                                real                       *fshift,
 +                                real                       *Vvdw,
 +                                real                       *Vc);
 +
 +typedef void (*p_nbk_func_noener)(const nbnxn_pairlist_t     *nbl,
 +                                  const nbnxn_atomdata_t     *nbat,
 +                                  const interaction_const_t  *ic,
 +                                  rvec                       *shift_vec,
 +                                  real                       *f,
 +                                  real                       *fshift);
 +
 +enum { coultRF, coultTAB, coultTAB_TWIN, coultNR };
 +
 +p_nbk_func_ener p_nbk_c_ener[coultNR] =
 +{ nbnxn_kernel_ref_rf_ener,
 +  nbnxn_kernel_ref_tab_ener,
 +  nbnxn_kernel_ref_tab_twin_ener };
 +
 +p_nbk_func_ener p_nbk_c_energrp[coultNR] =
 +{ nbnxn_kernel_ref_rf_energrp,
 +  nbnxn_kernel_ref_tab_energrp,
 +  nbnxn_kernel_ref_tab_twin_energrp};
 +
 +p_nbk_func_noener p_nbk_c_noener[coultNR] =
 +{ nbnxn_kernel_ref_rf_noener,
 +  nbnxn_kernel_ref_tab_noener,
 +  nbnxn_kernel_ref_tab_twin_noener };
 +
 +void
 +nbnxn_kernel_ref(const nbnxn_pairlist_set_t *nbl_list,
 +                 const nbnxn_atomdata_t     *nbat,
 +                 const interaction_const_t  *ic,
 +                 rvec                       *shift_vec,
 +                 int                        force_flags,
 +                 int                        clearF,
 +                 real                       *fshift,
 +                 real                       *Vc,
 +                 real                       *Vvdw)
 +{
 +    int              nnbl;
 +    nbnxn_pairlist_t **nbl;
 +    int coult;
 +    int nb;
 +
 +    nnbl = nbl_list->nnbl;
 +    nbl  = nbl_list->nbl;
 +
 +    if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
 +    {
 +        coult = coultRF;
 +    }
 +    else
 +    {
 +        if (ic->rcoulomb == ic->rvdw)
 +        {
 +            coult = coultTAB;
 +        }
 +        else
 +        {
 +            coult = coultTAB_TWIN;
 +        }
 +    }
 +
 +#pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded))
 +    for(nb=0; nb<nnbl; nb++)
 +    {
 +        nbnxn_atomdata_output_t *out;
 +        real *fshift_p;
 +
 +        out = &nbat->out[nb];
 +
 +        if (clearF == enbvClearFYes)
 +        {
++            clear_f(nbat,nb,out->f);
 +        }
 +
 +        if ((force_flags & GMX_FORCE_VIRIAL) && nnbl == 1)
 +        {
 +            fshift_p = fshift;
 +        }
 +        else
 +        {
 +            fshift_p = out->fshift;
 +
 +            if (clearF == enbvClearFYes)
 +            {
 +                clear_fshift(fshift_p);
 +            }
 +        }
 +
 +        if (!(force_flags & GMX_FORCE_ENERGY))
 +        {
 +            /* Don't calculate energies */
 +            p_nbk_c_noener[coult](nbl[nb],nbat,
 +                                  ic,
 +                                  shift_vec,
 +                                  out->f,
 +                                  fshift_p);
 +        }
 +        else if (out->nV == 1)
 +        {
 +            /* No energy groups */
 +            out->Vvdw[0] = 0;
 +            out->Vc[0]   = 0;
 +
 +            p_nbk_c_ener[coult](nbl[nb],nbat,
 +                                ic,
 +                                shift_vec,
 +                                out->f,
 +                                fshift_p,
 +                                out->Vvdw,
 +                                out->Vc);
 +        }
 +        else
 +        {
 +            /* Calculate energy group contributions */
 +            int i;
 +
 +            for(i=0; i<out->nV; i++)
 +            {
 +                out->Vvdw[i] = 0;
 +            }
 +            for(i=0; i<out->nV; i++)
 +            {
 +                out->Vc[i] = 0;
 +            }
 +
 +            p_nbk_c_energrp[coult](nbl[nb],nbat,
 +                                   ic,
 +                                   shift_vec,
 +                                   out->f,
 +                                   fshift_p,
 +                                   out->Vvdw,
 +                                   out->Vc);
 +        }
 +    }
 +
 +    if (force_flags & GMX_FORCE_ENERGY)
 +    {
 +        reduce_energies_over_lists(nbat,nnbl,Vvdw,Vc);
 +    }
 +}
index bdbe504e12bd4155bd60e2a997c2321da4b8aba7,0000000000000000000000000000000000000000..64fec48dc280e6cc86e3c6bad82e035e2dd34d07
mode 100644,000000..100644
--- /dev/null
@@@ -1,322 -1,0 +1,322 @@@
-             clear_f(nbat,out->f);
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "typedefs.h"
 +#include "vec.h"
 +#include "smalloc.h"
 +#include "force.h"
 +#include "gmx_omp_nthreads.h"
 +#include "../nbnxn_consts.h"
 +#include "nbnxn_kernel_common.h"
 +
 +#ifdef GMX_X86_SSE2
 +
 +#include "nbnxn_kernel_x86_simd128.h"
 +
 +/* Include all flavors of the 128-bit SSE or AVX kernel loops */
 +
 +#define GMX_MM128_HERE
 +
 +/* Analytical reaction-field kernels */
 +#define CALC_COUL_RF
 +
 +#include "nbnxn_kernel_x86_simd_includes.h"
 +
 +#undef CALC_COUL_RF
 +
 +/* Tabulated exclusion interaction electrostatics kernels */
 +#define CALC_COUL_TAB
 +
 +/* Single cut-off: rcoulomb = rvdw */
 +#include "nbnxn_kernel_x86_simd_includes.h"
 +
 +/* Twin cut-off: rcoulomb >= rvdw */
 +#define VDW_CUTOFF_CHECK
 +#include "nbnxn_kernel_x86_simd_includes.h"
 +#undef VDW_CUTOFF_CHECK
 +
 +#undef CALC_COUL_TAB
 +
 +/* Analytical Ewald exclusion interaction electrostatics kernels */
 +#define CALC_COUL_EWALD
 +
 +/* Single cut-off: rcoulomb = rvdw */
 +#include "nbnxn_kernel_x86_simd_includes.h"
 +
 +/* Twin cut-off: rcoulomb >= rvdw */
 +#define VDW_CUTOFF_CHECK
 +#include "nbnxn_kernel_x86_simd_includes.h"
 +#undef VDW_CUTOFF_CHECK
 +
 +#undef CALC_COUL_EWALD
 +
 +
 +typedef void (*p_nbk_func_ener)(const nbnxn_pairlist_t     *nbl,
 +                                const nbnxn_atomdata_t     *nbat,
 +                                const interaction_const_t  *ic,
 +                                rvec                       *shift_vec,
 +                                real                       *f,
 +                                real                       *fshift,
 +                                real                       *Vvdw,
 +                                real                       *Vc);
 +
 +typedef void (*p_nbk_func_noener)(const nbnxn_pairlist_t     *nbl,
 +                                  const nbnxn_atomdata_t     *nbat,
 +                                  const interaction_const_t  *ic,
 +                                  rvec                       *shift_vec,
 +                                  real                       *f,
 +                                  real                       *fshift);
 +
 +enum { coultRF, coultTAB, coultTAB_TWIN, coultEWALD, coultEWALD_TWIN, coultNR };
 +
 +#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd128_##elec##_comb_##ljcomb##_ener
 +static p_nbk_func_ener p_nbk_ener[coultNR][ljcrNR] =
 +{ { NBK_FN(rf        ,geom), NBK_FN(rf        ,lb), NBK_FN(rf        ,none) },
 +  { NBK_FN(tab       ,geom), NBK_FN(tab       ,lb), NBK_FN(tab       ,none) },
 +  { NBK_FN(tab_twin  ,geom), NBK_FN(tab_twin  ,lb), NBK_FN(tab_twin  ,none) },
 +  { NBK_FN(ewald     ,geom), NBK_FN(ewald     ,lb), NBK_FN(ewald     ,none) },
 +  { NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } };
 +#undef NBK_FN
 +
 +#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd128_##elec##_comb_##ljcomb##_energrp
 +static p_nbk_func_ener p_nbk_energrp[coultNR][ljcrNR] =
 +{ { NBK_FN(rf        ,geom), NBK_FN(rf        ,lb), NBK_FN(rf        ,none) },
 +  { NBK_FN(tab       ,geom), NBK_FN(tab       ,lb), NBK_FN(tab       ,none) },
 +  { NBK_FN(tab_twin  ,geom), NBK_FN(tab_twin  ,lb), NBK_FN(tab_twin  ,none) },
 +  { NBK_FN(ewald     ,geom), NBK_FN(ewald     ,lb), NBK_FN(ewald     ,none) },
 +  { NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } };
 +#undef NBK_FN
 +
 +#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd128_##elec##_comb_##ljcomb##_noener
 +static p_nbk_func_noener p_nbk_noener[coultNR][ljcrNR] =
 +{ { NBK_FN(rf        ,geom), NBK_FN(rf        ,lb), NBK_FN(rf        ,none) },
 +  { NBK_FN(tab       ,geom), NBK_FN(tab       ,lb), NBK_FN(tab       ,none) },
 +  { NBK_FN(tab_twin  ,geom), NBK_FN(tab_twin  ,lb), NBK_FN(tab_twin  ,none) },
 +  { NBK_FN(ewald     ,geom), NBK_FN(ewald     ,lb), NBK_FN(ewald     ,none) },
 +  { NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } };
 +#undef NBK_FN
 +
 +
 +static void reduce_group_energies(int ng,int ng_2log,
 +                                  const real *VSvdw,const real *VSc,
 +                                  real *Vvdw,real *Vc)
 +{
 +    int ng_p2,i,j,j0,j1,c,s;
 +
 +#define SIMD_WIDTH       (GMX_X86_SIMD_WIDTH_HERE)
 +#define SIMD_WIDTH_HALF  (GMX_X86_SIMD_WIDTH_HERE/2)
 +
 +    ng_p2 = (1<<ng_2log);
 +
 +    /* The size of the x86 SIMD energy group buffer array is:
 +     * ng*ng*ng_p2*SIMD_WIDTH_HALF*SIMD_WIDTH
 +     */
 +    for(i=0; i<ng; i++)
 +    {
 +        for(j=0; j<ng; j++)
 +        {
 +            Vvdw[i*ng+j] = 0;
 +            Vc[i*ng+j]   = 0;
 +        }
 +
 +        for(j1=0; j1<ng; j1++)
 +        {
 +            for(j0=0; j0<ng; j0++)
 +            {
 +                c = ((i*ng + j1)*ng_p2 + j0)*SIMD_WIDTH_HALF*SIMD_WIDTH;
 +                for(s=0; s<SIMD_WIDTH_HALF; s++)
 +                {
 +                    Vvdw[i*ng+j0] += VSvdw[c+0];
 +                    Vvdw[i*ng+j1] += VSvdw[c+1];
 +                    Vc  [i*ng+j0] += VSc  [c+0];
 +                    Vc  [i*ng+j1] += VSc  [c+1];
 +                    c += SIMD_WIDTH + 2;
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +#endif /* GMX_X86_SSE2 */
 +
 +void
 +nbnxn_kernel_x86_simd128(nbnxn_pairlist_set_t       *nbl_list,
 +                         const nbnxn_atomdata_t     *nbat,
 +                         const interaction_const_t  *ic,
 +                         int                        ewald_excl,
 +                         rvec                       *shift_vec, 
 +                         int                        force_flags,
 +                         int                        clearF,
 +                         real                       *fshift,
 +                         real                       *Vc,
 +                         real                       *Vvdw)
 +#ifdef GMX_X86_SSE2
 +{
 +    int              nnbl;
 +    nbnxn_pairlist_t **nbl;
 +    int coult;
 +    int nb;
 +
 +    nnbl = nbl_list->nnbl;
 +    nbl  = nbl_list->nbl;
 +
 +    if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
 +    {
 +        coult = coultRF;
 +    }
 +    else
 +    {
 +        if (ewald_excl == ewaldexclTable)
 +        {
 +            if (ic->rcoulomb == ic->rvdw)
 +            {
 +                coult = coultTAB;
 +            }
 +            else
 +            {
 +                coult = coultTAB_TWIN;
 +            }
 +        }
 +        else
 +        {
 +            if (ic->rcoulomb == ic->rvdw)
 +            {
 +                coult = coultEWALD;
 +            }
 +            else
 +            {
 +                coult = coultEWALD_TWIN;
 +            }
 +        }
 +    }
 +
 +#pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded))
 +    for(nb=0; nb<nnbl; nb++)
 +    {
 +        nbnxn_atomdata_output_t *out;
 +        real *fshift_p;
 +
 +        out = &nbat->out[nb];
 +
 +        if (clearF == enbvClearFYes)
 +        {
++            clear_f(nbat,nb,out->f);
 +        }
 +
 +        if ((force_flags & GMX_FORCE_VIRIAL) && nnbl == 1)
 +        {
 +            fshift_p = fshift;
 +        }
 +        else
 +        {
 +            fshift_p = out->fshift;
 +
 +            if (clearF == enbvClearFYes)
 +            {
 +                clear_fshift(fshift_p);
 +            }
 +        }
 +
 +        /* With Ewald type electrostatics we the forces for excluded atom pairs
 +         * should not contribute to the virial sum. The exclusion forces
 +         * are not calculate in the energy kernels, but are in _noener.
 +         */
 +        if (!((force_flags & GMX_FORCE_ENERGY) ||
 +              (EEL_FULL(ic->eeltype) && (force_flags & GMX_FORCE_VIRIAL))))
 +        {
 +            /* Don't calculate energies */
 +            p_nbk_noener[coult][nbat->comb_rule](nbl[nb],nbat,
 +                                                 ic,
 +                                                 shift_vec,
 +                                                 out->f,
 +                                                 fshift_p);
 +        }
 +        else if (out->nV == 1 || !(force_flags & GMX_FORCE_ENERGY))
 +        {
 +            /* No energy groups */
 +            out->Vvdw[0] = 0;
 +            out->Vc[0]   = 0;
 +
 +            p_nbk_ener[coult][nbat->comb_rule](nbl[nb],nbat,
 +                                               ic,
 +                                               shift_vec,
 +                                               out->f,
 +                                               fshift_p,
 +                                               out->Vvdw,
 +                                               out->Vc);
 +        }
 +        else
 +        {
 +            /* Calculate energy group contributions */
 +            int i;
 +
 +            for(i=0; i<out->nVS; i++)
 +            {
 +                out->VSvdw[i] = 0;
 +            }
 +            for(i=0; i<out->nVS; i++)
 +            {
 +                out->VSc[i] = 0;
 +            }
 +
 +            p_nbk_energrp[coult][nbat->comb_rule](nbl[nb],nbat,
 +                                                  ic,
 +                                                  shift_vec,
 +                                                  out->f,
 +                                                  fshift_p,
 +                                                  out->VSvdw,
 +                                                  out->VSc);
 +
 +            reduce_group_energies(nbat->nenergrp,nbat->neg_2log,
 +                                  out->VSvdw,out->VSc,
 +                                  out->Vvdw,out->Vc);
 +        }
 +    }
 +
 +    if (force_flags & GMX_FORCE_ENERGY)
 +    {
 +        reduce_energies_over_lists(nbat,nnbl,Vvdw,Vc);
 +    }
 +}
 +#else
 +{
 +    gmx_incons("nbnxn_kernel_x86_simd128 called while GROMACS was configured without SSE enabled");
 +}
 +#endif
index 89aac413cbcf1d313eaef2ce60b048aa230298c5,0000000000000000000000000000000000000000..3711d7bd4fae33bcefcc9d3cbefe930c8193bcb4
mode 100644,000000..100644
--- /dev/null
@@@ -1,322 -1,0 +1,322 @@@
-             clear_f(nbat,out->f);
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +
 +#include "typedefs.h"
 +#include "vec.h"
 +#include "smalloc.h"
 +#include "force.h"
 +#include "gmx_omp_nthreads.h"
 +#include "../nbnxn_consts.h"
 +#include "nbnxn_kernel_common.h"
 +
 +#ifdef GMX_X86_AVX_256
 +
 +#include "nbnxn_kernel_x86_simd256.h"
 +
 +/* Include all flavors of the 256-bit AVX kernel loops */
 +
 +#define GMX_MM256_HERE
 +
 +/* Analytical reaction-field kernels */
 +#define CALC_COUL_RF
 +
 +#include "nbnxn_kernel_x86_simd_includes.h"
 +
 +#undef CALC_COUL_RF
 +
 +/* Tabulated exclusion interaction electrostatics kernels */
 +#define CALC_COUL_TAB
 +
 +/* Single cut-off: rcoulomb = rvdw */
 +#include "nbnxn_kernel_x86_simd_includes.h"
 +
 +/* Twin cut-off: rcoulomb >= rvdw */
 +#define VDW_CUTOFF_CHECK
 +#include "nbnxn_kernel_x86_simd_includes.h"
 +#undef VDW_CUTOFF_CHECK
 +
 +#undef CALC_COUL_TAB
 +
 +/* Analytical Ewald exclusion interaction electrostatics kernels */
 +#define CALC_COUL_EWALD
 +
 +/* Single cut-off: rcoulomb = rvdw */
 +#include "nbnxn_kernel_x86_simd_includes.h"
 +
 +/* Twin cut-off: rcoulomb >= rvdw */
 +#define VDW_CUTOFF_CHECK
 +#include "nbnxn_kernel_x86_simd_includes.h"
 +#undef VDW_CUTOFF_CHECK
 +
 +#undef CALC_COUL_EWALD
 +
 +
 +typedef void (*p_nbk_func_ener)(const nbnxn_pairlist_t     *nbl,
 +                                const nbnxn_atomdata_t     *nbat,
 +                                const interaction_const_t  *ic,
 +                                rvec                       *shift_vec,
 +                                real                       *f,
 +                                real                       *fshift,
 +                                real                       *Vvdw,
 +                                real                       *Vc);
 +
 +typedef void (*p_nbk_func_noener)(const nbnxn_pairlist_t     *nbl,
 +                                  const nbnxn_atomdata_t     *nbat,
 +                                  const interaction_const_t  *ic,
 +                                  rvec                       *shift_vec,
 +                                  real                       *f,
 +                                  real                       *fshift);
 +
 +enum { coultRF, coultTAB, coultTAB_TWIN, coultEWALD, coultEWALD_TWIN, coultNR };
 +
 +#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd256_##elec##_comb_##ljcomb##_ener
 +static p_nbk_func_ener p_nbk_ener[coultNR][ljcrNR] =
 +{ { NBK_FN(rf        ,geom), NBK_FN(rf        ,lb), NBK_FN(rf        ,none) },
 +  { NBK_FN(tab       ,geom), NBK_FN(tab       ,lb), NBK_FN(tab       ,none) },
 +  { NBK_FN(tab_twin  ,geom), NBK_FN(tab_twin  ,lb), NBK_FN(tab_twin  ,none) },
 +  { NBK_FN(ewald     ,geom), NBK_FN(ewald     ,lb), NBK_FN(ewald     ,none) },
 +  { NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } };
 +#undef NBK_FN
 +
 +#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd256_##elec##_comb_##ljcomb##_energrp
 +static p_nbk_func_ener p_nbk_energrp[coultNR][ljcrNR] =
 +{ { NBK_FN(rf        ,geom), NBK_FN(rf        ,lb), NBK_FN(rf        ,none) },
 +  { NBK_FN(tab       ,geom), NBK_FN(tab       ,lb), NBK_FN(tab       ,none) },
 +  { NBK_FN(tab_twin  ,geom), NBK_FN(tab_twin  ,lb), NBK_FN(tab_twin  ,none) },
 +  { NBK_FN(ewald     ,geom), NBK_FN(ewald     ,lb), NBK_FN(ewald     ,none) },
 +  { NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } };
 +#undef NBK_FN
 +
 +#define NBK_FN(elec,ljcomb) nbnxn_kernel_x86_simd256_##elec##_comb_##ljcomb##_noener
 +static p_nbk_func_noener p_nbk_noener[coultNR][ljcrNR] =
 +{ { NBK_FN(rf        ,geom), NBK_FN(rf        ,lb), NBK_FN(rf        ,none) },
 +  { NBK_FN(tab       ,geom), NBK_FN(tab       ,lb), NBK_FN(tab       ,none) },
 +  { NBK_FN(tab_twin  ,geom), NBK_FN(tab_twin  ,lb), NBK_FN(tab_twin  ,none) },
 +  { NBK_FN(ewald     ,geom), NBK_FN(ewald     ,lb), NBK_FN(ewald     ,none) },
 +  { NBK_FN(ewald_twin,geom), NBK_FN(ewald_twin,lb), NBK_FN(ewald_twin,none) } };
 +#undef NBK_FN
 +
 +
 +static void reduce_group_energies(int ng,int ng_2log,
 +                                  const real *VSvdw,const real *VSc,
 +                                  real *Vvdw,real *Vc)
 +{
 +    int ng_p2,i,j,j0,j1,c,s;
 +
 +#define SIMD_WIDTH       (GMX_X86_SIMD_WIDTH_HERE)
 +#define SIMD_WIDTH_HALF  (GMX_X86_SIMD_WIDTH_HERE/2)
 +
 +    ng_p2 = (1<<ng_2log);
 +
 +    /* The size of the x86 SIMD energy group buffer array is:
 +     * ng*ng*ng_p2*SIMD_WIDTH_HALF*SIMD_WIDTH
 +     */
 +    for(i=0; i<ng; i++)
 +    {
 +        for(j=0; j<ng; j++)
 +        {
 +            Vvdw[i*ng+j] = 0;
 +            Vc[i*ng+j]   = 0;
 +        }
 +
 +        for(j1=0; j1<ng; j1++)
 +        {
 +            for(j0=0; j0<ng; j0++)
 +            {
 +                c = ((i*ng + j1)*ng_p2 + j0)*SIMD_WIDTH_HALF*SIMD_WIDTH;
 +                for(s=0; s<SIMD_WIDTH_HALF; s++)
 +                {
 +                    Vvdw[i*ng+j0] += VSvdw[c+0];
 +                    Vvdw[i*ng+j1] += VSvdw[c+1];
 +                    Vc  [i*ng+j0] += VSc  [c+0];
 +                    Vc  [i*ng+j1] += VSc  [c+1];
 +                    c += SIMD_WIDTH + 2;
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +#endif /* GMX_X86_AVX_256 */
 +
 +void
 +nbnxn_kernel_x86_simd256(nbnxn_pairlist_set_t       *nbl_list,
 +                         const nbnxn_atomdata_t     *nbat,
 +                         const interaction_const_t  *ic,
 +                         int                        ewald_excl,
 +                         rvec                       *shift_vec, 
 +                         int                        force_flags,
 +                         int                        clearF,
 +                         real                       *fshift,
 +                         real                       *Vc,
 +                         real                       *Vvdw)
 +#ifdef GMX_X86_AVX_256
 +{
 +    int              nnbl;
 +    nbnxn_pairlist_t **nbl;
 +    int coult;
 +    int nb;
 +
 +    nnbl = nbl_list->nnbl;
 +    nbl  = nbl_list->nbl;
 +
 +    if (EEL_RF(ic->eeltype) || ic->eeltype == eelCUT)
 +    {
 +        coult = coultRF;
 +    }
 +    else
 +    {
 +        if (ewald_excl == ewaldexclTable)
 +        {
 +            if (ic->rcoulomb == ic->rvdw)
 +            {
 +                coult = coultTAB;
 +            }
 +            else
 +            {
 +                coult = coultTAB_TWIN;
 +            }
 +        }
 +        else
 +        {
 +            if (ic->rcoulomb == ic->rvdw)
 +            {
 +                coult = coultEWALD;
 +            }
 +            else
 +            {
 +                coult = coultEWALD_TWIN;
 +            }
 +        }
 +    }
 +
 +#pragma omp parallel for schedule(static) num_threads(gmx_omp_nthreads_get(emntNonbonded))
 +    for(nb=0; nb<nnbl; nb++)
 +    {
 +        nbnxn_atomdata_output_t *out;
 +        real *fshift_p;
 +
 +        out = &nbat->out[nb];
 +
 +        if (clearF == enbvClearFYes)
 +        {
++            clear_f(nbat,nb,out->f);
 +        }
 +
 +        if ((force_flags & GMX_FORCE_VIRIAL) && nnbl == 1)
 +        {
 +            fshift_p = fshift;
 +        }
 +        else
 +        {
 +            fshift_p = out->fshift;
 +
 +            if (clearF == enbvClearFYes)
 +            {
 +                clear_fshift(fshift_p);
 +            }
 +        }
 +
 +        /* With Ewald type electrostatics we the forces for excluded atom pairs
 +         * should not contribute to the virial sum. The exclusion forces
 +         * are not calculate in the energy kernels, but are in _noener.
 +         */
 +        if (!((force_flags & GMX_FORCE_ENERGY) ||
 +              (EEL_FULL(ic->eeltype) && (force_flags & GMX_FORCE_VIRIAL))))
 +        {
 +            /* Don't calculate energies */
 +            p_nbk_noener[coult][nbat->comb_rule](nbl[nb],nbat,
 +                                                 ic,
 +                                                 shift_vec,
 +                                                 out->f,
 +                                                 fshift_p);
 +        }
 +        else if (out->nV == 1 || !(force_flags & GMX_FORCE_ENERGY))
 +        {
 +            /* No energy groups */
 +            out->Vvdw[0] = 0;
 +            out->Vc[0]   = 0;
 +
 +            p_nbk_ener[coult][nbat->comb_rule](nbl[nb],nbat,
 +                                               ic,
 +                                               shift_vec,
 +                                               out->f,
 +                                               fshift_p,
 +                                               out->Vvdw,
 +                                               out->Vc);
 +        }
 +        else
 +        {
 +            /* Calculate energy group contributions */
 +            int i;
 +
 +            for(i=0; i<out->nVS; i++)
 +            {
 +                out->VSvdw[i] = 0;
 +            }
 +            for(i=0; i<out->nVS; i++)
 +            {
 +                out->VSc[i] = 0;
 +            }
 +
 +            p_nbk_energrp[coult][nbat->comb_rule](nbl[nb],nbat,
 +                                                  ic,
 +                                                  shift_vec,
 +                                                  out->f,
 +                                                  fshift_p,
 +                                                  out->VSvdw,
 +                                                  out->VSc);
 +
 +            reduce_group_energies(nbat->nenergrp,nbat->neg_2log,
 +                                  out->VSvdw,out->VSc,
 +                                  out->Vvdw,out->Vc);
 +        }
 +    }
 +
 +    if (force_flags & GMX_FORCE_ENERGY)
 +    {
 +        reduce_energies_over_lists(nbat,nnbl,Vvdw,Vc);
 +    }
 +}
 +#else
 +{
 +    gmx_incons("nbnxn_kernel_x86_simd256 called while GROMACS was configured without AVX enabled");
 +}
 +#endif
index 8441ccce62fbdb827953dd20a6d85e6b5abdca77,0000000000000000000000000000000000000000..56ac987c6ee04bb8809973264a8d7dac10bfdfbb
mode 100644,000000..100644
--- /dev/null
@@@ -1,4944 -1,0 +1,4925 @@@
-                           "grid cell cx %d cy %d out of range (max %d %d)",
-                           cx,cy,grid->ncx,grid->ncy);
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + */
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +#include <string.h>
 +#include "sysstuff.h"
 +#include "smalloc.h"
 +#include "macros.h"
 +#include "maths.h"
 +#include "vec.h"
 +#include "pbc.h"
 +#include "nbnxn_consts.h"
 +#include "nbnxn_internal.h"
 +#include "nbnxn_atomdata.h"
 +#include "nbnxn_search.h"
 +#include "gmx_cyclecounter.h"
 +#include "gmxfio.h"
 +#include "gmx_omp_nthreads.h"
 +#include "nrnb.h"
 +
 +
 +/* Pair search box lower and upper corner in x,y,z.
 + * Store this in 4 iso 3 reals, which is useful with SSE.
 + * To avoid complicating the code we also use 4 without SSE.
 + */
 +#define NNBSBB_C         4
 +#define NNBSBB_B         (2*NNBSBB_C)
 +/* Pair search box lower and upper bound in z only. */
 +#define NNBSBB_D         2
 +/* Pair search box lower and upper corner x,y,z indices */
 +#define BBL_X  0
 +#define BBL_Y  1
 +#define BBL_Z  2
 +#define BBU_X  4
 +#define BBU_Y  5
 +#define BBU_Z  6
 +
 +
 +#ifdef NBNXN_SEARCH_SSE
 +
 +#ifndef GMX_DOUBLE
 +#define NBNXN_SEARCH_SSE_SINGLE
 +#include "gmx_x86_simd_single.h"
 +#else
 +#include "gmx_x86_simd_double.h"
 +#endif
 +
 +#if defined NBNXN_SEARCH_SSE_SINGLE && GPU_NSUBCELL == 8
 +#define NBNXN_8BB_SSE
 +#endif
 +
 +/* The width of SSE/AVX128 with single precision for bounding boxes with GPU.
 + * Here AVX-256 turns out to be slightly slower than AVX-128.
 + */
 +#define STRIDE_8BB        4
 +#define STRIDE_8BB_2LOG   2
 +
 +
 +/* The functions below are macros as they are performance sensitive */
 +
 +/* 4x4 list, pack=4: no complex conversion required */
 +/* i-cluster to j-cluster conversion */
 +#define CI_TO_CJ_J4(ci)   (ci)
 +/* cluster index to coordinate array index conversion */
 +#define X_IND_CI_J4(ci)  ((ci)*STRIDE_P4)
 +#define X_IND_CJ_J4(cj)  ((cj)*STRIDE_P4)
 +
 +/* 4x2 list, pack=4: j-cluster size is half the packing width */
 +/* i-cluster to j-cluster conversion */
 +#define CI_TO_CJ_J2(ci)  ((ci)<<1)
 +/* cluster index to coordinate array index conversion */
 +#define X_IND_CI_J2(ci)  ((ci)*STRIDE_P4)
 +#define X_IND_CJ_J2(cj)  (((cj)>>1)*STRIDE_P4 + ((cj) & 1)*(PACK_X4>>1))
 +
 +/* 4x8 list, pack=8: i-cluster size is half the packing width */
 +/* i-cluster to j-cluster conversion */
 +#define CI_TO_CJ_J8(ci)  ((ci)>>1)
 +/* cluster index to coordinate array index conversion */
 +#define X_IND_CI_J8(ci)  (((ci)>>1)*STRIDE_P8 + ((ci) & 1)*(PACK_X8>>1))
 +#define X_IND_CJ_J8(cj)  ((cj)*STRIDE_P8)
 +
 +/* The j-cluster size is matched to the SIMD width */
 +#ifndef GMX_DOUBLE
 +/* 128 bits can hold 4 floats */
 +#define CI_TO_CJ_S128(ci)  CI_TO_CJ_J4(ci)
 +#define X_IND_CI_S128(ci)  X_IND_CI_J4(ci)
 +#define X_IND_CJ_S128(cj)  X_IND_CJ_J4(cj)
 +/* 256 bits can hold 8 floats */
 +#define CI_TO_CJ_S256(ci)  CI_TO_CJ_J8(ci)
 +#define X_IND_CI_S256(ci)  X_IND_CI_J8(ci)
 +#define X_IND_CJ_S256(cj)  X_IND_CJ_J8(cj)
 +#else
 +/* 128 bits can hold 2 doubles */
 +#define CI_TO_CJ_S128(ci)  CI_TO_CJ_J2(ci)
 +#define X_IND_CI_S128(ci)  X_IND_CI_J2(ci)
 +#define X_IND_CJ_S128(cj)  X_IND_CJ_J2(cj)
 +/* 256 bits can hold 4 doubles */
 +#define CI_TO_CJ_S256(ci)  CI_TO_CJ_J4(ci)
 +#define X_IND_CI_S256(ci)  X_IND_CI_J4(ci)
 +#define X_IND_CJ_S256(cj)  X_IND_CJ_J4(cj)
 +#endif
 +
 +#endif /* NBNXN_SEARCH_SSE */
 +
 +
 +/* Interaction masks for 4xN atom interactions.
 + * Bit i*CJ_SIZE + j tells if atom i and j interact.
 + */
 +/* All interaction mask is the same for all kernels */
 +#define NBNXN_INT_MASK_ALL        0xffffffff
 +/* 4x4 kernel diagonal mask */
 +#define NBNXN_INT_MASK_DIAG       0x08ce
 +/* 4x2 kernel diagonal masks */
 +#define NBNXN_INT_MASK_DIAG_J2_0  0x0002
 +#define NBNXN_INT_MASK_DIAG_J2_1  0x002F
 +/* 4x8 kernel diagonal masks */
 +#define NBNXN_INT_MASK_DIAG_J8_0  0xf0f8fcfe
 +#define NBNXN_INT_MASK_DIAG_J8_1  0x0080c0e0
 +
 +
 +#ifdef NBNXN_SEARCH_SSE
 +/* Store bounding boxes corners as quadruplets: xxxxyyyyzzzz */
 +#define NBNXN_BBXXXX
 +/* Size of bounding box corners quadruplet */
 +#define NNBSBB_XXXX      (NNBSBB_D*DIM*STRIDE_8BB)
 +#endif
 +
 +/* We shift the i-particles backward for PBC.
 + * This leads to more conditionals than shifting forward.
 + * We do this to get more balanced pair lists.
 + */
 +#define NBNXN_SHIFT_BACKWARD
 +
 +
 +/* This define is a lazy way to avoid interdependence of the grid
 + * and searching data structures.
 + */
 +#define NBNXN_NA_SC_MAX (GPU_NSUBCELL*NBNXN_GPU_CLUSTER_SIZE)
 +
 +
 +static void nbs_cycle_clear(nbnxn_cycle_t *cc)
 +{
 +    int i;
 +
 +    for(i=0; i<enbsCCnr; i++)
 +    {
 +        cc[i].count = 0;
 +        cc[i].c     = 0;
 +    }
 +}
 +
 +static double Mcyc_av(const nbnxn_cycle_t *cc)
 +{
 +    return (double)cc->c*1e-6/cc->count;
 +}
 +
 +static void nbs_cycle_print(FILE *fp,const nbnxn_search_t nbs)
 +{
 +    int n;
 +    int t;
 +
 +    fprintf(fp,"\n");
 +    fprintf(fp,"ns %4d grid %4.1f search %4.1f red.f %5.3f",
 +            nbs->cc[enbsCCgrid].count,
 +            Mcyc_av(&nbs->cc[enbsCCgrid]),
 +            Mcyc_av(&nbs->cc[enbsCCsearch]),
 +            Mcyc_av(&nbs->cc[enbsCCreducef]));
 +
 +    if (nbs->nthread_max > 1)
 +    {
 +        if (nbs->cc[enbsCCcombine].count > 0)
 +        {
 +            fprintf(fp," comb %5.2f",
 +                    Mcyc_av(&nbs->cc[enbsCCcombine]));
 +        }
 +        fprintf(fp," s. th");
 +        for(t=0; t<nbs->nthread_max; t++)
 +        {
 +            fprintf(fp," %4.1f",
 +                    Mcyc_av(&nbs->work[t].cc[enbsCCsearch]));
 +        }
 +    }
 +    fprintf(fp,"\n");
 +}
 +
 +static void nbnxn_grid_init(nbnxn_grid_t * grid)
 +{
 +    grid->cxy_na      = NULL;
 +    grid->cxy_ind     = NULL;
 +    grid->cxy_nalloc  = 0;
 +    grid->bb          = NULL;
 +    grid->bbj         = NULL;
 +    grid->nc_nalloc   = 0;
 +}
 +
 +static int get_2log(int n)
 +{
 +    int log2;
 +
 +    log2 = 0;
 +    while ((1<<log2) < n)
 +    {
 +        log2++;
 +    }
 +    if ((1<<log2) != n)
 +    {
 +        gmx_fatal(FARGS,"nbnxn na_c (%d) is not a power of 2",n);
 +    }
 +
 +    return log2;
 +}
 +
 +static int nbnxn_kernel_to_ci_size(int nb_kernel_type)
 +{
 +    switch (nb_kernel_type)
 +    {
 +    case nbk4x4_PlainC:
 +    case nbk4xN_X86_SIMD128:
 +    case nbk4xN_X86_SIMD256:
 +        return NBNXN_CPU_CLUSTER_I_SIZE;
 +    case nbk8x8x8_CUDA:
 +    case nbk8x8x8_PlainC:
 +        /* The cluster size for super/sub lists is only set here.
 +         * Any value should work for the pair-search and atomdata code.
 +         * The kernels, of course, might require a particular value.
 +         */
 +        return NBNXN_GPU_CLUSTER_SIZE;
 +    default:
 +        gmx_incons("unknown kernel type");
 +    }
 +
 +    return 0;
 +}
 +
 +int nbnxn_kernel_to_cj_size(int nb_kernel_type)
 +{
 +    switch (nb_kernel_type)
 +    {
 +    case nbk4x4_PlainC:
 +        return NBNXN_CPU_CLUSTER_I_SIZE;
 +    case nbk4xN_X86_SIMD128:
 +        /* Number of reals that fit in SIMD (128 bits = 16 bytes) */
 +        return 16/sizeof(real);
 +    case nbk4xN_X86_SIMD256:
 +        /* Number of reals that fit in SIMD (256 bits = 32 bytes) */
 +        return 32/sizeof(real);
 +    case nbk8x8x8_CUDA:
 +    case nbk8x8x8_PlainC:
 +        return nbnxn_kernel_to_ci_size(nb_kernel_type);
 +    default:
 +        gmx_incons("unknown kernel type");
 +    }
 +
 +    return 0;
 +}
 +
 +static int ci_to_cj(int na_cj_2log,int ci)
 +{
 +    switch (na_cj_2log)
 +    {
 +    case 2: return  ci;     break;
 +    case 1: return (ci<<1); break;
 +    case 3: return (ci>>1); break;
 +    }
 +
 +    return 0;
 +}
 +
 +gmx_bool nbnxn_kernel_pairlist_simple(int nb_kernel_type)
 +{
 +    if (nb_kernel_type == nbkNotSet)
 +    {
 +        gmx_fatal(FARGS, "Non-bonded kernel type not set for Verlet-style pair-list.");
 +    }
 +
 +    switch (nb_kernel_type)
 +    {
 +    case nbk8x8x8_CUDA:
 +    case nbk8x8x8_PlainC:
 +        return FALSE;
 +
 +    case nbk4x4_PlainC:
 +    case nbk4xN_X86_SIMD128:
 +    case nbk4xN_X86_SIMD256:
 +        return TRUE;
 +
 +    default:
 +        gmx_incons("Invalid nonbonded kernel type passed!");
 +        return FALSE;
 +    }
 +}
 +
 +void nbnxn_init_search(nbnxn_search_t * nbs_ptr,
 +                       ivec *n_dd_cells,
 +                       gmx_domdec_zones_t *zones,
 +                       int nthread_max)
 +{
 +    nbnxn_search_t nbs;
 +    int d,g,t;
 +
 +    snew(nbs,1);
 +    *nbs_ptr = nbs;
 +
 +    nbs->DomDec = (n_dd_cells != NULL);
 +
 +    clear_ivec(nbs->dd_dim);
 +    nbs->ngrid = 1;
 +    if (nbs->DomDec)
 +    {
 +        nbs->zones = zones;
 +
 +        for(d=0; d<DIM; d++)
 +        {
 +            if ((*n_dd_cells)[d] > 1)
 +            {
 +                nbs->dd_dim[d] = 1;
 +                /* Each grid matches a DD zone */
 +                nbs->ngrid *= 2;
 +            }
 +        }
 +    }
 +
 +    snew(nbs->grid,nbs->ngrid);
 +    for(g=0; g<nbs->ngrid; g++)
 +    {
 +        nbnxn_grid_init(&nbs->grid[g]);
 +    }
 +    nbs->cell        = NULL;
 +    nbs->cell_nalloc = 0;
 +    nbs->a           = NULL;
 +    nbs->a_nalloc    = 0;
 +
 +    nbs->nthread_max = nthread_max;
 +
 +    /* Initialize the work data structures for each thread */
 +    snew(nbs->work,nbs->nthread_max);
 +    for(t=0; t<nbs->nthread_max; t++)
 +    {
 +        nbs->work[t].cxy_na           = NULL;
 +        nbs->work[t].cxy_na_nalloc    = 0;
 +        nbs->work[t].sort_work        = NULL;
 +        nbs->work[t].sort_work_nalloc = 0;
 +    }
 +
 +    /* Initialize detailed nbsearch cycle counting */
 +    nbs->print_cycles = (getenv("GMX_NBNXN_CYCLE") != 0);
 +    nbs->search_count = 0;
 +    nbs_cycle_clear(nbs->cc);
 +    for(t=0; t<nbs->nthread_max; t++)
 +    {
 +        nbs_cycle_clear(nbs->work[t].cc);
 +    }
 +}
 +
 +static real grid_atom_density(int n,rvec corner0,rvec corner1)
 +{
 +    rvec size;
 +
 +    rvec_sub(corner1,corner0,size);
 +
 +    return n/(size[XX]*size[YY]*size[ZZ]);
 +}
 +
 +static int set_grid_size_xy(const nbnxn_search_t nbs,
 +                            nbnxn_grid_t *grid,
 +                            int n,rvec corner0,rvec corner1,
 +                            real atom_density,
 +                            int XFormat)
 +{
 +    rvec size;
 +    int  na_c;
 +    real adens,tlen,tlen_x,tlen_y,nc_max;
 +    int  t;
 +
 +    rvec_sub(corner1,corner0,size);
 +
 +    if (n > grid->na_sc)
 +    {
 +        /* target cell length */
 +        if (grid->bSimple)
 +        {
 +            /* To minimize the zero interactions, we should make
 +             * the largest of the i/j cell cubic.
 +             */
 +            na_c = max(grid->na_c,grid->na_cj);
 +
 +            /* Approximately cubic cells */
 +            tlen   = pow(na_c/atom_density,1.0/3.0);
 +            tlen_x = tlen;
 +            tlen_y = tlen;
 +        }
 +        else
 +        {
 +            /* Approximately cubic sub cells */
 +            tlen   = pow(grid->na_c/atom_density,1.0/3.0);
 +            tlen_x = tlen*GPU_NSUBCELL_X;
 +            tlen_y = tlen*GPU_NSUBCELL_Y;
 +        }
 +        /* We round ncx and ncy down, because we get less cell pairs
 +         * in the nbsist when the fixed cell dimensions (x,y) are
 +         * larger than the variable one (z) than the other way around.
 +         */
 +        grid->ncx = max(1,(int)(size[XX]/tlen_x));
 +        grid->ncy = max(1,(int)(size[YY]/tlen_y));
 +    }
 +    else
 +    {
 +        grid->ncx = 1;
 +        grid->ncy = 1;
 +    }
 +
 +    /* We need one additional cell entry for particles moved by DD */
 +    if (grid->ncx*grid->ncy+1 > grid->cxy_nalloc)
 +    {
 +        grid->cxy_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
 +        srenew(grid->cxy_na,grid->cxy_nalloc);
 +        srenew(grid->cxy_ind,grid->cxy_nalloc+1);
 +    }
 +    for(t=0; t<nbs->nthread_max; t++)
 +    {
 +        if (grid->ncx*grid->ncy+1 > nbs->work[t].cxy_na_nalloc)
 +        {
 +            nbs->work[t].cxy_na_nalloc = over_alloc_large(grid->ncx*grid->ncy+1);
 +            srenew(nbs->work[t].cxy_na,nbs->work[t].cxy_na_nalloc);
 +        }
 +    }
 +
 +    /* Worst case scenario of 1 atom in each last cell */
 +    if (grid->na_cj <= grid->na_c)
 +    {
 +        nc_max = n/grid->na_sc + grid->ncx*grid->ncy;
 +    }
 +    else
 +    {
 +        nc_max = n/grid->na_sc + grid->ncx*grid->ncy*grid->na_cj/grid->na_c;
 +    }
 +
 +    if (nc_max > grid->nc_nalloc)
 +    {
 +        int bb_nalloc;
 +
 +        grid->nc_nalloc = over_alloc_large(nc_max);
 +        srenew(grid->nsubc,grid->nc_nalloc);
 +        srenew(grid->bbcz,grid->nc_nalloc*NNBSBB_D);
 +#ifdef NBNXN_8BB_SSE
 +        bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX;
 +#else
 +        bb_nalloc = grid->nc_nalloc*GPU_NSUBCELL*NNBSBB_B;
 +#endif
 +        sfree_aligned(grid->bb);
 +        /* This snew also zeros the contents, this avoid possible
 +         * floating exceptions in SSE with the unused bb elements.
 +         */
 +        snew_aligned(grid->bb,bb_nalloc,16);
 +
 +        if (grid->bSimple)
 +        {
 +            if (grid->na_cj == grid->na_c)
 +            {
 +                grid->bbj = grid->bb;
 +            }
 +            else
 +            {
 +                sfree_aligned(grid->bbj);
 +                snew_aligned(grid->bbj,bb_nalloc*grid->na_c/grid->na_cj,16);
 +            }
 +        }
 +
 +        srenew(grid->flags,grid->nc_nalloc);
 +    }
 +
 +    copy_rvec(corner0,grid->c0);
 +    copy_rvec(corner1,grid->c1);
 +    grid->sx = size[XX]/grid->ncx;
 +    grid->sy = size[YY]/grid->ncy;
 +    grid->inv_sx = 1/grid->sx;
 +    grid->inv_sy = 1/grid->sy;
 +
 +    return nc_max;
 +}
 +
 +#define SORT_GRID_OVERSIZE 2
 +#define SGSF (SORT_GRID_OVERSIZE + 1)
 +
 +static void sort_atoms(int dim,gmx_bool Backwards,
 +                       int *a,int n,rvec *x,
 +                       real h0,real invh,int nsort,int *sort)
 +{
 +    int i,c;
 +    int zi,zim;
 +    int cp,tmp;
 +
 +    if (n <= 1)
 +    {
 +        /* Nothing to do */
 +        return;
 +    }
 +
 +    /* For small oversize factors clearing the whole area is fastest.
 +     * For large oversize we should clear the used elements after use.
 +     */
 +    for(i=0; i<nsort; i++)
 +    {
 +        sort[i] = -1;
 +    }
 +    /* Sort the particles using a simple index sort */
 +    for(i=0; i<n; i++)
 +    {
 +        /* The cast takes care of float-point rounding effects below zero.
 +         * This code assumes particles are less than 1/SORT_GRID_OVERSIZE
 +         * times the box height out of the box.
 +         */
 +        zi = (int)((x[a[i]][dim] - h0)*invh);
 +
 +#ifdef DEBUG_NBNXN_GRIDDING
 +        if (zi < 0 || zi >= nsort)
 +        {
 +            gmx_fatal(FARGS,"(int)((x[%d][%c]=%f - %f)*%f) = %d, not in 0 - %d\n",
 +                      a[i],'x'+dim,x[a[i]][dim],h0,invh,zi,nsort);
 +        }
 +#endif
 +
 +        /* Ideally this particle should go in sort cell zi,
 +         * but that might already be in use,
 +         * in that case find the first empty cell higher up
 +         */
 +        if (sort[zi] < 0)
 +        {
 +            sort[zi] = a[i];
 +        }
 +        else
 +        {
 +            /* We have multiple atoms in the same sorting slot.
 +             * Sort on real z for minimal bounding box size.
 +             * There is an extra check for identical z to ensure
 +             * well-defined output order, independent of input order
 +             * to ensure binary reproducibility after restarts.
 +             */
 +            while(sort[zi] >= 0 && ( x[a[i]][dim] >  x[sort[zi]][dim] ||
 +                                    (x[a[i]][dim] == x[sort[zi]][dim] &&
 +                                     a[i] > sort[zi])))
 +            {
 +                zi++;
 +            }
 +
 +            if (sort[zi] >= 0)
 +            {
 +                /* Shift all elements by one slot until we find an empty slot */
 +                cp = sort[zi];
 +                zim = zi + 1;
 +                while (sort[zim] >= 0)
 +                {
 +                    tmp = sort[zim];
 +                    sort[zim] = cp;
 +                    cp  = tmp;
 +                    zim++;
 +                }
 +                sort[zim] = cp;
 +            }
 +            sort[zi] = a[i];
 +        }
 +    }
 +
 +    c = 0;
 +    if (!Backwards)
 +    {
 +        for(zi=0; zi<nsort; zi++)
 +        {
 +            if (sort[zi] >= 0)
 +            {
 +                a[c++] = sort[zi];
 +            }
 +        }
 +    }
 +    else
 +    {
 +        for(zi=nsort-1; zi>=0; zi--)
 +        {
 +            if (sort[zi] >= 0)
 +            {
 +                a[c++] = sort[zi];
 +            }
 +        }
 +    }
 +    if (c < n)
 +    {
 +        gmx_incons("Lost particles while sorting");
 +    }
 +}
 +
 +#ifdef GMX_DOUBLE
 +#define R2F_D(x) ((float)((x) >= 0 ? ((1-GMX_FLOAT_EPS)*(x)) : ((1+GMX_FLOAT_EPS)*(x))))
 +#define R2F_U(x) ((float)((x) >= 0 ? ((1+GMX_FLOAT_EPS)*(x)) : ((1-GMX_FLOAT_EPS)*(x))))
 +#else
 +#define R2F_D(x) (x)
 +#define R2F_U(x) (x)
 +#endif
 +
 +/* Coordinate order x,y,z, bb order xyz0 */
 +static void calc_bounding_box(int na,int stride,const real *x,float *bb)
 +{
 +    int  i,j;
 +    real xl,xh,yl,yh,zl,zh;
 +
 +    i = 0;
 +    xl = x[i+XX];
 +    xh = x[i+XX];
 +    yl = x[i+YY];
 +    yh = x[i+YY];
 +    zl = x[i+ZZ];
 +    zh = x[i+ZZ];
 +    i += stride;
 +    for(j=1; j<na; j++)
 +    {
 +        xl = min(xl,x[i+XX]);
 +        xh = max(xh,x[i+XX]);
 +        yl = min(yl,x[i+YY]);
 +        yh = max(yh,x[i+YY]);
 +        zl = min(zl,x[i+ZZ]);
 +        zh = max(zh,x[i+ZZ]);
 +        i += stride;
 +    }
 +    /* Note: possible double to float conversion here */
 +    bb[BBL_X] = R2F_D(xl);
 +    bb[BBL_Y] = R2F_D(yl);
 +    bb[BBL_Z] = R2F_D(zl);
 +    bb[BBU_X] = R2F_U(xh);
 +    bb[BBU_Y] = R2F_U(yh);
 +    bb[BBU_Z] = R2F_U(zh);
 +}
 +
 +/* Packed coordinates, bb order xyz0 */
 +static void calc_bounding_box_x_x4(int na,const real *x,float *bb)
 +{
 +    int  j;
 +    real xl,xh,yl,yh,zl,zh;
 +
 +    xl = x[XX*PACK_X4];
 +    xh = x[XX*PACK_X4];
 +    yl = x[YY*PACK_X4];
 +    yh = x[YY*PACK_X4];
 +    zl = x[ZZ*PACK_X4];
 +    zh = x[ZZ*PACK_X4];
 +    for(j=1; j<na; j++)
 +    {
 +        xl = min(xl,x[j+XX*PACK_X4]);
 +        xh = max(xh,x[j+XX*PACK_X4]);
 +        yl = min(yl,x[j+YY*PACK_X4]);
 +        yh = max(yh,x[j+YY*PACK_X4]);
 +        zl = min(zl,x[j+ZZ*PACK_X4]);
 +        zh = max(zh,x[j+ZZ*PACK_X4]);
 +    }
 +    /* Note: possible double to float conversion here */
 +    bb[BBL_X] = R2F_D(xl);
 +    bb[BBL_Y] = R2F_D(yl);
 +    bb[BBL_Z] = R2F_D(zl);
 +    bb[BBU_X] = R2F_U(xh);
 +    bb[BBU_Y] = R2F_U(yh);
 +    bb[BBU_Z] = R2F_U(zh);
 +}
 +
 +/* Packed coordinates, bb order xyz0 */
 +static void calc_bounding_box_x_x8(int na,const real *x,float *bb)
 +{
 +    int  j;
 +    real xl,xh,yl,yh,zl,zh;
 +
 +    xl = x[XX*PACK_X8];
 +    xh = x[XX*PACK_X8];
 +    yl = x[YY*PACK_X8];
 +    yh = x[YY*PACK_X8];
 +    zl = x[ZZ*PACK_X8];
 +    zh = x[ZZ*PACK_X8];
 +    for(j=1; j<na; j++)
 +    {
 +        xl = min(xl,x[j+XX*PACK_X8]);
 +        xh = max(xh,x[j+XX*PACK_X8]);
 +        yl = min(yl,x[j+YY*PACK_X8]);
 +        yh = max(yh,x[j+YY*PACK_X8]);
 +        zl = min(zl,x[j+ZZ*PACK_X8]);
 +        zh = max(zh,x[j+ZZ*PACK_X8]);
 +    }
 +    /* Note: possible double to float conversion here */
 +    bb[BBL_X] = R2F_D(xl);
 +    bb[BBL_Y] = R2F_D(yl);
 +    bb[BBL_Z] = R2F_D(zl);
 +    bb[BBU_X] = R2F_U(xh);
 +    bb[BBU_Y] = R2F_U(yh);
 +    bb[BBU_Z] = R2F_U(zh);
 +}
 +
 +#ifdef NBNXN_SEARCH_SSE
 +
 +/* Packed coordinates, bb order xyz0 */
 +static void calc_bounding_box_x_x4_halves(int na,const real *x,
 +                                          float *bb,float *bbj)
 +{
 +    calc_bounding_box_x_x4(min(na,2),x,bbj);
 +
 +    if (na > 2)
 +    {
 +        calc_bounding_box_x_x4(min(na-2,2),x+(PACK_X4>>1),bbj+NNBSBB_B);
 +    }
 +    else
 +    {
 +        /* Set the "empty" bounding box to the same as the first one,
 +         * so we don't need to treat special cases in the rest of the code.
 +         */
 +        _mm_store_ps(bbj+NNBSBB_B         ,_mm_load_ps(bbj));
 +        _mm_store_ps(bbj+NNBSBB_B+NNBSBB_C,_mm_load_ps(bbj+NNBSBB_C));
 +    }
 +
 +    _mm_store_ps(bb         ,_mm_min_ps(_mm_load_ps(bbj),
 +                                        _mm_load_ps(bbj+NNBSBB_B)));
 +    _mm_store_ps(bb+NNBSBB_C,_mm_max_ps(_mm_load_ps(bbj+NNBSBB_C),
 +                                        _mm_load_ps(bbj+NNBSBB_B+NNBSBB_C)));
 +}
 +
 +/* Coordinate order xyz, bb order xxxxyyyyzzzz */
 +static void calc_bounding_box_xxxx(int na,int stride,const real *x,float *bb)
 +{
 +    int  i,j;
 +    real xl,xh,yl,yh,zl,zh;
 +
 +    i = 0;
 +    xl = x[i+XX];
 +    xh = x[i+XX];
 +    yl = x[i+YY];
 +    yh = x[i+YY];
 +    zl = x[i+ZZ];
 +    zh = x[i+ZZ];
 +    i += stride;
 +    for(j=1; j<na; j++)
 +    {
 +        xl = min(xl,x[i+XX]);
 +        xh = max(xh,x[i+XX]);
 +        yl = min(yl,x[i+YY]);
 +        yh = max(yh,x[i+YY]);
 +        zl = min(zl,x[i+ZZ]);
 +        zh = max(zh,x[i+ZZ]);
 +        i += stride;
 +    }
 +    /* Note: possible double to float conversion here */
 +    bb[0*STRIDE_8BB] = R2F_D(xl);
 +    bb[1*STRIDE_8BB] = R2F_D(yl);
 +    bb[2*STRIDE_8BB] = R2F_D(zl);
 +    bb[3*STRIDE_8BB] = R2F_U(xh);
 +    bb[4*STRIDE_8BB] = R2F_U(yh);
 +    bb[5*STRIDE_8BB] = R2F_U(zh);
 +}
 +
 +#endif /* NBNXN_SEARCH_SSE */
 +
 +#ifdef NBNXN_SEARCH_SSE_SINGLE
 +
 +/* Coordinate order xyz?, bb order xyz0 */
 +static void calc_bounding_box_sse(int na,const float *x,float *bb)
 +{
 +    __m128 bb_0_SSE,bb_1_SSE;
 +    __m128 x_SSE;
 +
 +    int  i;
 +
 +    bb_0_SSE = _mm_load_ps(x);
 +    bb_1_SSE = bb_0_SSE;
 +
 +    for(i=1; i<na; i++)
 +    {
 +        x_SSE    = _mm_load_ps(x+i*NNBSBB_C);
 +        bb_0_SSE = _mm_min_ps(bb_0_SSE,x_SSE);
 +        bb_1_SSE = _mm_max_ps(bb_1_SSE,x_SSE);
 +    }
 +
 +    _mm_store_ps(bb  ,bb_0_SSE);
 +    _mm_store_ps(bb+4,bb_1_SSE);
 +}
 +
 +/* Coordinate order xyz?, bb order xxxxyyyyzzzz */
 +static void calc_bounding_box_xxxx_sse(int na,const float *x,
 +                                       float *bb_work,
 +                                       real *bb)
 +{
 +    calc_bounding_box_sse(na,x,bb_work);
 +
 +    bb[0*STRIDE_8BB] = bb_work[BBL_X];
 +    bb[1*STRIDE_8BB] = bb_work[BBL_Y];
 +    bb[2*STRIDE_8BB] = bb_work[BBL_Z];
 +    bb[3*STRIDE_8BB] = bb_work[BBU_X];
 +    bb[4*STRIDE_8BB] = bb_work[BBU_Y];
 +    bb[5*STRIDE_8BB] = bb_work[BBU_Z];
 +}
 +
 +#endif /* NBNXN_SEARCH_SSE_SINGLE */
 +
 +#ifdef NBNXN_SEARCH_SSE
 +
 +/* Combines pairs of consecutive bounding boxes */
 +static void combine_bounding_box_pairs(nbnxn_grid_t *grid,const float *bb)
 +{
 +    int    i,j,sc2,nc2,c2;
 +    __m128 min_SSE,max_SSE;
 +
 +    for(i=0; i<grid->ncx*grid->ncy; i++)
 +    {
 +        /* Starting bb in a column is expected to be 2-aligned */
 +        sc2 = grid->cxy_ind[i]>>1;
 +        /* For odd numbers skip the last bb here */
 +        nc2 = (grid->cxy_na[i]+3)>>(2+1);
 +        for(c2=sc2; c2<sc2+nc2; c2++)
 +        {
 +            min_SSE = _mm_min_ps(_mm_load_ps(bb+(c2*4+0)*NNBSBB_C),
 +                                 _mm_load_ps(bb+(c2*4+2)*NNBSBB_C));
 +            max_SSE = _mm_max_ps(_mm_load_ps(bb+(c2*4+1)*NNBSBB_C),
 +                                 _mm_load_ps(bb+(c2*4+3)*NNBSBB_C));
 +            _mm_store_ps(grid->bbj+(c2*2+0)*NNBSBB_C,min_SSE);
 +            _mm_store_ps(grid->bbj+(c2*2+1)*NNBSBB_C,max_SSE);
 +        }
 +        if (((grid->cxy_na[i]+3)>>2) & 1)
 +        {
 +            /* Copy the last bb for odd bb count in this column */
 +            for(j=0; j<NNBSBB_C; j++)
 +            {
 +                grid->bbj[(c2*2+0)*NNBSBB_C+j] = bb[(c2*4+0)*NNBSBB_C+j];
 +                grid->bbj[(c2*2+1)*NNBSBB_C+j] = bb[(c2*4+1)*NNBSBB_C+j];
 +            }
 +        }
 +    }
 +}
 +
 +#endif
 +
 +
 +/* Prints the average bb size, used for debug output */
 +static void print_bbsizes_simple(FILE *fp,
 +                                 const nbnxn_search_t nbs,
 +                                 const nbnxn_grid_t *grid)
 +{
 +    int  c,d;
 +    dvec ba;
 +
 +    clear_dvec(ba);
 +    for(c=0; c<grid->nc; c++)
 +    {
 +        for(d=0; d<DIM; d++)
 +        {
 +            ba[d] += grid->bb[c*NNBSBB_B+NNBSBB_C+d] - grid->bb[c*NNBSBB_B+d];
 +        }
 +    }
 +    dsvmul(1.0/grid->nc,ba,ba);
 +
 +    fprintf(fp,"ns bb: %4.2f %4.2f %4.2f  %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 +            nbs->box[XX][XX]/grid->ncx,
 +            nbs->box[YY][YY]/grid->ncy,
 +            nbs->box[ZZ][ZZ]*grid->ncx*grid->ncy/grid->nc,
 +            ba[XX],ba[YY],ba[ZZ],
 +            ba[XX]*grid->ncx/nbs->box[XX][XX],
 +            ba[YY]*grid->ncy/nbs->box[YY][YY],
 +            ba[ZZ]*grid->nc/(grid->ncx*grid->ncy*nbs->box[ZZ][ZZ]));
 +}
 +
 +/* Prints the average bb size, used for debug output */
 +static void print_bbsizes_supersub(FILE *fp,
 +                                   const nbnxn_search_t nbs,
 +                                   const nbnxn_grid_t *grid)
 +{
 +    int  ns,c,s;
 +    dvec ba;
 +
 +    clear_dvec(ba);
 +    ns = 0;
 +    for(c=0; c<grid->nc; c++)
 +    {
 +#ifdef NBNXN_BBXXXX
 +        for(s=0; s<grid->nsubc[c]; s+=STRIDE_8BB)
 +        {
 +            int cs_w,i,d;
 +
 +            cs_w = (c*GPU_NSUBCELL + s)/STRIDE_8BB;
 +            for(i=0; i<STRIDE_8BB; i++)
 +            {
 +                for(d=0; d<DIM; d++)
 +                {
 +                    ba[d] +=
 +                        grid->bb[cs_w*NNBSBB_XXXX+(DIM+d)*STRIDE_8BB+i] -
 +                        grid->bb[cs_w*NNBSBB_XXXX+     d *STRIDE_8BB+i];
 +                }
 +            }
 +        }
 +#else
 +        for(s=0; s<grid->nsubc[c]; s++)
 +        {
 +            int cs,d;
 +
 +            cs = c*GPU_NSUBCELL + s;
 +            for(d=0; d<DIM; d++)
 +            {
 +                ba[d] +=
 +                    grid->bb[cs*NNBSBB_B+NNBSBB_C+d] -
 +                    grid->bb[cs*NNBSBB_B         +d];
 +            }
 +        }
 +#endif
 +        ns += grid->nsubc[c];
 +    }
 +    dsvmul(1.0/ns,ba,ba);
 +
 +    fprintf(fp,"ns bb: %4.2f %4.2f %4.2f  %4.2f %4.2f %4.2f rel %4.2f %4.2f %4.2f\n",
 +            nbs->box[XX][XX]/(grid->ncx*GPU_NSUBCELL_X),
 +            nbs->box[YY][YY]/(grid->ncy*GPU_NSUBCELL_Y),
 +            nbs->box[ZZ][ZZ]*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z),
 +            ba[XX],ba[YY],ba[ZZ],
 +            ba[XX]*grid->ncx*GPU_NSUBCELL_X/nbs->box[XX][XX],
 +            ba[YY]*grid->ncy*GPU_NSUBCELL_Y/nbs->box[YY][YY],
 +            ba[ZZ]*grid->nc*GPU_NSUBCELL_Z/(grid->ncx*grid->ncy*nbs->box[ZZ][ZZ]));
 +}
 +
 +/* Potentially sorts atoms on LJ coefficients !=0 and ==0.
 + * Also sets interaction flags.
 + */
 +void sort_on_lj(nbnxn_atomdata_t *nbat,int na_c,
 +                int a0,int a1,const int *atinfo,
 +                int *order,
 +                int *flags)
 +{
 +    int subc,s,a,n1,n2,a_lj_max,i,j;
 +    int sort1[NBNXN_NA_SC_MAX/GPU_NSUBCELL];
 +    int sort2[NBNXN_NA_SC_MAX/GPU_NSUBCELL];
 +    gmx_bool haveQ;
 +
 +    *flags = 0;
 +
 +    subc = 0;
 +    for(s=a0; s<a1; s+=na_c)
 +    {
 +        /* Make lists for this (sub-)cell on atoms with and without LJ */
 +        n1 = 0;
 +        n2 = 0;
 +        haveQ = FALSE;
 +        a_lj_max = -1;
 +        for(a=s; a<min(s+na_c,a1); a++)
 +        {
 +            haveQ = haveQ || GET_CGINFO_HAS_Q(atinfo[order[a]]);
 +
 +            if (GET_CGINFO_HAS_VDW(atinfo[order[a]]))
 +            {
 +                sort1[n1++] = order[a];
 +                a_lj_max = a;
 +            }
 +            else
 +            {
 +                sort2[n2++] = order[a];
 +            }
 +        }
 +
 +        /* If we don't have atom with LJ, there's nothing to sort */
 +        if (n1 > 0)
 +        {
 +            *flags |= NBNXN_CI_DO_LJ(subc);
 +
 +            if (2*n1 <= na_c)
 +            {
 +                /* Only sort when strictly necessary. Ordering particles
 +                 * Ordering particles can lead to less accurate summation
 +                 * due to rounding, both for LJ and Coulomb interactions.
 +                 */
 +                if (2*(a_lj_max - s) >= na_c)
 +                {
 +                    for(i=0; i<n1; i++)
 +                    {
 +                        order[a0+i] = sort1[i];
 +                    }
 +                    for(j=0; j<n2; j++)
 +                    {
 +                        order[a0+n1+j] = sort2[j];
 +                    }
 +                }
 +
 +                *flags |= NBNXN_CI_HALF_LJ(subc);
 +            }
 +        }
 +        if (haveQ)
 +        {
 +            *flags |= NBNXN_CI_DO_COUL(subc);
 +        }
 +        subc++;
 +    }
 +}
 +
 +/* Fill a pair search cell with atoms.
 + * Potentially sorts atoms and sets the interaction flags.
 + */
 +void fill_cell(const nbnxn_search_t nbs,
 +               nbnxn_grid_t *grid,
 +               nbnxn_atomdata_t *nbat,
 +               int a0,int a1,
 +               const int *atinfo,
 +               rvec *x,
 +               int sx,int sy, int sz,
 +               float *bb_work)
 +{
 +    int    na,a;
 +    size_t offset;
 +    float  *bb_ptr;
 +
 +    na = a1 - a0;
 +
 +    if (grid->bSimple)
 +    {
 +        sort_on_lj(nbat,grid->na_c,a0,a1,atinfo,nbs->a,
 +                   grid->flags+(a0>>grid->na_c_2log)-grid->cell0);
 +    }
 +
 +    /* Now we have sorted the atoms, set the cell indices */
 +    for(a=a0; a<a1; a++)
 +    {
 +        nbs->cell[nbs->a[a]] = a;
 +    }
 +
 +    copy_rvec_to_nbat_real(nbs->a+a0,a1-a0,grid->na_c,x,
 +                           nbat->XFormat,nbat->x,a0,
 +                           sx,sy,sz);
 +
 +    if (nbat->XFormat == nbatX4)
 +    {
 +        /* Store the bounding boxes as xyz.xyz. */
 +        offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
 +        bb_ptr = grid->bb + offset;
 +
 +#if defined GMX_DOUBLE && defined NBNXN_SEARCH_SSE
 +        if (2*grid->na_cj == grid->na_c)
 +        {
 +            calc_bounding_box_x_x4_halves(na,nbat->x+X4_IND_A(a0),bb_ptr,
 +                                          grid->bbj+offset*2);
 +        }
 +        else
 +#endif
 +        {
 +            calc_bounding_box_x_x4(na,nbat->x+X4_IND_A(a0),bb_ptr);
 +        }
 +    }
 +    else if (nbat->XFormat == nbatX8)
 +    {
 +        /* Store the bounding boxes as xyz.xyz. */
 +        offset = ((a0 - grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
 +        bb_ptr = grid->bb + offset;
 +
 +        calc_bounding_box_x_x8(na,nbat->x+X8_IND_A(a0),bb_ptr);
 +    }
 +#ifdef NBNXN_BBXXXX
 +    else if (!grid->bSimple)
 +    {
 +        /* Store the bounding boxes in a format convenient
 +         * for SSE calculations: xxxxyyyyzzzz...
 +                             */
 +        bb_ptr =
 +            grid->bb +
 +            ((a0-grid->cell0*grid->na_sc)>>(grid->na_c_2log+STRIDE_8BB_2LOG))*NNBSBB_XXXX +
 +            (((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log) & (STRIDE_8BB-1));
 +
 +#ifdef NBNXN_SEARCH_SSE_SINGLE
 +        if (nbat->XFormat == nbatXYZQ)
 +        {
 +            calc_bounding_box_xxxx_sse(na,nbat->x+a0*nbat->xstride,
 +                                       bb_work,bb_ptr);
 +        }
 +        else
 +#endif
 +        {
 +            calc_bounding_box_xxxx(na,nbat->xstride,nbat->x+a0*nbat->xstride,
 +                                   bb_ptr);
 +        }
 +        if (gmx_debug_at)
 +        {
 +            fprintf(debug,"%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 +                    sx,sy,sz,
 +                    bb_ptr[0*STRIDE_8BB],bb_ptr[3*STRIDE_8BB],
 +                    bb_ptr[1*STRIDE_8BB],bb_ptr[4*STRIDE_8BB],
 +                    bb_ptr[2*STRIDE_8BB],bb_ptr[5*STRIDE_8BB]);
 +        }
 +    }
 +#endif
 +    else
 +    {
 +        /* Store the bounding boxes as xyz.xyz. */
 +        bb_ptr = grid->bb+((a0-grid->cell0*grid->na_sc)>>grid->na_c_2log)*NNBSBB_B;
 +
 +        calc_bounding_box(na,nbat->xstride,nbat->x+a0*nbat->xstride,
 +                          bb_ptr);
 +
 +        if (gmx_debug_at)
 +        {
 +            int bbo;
 +            bbo = (a0 - grid->cell0*grid->na_sc)/grid->na_c;
 +            fprintf(debug,"%2d %2d %2d bb %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f\n",
 +                    sx,sy,sz,
 +                    (grid->bb+bbo*NNBSBB_B)[BBL_X],
 +                    (grid->bb+bbo*NNBSBB_B)[BBU_X],
 +                    (grid->bb+bbo*NNBSBB_B)[BBL_Y],
 +                    (grid->bb+bbo*NNBSBB_B)[BBU_Y],
 +                    (grid->bb+bbo*NNBSBB_B)[BBL_Z],
 +                    (grid->bb+bbo*NNBSBB_B)[BBU_Z]);
 +        }
 +    }
 +}
 +
 +/* Spatially sort the atoms within one grid column */
 +static void sort_columns_simple(const nbnxn_search_t nbs,
 +                                int dd_zone,
 +                                nbnxn_grid_t *grid,
 +                                int a0,int a1,
 +                                const int *atinfo,
 +                                rvec *x,
 +                                nbnxn_atomdata_t *nbat,
 +                                int cxy_start,int cxy_end,
 +                                int *sort_work)
 +{
 +    int  cxy;
 +    int  cx,cy,cz,ncz,cfilled,c;
 +    int  na,ash,ind,a;
 +    int  na_c,ash_c;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"cell0 %d sorting columns %d - %d, atoms %d - %d\n",
 +                grid->cell0,cxy_start,cxy_end,a0,a1);
 +    }
 +
 +    /* Sort the atoms within each x,y column in 3 dimensions */
 +    for(cxy=cxy_start; cxy<cxy_end; cxy++)
 +    {
 +        cx = cxy/grid->ncy;
 +        cy = cxy - cx*grid->ncy;
 +
 +        na  = grid->cxy_na[cxy];
 +        ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
 +        ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
 +
 +        /* Sort the atoms within each x,y column on z coordinate */
 +        sort_atoms(ZZ,FALSE,
 +                   nbs->a+ash,na,x,
 +                   grid->c0[ZZ],
 +                   ncz*grid->na_sc*SORT_GRID_OVERSIZE/nbs->box[ZZ][ZZ],
 +                   ncz*grid->na_sc*SGSF,sort_work);
 +
 +        /* Fill the ncz cells in this column */
 +        cfilled = grid->cxy_ind[cxy];
 +        for(cz=0; cz<ncz; cz++)
 +        {
 +            c  = grid->cxy_ind[cxy] + cz ;
 +
 +            ash_c = ash + cz*grid->na_sc;
 +            na_c  = min(grid->na_sc,na-(ash_c-ash));
 +
 +            fill_cell(nbs,grid,nbat,
 +                      ash_c,ash_c+na_c,atinfo,x,
 +                      grid->na_sc*cx + (dd_zone >> 2),
 +                      grid->na_sc*cy + (dd_zone & 3),
 +                      grid->na_sc*cz,
 +                      NULL);
 +
 +            /* This copy to bbcz is not really necessary.
 +             * But it allows to use the same grid search code
 +             * for the simple and supersub cell setups.
 +             */
 +            if (na_c > 0)
 +            {
 +                cfilled = c;
 +            }
 +            grid->bbcz[c*NNBSBB_D  ] = grid->bb[cfilled*NNBSBB_B+2];
 +            grid->bbcz[c*NNBSBB_D+1] = grid->bb[cfilled*NNBSBB_B+6];
 +        }
 +
 +        /* Set the unused atom indices to -1 */
 +        for(ind=na; ind<ncz*grid->na_sc; ind++)
 +        {
 +            nbs->a[ash+ind] = -1;
 +        }
 +    }
 +}
 +
 +/* Spatially sort the atoms within one grid column */
 +static void sort_columns_supersub(const nbnxn_search_t nbs,
 +                                  int dd_zone,
 +                                  nbnxn_grid_t *grid,
 +                                  int a0,int a1,
 +                                  const int *atinfo,
 +                                  rvec *x,
 +                                  nbnxn_atomdata_t *nbat,
 +                                  int cxy_start,int cxy_end,
 +                                  int *sort_work)
 +{
 +    int  cxy;
 +    int  cx,cy,cz=-1,c=-1,ncz;
 +    int  na,ash,na_c,ind,a;
 +    int  subdiv_z,sub_z,na_z,ash_z;
 +    int  subdiv_y,sub_y,na_y,ash_y;
 +    int  subdiv_x,sub_x,na_x,ash_x;
 +
 +    /* cppcheck-suppress unassignedVariable */
 +    float bb_work_array[NNBSBB_B+3],*bb_work_align;
 +
 +    bb_work_align = (float *)(((size_t)(bb_work_array+3)) & (~((size_t)15)));
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"cell0 %d sorting columns %d - %d, atoms %d - %d\n",
 +                grid->cell0,cxy_start,cxy_end,a0,a1);
 +    }
 +
 +    subdiv_x = grid->na_c;
 +    subdiv_y = GPU_NSUBCELL_X*subdiv_x;
 +    subdiv_z = GPU_NSUBCELL_Y*subdiv_y;
 +
 +    /* Sort the atoms within each x,y column in 3 dimensions */
 +    for(cxy=cxy_start; cxy<cxy_end; cxy++)
 +    {
 +        cx = cxy/grid->ncy;
 +        cy = cxy - cx*grid->ncy;
 +
 +        na  = grid->cxy_na[cxy];
 +        ncz = grid->cxy_ind[cxy+1] - grid->cxy_ind[cxy];
 +        ash = (grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc;
 +
 +        /* Sort the atoms within each x,y column on z coordinate */
 +        sort_atoms(ZZ,FALSE,
 +                   nbs->a+ash,na,x,
 +                   grid->c0[ZZ],
 +                   ncz*grid->na_sc*SORT_GRID_OVERSIZE/nbs->box[ZZ][ZZ],
 +                   ncz*grid->na_sc*SGSF,sort_work);
 +
 +        /* This loop goes over the supercells and subcells along z at once */
 +        for(sub_z=0; sub_z<ncz*GPU_NSUBCELL_Z; sub_z++)
 +        {
 +            ash_z = ash + sub_z*subdiv_z;
 +            na_z  = min(subdiv_z,na-(ash_z-ash));
 +
 +            /* We have already sorted on z */
 +
 +            if (sub_z % GPU_NSUBCELL_Z == 0)
 +            {
 +                cz = sub_z/GPU_NSUBCELL_Z;
 +                c  = grid->cxy_ind[cxy] + cz ;
 +
 +                /* The number of atoms in this supercell */
 +                na_c = min(grid->na_sc,na-(ash_z-ash));
 +
 +                grid->nsubc[c] = min(GPU_NSUBCELL,(na_c+grid->na_c-1)/grid->na_c);
 +
 +                /* Store the z-boundaries of the super cell */
 +                grid->bbcz[c*NNBSBB_D  ] = x[nbs->a[ash_z]][ZZ];
 +                grid->bbcz[c*NNBSBB_D+1] = x[nbs->a[ash_z+na_c-1]][ZZ];
 +            }
 +
 +#if GPU_NSUBCELL_Y > 1
 +            /* Sort the atoms along y */
 +            sort_atoms(YY,(sub_z & 1),
 +                       nbs->a+ash_z,na_z,x,
 +                       grid->c0[YY]+cy*grid->sy,grid->inv_sy,
 +                       subdiv_y*SGSF,sort_work);
 +#endif
 +
 +            for(sub_y=0; sub_y<GPU_NSUBCELL_Y; sub_y++)
 +            {
 +                ash_y = ash_z + sub_y*subdiv_y;
 +                na_y  = min(subdiv_y,na-(ash_y-ash));
 +
 +#if GPU_NSUBCELL_X > 1
 +                /* Sort the atoms along x */
 +                sort_atoms(XX,((cz*GPU_NSUBCELL_Y + sub_y) & 1),
 +                           nbs->a+ash_y,na_y,x,
 +                           grid->c0[XX]+cx*grid->sx,grid->inv_sx,
 +                           subdiv_x*SGSF,sort_work);
 +#endif
 +
 +                for(sub_x=0; sub_x<GPU_NSUBCELL_X; sub_x++)
 +                {
 +                    ash_x = ash_y + sub_x*subdiv_x;
 +                    na_x  = min(subdiv_x,na-(ash_x-ash));
 +
 +                    fill_cell(nbs,grid,nbat,
 +                              ash_x,ash_x+na_x,atinfo,x,
 +                              grid->na_c*(cx*GPU_NSUBCELL_X+sub_x) + (dd_zone >> 2),
 +                              grid->na_c*(cy*GPU_NSUBCELL_Y+sub_y) + (dd_zone & 3),
 +                              grid->na_c*sub_z,
 +                              bb_work_align);
 +                }
 +            }
 +        }
 +
 +        /* Set the unused atom indices to -1 */
 +        for(ind=na; ind<ncz*grid->na_sc; ind++)
 +        {
 +            nbs->a[ash+ind] = -1;
 +        }
 +    }
 +}
 +
 +/* Determine in which grid column atoms should go */
 +static void calc_column_indices(nbnxn_grid_t *grid,
 +                                int a0,int a1,
 +                                rvec *x,const int *move,
 +                                int thread,int nthread,
 +                                int *cell,
 +                                int *cxy_na)
 +{
 +    int  n0,n1,i;
 +    int  cx,cy;
 +
 +    /* We add one extra cell for particles which moved during DD */
 +    for(i=0; i<grid->ncx*grid->ncy+1; i++)
 +    {
 +        cxy_na[i] = 0;
 +    }
 +
 +    n0 = a0 + (int)((thread+0)*(a1 - a0))/nthread;
 +    n1 = a0 + (int)((thread+1)*(a1 - a0))/nthread;
 +    for(i=n0; i<n1; i++)
 +    {
 +        if (move == NULL || move[i] >= 0)
 +        {
 +            /* We need to be careful with rounding,
 +             * particles might be a few bits outside the local box.
 +             * The int cast takes care of the lower bound,
 +             * we need to explicitly take care of the upper bound.
 +             */
 +            cx = (int)((x[i][XX] - grid->c0[XX])*grid->inv_sx);
 +            if (cx == grid->ncx)
 +            {
 +                cx = grid->ncx - 1;
 +            }
 +            cy = (int)((x[i][YY] - grid->c0[YY])*grid->inv_sy);
 +            if (cy == grid->ncy)
 +            {
 +                cy = grid->ncy - 1;
 +            }
 +            /* For the moment cell contains only the, grid local,
 +             * x and y indices, not z.
 +             */
 +            cell[i] = cx*grid->ncy + cy;
 +
 +#ifdef DEBUG_NBNXN_GRIDDING
 +            if (cell[i] < 0 || cell[i] >= grid->ncx*grid->ncy)
 +            {
 +                gmx_fatal(FARGS,
- static void init_grid_flags(nbnxn_cellblock_flags *flags,
-                             const nbnxn_grid_t *grid)
++                          "grid cell cx %d cy %d out of range (max %d %d)\n"
++                          "atom %f %f %f, grid->c0 %f %f",
++                          cx,cy,grid->ncx,grid->ncy,
++                          x[i][XX],x[i][YY],x[i][ZZ],grid->c0[XX],grid->c0[YY]);
 +            }
 +#endif
 +        }
 +        else
 +        {
 +            /* Put this moved particle after the end of the grid,
 +             * so we can process it later without using conditionals.
 +             */
 +            cell[i] = grid->ncx*grid->ncy;
 +        }
 +
 +        cxy_na[cell[i]]++;
 +    }
 +}
 +
 +/* Determine in which grid cells the atoms should go */
 +static void calc_cell_indices(const nbnxn_search_t nbs,
 +                              int dd_zone,
 +                              nbnxn_grid_t *grid,
 +                              int a0,int a1,
 +                              const int *atinfo,
 +                              rvec *x,
 +                              const int *move,
 +                              nbnxn_atomdata_t *nbat)
 +{
 +    int  n0,n1,i;
 +    int  cx,cy,cxy,ncz_max,ncz;
 +    int  nthread,thread;
 +    int  *cxy_na,cxy_na_i;
 +
 +    nthread = gmx_omp_nthreads_get(emntPairsearch);
 +
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +    for(thread=0; thread<nthread; thread++)
 +    {
 +        calc_column_indices(grid,a0,a1,x,move,thread,nthread,
 +                            nbs->cell,nbs->work[thread].cxy_na);
 +    }
 +
 +    /* Make the cell index as a function of x and y */
 +    ncz_max = 0;
 +    ncz = 0;
 +    grid->cxy_ind[0] = 0;
 +    for(i=0; i<grid->ncx*grid->ncy+1; i++)
 +    {
 +        /* We set ncz_max at the beginning of the loop iso at the end
 +         * to skip i=grid->ncx*grid->ncy which are moved particles
 +         * that do not need to be ordered on the grid.
 +         */
 +        if (ncz > ncz_max)
 +        {
 +            ncz_max = ncz;
 +        }
 +        cxy_na_i = nbs->work[0].cxy_na[i];
 +        for(thread=1; thread<nthread; thread++)
 +        {
 +            cxy_na_i += nbs->work[thread].cxy_na[i];
 +        }
 +        ncz = (cxy_na_i + grid->na_sc - 1)/grid->na_sc;
 +        if (nbat->XFormat == nbatX8)
 +        {
 +            /* Make the number of cell a multiple of 2 */
 +            ncz = (ncz + 1) & ~1;
 +        }
 +        grid->cxy_ind[i+1] = grid->cxy_ind[i] + ncz;
 +        /* Clear cxy_na, so we can reuse the array below */
 +        grid->cxy_na[i] = 0;
 +    }
 +    grid->nc = grid->cxy_ind[grid->ncx*grid->ncy] - grid->cxy_ind[0];
 +
 +    nbat->natoms = (grid->cell0 + grid->nc)*grid->na_sc;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"ns na_sc %d na_c %d super-cells: %d x %d y %d z %.1f maxz %d\n",
 +                grid->na_sc,grid->na_c,grid->nc,
 +                grid->ncx,grid->ncy,grid->nc/((double)(grid->ncx*grid->ncy)),
 +                ncz_max);
 +        if (gmx_debug_at)
 +        {
 +            i = 0;
 +            for(cy=0; cy<grid->ncy; cy++)
 +            {
 +                for(cx=0; cx<grid->ncx; cx++)
 +                {
 +                    fprintf(debug," %2d",grid->cxy_ind[i+1]-grid->cxy_ind[i]);
 +                    i++;
 +                }
 +                fprintf(debug,"\n");
 +            }
 +        }
 +    }
 +
 +    /* Make sure the work array for sorting is large enough */
 +    if (ncz_max*grid->na_sc*SGSF > nbs->work[0].sort_work_nalloc)
 +    {
 +        for(thread=0; thread<nbs->nthread_max; thread++)
 +        {
 +            nbs->work[thread].sort_work_nalloc =
 +                over_alloc_large(ncz_max*grid->na_sc*SGSF);
 +            srenew(nbs->work[thread].sort_work,
 +                   nbs->work[thread].sort_work_nalloc);
 +        }
 +    }
 +
 +    /* Now we know the dimensions we can fill the grid.
 +     * This is the first, unsorted fill. We sort the columns after this.
 +     */
 +    for(i=a0; i<a1; i++)
 +    {
 +        /* At this point nbs->cell contains the local grid x,y indices */
 +        cxy = nbs->cell[i];
 +        nbs->a[(grid->cell0 + grid->cxy_ind[cxy])*grid->na_sc + grid->cxy_na[cxy]++] = i;
 +    }
 +
 +    /* Set the cell indices for the moved particles */
 +    n0 = grid->nc*grid->na_sc;
 +    n1 = grid->nc*grid->na_sc+grid->cxy_na[grid->ncx*grid->ncy];
 +    for(i=n0; i<n1; i++)
 +    {
 +        nbs->cell[nbs->a[i]] = i;
 +    }
 +
 +    /* Sort the super-cell columns along z into the sub-cells. */
 +#pragma omp parallel for num_threads(nbs->nthread_max) schedule(static)
 +    for(thread=0; thread<nbs->nthread_max; thread++)
 +    {
 +        if (grid->bSimple)
 +        {
 +            sort_columns_simple(nbs,dd_zone,grid,a0,a1,atinfo,x,nbat,
 +                                ((thread+0)*grid->ncx*grid->ncy)/nthread,
 +                                ((thread+1)*grid->ncx*grid->ncy)/nthread,
 +                                nbs->work[thread].sort_work);
 +        }
 +        else
 +        {
 +            sort_columns_supersub(nbs,dd_zone,grid,a0,a1,atinfo,x,nbat,
 +                                  ((thread+0)*grid->ncx*grid->ncy)/nthread,
 +                                  ((thread+1)*grid->ncx*grid->ncy)/nthread,
 +                                  nbs->work[thread].sort_work);
 +        }
 +    }
 +
 +#ifdef NBNXN_SEARCH_SSE
 +    if (grid->bSimple && nbat->XFormat == nbatX8)
 +    {
 +        combine_bounding_box_pairs(grid,grid->bb);
 +    }
 +#endif
 +
 +    if (!grid->bSimple)
 +    {
 +        grid->nsubc_tot = 0;
 +        for(i=0; i<grid->nc; i++)
 +        {
 +            grid->nsubc_tot += grid->nsubc[i];
 +        }
 +    }
 +
 +    if (debug)
 +    {
 +        if (grid->bSimple)
 +        {
 +            print_bbsizes_simple(debug,nbs,grid);
 +        }
 +        else
 +        {
 +            fprintf(debug,"ns non-zero sub-cells: %d average atoms %.2f\n",
 +                    grid->nsubc_tot,(a1-a0)/(double)grid->nsubc_tot);
 +
 +            print_bbsizes_supersub(debug,nbs,grid);
 +        }
 +    }
 +}
 +
-     int cb;
++static void init_buffer_flags(nbnxn_buffer_flags_t *flags,
++                              int natoms)
 +{
-     flags->ncb = (grid->nc + NBNXN_CELLBLOCK_SIZE - 1)/NBNXN_CELLBLOCK_SIZE;
-     if (flags->ncb > flags->flag_nalloc)
++    int b;
 +
-         flags->flag_nalloc = over_alloc_large(flags->ncb);
++    flags->nflag = (natoms + NBNXN_BUFFERFLAG_SIZE - 1)/NBNXN_BUFFERFLAG_SIZE;
++    if (flags->nflag > flags->flag_nalloc)
 +    {
-     for(cb=0; cb<flags->ncb; cb++)
++        flags->flag_nalloc = over_alloc_large(flags->nflag);
 +        srenew(flags->flag,flags->flag_nalloc);
 +    }
-         flags->flag[cb] = 0;
++    for(b=0; b<flags->nflag; b++)
 +    {
-     flags->bUse = TRUE;
++        flags->flag[b] = 0;
 +    }
-     if (nc_max*grid->na_sc > nbat->nalloc)
 +}
 +
 +/* Sets up a grid and puts the atoms on the grid.
 + * This function only operates on one domain of the domain decompostion.
 + * Note that without domain decomposition there is only one domain.
 + */
 +void nbnxn_put_on_grid(nbnxn_search_t nbs,
 +                       int ePBC,matrix box,
 +                       int dd_zone,
 +                       rvec corner0,rvec corner1,
 +                       int a0,int a1,
 +                       real atom_density,
 +                       const int *atinfo,
 +                       rvec *x,
 +                       int nmoved,int *move,
 +                       int nb_kernel_type,
 +                       nbnxn_atomdata_t *nbat)
 +{
 +    nbnxn_grid_t *grid;
 +    int n;
 +    int nc_max_grid,nc_max;
 +
 +    grid = &nbs->grid[dd_zone];
 +
 +    nbs_cycle_start(&nbs->cc[enbsCCgrid]);
 +
 +    grid->bSimple = nbnxn_kernel_pairlist_simple(nb_kernel_type);
 +
 +    grid->na_c      = nbnxn_kernel_to_ci_size(nb_kernel_type);
 +    grid->na_cj     = nbnxn_kernel_to_cj_size(nb_kernel_type);
 +    grid->na_sc     = (grid->bSimple ? 1 : GPU_NSUBCELL)*grid->na_c;
 +    grid->na_c_2log = get_2log(grid->na_c);
 +
 +    nbat->na_c = grid->na_c;
 +
 +    if (dd_zone == 0)
 +    {
 +        grid->cell0 = 0;
 +    }
 +    else
 +    {
 +        grid->cell0 =
 +            (nbs->grid[dd_zone-1].cell0 + nbs->grid[dd_zone-1].nc)*
 +            nbs->grid[dd_zone-1].na_sc/grid->na_sc;
 +    }
 +
 +    n = a1 - a0;
 +
 +    if (dd_zone == 0)
 +    {
 +        nbs->ePBC = ePBC;
 +        copy_mat(box,nbs->box);
 +
 +        if (atom_density >= 0)
 +        {
 +            grid->atom_density = atom_density;
 +        }
 +        else
 +        {
 +            grid->atom_density = grid_atom_density(n-nmoved,corner0,corner1);
 +        }
 +
 +        grid->cell0 = 0;
 +
 +        nbs->natoms_local    = a1 - nmoved;
 +        /* We assume that nbnxn_put_on_grid is called first
 +         * for the local atoms (dd_zone=0).
 +         */
 +        nbs->natoms_nonlocal = a1 - nmoved;
 +    }
 +    else
 +    {
 +        nbs->natoms_nonlocal = max(nbs->natoms_nonlocal,a1);
 +    }
 +
 +    nc_max_grid = set_grid_size_xy(nbs,grid,n-nmoved,corner0,corner1,
 +                                   nbs->grid[0].atom_density,
 +                                   nbat->XFormat);
 +
 +    nc_max = grid->cell0 + nc_max_grid;
 +
 +    if (a1 > nbs->cell_nalloc)
 +    {
 +        nbs->cell_nalloc = over_alloc_large(a1);
 +        srenew(nbs->cell,nbs->cell_nalloc);
 +    }
 +
 +    /* To avoid conditionals we store the moved particles at the end of a,
 +     * make sure we have enough space.
 +     */
 +    if (nc_max*grid->na_sc + nmoved > nbs->a_nalloc)
 +    {
 +        nbs->a_nalloc = over_alloc_large(nc_max*grid->na_sc + nmoved);
 +        srenew(nbs->a,nbs->a_nalloc);
 +    }
 +
-         nbnxn_atomdata_realloc(nbat,nc_max*grid->na_sc);
++    /* We need padding up to a multiple of the buffer flag size: simply add */
++    if (nc_max*grid->na_sc + NBNXN_BUFFERFLAG_SIZE > nbat->nalloc)
 +    {
-     init_grid_flags(&grid->cellblock_flags,grid);
++        nbnxn_atomdata_realloc(nbat,nc_max*grid->na_sc+NBNXN_BUFFERFLAG_SIZE);
 +    }
 +
 +    calc_cell_indices(nbs,dd_zone,grid,a0,a1,atinfo,x,move,nbat);
 +
 +    if (dd_zone == 0)
 +    {
 +        nbat->natoms_local = nbat->natoms;
 +    }
 +
-         nbl_list->nnbl > NBNXN_CELLBLOCK_MAX_THREADS)
 +    nbs_cycle_stop(&nbs->cc[enbsCCgrid]);
 +}
 +
 +/* Calls nbnxn_put_on_grid for all non-local domains */
 +void nbnxn_put_on_grid_nonlocal(nbnxn_search_t nbs,
 +                                const gmx_domdec_zones_t *zones,
 +                                const int *atinfo,
 +                                rvec *x,
 +                                int nb_kernel_type,
 +                                nbnxn_atomdata_t *nbat)
 +{
 +    int  zone,d;
 +    rvec c0,c1;
 +
 +    for(zone=1; zone<zones->n; zone++)
 +    {
 +        for(d=0; d<DIM; d++)
 +        {
 +            c0[d] = zones->size[zone].bb_x0[d];
 +            c1[d] = zones->size[zone].bb_x1[d];
 +        }
 +
 +        nbnxn_put_on_grid(nbs,nbs->ePBC,NULL,
 +                          zone,c0,c1,
 +                          zones->cg_range[zone],
 +                          zones->cg_range[zone+1],
 +                          -1,
 +                          atinfo,
 +                          x,
 +                          0,NULL,
 +                          nb_kernel_type,
 +                          nbat);
 +    }
 +}
 +
 +/* Add simple grid type information to the local super/sub grid */
 +void nbnxn_grid_add_simple(nbnxn_search_t nbs,
 +                           nbnxn_atomdata_t *nbat)
 +{
 +    nbnxn_grid_t *grid;
 +    float *bbcz,*bb;
 +    int ncd,sc;
 +
 +    grid = &nbs->grid[0];
 +
 +    if (grid->bSimple)
 +    {
 +        gmx_incons("nbnxn_grid_simple called with a simple grid");
 +    }
 +
 +    ncd = grid->na_sc/NBNXN_CPU_CLUSTER_I_SIZE;
 +
 +    if (grid->nc*ncd > grid->nc_nalloc_simple)
 +    {
 +        grid->nc_nalloc_simple = over_alloc_large(grid->nc*ncd);
 +        srenew(grid->bbcz_simple,grid->nc_nalloc_simple*NNBSBB_D);
 +        srenew(grid->bb_simple,grid->nc_nalloc_simple*NNBSBB_B);
 +        srenew(grid->flags_simple,grid->nc_nalloc_simple);
 +        if (nbat->XFormat)
 +        {
 +            sfree_aligned(grid->bbj);
 +            snew_aligned(grid->bbj,grid->nc_nalloc_simple/2,16);
 +        }
 +    }
 +
 +    bbcz = grid->bbcz_simple;
 +    bb   = grid->bb_simple;
 +
 +#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
 +    for(sc=0; sc<grid->nc; sc++)
 +    {
 +        int c,tx,na;
 +
 +        for(c=0; c<ncd; c++)
 +        {
 +            tx = sc*ncd + c;
 +
 +            na = NBNXN_CPU_CLUSTER_I_SIZE;
 +            while (na > 0 &&
 +                   nbat->type[tx*NBNXN_CPU_CLUSTER_I_SIZE+na-1] == nbat->ntype-1)
 +            {
 +                na--;
 +            }
 +
 +            if (na > 0)
 +            {
 +                switch (nbat->XFormat)
 +                {
 +                case nbatX4:
 +                    /* PACK_X4==NBNXN_CPU_CLUSTER_I_SIZE, so this is simple */
 +                    calc_bounding_box_x_x4(na,nbat->x+tx*STRIDE_P4,
 +                                           bb+tx*NNBSBB_B);
 +                    break;
 +                case nbatX8:
 +                    /* PACK_X8>NBNXN_CPU_CLUSTER_I_SIZE, more complicated */
 +                    calc_bounding_box_x_x8(na,nbat->x+X8_IND_A(tx*NBNXN_CPU_CLUSTER_I_SIZE),
 +                                           bb+tx*NNBSBB_B);
 +                    break;
 +                default:
 +                    calc_bounding_box(na,nbat->xstride,
 +                                      nbat->x+tx*NBNXN_CPU_CLUSTER_I_SIZE*nbat->xstride,
 +                                      bb+tx*NNBSBB_B);
 +                    break;
 +                }
 +                bbcz[tx*NNBSBB_D+0] = bb[tx*NNBSBB_B         +ZZ];
 +                bbcz[tx*NNBSBB_D+1] = bb[tx*NNBSBB_B+NNBSBB_C+ZZ];
 +
 +                /* No interaction optimization yet here */
 +                grid->flags_simple[tx] = NBNXN_CI_DO_LJ(0) | NBNXN_CI_DO_COUL(0);
 +            }
 +            else
 +            {
 +                grid->flags_simple[tx] = 0;
 +            }
 +        }
 +    }
 +
 +#ifdef NBNXN_SEARCH_SSE
 +    if (grid->bSimple && nbat->XFormat == nbatX8)
 +    {
 +        combine_bounding_box_pairs(grid,grid->bb_simple);
 +    }
 +#endif
 +}
 +
 +void nbnxn_get_ncells(nbnxn_search_t nbs,int *ncx,int *ncy)
 +{
 +    *ncx = nbs->grid[0].ncx;
 +    *ncy = nbs->grid[0].ncy;
 +}
 +
 +void nbnxn_get_atomorder(nbnxn_search_t nbs,int **a,int *n)
 +{
 +    const nbnxn_grid_t *grid;
 +
 +    grid = &nbs->grid[0];
 +
 +    /* Return the atom order for the home cell (index 0) */
 +    *a  = nbs->a;
 +
 +    *n = grid->cxy_ind[grid->ncx*grid->ncy]*grid->na_sc;
 +}
 +
 +void nbnxn_set_atomorder(nbnxn_search_t nbs)
 +{
 +    nbnxn_grid_t *grid;
 +    int ao,cx,cy,cxy,cz,j;
 +
 +    /* Set the atom order for the home cell (index 0) */
 +    grid = &nbs->grid[0];
 +
 +    ao = 0;
 +    for(cx=0; cx<grid->ncx; cx++)
 +    {
 +        for(cy=0; cy<grid->ncy; cy++)
 +        {
 +            cxy = cx*grid->ncy + cy;
 +            j   = grid->cxy_ind[cxy]*grid->na_sc;
 +            for(cz=0; cz<grid->cxy_na[cxy]; cz++)
 +            {
 +                nbs->a[j]     = ao;
 +                nbs->cell[ao] = j;
 +                ao++;
 +                j++;
 +            }
 +        }
 +    }
 +}
 +
 +/* Determines the cell range along one dimension that
 + * the bounding box b0 - b1 sees.
 + */
 +static void get_cell_range(real b0,real b1,
 +                           int nc,real c0,real s,real invs,
 +                           real d2,real r2,int *cf,int *cl)
 +{
 +    *cf = max((int)((b0 - c0)*invs),0);
 +
 +    while (*cf > 0 && d2 + sqr((b0 - c0) - (*cf-1+1)*s) < r2)
 +    {
 +        (*cf)--;
 +    }
 +
 +    *cl = min((int)((b1 - c0)*invs),nc-1);
 +    while (*cl < nc-1 && d2 + sqr((*cl+1)*s - (b1 - c0)) < r2)
 +    {
 +        (*cl)++;
 +    }
 +}
 +
 +/* Reference code calculating the distance^2 between two bounding boxes */
 +static float box_dist2(float bx0,float bx1,float by0,
 +                       float by1,float bz0,float bz1,
 +                       const float *bb)
 +{
 +    float d2;
 +    float dl,dh,dm,dm0;
 +
 +    d2 = 0;
 +
 +    dl  = bx0 - bb[BBU_X];
 +    dh  = bb[BBL_X] - bx1;
 +    dm  = max(dl,dh);
 +    dm0 = max(dm,0);
 +    d2 += dm0*dm0;
 +
 +    dl  = by0 - bb[BBU_Y];
 +    dh  = bb[BBL_Y] - by1;
 +    dm  = max(dl,dh);
 +    dm0 = max(dm,0);
 +    d2 += dm0*dm0;
 +
 +    dl  = bz0 - bb[BBU_Z];
 +    dh  = bb[BBL_Z] - bz1;
 +    dm  = max(dl,dh);
 +    dm0 = max(dm,0);
 +    d2 += dm0*dm0;
 +
 +    return d2;
 +}
 +
 +/* Plain C code calculating the distance^2 between two bounding boxes */
 +static float subc_bb_dist2(int si,const float *bb_i_ci,
 +                           int csj,const float *bb_j_all)
 +{
 +    const float *bb_i,*bb_j;
 +    float d2;
 +    float dl,dh,dm,dm0;
 +
 +    bb_i = bb_i_ci  +  si*NNBSBB_B;
 +    bb_j = bb_j_all + csj*NNBSBB_B;
 +
 +    d2 = 0;
 +
 +    dl  = bb_i[BBL_X] - bb_j[BBU_X];
 +    dh  = bb_j[BBL_X] - bb_i[BBU_X];
 +    dm  = max(dl,dh);
 +    dm0 = max(dm,0);
 +    d2 += dm0*dm0;
 +
 +    dl  = bb_i[BBL_Y] - bb_j[BBU_Y];
 +    dh  = bb_j[BBL_Y] - bb_i[BBU_Y];
 +    dm  = max(dl,dh);
 +    dm0 = max(dm,0);
 +    d2 += dm0*dm0;
 +
 +    dl  = bb_i[BBL_Z] - bb_j[BBU_Z];
 +    dh  = bb_j[BBL_Z] - bb_i[BBU_Z];
 +    dm  = max(dl,dh);
 +    dm0 = max(dm,0);
 +    d2 += dm0*dm0;
 +
 +    return d2;
 +}
 +
 +#ifdef NBNXN_SEARCH_SSE
 +
 +/* SSE code for bb distance for bb format xyz0 */
 +static float subc_bb_dist2_sse(int na_c,
 +                              int si,const float *bb_i_ci,
 +                              int csj,const float *bb_j_all)
 +{
 +    const float *bb_i,*bb_j;
 +
 +    __m128 bb_i_SSE0,bb_i_SSE1;
 +    __m128 bb_j_SSE0,bb_j_SSE1;
 +    __m128 dl_SSE;
 +    __m128 dh_SSE;
 +    __m128 dm_SSE;
 +    __m128 dm0_SSE;
 +    __m128 d2_SSE;
 +#ifndef GMX_X86_SSE4_1
 +    float d2_array[7],*d2_align;
 +
 +    d2_align = (float *)(((size_t)(d2_array+3)) & (~((size_t)15)));
 +#else
 +    float d2;
 +#endif
 +
 +    bb_i = bb_i_ci  +  si*NNBSBB_B;
 +    bb_j = bb_j_all + csj*NNBSBB_B;
 +
 +    bb_i_SSE0 = _mm_load_ps(bb_i);
 +    bb_i_SSE1 = _mm_load_ps(bb_i+NNBSBB_C);
 +    bb_j_SSE0 = _mm_load_ps(bb_j);
 +    bb_j_SSE1 = _mm_load_ps(bb_j+NNBSBB_C);
 +
 +    dl_SSE    = _mm_sub_ps(bb_i_SSE0,bb_j_SSE1);
 +    dh_SSE    = _mm_sub_ps(bb_j_SSE0,bb_i_SSE1);
 +
 +    dm_SSE    = _mm_max_ps(dl_SSE,dh_SSE);
 +    dm0_SSE   = _mm_max_ps(dm_SSE,_mm_setzero_ps());
 +#ifndef GMX_X86_SSE4_1
 +    d2_SSE    = _mm_mul_ps(dm0_SSE,dm0_SSE);
 +
 +    _mm_store_ps(d2_align,d2_SSE);
 +
 +    return d2_align[0] + d2_align[1] + d2_align[2];
 +#else
 +    /* SSE4.1 dot product of components 0,1,2 */
 +    d2_SSE    = _mm_dp_ps(dm0_SSE,dm0_SSE,0x71);
 +
 +    _mm_store_ss(&d2,d2_SSE);
 +
 +    return d2;
 +#endif
 +}
 +
 +/* Calculate bb bounding distances of bb_i[si,...,si+3] and store them in d2 */
 +#define SUBC_BB_DIST2_SSE_XXXX_INNER(si,bb_i,d2) \
 +{                                                \
 +    int    shi;                                  \
 +                                                 \
 +    __m128 dx_0,dy_0,dz_0;                       \
 +    __m128 dx_1,dy_1,dz_1;                       \
 +                                                 \
 +    __m128 mx,my,mz;                             \
 +    __m128 m0x,m0y,m0z;                          \
 +                                                 \
 +    __m128 d2x,d2y,d2z;                          \
 +    __m128 d2s,d2t;                              \
 +                                                 \
 +    shi = si*NNBSBB_D*DIM;                       \
 +                                                 \
 +    xi_l = _mm_load_ps(bb_i+shi+0*STRIDE_8BB);   \
 +    yi_l = _mm_load_ps(bb_i+shi+1*STRIDE_8BB);   \
 +    zi_l = _mm_load_ps(bb_i+shi+2*STRIDE_8BB);   \
 +    xi_h = _mm_load_ps(bb_i+shi+3*STRIDE_8BB);   \
 +    yi_h = _mm_load_ps(bb_i+shi+4*STRIDE_8BB);   \
 +    zi_h = _mm_load_ps(bb_i+shi+5*STRIDE_8BB);   \
 +                                                 \
 +    dx_0 = _mm_sub_ps(xi_l,xj_h);                \
 +    dy_0 = _mm_sub_ps(yi_l,yj_h);                \
 +    dz_0 = _mm_sub_ps(zi_l,zj_h);                \
 +                                                 \
 +    dx_1 = _mm_sub_ps(xj_l,xi_h);                \
 +    dy_1 = _mm_sub_ps(yj_l,yi_h);                \
 +    dz_1 = _mm_sub_ps(zj_l,zi_h);                \
 +                                                 \
 +    mx   = _mm_max_ps(dx_0,dx_1);                \
 +    my   = _mm_max_ps(dy_0,dy_1);                \
 +    mz   = _mm_max_ps(dz_0,dz_1);                \
 +                                                 \
 +    m0x  = _mm_max_ps(mx,zero);                  \
 +    m0y  = _mm_max_ps(my,zero);                  \
 +    m0z  = _mm_max_ps(mz,zero);                  \
 +                                                 \
 +    d2x  = _mm_mul_ps(m0x,m0x);                  \
 +    d2y  = _mm_mul_ps(m0y,m0y);                  \
 +    d2z  = _mm_mul_ps(m0z,m0z);                  \
 +                                                 \
 +    d2s  = _mm_add_ps(d2x,d2y);                  \
 +    d2t  = _mm_add_ps(d2s,d2z);                  \
 +                                                 \
 +    _mm_store_ps(d2+si,d2t);                     \
 +}
 +
 +/* SSE code for nsi bb distances for bb format xxxxyyyyzzzz */
 +static void subc_bb_dist2_sse_xxxx(const float *bb_j,
 +                                   int nsi,const float *bb_i,
 +                                   float *d2)
 +{
 +    __m128 xj_l,yj_l,zj_l;
 +    __m128 xj_h,yj_h,zj_h;
 +    __m128 xi_l,yi_l,zi_l;
 +    __m128 xi_h,yi_h,zi_h;
 +
 +    __m128 zero;
 +
 +    zero = _mm_setzero_ps();
 +
 +    xj_l = _mm_set1_ps(bb_j[0*STRIDE_8BB]);
 +    yj_l = _mm_set1_ps(bb_j[1*STRIDE_8BB]);
 +    zj_l = _mm_set1_ps(bb_j[2*STRIDE_8BB]);
 +    xj_h = _mm_set1_ps(bb_j[3*STRIDE_8BB]);
 +    yj_h = _mm_set1_ps(bb_j[4*STRIDE_8BB]);
 +    zj_h = _mm_set1_ps(bb_j[5*STRIDE_8BB]);
 +
 +    /* Here we "loop" over si (0,STRIDE_8BB) from 0 to nsi with step STRIDE_8BB.
 +     * But as we know the number of iterations is 1 or 2, we unroll manually.
 +     */
 +    SUBC_BB_DIST2_SSE_XXXX_INNER(0,bb_i,d2);
 +    if (STRIDE_8BB < nsi)
 +    {
 +        SUBC_BB_DIST2_SSE_XXXX_INNER(STRIDE_8BB,bb_i,d2);
 +    }
 +}
 +
 +#endif /* NBNXN_SEARCH_SSE */
 +
 +/* Plain C function which determines if any atom pair between two cells
 + * is within distance sqrt(rl2).
 + */
 +static gmx_bool subc_in_range_x(int na_c,
 +                                int si,const real *x_i,
 +                                int csj,int stride,const real *x_j,
 +                                real rl2)
 +{
 +    int  i,j,i0,j0;
 +    real d2;
 +
 +    for(i=0; i<na_c; i++)
 +    {
 +        i0 = (si*na_c + i)*DIM;
 +        for(j=0; j<na_c; j++)
 +        {
 +            j0 = (csj*na_c + j)*stride;
 +
 +            d2 = sqr(x_i[i0  ] - x_j[j0  ]) +
 +                 sqr(x_i[i0+1] - x_j[j0+1]) +
 +                 sqr(x_i[i0+2] - x_j[j0+2]);
 +
 +            if (d2 < rl2)
 +            {
 +                return TRUE;
 +            }
 +        }
 +    }
 +
 +    return FALSE;
 +}
 +
 +/* SSE function which determines if any atom pair between two cells,
 + * both with 8 atoms, is within distance sqrt(rl2).
 + */
 +static gmx_bool subc_in_range_sse8(int na_c,
 +                                   int si,const real *x_i,
 +                                   int csj,int stride,const real *x_j,
 +                                   real rl2)
 +{
 +#ifdef NBNXN_SEARCH_SSE_SINGLE
 +    __m128 ix_SSE0,iy_SSE0,iz_SSE0;
 +    __m128 ix_SSE1,iy_SSE1,iz_SSE1;
 +
 +    __m128 rc2_SSE;
 +
 +    int na_c_sse;
 +    int j0,j1;
 +
 +    rc2_SSE   = _mm_set1_ps(rl2);
 +
 +    na_c_sse = NBNXN_GPU_CLUSTER_SIZE/STRIDE_8BB;
 +    ix_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+0)*STRIDE_8BB);
 +    iy_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+1)*STRIDE_8BB);
 +    iz_SSE0 = _mm_load_ps(x_i+(si*na_c_sse*DIM+2)*STRIDE_8BB);
 +    ix_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+3)*STRIDE_8BB);
 +    iy_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+4)*STRIDE_8BB);
 +    iz_SSE1 = _mm_load_ps(x_i+(si*na_c_sse*DIM+5)*STRIDE_8BB);
 +
 +    /* We loop from the outer to the inner particles to maximize
 +     * the chance that we find a pair in range quickly and return.
 +     */
 +    j0 = csj*na_c;
 +    j1 = j0 + na_c - 1;
 +    while (j0 < j1)
 +    {
 +        __m128 jx0_SSE,jy0_SSE,jz0_SSE;
 +        __m128 jx1_SSE,jy1_SSE,jz1_SSE;
 +
 +        __m128 dx_SSE0,dy_SSE0,dz_SSE0;
 +        __m128 dx_SSE1,dy_SSE1,dz_SSE1;
 +        __m128 dx_SSE2,dy_SSE2,dz_SSE2;
 +        __m128 dx_SSE3,dy_SSE3,dz_SSE3;
 +
 +        __m128 rsq_SSE0;
 +        __m128 rsq_SSE1;
 +        __m128 rsq_SSE2;
 +        __m128 rsq_SSE3;
 +
 +        __m128 wco_SSE0;
 +        __m128 wco_SSE1;
 +        __m128 wco_SSE2;
 +        __m128 wco_SSE3;
 +        __m128 wco_any_SSE01,wco_any_SSE23,wco_any_SSE;
 +
 +        jx0_SSE = _mm_load1_ps(x_j+j0*stride+0);
 +        jy0_SSE = _mm_load1_ps(x_j+j0*stride+1);
 +        jz0_SSE = _mm_load1_ps(x_j+j0*stride+2);
 +
 +        jx1_SSE = _mm_load1_ps(x_j+j1*stride+0);
 +        jy1_SSE = _mm_load1_ps(x_j+j1*stride+1);
 +        jz1_SSE = _mm_load1_ps(x_j+j1*stride+2);
 +
 +        /* Calculate distance */
 +        dx_SSE0            = _mm_sub_ps(ix_SSE0,jx0_SSE);
 +        dy_SSE0            = _mm_sub_ps(iy_SSE0,jy0_SSE);
 +        dz_SSE0            = _mm_sub_ps(iz_SSE0,jz0_SSE);
 +        dx_SSE1            = _mm_sub_ps(ix_SSE1,jx0_SSE);
 +        dy_SSE1            = _mm_sub_ps(iy_SSE1,jy0_SSE);
 +        dz_SSE1            = _mm_sub_ps(iz_SSE1,jz0_SSE);
 +        dx_SSE2            = _mm_sub_ps(ix_SSE0,jx1_SSE);
 +        dy_SSE2            = _mm_sub_ps(iy_SSE0,jy1_SSE);
 +        dz_SSE2            = _mm_sub_ps(iz_SSE0,jz1_SSE);
 +        dx_SSE3            = _mm_sub_ps(ix_SSE1,jx1_SSE);
 +        dy_SSE3            = _mm_sub_ps(iy_SSE1,jy1_SSE);
 +        dz_SSE3            = _mm_sub_ps(iz_SSE1,jz1_SSE);
 +
 +        /* rsq = dx*dx+dy*dy+dz*dz */
 +        rsq_SSE0           = gmx_mm_calc_rsq_ps(dx_SSE0,dy_SSE0,dz_SSE0);
 +        rsq_SSE1           = gmx_mm_calc_rsq_ps(dx_SSE1,dy_SSE1,dz_SSE1);
 +        rsq_SSE2           = gmx_mm_calc_rsq_ps(dx_SSE2,dy_SSE2,dz_SSE2);
 +        rsq_SSE3           = gmx_mm_calc_rsq_ps(dx_SSE3,dy_SSE3,dz_SSE3);
 +
 +        wco_SSE0           = _mm_cmplt_ps(rsq_SSE0,rc2_SSE);
 +        wco_SSE1           = _mm_cmplt_ps(rsq_SSE1,rc2_SSE);
 +        wco_SSE2           = _mm_cmplt_ps(rsq_SSE2,rc2_SSE);
 +        wco_SSE3           = _mm_cmplt_ps(rsq_SSE3,rc2_SSE);
 +
 +        wco_any_SSE01      = _mm_or_ps(wco_SSE0,wco_SSE1);
 +        wco_any_SSE23      = _mm_or_ps(wco_SSE2,wco_SSE3);
 +        wco_any_SSE        = _mm_or_ps(wco_any_SSE01,wco_any_SSE23);
 +
 +        if (_mm_movemask_ps(wco_any_SSE))
 +        {
 +            return TRUE;
 +        }
 +
 +        j0++;
 +        j1--;
 +    }
 +    return FALSE;
 +
 +#else
 +    /* No SSE */
 +    gmx_incons("SSE function called without SSE support");
 +
 +    return TRUE;
 +#endif
 +}
 +
 +/* Returns the j sub-cell for index cj_ind */
 +static int nbl_cj(const nbnxn_pairlist_t *nbl,int cj_ind)
 +{
 +    return nbl->cj4[cj_ind>>2].cj[cj_ind & 3];
 +}
 +
 +/* Returns the i-interaction mask of the j sub-cell for index cj_ind */
 +static unsigned nbl_imask0(const nbnxn_pairlist_t *nbl,int cj_ind)
 +{
 +    return nbl->cj4[cj_ind>>2].imei[0].imask;
 +}
 +
 +/* Ensures there is enough space for extra extra exclusion masks */
 +static void check_excl_space(nbnxn_pairlist_t *nbl,int extra)
 +{
 +    if (nbl->nexcl+extra > nbl->excl_nalloc)
 +    {
 +        nbl->excl_nalloc = over_alloc_small(nbl->nexcl+extra);
 +        nbnxn_realloc_void((void **)&nbl->excl,
 +                           nbl->nexcl*sizeof(*nbl->excl),
 +                           nbl->excl_nalloc*sizeof(*nbl->excl),
 +                           nbl->alloc,nbl->free);
 +    }
 +}
 +
 +/* Ensures there is enough space for ncell extra j-cells in the list */
 +static void check_subcell_list_space_simple(nbnxn_pairlist_t *nbl,
 +                                            int ncell)
 +{
 +    int cj_max;
 +
 +    cj_max = nbl->ncj + ncell;
 +
 +    if (cj_max > nbl->cj_nalloc)
 +    {
 +        nbl->cj_nalloc = over_alloc_small(cj_max);
 +        nbnxn_realloc_void((void **)&nbl->cj,
 +                           nbl->ncj*sizeof(*nbl->cj),
 +                           nbl->cj_nalloc*sizeof(*nbl->cj),
 +                           nbl->alloc,nbl->free);
 +    }
 +}
 +
 +/* Ensures there is enough space for ncell extra j-subcells in the list */
 +static void check_subcell_list_space_supersub(nbnxn_pairlist_t *nbl,
 +                                              int nsupercell)
 +{
 +    int ncj4_max,j4,j,w,t;
 +
 +#define NWARP       2
 +#define WARP_SIZE  32
 +
 +    /* We can have maximally nsupercell*GPU_NSUBCELL sj lists */
 +    /* We can store 4 j-subcell - i-supercell pairs in one struct.
 +     * since we round down, we need one extra entry.
 +     */
 +    ncj4_max = ((nbl->work->cj_ind + nsupercell*GPU_NSUBCELL + 4-1) >> 2);
 +
 +    if (ncj4_max > nbl->cj4_nalloc)
 +    {
 +        nbl->cj4_nalloc = over_alloc_small(ncj4_max);
 +        nbnxn_realloc_void((void **)&nbl->cj4,
 +                           nbl->work->cj4_init*sizeof(*nbl->cj4),
 +                           nbl->cj4_nalloc*sizeof(*nbl->cj4),
 +                           nbl->alloc,nbl->free);
 +    }
 +
 +    if (ncj4_max > nbl->work->cj4_init)
 +    {
 +        for(j4=nbl->work->cj4_init; j4<ncj4_max; j4++)
 +        {
 +            /* No i-subcells and no excl's in the list initially */
 +            for(w=0; w<NWARP; w++)
 +            {
 +                nbl->cj4[j4].imei[w].imask    = 0U;
 +                nbl->cj4[j4].imei[w].excl_ind = 0;
 +
 +            }
 +        }
 +        nbl->work->cj4_init = ncj4_max;
 +    }
 +}
 +
 +/* Set all excl masks for one GPU warp no exclusions */
 +static void set_no_excls(nbnxn_excl_t *excl)
 +{
 +    int t;
 +
 +    for(t=0; t<WARP_SIZE; t++)
 +    {
 +        /* Turn all interaction bits on */
 +        excl->pair[t] = NBNXN_INT_MASK_ALL;
 +    }
 +}
 +
 +/* Initializes a single nbnxn_pairlist_t data structure */
 +static void nbnxn_init_pairlist(nbnxn_pairlist_t *nbl,
 +                                gmx_bool bSimple,
 +                                nbnxn_alloc_t *alloc,
 +                                nbnxn_free_t  *free)
 +{
 +    if (alloc == NULL)
 +    {
 +        nbl->alloc = nbnxn_alloc_aligned;
 +    }
 +    else
 +    {
 +        nbl->alloc = alloc;
 +    }
 +    if (free == NULL)
 +    {
 +        nbl->free = nbnxn_free_aligned;
 +    }
 +    else
 +    {
 +        nbl->free = free;
 +    }
 +
 +    nbl->bSimple     = bSimple;
 +    nbl->na_sc       = 0;
 +    nbl->na_ci       = 0;
 +    nbl->na_cj       = 0;
 +    nbl->nci         = 0;
 +    nbl->ci          = NULL;
 +    nbl->ci_nalloc   = 0;
 +    nbl->ncj         = 0;
 +    nbl->cj          = NULL;
 +    nbl->cj_nalloc   = 0;
 +    nbl->ncj4        = 0;
 +    /* We need one element extra in sj, so alloc initially with 1 */
 +    nbl->cj4_nalloc  = 0;
 +    nbl->cj4         = NULL;
 +    nbl->nci_tot     = 0;
 +
 +    if (!nbl->bSimple)
 +    {
 +        nbl->excl        = NULL;
 +        nbl->excl_nalloc = 0;
 +        nbl->nexcl       = 0;
 +        check_excl_space(nbl,1);
 +        nbl->nexcl       = 1;
 +        set_no_excls(&nbl->excl[0]);
 +    }
 +
 +    snew(nbl->work,1);
 +#ifdef NBNXN_BBXXXX
 +    snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL/STRIDE_8BB*NNBSBB_XXXX,16);
 +#else
 +    snew_aligned(nbl->work->bb_ci,GPU_NSUBCELL*NNBSBB_B,16);
 +#endif
 +    snew_aligned(nbl->work->x_ci,NBNXN_NA_SC_MAX*DIM,16);
 +#ifdef NBNXN_SEARCH_SSE
 +    snew_aligned(nbl->work->x_ci_x86_simd128,1,16);
 +#ifdef GMX_X86_AVX_256
 +    snew_aligned(nbl->work->x_ci_x86_simd256,1,32);
 +#endif
 +#endif
 +    snew_aligned(nbl->work->d2,GPU_NSUBCELL,16);
 +}
 +
 +void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
 +                             gmx_bool bSimple, gmx_bool bCombined,
 +                             nbnxn_alloc_t *alloc,
 +                             nbnxn_free_t  *free)
 +{
 +    int i;
 +
 +    nbl_list->bSimple   = bSimple;
 +    nbl_list->bCombined = bCombined;
 +
 +    nbl_list->nnbl = gmx_omp_nthreads_get(emntNonbonded);
 +
 +    if (!nbl_list->bCombined &&
-                   nbl_list->nnbl,NBNXN_CELLBLOCK_MAX_THREADS,NBNXN_CELLBLOCK_MAX_THREADS);
++        nbl_list->nnbl > NBNXN_BUFFERFLAG_MAX_THREADS)
 +    {
 +        gmx_fatal(FARGS,"%d OpenMP threads were requested. Since the non-bonded force buffer reduction is prohibitively slow with more than %d threads, we do not allow this. Use %d or less OpenMP threads.",
-         bb_ci[BBL_X] = bb[ia+BBL_X] + shx;
-         bb_ci[BBL_Y] = bb[ia+BBL_Y] + shy;
-         bb_ci[BBL_Z] = bb[ia+BBL_Z] + shz;
-         bb_ci[BBU_X] = bb[ia+BBU_X] + shx;
-         bb_ci[BBU_Y] = bb[ia+BBU_Y] + shy;
-         bb_ci[BBU_Z] = bb[ia+BBU_Z] + shz;
++                  nbl_list->nnbl,NBNXN_BUFFERFLAG_MAX_THREADS,NBNXN_BUFFERFLAG_MAX_THREADS);
 +    }
 +
 +    snew(nbl_list->nbl,nbl_list->nnbl);
 +    /* Execute in order to avoid memory interleaving between threads */
 +#pragma omp parallel for num_threads(nbl_list->nnbl) schedule(static)
 +    for(i=0; i<nbl_list->nnbl; i++)
 +    {
 +        /* Allocate the nblist data structure locally on each thread
 +         * to optimize memory access for NUMA architectures.
 +         */
 +        snew(nbl_list->nbl[i],1);
 +
 +        /* Only list 0 is used on the GPU, use normal allocation for i>0 */
 +        if (i == 0)
 +        {
 +            nbnxn_init_pairlist(nbl_list->nbl[i],nbl_list->bSimple,alloc,free);
 +        }
 +        else
 +        {
 +            nbnxn_init_pairlist(nbl_list->nbl[i],nbl_list->bSimple,NULL,NULL);
 +        }
 +    }
 +}
 +
 +/* Print statistics of a pair list, used for debug output */
 +static void print_nblist_statistics_simple(FILE *fp,const nbnxn_pairlist_t *nbl,
 +                                           const nbnxn_search_t nbs,real rl)
 +{
 +    const nbnxn_grid_t *grid;
 +    int cs[SHIFTS];
 +    int s,i,j;
 +    int npexcl;
 +
 +    /* This code only produces correct statistics with domain decomposition */
 +    grid = &nbs->grid[0];
 +
 +    fprintf(fp,"nbl nci %d ncj %d\n",
 +            nbl->nci,nbl->ncj);
 +    fprintf(fp,"nbl na_sc %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
 +            nbl->na_sc,rl,nbl->ncj,nbl->ncj/(double)grid->nc,
 +            nbl->ncj/(double)grid->nc*grid->na_sc,
 +            nbl->ncj/(double)grid->nc*grid->na_sc/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nc*grid->na_sc/det(nbs->box)));
 +
 +    fprintf(fp,"nbl average j cell list length %.1f\n",
 +            0.25*nbl->ncj/(double)nbl->nci);
 +
 +    for(s=0; s<SHIFTS; s++)
 +    {
 +        cs[s] = 0;
 +    }
 +    npexcl = 0;
 +    for(i=0; i<nbl->nci; i++)
 +    {
 +        cs[nbl->ci[i].shift & NBNXN_CI_SHIFT] +=
 +            nbl->ci[i].cj_ind_end - nbl->ci[i].cj_ind_start;
 +
 +        j = nbl->ci[i].cj_ind_start;
 +        while (j < nbl->ci[i].cj_ind_end &&
 +               nbl->cj[j].excl != NBNXN_INT_MASK_ALL)
 +        {
 +            npexcl++;
 +            j++;
 +        }
 +    }
 +    fprintf(fp,"nbl cell pairs, total: %d excl: %d %.1f%%\n",
 +            nbl->ncj,npexcl,100*npexcl/(double)nbl->ncj);
 +    for(s=0; s<SHIFTS; s++)
 +    {
 +        if (cs[s] > 0)
 +        {
 +            fprintf(fp,"nbl shift %2d ncj %3d\n",s,cs[s]);
 +        }
 +    }
 +}
 +
 +/* Print statistics of a pair lists, used for debug output */
 +static void print_nblist_statistics_supersub(FILE *fp,const nbnxn_pairlist_t *nbl,
 +                                             const nbnxn_search_t nbs,real rl)
 +{
 +    const nbnxn_grid_t *grid;
 +    int i,j4,j,si,b;
 +    int c[GPU_NSUBCELL+1];
 +
 +    /* This code only produces correct statistics with domain decomposition */
 +    grid = &nbs->grid[0];
 +
 +    fprintf(fp,"nbl nsci %d ncj4 %d nsi %d excl4 %d\n",
 +            nbl->nsci,nbl->ncj4,nbl->nci_tot,nbl->nexcl);
 +    fprintf(fp,"nbl na_c %d rl %g ncp %d per cell %.1f atoms %.1f ratio %.2f\n",
 +            nbl->na_ci,rl,nbl->nci_tot,nbl->nci_tot/(double)grid->nsubc_tot,
 +            nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c,
 +            nbl->nci_tot/(double)grid->nsubc_tot*grid->na_c/(0.5*4.0/3.0*M_PI*rl*rl*rl*grid->nsubc_tot*grid->na_c/det(nbs->box)));
 +
 +    fprintf(fp,"nbl average j super cell list length %.1f\n",
 +            0.25*nbl->ncj4/(double)nbl->nsci);
 +    fprintf(fp,"nbl average i sub cell list length %.1f\n",
 +            nbl->nci_tot/(0.25*nbl->ncj4));
 +
 +    for(si=0; si<=GPU_NSUBCELL; si++)
 +    {
 +        c[si] = 0;
 +    }
 +    for(i=0; i<nbl->nsci; i++)
 +    {
 +        for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
 +        {
 +            for(j=0; j<4; j++)
 +            {
 +                b = 0;
 +                for(si=0; si<GPU_NSUBCELL; si++)
 +                {
 +                    if (nbl->cj4[j4].imei[0].imask & (1U << (j*GPU_NSUBCELL + si)))
 +                    {
 +                        b++;
 +                    }
 +                }
 +                c[b]++;
 +            }
 +        }
 +    }
 +    for(b=0; b<=GPU_NSUBCELL; b++)
 +    {
 +        fprintf(fp,"nbl j-list #i-subcell %d %7d %4.1f\n",
 +                b,c[b],100.0*c[b]/(double)(nbl->ncj4*NBNXN_GPU_JGROUP_SIZE));
 +    }
 +}
 +
 +/* Print the full pair list, used for debug output */
 +static void print_supersub_nsp(const char *fn,
 +                               const nbnxn_pairlist_t *nbl,
 +                               int iloc)
 +{
 +    char buf[STRLEN];
 +    FILE *fp;
 +    int i,nsp,j4,p;
 +
 +    sprintf(buf,"%s_%s.xvg",fn,NONLOCAL_I(iloc) ? "nl" : "l");
 +    fp = ffopen(buf,"w");
 +
 +    for(i=0; i<nbl->nci; i++)
 +    {
 +        nsp = 0;
 +        for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
 +        {
 +            for(p=0; p<NBNXN_GPU_JGROUP_SIZE*GPU_NSUBCELL; p++)
 +            {
 +                nsp += (nbl->cj4[j4].imei[0].imask >> p) & 1;
 +            }
 +        }
 +        fprintf(fp,"%4d %3d %3d\n",
 +                i,
 +                nsp,
 +                nbl->sci[i].cj4_ind_end-nbl->sci[i].cj4_ind_start);
 +    }
 +
 +    fclose(fp);
 +}
 +
 +/* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp */
 +static void low_get_nbl_exclusions(nbnxn_pairlist_t *nbl,int cj4,
 +                                   int warp,nbnxn_excl_t **excl)
 +{
 +    if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
 +    {
 +        /* No exclusions set, make a new list entry */
 +        nbl->cj4[cj4].imei[warp].excl_ind = nbl->nexcl;
 +        nbl->nexcl++;
 +        *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
 +        set_no_excls(*excl);
 +    }
 +    else
 +    {
 +        /* We already have some exclusions, new ones can be added to the list */
 +        *excl = &nbl->excl[nbl->cj4[cj4].imei[warp].excl_ind];
 +    }
 +}
 +
 +/* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp,
 + * allocates extra memory, if necessary.
 + */
 +static void get_nbl_exclusions_1(nbnxn_pairlist_t *nbl,int cj4,
 +                                 int warp,nbnxn_excl_t **excl)
 +{
 +    if (nbl->cj4[cj4].imei[warp].excl_ind == 0)
 +    {
 +        /* We need to make a new list entry, check if we have space */
 +        check_excl_space(nbl,1);
 +    }
 +    low_get_nbl_exclusions(nbl,cj4,warp,excl);
 +}
 +
 +/* Returns pointers to the exclusion mask for cj4-unit cj4 for both warps,
 + * allocates extra memory, if necessary.
 + */
 +static void get_nbl_exclusions_2(nbnxn_pairlist_t *nbl,int cj4,
 +                                 nbnxn_excl_t **excl_w0,
 +                                 nbnxn_excl_t **excl_w1)
 +{
 +    /* Check for space we might need */
 +    check_excl_space(nbl,2);
 +
 +    low_get_nbl_exclusions(nbl,cj4,0,excl_w0);
 +    low_get_nbl_exclusions(nbl,cj4,1,excl_w1);
 +}
 +
 +/* Sets the self exclusions i=j and pair exclusions i>j */
 +static void set_self_and_newton_excls_supersub(nbnxn_pairlist_t *nbl,
 +                                               int cj4_ind,int sj_offset,
 +                                               int si)
 +{
 +    nbnxn_excl_t *excl[2];
 +    int  ei,ej,w;
 +
 +    /* Here we only set the set self and double pair exclusions */
 +
 +    get_nbl_exclusions_2(nbl,cj4_ind,&excl[0],&excl[1]);
 +
 +    /* Only minor < major bits set */
 +    for(ej=0; ej<nbl->na_ci; ej++)
 +    {
 +        w = (ej>>2);
 +        for(ei=ej; ei<nbl->na_ci; ei++)
 +        {
 +            excl[w]->pair[(ej&(4-1))*nbl->na_ci+ei] &=
 +                ~(1U << (sj_offset*GPU_NSUBCELL+si));
 +        }
 +    }
 +}
 +
 +/* Returns a diagonal or off-diagonal interaction mask for plain C lists */
 +static unsigned int get_imask(gmx_bool rdiag,int ci,int cj)
 +{
 +    return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
 +}
 +
 +#ifdef NBNXN_SEARCH_SSE
 +/* Returns a diagonal or off-diagonal interaction mask for SIMD128 lists */
 +static unsigned int get_imask_x86_simd128(gmx_bool rdiag,int ci,int cj)
 +{
 +#ifndef GMX_DOUBLE /* cj-size = 4 */
 +    return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
 +#else              /* cj-size = 2 */
 +    return (rdiag && ci*2 == cj ? NBNXN_INT_MASK_DIAG_J2_0 :
 +            (rdiag && ci*2+1 == cj ? NBNXN_INT_MASK_DIAG_J2_1 :
 +             NBNXN_INT_MASK_ALL));
 +#endif
 +}
 +
 +#ifdef GMX_X86_AVX_256
 +/* Returns a diagonal or off-diagonal interaction mask for SIMD256 lists */
 +static unsigned int get_imask_x86_simd256(gmx_bool rdiag,int ci,int cj)
 +{
 +#ifndef GMX_DOUBLE /* cj-size = 8 */
 +    return (rdiag && ci == cj*2 ? NBNXN_INT_MASK_DIAG_J8_0 :
 +            (rdiag && ci == cj*2+1 ? NBNXN_INT_MASK_DIAG_J8_1 :
 +             NBNXN_INT_MASK_ALL));
 +#else              /* cj-size = 2 */
 +    return (rdiag && ci == cj ? NBNXN_INT_MASK_DIAG : NBNXN_INT_MASK_ALL);
 +#endif
 +}
 +#endif
 +#endif /* NBNXN_SEARCH_SSE */
 +
 +/* Plain C code for making a pair list of cell ci vs cell cjf-cjl.
 + * Checks bounding box distances and possibly atom pair distances.
 + */
 +static void make_cluster_list_simple(const nbnxn_grid_t *gridj,
 +                                     nbnxn_pairlist_t *nbl,
 +                                     int ci,int cjf,int cjl,
 +                                     gmx_bool remove_sub_diag,
 +                                     const real *x_j,
 +                                     real rl2,float rbb2,
 +                                     int *ndistc)
 +{
 +    const nbnxn_list_work_t *work;
 +
 +    const float *bb_ci;
 +    const real  *x_ci;
 +
 +    gmx_bool   InRange;
 +    real       d2;
 +    int        cjf_gl,cjl_gl,cj;
 +
 +    work = nbl->work;
 +
 +    bb_ci = nbl->work->bb_ci;
 +    x_ci  = nbl->work->x_ci;
 +
 +    InRange = FALSE;
 +    while (!InRange && cjf <= cjl)
 +    {
 +        d2 = subc_bb_dist2(0,bb_ci,cjf,gridj->bb);
 +        *ndistc += 2;
 +
 +        /* Check if the distance is within the distance where
 +         * we use only the bounding box distance rbb,
 +         * or within the cut-off and there is at least one atom pair
 +         * within the cut-off.
 +         */
 +        if (d2 < rbb2)
 +        {
 +            InRange = TRUE;
 +        }
 +        else if (d2 < rl2)
 +        {
 +            int i,j;
 +
 +            cjf_gl = gridj->cell0 + cjf;
 +            for(i=0; i<NBNXN_CPU_CLUSTER_I_SIZE && !InRange; i++)
 +            {
 +                for(j=0; j<NBNXN_CPU_CLUSTER_I_SIZE; j++)
 +                {
 +                    InRange = InRange ||
 +                        (sqr(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+XX]) +
 +                         sqr(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+YY]) +
 +                         sqr(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjf_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+ZZ]) < rl2);
 +                }
 +            }
 +            *ndistc += NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE;
 +        }
 +        if (!InRange)
 +        {
 +            cjf++;
 +        }
 +    }
 +    if (!InRange)
 +    {
 +        return;
 +    }
 +
 +    InRange = FALSE;
 +    while (!InRange && cjl > cjf)
 +    {
 +        d2 = subc_bb_dist2(0,bb_ci,cjl,gridj->bb);
 +        *ndistc += 2;
 +
 +        /* Check if the distance is within the distance where
 +         * we use only the bounding box distance rbb,
 +         * or within the cut-off and there is at least one atom pair
 +         * within the cut-off.
 +         */
 +        if (d2 < rbb2)
 +        {
 +            InRange = TRUE;
 +        }
 +        else if (d2 < rl2)
 +        {
 +            int i,j;
 +
 +            cjl_gl = gridj->cell0 + cjl;
 +            for(i=0; i<NBNXN_CPU_CLUSTER_I_SIZE && !InRange; i++)
 +            {
 +                for(j=0; j<NBNXN_CPU_CLUSTER_I_SIZE; j++)
 +                {
 +                    InRange = InRange ||
 +                        (sqr(x_ci[i*STRIDE_XYZ+XX] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+XX]) +
 +                         sqr(x_ci[i*STRIDE_XYZ+YY] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+YY]) +
 +                         sqr(x_ci[i*STRIDE_XYZ+ZZ] - x_j[(cjl_gl*NBNXN_CPU_CLUSTER_I_SIZE+j)*STRIDE_XYZ+ZZ]) < rl2);
 +                }
 +            }
 +            *ndistc += NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE;
 +        }
 +        if (!InRange)
 +        {
 +            cjl--;
 +        }
 +    }
 +
 +    if (cjf <= cjl)
 +    {
 +        for(cj=cjf; cj<=cjl; cj++)
 +        {
 +            /* Store cj and the interaction mask */
 +            nbl->cj[nbl->ncj].cj   = gridj->cell0 + cj;
 +            nbl->cj[nbl->ncj].excl = get_imask(remove_sub_diag,ci,cj);
 +            nbl->ncj++;
 +        }
 +        /* Increase the closing index in i super-cell list */
 +        nbl->ci[nbl->nci].cj_ind_end = nbl->ncj;
 +    }
 +}
 +
 +#ifdef NBNXN_SEARCH_SSE
 +/* Include make_cluster_list_x86_simd128/256 */
 +#define GMX_MM128_HERE
 +#include "gmx_x86_simd_macros.h"
 +#define STRIDE_S  PACK_X4
 +#include "nbnxn_search_x86_simd.h"
 +#undef STRIDE_S
 +#undef GMX_MM128_HERE
 +#ifdef GMX_X86_AVX_256
 +/* Include make_cluster_list_x86_simd128/256 */
 +#define GMX_MM256_HERE
 +#include "gmx_x86_simd_macros.h"
 +#define STRIDE_S  GMX_X86_SIMD_WIDTH_HERE
 +#include "nbnxn_search_x86_simd.h"
 +#undef STRIDE_S
 +#undef GMX_MM256_HERE
 +#endif
 +#endif
 +
 +/* Plain C or SSE code for making a pair list of super-cell sci vs scj.
 + * Checks bounding box distances and possibly atom pair distances.
 + */
 +static void make_cluster_list_supersub(const nbnxn_search_t nbs,
 +                                       const nbnxn_grid_t *gridi,
 +                                       const nbnxn_grid_t *gridj,
 +                                       nbnxn_pairlist_t *nbl,
 +                                       int sci,int scj,
 +                                       gmx_bool sci_equals_scj,
 +                                       int stride,const real *x,
 +                                       real rl2,float rbb2,
 +                                       int *ndistc)
 +{
 +    int  na_c;
 +    int  npair;
 +    int  cjo,ci1,ci,cj,cj_gl;
 +    int  cj4_ind,cj_offset;
 +    unsigned imask;
 +    nbnxn_cj4_t *cj4;
 +    const float *bb_ci;
 +    const real *x_ci;
 +    float *d2l,d2;
 +    int  w;
 +#define PRUNE_LIST_CPU_ONE
 +#ifdef PRUNE_LIST_CPU_ONE
 +    int  ci_last=-1;
 +#endif
 +
 +    d2l = nbl->work->d2;
 +
 +    bb_ci = nbl->work->bb_ci;
 +    x_ci  = nbl->work->x_ci;
 +
 +    na_c = gridj->na_c;
 +
 +    for(cjo=0; cjo<gridj->nsubc[scj]; cjo++)
 +    {
 +        cj4_ind   = (nbl->work->cj_ind >> 2);
 +        cj_offset = nbl->work->cj_ind - cj4_ind*NBNXN_GPU_JGROUP_SIZE;
 +        cj4       = &nbl->cj4[cj4_ind];
 +
 +        cj = scj*GPU_NSUBCELL + cjo;
 +
 +        cj_gl = gridj->cell0*GPU_NSUBCELL + cj;
 +
 +        /* Initialize this j-subcell i-subcell list */
 +        cj4->cj[cj_offset] = cj_gl;
 +        imask              = 0;
 +
 +        if (sci_equals_scj)
 +        {
 +            ci1 = cjo + 1;
 +        }
 +        else
 +        {
 +            ci1 = gridi->nsubc[sci];
 +        }
 +
 +#ifdef NBNXN_BBXXXX
 +        /* Determine all ci1 bb distances in one call with SSE */
 +        subc_bb_dist2_sse_xxxx(gridj->bb+(cj>>STRIDE_8BB_2LOG)*NNBSBB_XXXX+(cj & (STRIDE_8BB-1)),
 +                               ci1,bb_ci,d2l);
 +        *ndistc += na_c*2;
 +#endif
 +
 +        npair = 0;
 +        /* We use a fixed upper-bound instead of ci1 to help optimization */
 +        for(ci=0; ci<GPU_NSUBCELL; ci++)
 +        {
 +            if (ci == ci1)
 +            {
 +                break;
 +            }
 +
 +#ifndef NBNXN_BBXXXX
 +            /* Determine the bb distance between ci and cj */
 +            d2l[ci] = subc_bb_dist2(ci,bb_ci,cj,gridj->bb);
 +            *ndistc += 2;
 +#endif
 +            d2 = d2l[ci];
 +
 +#ifdef PRUNE_LIST_CPU_ALL
 +            /* Check if the distance is within the distance where
 +             * we use only the bounding box distance rbb,
 +             * or within the cut-off and there is at least one atom pair
 +             * within the cut-off. This check is very costly.
 +             */
 +            *ndistc += na_c*na_c;
 +            if (d2 < rbb2 ||
 +                (d2 < rl2 && subc_in_range_x(na_c,ci,x_ci,cj_gl,stride,x,rl2)))
 +#else
 +            /* Check if the distance between the two bounding boxes
 +             * in within the pair-list cut-off.
 +             */
 +            if (d2 < rl2)
 +#endif
 +            {
 +                /* Flag this i-subcell to be taken into account */
 +                imask |= (1U << (cj_offset*GPU_NSUBCELL+ci));
 +
 +#ifdef PRUNE_LIST_CPU_ONE
 +                ci_last = ci;
 +#endif
 +
 +                npair++;
 +            }
 +        }
 +
 +#ifdef PRUNE_LIST_CPU_ONE
 +        /* If we only found 1 pair, check if any atoms are actually
 +         * within the cut-off, so we could get rid of it.
 +         */
 +        if (npair == 1 && d2l[ci_last] >= rbb2)
 +        {
 +            /* Avoid using function pointers here, as it's slower */
 +            if (
 +#ifdef NBNXN_8BB_SSE
 +                !subc_in_range_sse8
 +#else
 +                !subc_in_range_x
 +#endif
 +                                (na_c,ci_last,x_ci,cj_gl,stride,x,rl2))
 +            {
 +                imask &= ~(1U << (cj_offset*GPU_NSUBCELL+ci_last));
 +                npair--;
 +            }
 +        }
 +#endif
 +
 +        if (npair > 0)
 +        {
 +            /* We have a useful sj entry, close it now */
 +
 +            /* Set the exclucions for the ci== sj entry.
 +             * Here we don't bother to check if this entry is actually flagged,
 +             * as it will nearly always be in the list.
 +             */
 +            if (sci_equals_scj)
 +            {
 +                set_self_and_newton_excls_supersub(nbl,cj4_ind,cj_offset,cjo);
 +            }
 +
 +            /* Copy the cluster interaction mask to the list */
 +            for(w=0; w<NWARP; w++)
 +            {
 +                cj4->imei[w].imask |= imask;
 +            }
 +
 +            nbl->work->cj_ind++;
 +
 +            /* Keep the count */
 +            nbl->nci_tot += npair;
 +
 +            /* Increase the closing index in i super-cell list */
 +            nbl->sci[nbl->nsci].cj4_ind_end = ((nbl->work->cj_ind+4-1)>>2);
 +        }
 +    }
 +}
 +
 +/* Set all atom-pair exclusions from the topology stored in excl
 + * as masks in the pair-list for simple list i-entry nbl_ci
 + */
 +static void set_ci_top_excls(const nbnxn_search_t nbs,
 +                             nbnxn_pairlist_t *nbl,
 +                             gmx_bool diagRemoved,
 +                             int na_ci_2log,
 +                             int na_cj_2log,
 +                             const nbnxn_ci_t *nbl_ci,
 +                             const t_blocka *excl)
 +{
 +    const int *cell;
 +    int ci;
 +    int cj_ind_first,cj_ind_last;
 +    int cj_first,cj_last;
 +    int ndirect;
 +    int i,ai,aj,si,eind,ge,se;
 +    int found,cj_ind_0,cj_ind_1,cj_ind_m;
 +    int cj_m;
 +    gmx_bool Found_si;
 +    int si_ind;
 +    nbnxn_excl_t *nbl_excl;
 +    int inner_i,inner_e;
 +
 +    cell = nbs->cell;
 +
 +    if (nbl_ci->cj_ind_end == nbl_ci->cj_ind_start)
 +    {
 +        /* Empty list */
 +        return;
 +    }
 +
 +    ci = nbl_ci->ci;
 +
 +    cj_ind_first = nbl_ci->cj_ind_start;
 +    cj_ind_last  = nbl->ncj - 1;
 +
 +    cj_first = nbl->cj[cj_ind_first].cj;
 +    cj_last  = nbl->cj[cj_ind_last].cj;
 +
 +    /* Determine how many contiguous j-cells we have starting
 +     * from the first i-cell. This number can be used to directly
 +     * calculate j-cell indices for excluded atoms.
 +     */
 +    ndirect = 0;
 +    if (na_ci_2log == na_cj_2log)
 +    {
 +        while (cj_ind_first + ndirect <= cj_ind_last &&
 +               nbl->cj[cj_ind_first+ndirect].cj == ci + ndirect)
 +        {
 +            ndirect++;
 +        }
 +    }
 +#ifdef NBNXN_SEARCH_SSE
 +    else
 +    {
 +        while (cj_ind_first + ndirect <= cj_ind_last &&
 +               nbl->cj[cj_ind_first+ndirect].cj == ci_to_cj(na_cj_2log,ci) + ndirect)
 +        {
 +            ndirect++;
 +        }
 +    }
 +#endif
 +
 +    /* Loop over the atoms in the i super-cell */
 +    for(i=0; i<nbl->na_sc; i++)
 +    {
 +        ai = nbs->a[ci*nbl->na_sc+i];
 +        if (ai >= 0)
 +        {
 +            si  = (i>>na_ci_2log);
 +
 +            /* Loop over the topology-based exclusions for this i-atom */
 +            for(eind=excl->index[ai]; eind<excl->index[ai+1]; eind++)
 +            {
 +                aj = excl->a[eind];
 +
 +                if (aj == ai)
 +                {
 +                    /* The self exclusion are already set, save some time */
 +                    continue;
 +                }
 +
 +                ge = cell[aj];
 +
 +                /* Without shifts we only calculate interactions j>i
 +                 * for one-way pair-lists.
 +                 */
 +                if (diagRemoved && ge <= ci*nbl->na_sc + i)
 +                {
 +                    continue;
 +                }
 +
 +                se = (ge >> na_cj_2log);
 +
 +                /* Could the cluster se be in our list? */
 +                if (se >= cj_first && se <= cj_last)
 +                {
 +                    if (se < cj_first + ndirect)
 +                    {
 +                        /* We can calculate cj_ind directly from se */
 +                        found = cj_ind_first + se - cj_first;
 +                    }
 +                    else
 +                    {
 +                        /* Search for se using bisection */
 +                        found = -1;
 +                        cj_ind_0 = cj_ind_first + ndirect;
 +                        cj_ind_1 = cj_ind_last + 1;
 +                        while (found == -1 && cj_ind_0 < cj_ind_1)
 +                        {
 +                            cj_ind_m = (cj_ind_0 + cj_ind_1)>>1;
 +
 +                            cj_m = nbl->cj[cj_ind_m].cj;
 +
 +                            if (se == cj_m)
 +                            {
 +                                found = cj_ind_m;
 +                            }
 +                            else if (se < cj_m)
 +                            {
 +                                cj_ind_1 = cj_ind_m;
 +                            }
 +                            else
 +                            {
 +                                cj_ind_0 = cj_ind_m + 1;
 +                            }
 +                        }
 +                    }
 +
 +                    if (found >= 0)
 +                    {
 +                        inner_i = i  - (si << na_ci_2log);
 +                        inner_e = ge - (se << na_cj_2log);
 +
 +                        nbl->cj[found].excl &= ~(1U<<((inner_i<<na_cj_2log) + inner_e));
 +                    }
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* Set all atom-pair exclusions from the topology stored in excl
 + * as masks in the pair-list for i-super-cell entry nbl_sci
 + */
 +static void set_sci_top_excls(const nbnxn_search_t nbs,
 +                              nbnxn_pairlist_t *nbl,
 +                              gmx_bool diagRemoved,
 +                              int na_c_2log,
 +                              const nbnxn_sci_t *nbl_sci,
 +                              const t_blocka *excl)
 +{
 +    const int *cell;
 +    int na_c;
 +    int sci;
 +    int cj_ind_first,cj_ind_last;
 +    int cj_first,cj_last;
 +    int ndirect;
 +    int i,ai,aj,si,eind,ge,se;
 +    int found,cj_ind_0,cj_ind_1,cj_ind_m;
 +    int cj_m;
 +    gmx_bool Found_si;
 +    int si_ind;
 +    nbnxn_excl_t *nbl_excl;
 +    int inner_i,inner_e,w;
 +
 +    cell = nbs->cell;
 +
 +    na_c = nbl->na_ci;
 +
 +    if (nbl_sci->cj4_ind_end == nbl_sci->cj4_ind_start)
 +    {
 +        /* Empty list */
 +        return;
 +    }
 +
 +    sci = nbl_sci->sci;
 +
 +    cj_ind_first = nbl_sci->cj4_ind_start*NBNXN_GPU_JGROUP_SIZE;
 +    cj_ind_last  = nbl->work->cj_ind - 1;
 +
 +    cj_first = nbl->cj4[nbl_sci->cj4_ind_start].cj[0];
 +    cj_last  = nbl_cj(nbl,cj_ind_last);
 +
 +    /* Determine how many contiguous j-clusters we have starting
 +     * from the first i-cluster. This number can be used to directly
 +     * calculate j-cluster indices for excluded atoms.
 +     */
 +    ndirect = 0;
 +    while (cj_ind_first + ndirect <= cj_ind_last &&
 +           nbl_cj(nbl,cj_ind_first+ndirect) == sci*GPU_NSUBCELL + ndirect)
 +    {
 +        ndirect++;
 +    }
 +
 +    /* Loop over the atoms in the i super-cell */
 +    for(i=0; i<nbl->na_sc; i++)
 +    {
 +        ai = nbs->a[sci*nbl->na_sc+i];
 +        if (ai >= 0)
 +        {
 +            si  = (i>>na_c_2log);
 +
 +            /* Loop over the topology-based exclusions for this i-atom */
 +            for(eind=excl->index[ai]; eind<excl->index[ai+1]; eind++)
 +            {
 +                aj = excl->a[eind];
 +
 +                if (aj == ai)
 +                {
 +                    /* The self exclusion are already set, save some time */
 +                    continue;
 +                }
 +
 +                ge = cell[aj];
 +
 +                /* Without shifts we only calculate interactions j>i
 +                 * for one-way pair-lists.
 +                 */
 +                if (diagRemoved && ge <= sci*nbl->na_sc + i)
 +                {
 +                    continue;
 +                }
 +
 +                se = ge>>na_c_2log;
 +                /* Could the cluster se be in our list? */
 +                if (se >= cj_first && se <= cj_last)
 +                {
 +                    if (se < cj_first + ndirect)
 +                    {
 +                        /* We can calculate cj_ind directly from se */
 +                        found = cj_ind_first + se - cj_first;
 +                    }
 +                    else
 +                    {
 +                        /* Search for se using bisection */
 +                        found = -1;
 +                        cj_ind_0 = cj_ind_first + ndirect;
 +                        cj_ind_1 = cj_ind_last + 1;
 +                        while (found == -1 && cj_ind_0 < cj_ind_1)
 +                        {
 +                            cj_ind_m = (cj_ind_0 + cj_ind_1)>>1;
 +
 +                            cj_m = nbl_cj(nbl,cj_ind_m);
 +
 +                            if (se == cj_m)
 +                            {
 +                                found = cj_ind_m;
 +                            }
 +                            else if (se < cj_m)
 +                            {
 +                                cj_ind_1 = cj_ind_m;
 +                            }
 +                            else
 +                            {
 +                                cj_ind_0 = cj_ind_m + 1;
 +                            }
 +                        }
 +                    }
 +
 +                    if (found >= 0)
 +                    {
 +                        inner_i = i  - si*na_c;
 +                        inner_e = ge - se*na_c;
 +
 +/* Macro for getting the index of atom a within a cluster */
 +#define AMODI(a)  ((a) & (NBNXN_CPU_CLUSTER_I_SIZE - 1))
 +/* Macro for converting an atom number to a cluster number */
 +#define A2CI(a)   ((a) >> NBNXN_CPU_CLUSTER_I_SIZE_2LOG)
 +
 +                        if (nbl_imask0(nbl,found) & (1U << (AMODI(found)*GPU_NSUBCELL + si)))
 +                        {
 +                            w       = (inner_e >> 2);
 +
 +                            get_nbl_exclusions_1(nbl,A2CI(found),w,&nbl_excl);
 +
 +                            nbl_excl->pair[AMODI(inner_e)*nbl->na_ci+inner_i] &=
 +                                ~(1U << (AMODI(found)*GPU_NSUBCELL + si));
 +                        }
 +
 +#undef AMODI
 +#undef A2CI
 +                    }
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* Reallocate the simple ci list for at least n entries */
 +static void nb_realloc_ci(nbnxn_pairlist_t *nbl,int n)
 +{
 +    nbl->ci_nalloc = over_alloc_small(n);
 +    nbnxn_realloc_void((void **)&nbl->ci,
 +                       nbl->nci*sizeof(*nbl->ci),
 +                       nbl->ci_nalloc*sizeof(*nbl->ci),
 +                       nbl->alloc,nbl->free);
 +}
 +
 +/* Reallocate the super-cell sci list for at least n entries */
 +static void nb_realloc_sci(nbnxn_pairlist_t *nbl,int n)
 +{
 +    nbl->sci_nalloc = over_alloc_small(n);
 +    nbnxn_realloc_void((void **)&nbl->sci,
 +                       nbl->nsci*sizeof(*nbl->sci),
 +                       nbl->sci_nalloc*sizeof(*nbl->sci),
 +                       nbl->alloc,nbl->free);
 +}
 +
 +/* Make a new ci entry at index nbl->nci */
 +static void new_ci_entry(nbnxn_pairlist_t *nbl,int ci,int shift,int flags,
 +                         nbnxn_list_work_t *work)
 +{
 +    if (nbl->nci + 1 > nbl->ci_nalloc)
 +    {
 +        nb_realloc_ci(nbl,nbl->nci+1);
 +    }
 +    nbl->ci[nbl->nci].ci            = ci;
 +    nbl->ci[nbl->nci].shift         = shift;
 +    /* Store the interaction flags along with the shift */
 +    nbl->ci[nbl->nci].shift        |= flags;
 +    nbl->ci[nbl->nci].cj_ind_start  = nbl->ncj;
 +    nbl->ci[nbl->nci].cj_ind_end    = nbl->ncj;
 +}
 +
 +/* Make a new sci entry at index nbl->nsci */
 +static void new_sci_entry(nbnxn_pairlist_t *nbl,int sci,int shift,int flags,
 +                          nbnxn_list_work_t *work)
 +{
 +    if (nbl->nsci + 1 > nbl->sci_nalloc)
 +    {
 +        nb_realloc_sci(nbl,nbl->nsci+1);
 +    }
 +    nbl->sci[nbl->nsci].sci           = sci;
 +    nbl->sci[nbl->nsci].shift         = shift;
 +    nbl->sci[nbl->nsci].cj4_ind_start = nbl->ncj4;
 +    nbl->sci[nbl->nsci].cj4_ind_end   = nbl->ncj4;
 +}
 +
 +/* Sort the simple j-list cj on exclusions.
 + * Entries with exclusions will all be sorted to the beginning of the list.
 + */
 +static void sort_cj_excl(nbnxn_cj_t *cj,int ncj,
 +                         nbnxn_list_work_t *work)
 +{
 +    int jnew,j;
 +
 +    if (ncj > work->cj_nalloc)
 +    {
 +        work->cj_nalloc = over_alloc_large(ncj);
 +        srenew(work->cj,work->cj_nalloc);
 +    }
 +
 +    /* Make a list of the j-cells involving exclusions */
 +    jnew = 0;
 +    for(j=0; j<ncj; j++)
 +    {
 +        if (cj[j].excl != NBNXN_INT_MASK_ALL)
 +        {
 +            work->cj[jnew++] = cj[j];
 +        }
 +    }
 +    /* Check if there are exclusions at all or not just the first entry */
 +    if (!((jnew == 0) ||
 +          (jnew == 1 && cj[0].excl != NBNXN_INT_MASK_ALL)))
 +    {
 +        for(j=0; j<ncj; j++)
 +        {
 +            if (cj[j].excl == NBNXN_INT_MASK_ALL)
 +            {
 +                work->cj[jnew++] = cj[j];
 +            }
 +        }
 +        for(j=0; j<ncj; j++)
 +        {
 +            cj[j] = work->cj[j];
 +        }
 +    }
 +}
 +
 +/* Close this simple list i entry */
 +static void close_ci_entry_simple(nbnxn_pairlist_t *nbl)
 +{
 +    int jlen;
 +
 +    /* All content of the new ci entry have already been filled correctly,
 +     * we only need to increase the count here (for non empty lists).
 +     */
 +    jlen = nbl->ci[nbl->nci].cj_ind_end - nbl->ci[nbl->nci].cj_ind_start;
 +    if (jlen > 0)
 +    {
 +        sort_cj_excl(nbl->cj+nbl->ci[nbl->nci].cj_ind_start,jlen,nbl->work);
 +
 +        if (nbl->ci[nbl->nci].shift & NBNXN_CI_HALF_LJ(0))
 +        {
 +            nbl->work->ncj_hlj += jlen;
 +        }
 +        else if (!(nbl->ci[nbl->nci].shift & NBNXN_CI_DO_COUL(0)))
 +        {
 +            nbl->work->ncj_noq += jlen;
 +        }
 +
 +        nbl->nci++;
 +    }
 +}
 +
 +/* Split sci entry for load balancing on the GPU.
 + * As we only now the current count on our own thread,
 + * we will need to estimate the current total amount of i-entries.
 + * As the lists get concatenated later, this estimate depends
 + * both on nthread and our own thread index thread.
 + */
 +static void split_sci_entry(nbnxn_pairlist_t *nbl,
 +                            int nsp_max_av,gmx_bool progBal,int nc_bal,
 +                            int thread,int nthread)
 +{
 +    int nsci_est;
 +    int nsp_max;
 +    int cj4_start,cj4_end,j4len,cj4;
 +    int sci;
 +    int nsp,nsp_sci,nsp_cj4,nsp_cj4_e,nsp_cj4_p;
 +    int p;
 +
 +    /* Estimate the total numbers of ci's of the nblist combined
 +     * over all threads using the target number of ci's.
 +     */
 +    nsci_est = nc_bal*thread/nthread + nbl->nsci;
 +    if (progBal)
 +    {
 +        /* The first ci blocks should be larger, to avoid overhead.
 +         * The last ci blocks should be smaller, to improve load balancing.
 +         */
 +        nsp_max = max(1,
 +                      nsp_max_av*nc_bal*3/(2*(nsci_est - 1 + nc_bal)));
 +    }
 +    else
 +    {
 +        nsp_max = nsp_max_av;
 +    }
 +
 +    cj4_start = nbl->sci[nbl->nsci-1].cj4_ind_start;
 +    cj4_end   = nbl->sci[nbl->nsci-1].cj4_ind_end;
 +    j4len = cj4_end - cj4_start;
 +
 +    if (j4len > 1 && j4len*GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE > nsp_max)
 +    {
 +        /* Remove the last ci entry and process the cj4's again */
 +        nbl->nsci -= 1;
 +
 +        sci        = nbl->nsci;
 +        cj4        = cj4_start;
 +        nsp        = 0;
 +        nsp_sci    = 0;
 +        nsp_cj4_e  = 0;
 +        nsp_cj4    = 0;
 +        while (cj4 < cj4_end)
 +        {
 +            nsp_cj4_p = nsp_cj4;
 +            nsp_cj4   = 0;
 +            for(p=0; p<GPU_NSUBCELL*NBNXN_GPU_JGROUP_SIZE; p++)
 +            {
 +                nsp_cj4 += (nbl->cj4[cj4].imei[0].imask >> p) & 1;
 +            }
 +            nsp += nsp_cj4;
 +
 +            if (nsp > nsp_max && nsp > nsp_cj4)
 +            {
 +                nbl->sci[sci].cj4_ind_end = cj4;
 +                sci++;
 +                nbl->nsci++;
 +                if (nbl->nsci+1 > nbl->sci_nalloc)
 +                {
 +                    nb_realloc_sci(nbl,nbl->nsci+1);
 +                }
 +                nbl->sci[sci].sci           = nbl->sci[nbl->nsci-1].sci;
 +                nbl->sci[sci].shift         = nbl->sci[nbl->nsci-1].shift;
 +                nbl->sci[sci].cj4_ind_start = cj4;
 +                nsp_sci   = nsp - nsp_cj4;
 +                nsp_cj4_e = nsp_cj4_p;
 +                nsp       = nsp_cj4;
 +            }
 +
 +            cj4++;
 +        }
 +
 +        /* Put the remaining cj4's in a new ci entry */
 +        nbl->sci[sci].cj4_ind_end = cj4_end;
 +
 +        /* Possibly balance out the last two ci's
 +         * by moving the last cj4 of the second last ci.
 +         */
 +        if (nsp_sci - nsp_cj4_e >= nsp + nsp_cj4_e)
 +        {
 +            nbl->sci[sci-1].cj4_ind_end--;
 +            nbl->sci[sci].cj4_ind_start--;
 +        }
 +
 +        sci++;
 +        nbl->nsci++;
 +    }
 +}
 +
 +/* Clost this super/sub list i entry */
 +static void close_ci_entry_supersub(nbnxn_pairlist_t *nbl,
 +                                    int nsp_max_av,
 +                                    gmx_bool progBal,int nc_bal,
 +                                    int thread,int nthread)
 +{
 +    int j4len,tlen;
 +    int nb,b;
 +
 +    /* All content of the new ci entry have already been filled correctly,
 +     * we only need to increase the count here (for non empty lists).
 +     */
 +    j4len = nbl->sci[nbl->nsci].cj4_ind_end - nbl->sci[nbl->nsci].cj4_ind_start;
 +    if (j4len > 0)
 +    {
 +        /* We can only have complete blocks of 4 j-entries in a list,
 +         * so round the count up before closing.
 +         */
 +        nbl->ncj4         = ((nbl->work->cj_ind + 4-1) >> 2);
 +        nbl->work->cj_ind = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
 +
 +        nbl->nsci++;
 +
 +        if (nsp_max_av > 0)
 +        {
 +            split_sci_entry(nbl,nsp_max_av,progBal,nc_bal,thread,nthread);
 +        }
 +    }
 +}
 +
 +/* Syncs the working array before adding another grid pair to the list */
 +static void sync_work(nbnxn_pairlist_t *nbl)
 +{
 +    if (!nbl->bSimple)
 +    {
 +        nbl->work->cj_ind   = nbl->ncj4*NBNXN_GPU_JGROUP_SIZE;
 +        nbl->work->cj4_init = nbl->ncj4;
 +    }
 +}
 +
 +/* Clears an nbnxn_pairlist_t data structure */
 +static void clear_pairlist(nbnxn_pairlist_t *nbl)
 +{
 +    nbl->nci           = 0;
 +    nbl->nsci          = 0;
 +    nbl->ncj           = 0;
 +    nbl->ncj4          = 0;
 +    nbl->nci_tot       = 0;
 +    nbl->nexcl         = 1;
 +
 +    nbl->work->ncj_noq = 0;
 +    nbl->work->ncj_hlj = 0;
 +}
 +
 +/* Sets a simple list i-cell bounding box, including PBC shift */
 +static void set_icell_bb_simple(const float *bb,int ci,
 +                                real shx,real shy,real shz,
 +                                float *bb_ci)
 +{
 +    int ia;
 +
 +    ia = ci*NNBSBB_B;
 +    bb_ci[BBL_X] = bb[ia+BBL_X] + shx;
 +    bb_ci[BBL_Y] = bb[ia+BBL_Y] + shy;
 +    bb_ci[BBL_Z] = bb[ia+BBL_Z] + shz;
 +    bb_ci[BBU_X] = bb[ia+BBU_X] + shx;
 +    bb_ci[BBU_Y] = bb[ia+BBU_Y] + shy;
 +    bb_ci[BBU_Z] = bb[ia+BBU_Z] + shz;
 +}
 +
 +/* Sets a super-cell and sub cell bounding boxes, including PBC shift */
 +static void set_icell_bb_supersub(const float *bb,int ci,
 +                                  real shx,real shy,real shz,
 +                                  float *bb_ci)
 +{
 +    int ia,m,i;
 +
 +#ifdef NBNXN_BBXXXX
 +    ia = ci*(GPU_NSUBCELL>>STRIDE_8BB_2LOG)*NNBSBB_XXXX;
 +    for(m=0; m<(GPU_NSUBCELL>>STRIDE_8BB_2LOG)*NNBSBB_XXXX; m+=NNBSBB_XXXX)
 +    {
 +        for(i=0; i<STRIDE_8BB; i++)
 +        {
 +            bb_ci[m+0*STRIDE_8BB+i] = bb[ia+m+0*STRIDE_8BB+i] + shx;
 +            bb_ci[m+1*STRIDE_8BB+i] = bb[ia+m+1*STRIDE_8BB+i] + shy;
 +            bb_ci[m+2*STRIDE_8BB+i] = bb[ia+m+2*STRIDE_8BB+i] + shz;
 +            bb_ci[m+3*STRIDE_8BB+i] = bb[ia+m+3*STRIDE_8BB+i] + shx;
 +            bb_ci[m+4*STRIDE_8BB+i] = bb[ia+m+4*STRIDE_8BB+i] + shy;
 +            bb_ci[m+5*STRIDE_8BB+i] = bb[ia+m+5*STRIDE_8BB+i] + shz;
 +        }
 +    }
 +#else
 +    ia = ci*GPU_NSUBCELL*NNBSBB_B;
 +    for(i=0; i<GPU_NSUBCELL*NNBSBB_B; i+=NNBSBB_B)
 +    {
-                              gmx_bool bDomDec, int nth,
-                              gmx_bool *bFBufferFlag)
++        bb_ci[i+BBL_X] = bb[ia+i+BBL_X] + shx;
++        bb_ci[i+BBL_Y] = bb[ia+i+BBL_Y] + shy;
++        bb_ci[i+BBL_Z] = bb[ia+i+BBL_Z] + shz;
++        bb_ci[i+BBU_X] = bb[ia+i+BBU_X] + shx;
++        bb_ci[i+BBU_Y] = bb[ia+i+BBU_Y] + shy;
++        bb_ci[i+BBU_Z] = bb[ia+i+BBU_Z] + shz;
 +    }
 +#endif
 +}
 +
 +/* Copies PBC shifted i-cell atom coordinates x,y,z to working array */
 +static void icell_set_x_simple(int ci,
 +                               real shx,real shy,real shz,
 +                               int na_c,
 +                               int stride,const real *x,
 +                               nbnxn_list_work_t *work)
 +{
 +    int  ia,i;
 +
 +    ia = ci*NBNXN_CPU_CLUSTER_I_SIZE;
 +
 +    for(i=0; i<NBNXN_CPU_CLUSTER_I_SIZE; i++)
 +    {
 +        work->x_ci[i*STRIDE_XYZ+XX] = x[(ia+i)*stride+XX] + shx;
 +        work->x_ci[i*STRIDE_XYZ+YY] = x[(ia+i)*stride+YY] + shy;
 +        work->x_ci[i*STRIDE_XYZ+ZZ] = x[(ia+i)*stride+ZZ] + shz;
 +    }
 +}
 +
 +/* Copies PBC shifted super-cell atom coordinates x,y,z to working array */
 +static void icell_set_x_supersub(int ci,
 +                                 real shx,real shy,real shz,
 +                                 int na_c,
 +                                 int stride,const real *x,
 +                                 nbnxn_list_work_t *work)
 +{
 +    int  ia,i;
 +    real *x_ci;
 +
 +    x_ci = work->x_ci;
 +
 +    ia = ci*GPU_NSUBCELL*na_c;
 +    for(i=0; i<GPU_NSUBCELL*na_c; i++)
 +    {
 +        x_ci[i*DIM + XX] = x[(ia+i)*stride + XX] + shx;
 +        x_ci[i*DIM + YY] = x[(ia+i)*stride + YY] + shy;
 +        x_ci[i*DIM + ZZ] = x[(ia+i)*stride + ZZ] + shz;
 +    }
 +}
 +
 +#ifdef NBNXN_SEARCH_SSE
 +/* Copies PBC shifted super-cell packed atom coordinates to working array */
 +static void icell_set_x_supersub_sse8(int ci,
 +                                      real shx,real shy,real shz,
 +                                      int na_c,
 +                                      int stride,const real *x,
 +                                      nbnxn_list_work_t *work)
 +{
 +    int  si,io,ia,i,j;
 +    real *x_ci;
 +
 +    x_ci = work->x_ci;
 +
 +    for(si=0; si<GPU_NSUBCELL; si++)
 +    {
 +        for(i=0; i<na_c; i+=STRIDE_8BB)
 +        {
 +            io = si*na_c + i;
 +            ia = ci*GPU_NSUBCELL*na_c + io;
 +            for(j=0; j<STRIDE_8BB; j++)
 +            {
 +                x_ci[io*DIM + j + XX*STRIDE_8BB] = x[(ia+j)*stride+XX] + shx;
 +                x_ci[io*DIM + j + YY*STRIDE_8BB] = x[(ia+j)*stride+YY] + shy;
 +                x_ci[io*DIM + j + ZZ*STRIDE_8BB] = x[(ia+j)*stride+ZZ] + shz;
 +            }
 +        }
 +    }
 +}
 +#endif
 +
 +static real nbnxn_rlist_inc_nonloc_fac = 0.6;
 +
 +/* Due to the cluster size the effective pair-list is longer than
 + * that of a simple atom pair-list. This function gives the extra distance.
 + */
 +real nbnxn_get_rlist_effective_inc(int cluster_size,real atom_density)
 +{
 +    return ((0.5 + nbnxn_rlist_inc_nonloc_fac)*sqr(((cluster_size) - 1.0)/(cluster_size))*pow((cluster_size)/(atom_density),1.0/3.0));
 +}
 +
 +/* Estimates the interaction volume^2 for non-local interactions */
 +static real nonlocal_vol2(const gmx_domdec_zones_t *zones,rvec ls,real r)
 +{
 +    int  z,d;
 +    real cl,ca,za;
 +    real vold_est;
 +    real vol2_est_tot;
 +
 +    vol2_est_tot = 0;
 +
 +    /* Here we simply add up the volumes of 1, 2 or 3 1D decomposition
 +     * not home interaction volume^2. As these volumes are not additive,
 +     * this is an overestimate, but it would only be significant in the limit
 +     * of small cells, where we anyhow need to split the lists into
 +     * as small parts as possible.
 +     */
 +
 +    for(z=0; z<zones->n; z++)
 +    {
 +        if (zones->shift[z][XX] + zones->shift[z][YY] + zones->shift[z][ZZ] == 1)
 +        {
 +            cl = 0;
 +            ca = 1;
 +            za = 1;
 +            for(d=0; d<DIM; d++)
 +            {
 +                if (zones->shift[z][d] == 0)
 +                {
 +                    cl += 0.5*ls[d];
 +                    ca *= ls[d];
 +                    za *= zones->size[z].x1[d] - zones->size[z].x0[d];
 +                }
 +            }
 +
 +            /* 4 octants of a sphere */
 +            vold_est  = 0.25*M_PI*r*r*r*r;
 +            /* 4 quarter pie slices on the edges */
 +            vold_est += 4*cl*M_PI/6.0*r*r*r;
 +            /* One rectangular volume on a face */
 +            vold_est += ca*0.5*r*r;
 +
 +            vol2_est_tot += vold_est*za;
 +        }
 +    }
 +
 +    return vol2_est_tot;
 +}
 +
 +/* Estimates the average size of a full j-list for super/sub setup */
 +static int get_nsubpair_max(const nbnxn_search_t nbs,
 +                            int iloc,
 +                            real rlist,
 +                            int min_ci_balanced)
 +{
 +    const nbnxn_grid_t *grid;
 +    rvec ls;
 +    real xy_diag2,r_eff_sup,vol_est,nsp_est,nsp_est_nl;
 +    int  nsubpair_max;
 +
 +    grid = &nbs->grid[0];
 +
 +    ls[XX] = (grid->c1[XX] - grid->c0[XX])/(grid->ncx*GPU_NSUBCELL_X);
 +    ls[YY] = (grid->c1[YY] - grid->c0[YY])/(grid->ncy*GPU_NSUBCELL_Y);
 +    ls[ZZ] = (grid->c1[ZZ] - grid->c0[ZZ])*grid->ncx*grid->ncy/(grid->nc*GPU_NSUBCELL_Z);
 +
 +    /* The average squared length of the diagonal of a sub cell */
 +    xy_diag2 = ls[XX]*ls[XX] + ls[YY]*ls[YY] + ls[ZZ]*ls[ZZ];
 +
 +    /* The formulas below are a heuristic estimate of the average nsj per si*/
 +    r_eff_sup = rlist + nbnxn_rlist_inc_nonloc_fac*sqr((grid->na_c - 1.0)/grid->na_c)*sqrt(xy_diag2/3);
 +
 +    if (!nbs->DomDec || nbs->zones->n == 1)
 +    {
 +        nsp_est_nl = 0;
 +    }
 +    else
 +    {
 +        nsp_est_nl =
 +            sqr(grid->atom_density/grid->na_c)*
 +            nonlocal_vol2(nbs->zones,ls,r_eff_sup);
 +    }
 +
 +    if (LOCAL_I(iloc))
 +    {
 +        /* Sub-cell interacts with itself */
 +        vol_est  = ls[XX]*ls[YY]*ls[ZZ];
 +        /* 6/2 rectangular volume on the faces */
 +        vol_est += (ls[XX]*ls[YY] + ls[XX]*ls[ZZ] + ls[YY]*ls[ZZ])*r_eff_sup;
 +        /* 12/2 quarter pie slices on the edges */
 +        vol_est += 2*(ls[XX] + ls[YY] + ls[ZZ])*0.25*M_PI*sqr(r_eff_sup);
 +        /* 4 octants of a sphere */
 +        vol_est += 0.5*4.0/3.0*M_PI*pow(r_eff_sup,3);
 +
 +        nsp_est = grid->nsubc_tot*vol_est*grid->atom_density/grid->na_c;
 +
 +        /* Subtract the non-local pair count */
 +        nsp_est -= nsp_est_nl;
 +
 +        if (debug)
 +        {
 +            fprintf(debug,"nsp_est local %5.1f non-local %5.1f\n",
 +                    nsp_est,nsp_est_nl);
 +        }
 +    }
 +    else
 +    {
 +        nsp_est = nsp_est_nl;
 +    }
 +
 +    if (min_ci_balanced <= 0 || grid->nc >= min_ci_balanced || grid->nc == 0)
 +    {
 +        /* We don't need to worry */
 +        nsubpair_max = -1;
 +    }
 +    else
 +    {
 +        /* Thus the (average) maximum j-list size should be as follows */
 +        nsubpair_max = max(1,(int)(nsp_est/min_ci_balanced+0.5));
 +
 +        /* Since the target value is a maximum (this avoid high outliers,
 +         * which lead to load imbalance), not average, we get more lists
 +         * than we ask for (to compensate we need to add GPU_NSUBCELL*4/4).
 +         * But more importantly, the optimal GPU performance moves
 +         * to lower number of block for very small blocks.
 +         * To compensate we add the maximum pair count per cj4.
 +         */
 +        nsubpair_max += GPU_NSUBCELL*NBNXN_CPU_CLUSTER_I_SIZE;
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"nbl nsp estimate %.1f, nsubpair_max %d\n",
 +                nsp_est,nsubpair_max);
 +    }
 +
 +    return nsubpair_max;
 +}
 +
 +/* Debug list print function */
 +static void print_nblist_ci_cj(FILE *fp,const nbnxn_pairlist_t *nbl)
 +{
 +    int i,j;
 +
 +    for(i=0; i<nbl->nci; i++)
 +    {
 +        fprintf(fp,"ci %4d  shift %2d  ncj %3d\n",
 +                nbl->ci[i].ci,nbl->ci[i].shift,
 +                nbl->ci[i].cj_ind_end - nbl->ci[i].cj_ind_start);
 +
 +        for(j=nbl->ci[i].cj_ind_start; j<nbl->ci[i].cj_ind_end; j++)
 +        {
 +            fprintf(fp,"  cj %5d  imask %x\n",
 +                    nbl->cj[j].cj,
 +                    nbl->cj[j].excl);
 +        }
 +    }
 +}
 +
 +/* Debug list print function */
 +static void print_nblist_sci_cj(FILE *fp,const nbnxn_pairlist_t *nbl)
 +{
 +    int i,j4,j;
 +
 +    for(i=0; i<nbl->nsci; i++)
 +    {
 +        fprintf(fp,"ci %4d  shift %2d  ncj4 %2d\n",
 +                nbl->sci[i].sci,nbl->sci[i].shift,
 +                nbl->sci[i].cj4_ind_end - nbl->sci[i].cj4_ind_start);
 +
 +        for(j4=nbl->sci[i].cj4_ind_start; j4<nbl->sci[i].cj4_ind_end; j4++)
 +        {
 +            for(j=0; j<4; j++)
 +            {
 +                fprintf(fp,"  sj %5d  imask %x\n",
 +                        nbl->cj4[j4].cj[j],
 +                        nbl->cj4[j4].imei[0].imask);
 +            }
 +        }
 +    }
 +}
 +
 +/* Combine pair lists *nbl generated on multiple threads nblc */
 +static void combine_nblists(int nnbl,nbnxn_pairlist_t **nbl,
 +                            nbnxn_pairlist_t *nblc)
 +{
 +    int nsci,ncj4,nexcl;
 +    int n,i;
 +
 +    if (nblc->bSimple)
 +    {
 +        gmx_incons("combine_nblists does not support simple lists");
 +    }
 +
 +    nsci  = nblc->nsci;
 +    ncj4  = nblc->ncj4;
 +    nexcl = nblc->nexcl;
 +    for(i=0; i<nnbl; i++)
 +    {
 +        nsci  += nbl[i]->nsci;
 +        ncj4  += nbl[i]->ncj4;
 +        nexcl += nbl[i]->nexcl;
 +    }
 +
 +    if (nsci > nblc->sci_nalloc)
 +    {
 +        nb_realloc_sci(nblc,nsci);
 +    }
 +    if (ncj4 > nblc->cj4_nalloc)
 +    {
 +        nblc->cj4_nalloc = over_alloc_small(ncj4);
 +        nbnxn_realloc_void((void **)&nblc->cj4,
 +                           nblc->ncj4*sizeof(*nblc->cj4),
 +                           nblc->cj4_nalloc*sizeof(*nblc->cj4),
 +                           nblc->alloc,nblc->free);
 +    }
 +    if (nexcl > nblc->excl_nalloc)
 +    {
 +        nblc->excl_nalloc = over_alloc_small(nexcl);
 +        nbnxn_realloc_void((void **)&nblc->excl,
 +                           nblc->nexcl*sizeof(*nblc->excl),
 +                           nblc->excl_nalloc*sizeof(*nblc->excl),
 +                           nblc->alloc,nblc->free);
 +    }
 +
 +    /* Each thread should copy its own data to the combined arrays,
 +     * as otherwise data will go back and forth between different caches.
 +     */
 +#pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static)
 +    for(n=0; n<nnbl; n++)
 +    {
 +        int sci_offset;
 +        int cj4_offset;
 +        int ci_offset;
 +        int excl_offset;
 +        int i,j4;
 +        const nbnxn_pairlist_t *nbli;
 +
 +        /* Determine the offset in the combined data for our thread */
 +        sci_offset  = nblc->nsci;
 +        cj4_offset  = nblc->ncj4;
 +        ci_offset   = nblc->nci_tot;
 +        excl_offset = nblc->nexcl;
 +
 +        for(i=0; i<n; i++)
 +        {
 +            sci_offset  += nbl[i]->nsci;
 +            cj4_offset  += nbl[i]->ncj4;
 +            ci_offset   += nbl[i]->nci_tot;
 +            excl_offset += nbl[i]->nexcl;
 +        }
 +
 +        nbli = nbl[n];
 +
 +        for(i=0; i<nbli->nsci; i++)
 +        {
 +            nblc->sci[sci_offset+i]                = nbli->sci[i];
 +            nblc->sci[sci_offset+i].cj4_ind_start += cj4_offset;
 +            nblc->sci[sci_offset+i].cj4_ind_end   += cj4_offset;
 +        }
 +
 +        for(j4=0; j4<nbli->ncj4; j4++)
 +        {
 +            nblc->cj4[cj4_offset+j4] = nbli->cj4[j4];
 +            nblc->cj4[cj4_offset+j4].imei[0].excl_ind += excl_offset;
 +            nblc->cj4[cj4_offset+j4].imei[1].excl_ind += excl_offset;
 +        }
 +
 +        for(j4=0; j4<nbli->nexcl; j4++)
 +        {
 +            nblc->excl[excl_offset+j4] = nbli->excl[j4];
 +        }
 +    }
 +
 +    for(n=0; n<nnbl; n++)
 +    {
 +        nblc->nsci    += nbl[n]->nsci;
 +        nblc->ncj4    += nbl[n]->ncj4;
 +        nblc->nci_tot += nbl[n]->nci_tot;
 +        nblc->nexcl   += nbl[n]->nexcl;
 +    }
 +}
 +
 +/* Returns the next ci to be processes by our thread */
 +static gmx_bool next_ci(const nbnxn_grid_t *grid,
 +                        int conv,
 +                        int nth,int ci_block,
 +                        int *ci_x,int *ci_y,
 +                        int *ci_b,int *ci)
 +{
 +    (*ci_b)++;
 +    (*ci)++;
 +
 +    if (*ci_b == ci_block)
 +    {
 +        /* Jump to the next block assigned to this task */
 +        *ci   += (nth - 1)*ci_block;
 +        *ci_b  = 0;
 +    }
 +
 +    if (*ci >= grid->nc*conv)
 +    {
 +        return FALSE;
 +    }
 +
 +    while (*ci >= grid->cxy_ind[*ci_x*grid->ncy + *ci_y + 1]*conv)
 +    {
 +        *ci_y += 1;
 +        if (*ci_y == grid->ncy)
 +        {
 +            *ci_x += 1;
 +            *ci_y  = 0;
 +        }
 +    }
 +
 +    return TRUE;
 +}
 +
 +/* Returns the distance^2 for which we put cell pairs in the list
 + * without checking atom pair distances. This is usually < rlist^2.
 + */
 +static float boundingbox_only_distance2(const nbnxn_grid_t *gridi,
 +                                        const nbnxn_grid_t *gridj,
 +                                        real rlist,
 +                                        gmx_bool simple)
 +{
 +    /* If the distance between two sub-cell bounding boxes is less
 +     * than this distance, do not check the distance between
 +     * all particle pairs in the sub-cell, since then it is likely
 +     * that the box pair has atom pairs within the cut-off.
 +     * We use the nblist cut-off minus 0.5 times the average x/y diagonal
 +     * spacing of the sub-cells. Around 40% of the checked pairs are pruned.
 +     * Using more than 0.5 gains at most 0.5%.
 +     * If forces are calculated more than twice, the performance gain
 +     * in the force calculation outweighs the cost of checking.
 +     * Note that with subcell lists, the atom-pair distance check
 +     * is only performed when only 1 out of 8 sub-cells in within range,
 +     * this is because the GPU is much faster than the cpu.
 +     */
 +    real bbx,bby;
 +    real rbb2;
 +
 +    bbx = 0.5*(gridi->sx + gridj->sx);
 +    bby = 0.5*(gridi->sy + gridj->sy);
 +    if (!simple)
 +    {
 +        bbx /= GPU_NSUBCELL_X;
 +        bby /= GPU_NSUBCELL_Y;
 +    }
 +
 +    rbb2 = sqr(max(0,rlist - 0.5*sqrt(bbx*bbx + bby*bby)));
 +
 +#ifndef GMX_DOUBLE
 +    return rbb2;
 +#else
 +    return (float)((1+GMX_FLOAT_EPS)*rbb2);
 +#endif
 +}
 +
 +static int get_ci_block_size(const nbnxn_grid_t *gridi,
-         /* With non-interleaved blocks it makes sense to flag which
-          * part of the force output thread buffer we access.
-          * We use bit flags, so we have to check if it fits.
-          */
-         *bFBufferFlag = (nth > 1 && nth <= sizeof(unsigned int)*8);
-     }
-     else
-     {
-         *bFBufferFlag = FALSE;
++                             gmx_bool bDomDec, int nth)
 +{
 +    const int ci_block_enum = 5;
 +    const int ci_block_denom = 11;
 +    const int ci_block_min_atoms = 16;
 +    int ci_block;
 +
 +    /* Here we decide how to distribute the blocks over the threads.
 +     * We use prime numbers to try to avoid that the grid size becomes
 +     * a multiple of the number of threads, which would lead to some
 +     * threads getting "inner" pairs and others getting boundary pairs,
 +     * which in turns will lead to load imbalance between threads.
 +     * Set the block size as 5/11/ntask times the average number of cells
 +     * in a y,z slab. This should ensure a quite uniform distribution
 +     * of the grid parts of the different thread along all three grid
 +     * zone boundaries with 3D domain decomposition. At the same time
 +     * the blocks will not become too small.
 +     */
 +    ci_block = (gridi->nc*ci_block_enum)/(ci_block_denom*gridi->ncx*nth);
 +
 +    /* Ensure the blocks are not too small: avoids cache invalidation */
 +    if (ci_block*gridi->na_sc < ci_block_min_atoms)
 +    {
 +        ci_block = (ci_block_min_atoms + gridi->na_sc - 1)/gridi->na_sc;
 +    }
 +    
 +    /* Without domain decomposition
 +     * or with less than 3 blocks per task, divide in nth blocks.
 +     */
 +    if (!bDomDec || ci_block*3*nth > gridi->nc)
 +    {
 +        ci_block = (gridi->nc + nth - 1)/nth;
-     int  gridj_flag_shift=0,cj_offset=0;
 +    }
 +
 +    return ci_block;
 +}
 +
 +/* Generates the part of pair-list nbl assigned to our thread */
 +static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs,
 +                                     const nbnxn_grid_t *gridi,
 +                                     const nbnxn_grid_t *gridj,
 +                                     nbnxn_search_work_t *work,
 +                                     const nbnxn_atomdata_t *nbat,
 +                                     const t_blocka *excl,
 +                                     real rlist,
 +                                     int nb_kernel_type,
 +                                     int ci_block,
 +                                     gmx_bool bFBufferFlag,
 +                                     int nsubpair_max,
 +                                     gmx_bool progBal,
 +                                     int min_ci_balanced,
 +                                     int th,int nth,
 +                                     nbnxn_pairlist_t *nbl)
 +{
 +    int  na_cj_2log;
 +    matrix box;
 +    real rl2;
 +    float rbb2;
 +    int  d;
 +    int  ci_b,ci,ci_x,ci_y,ci_xy,cj;
 +    ivec shp;
 +    int  tx,ty,tz;
 +    int  shift;
 +    gmx_bool bMakeList;
 +    real shx,shy,shz;
 +    int  conv_i,cell0_i;
 +    const float *bb_i,*bbcz_i,*bbcz_j;
 +    const int *flags_i;
 +    real bx0,bx1,by0,by1,bz0,bz1;
 +    real bz1_frac;
 +    real d2cx,d2z,d2z_cx,d2z_cy,d2zx,d2zxy,d2xy;
 +    int  cxf,cxl,cyf,cyf_x,cyl;
 +    int  cx,cy;
 +    int  c0,c1,cs,cf,cl;
 +    int  ndistc;
 +    int  ncpcheck;
-         init_grid_flags(&work->gridi_flags,gridi);
-         init_grid_flags(&work->gridj_flags,gridj);
-         /* To flag j-blocks for gridj, we need to convert j-clusters to flag blocks */
++    int  gridi_flag_shift=0,gridj_flag_shift=0;
 +    unsigned *gridj_flag=NULL;
 +    int  ncj_old_i,ncj_old_j;
 +
 +    nbs_cycle_start(&work->cc[enbsCCsearch]);
 +
 +    if (gridj->bSimple != nbl->bSimple)
 +    {
 +        gmx_incons("Grid incompatible with pair-list");
 +    }
 +
 +    sync_work(nbl);
 +    nbl->na_sc = gridj->na_sc;
 +    nbl->na_ci = gridj->na_c;
 +    nbl->na_cj = nbnxn_kernel_to_cj_size(nb_kernel_type);
 +    na_cj_2log = get_2log(nbl->na_cj);
 +
 +    nbl->rlist  = rlist;
 +
 +    if (bFBufferFlag)
 +    {
-         while ((nbl->na_cj<<gridj_flag_shift) < NBNXN_CELLBLOCK_SIZE*nbl->na_ci)
++        /* Determine conversion of clusters to flag blocks */
++        gridi_flag_shift = 0;
++        while ((nbl->na_ci<<gridi_flag_shift) < NBNXN_BUFFERFLAG_SIZE)
++        {
++            gridi_flag_shift++;
++        }
 +        gridj_flag_shift = 0;
-         /* We will subtract the cell offset, which is not a multiple of the block size */
-         cj_offset = ci_to_cj(get_2log(nbl->na_cj),gridj->cell0);
++        while ((nbl->na_cj<<gridj_flag_shift) < NBNXN_BUFFERFLAG_SIZE)
 +        {
 +            gridj_flag_shift++;
 +        }
-         gridj_flag = work->gridj_flags.flag;
 +
-                                         cbf = (nbl->cj[ncj_old_j].cj - cj_offset) >> gridj_flag_shift;
-                                         cbl = (nbl->cj[nbl->ncj-1].cj - cj_offset) >> gridj_flag_shift;
++        gridj_flag = work->buffer_flags.flag;
 +    }
 +
 +    copy_mat(nbs->box,box);
 +
 +    rl2 = nbl->rlist*nbl->rlist;
 +
 +    rbb2 = boundingbox_only_distance2(gridi,gridj,nbl->rlist,nbl->bSimple);
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"nbl bounding box only distance %f\n",sqrt(rbb2));
 +    }
 +
 +    /* Set the shift range */
 +    for(d=0; d<DIM; d++)
 +    {
 +        /* Check if we need periodicity shifts.
 +         * Without PBC or with domain decomposition we don't need them.
 +         */
 +        if (d >= ePBC2npbcdim(nbs->ePBC) || nbs->dd_dim[d])
 +        {
 +            shp[d] = 0;
 +        }
 +        else
 +        {
 +            if (d == XX &&
 +                box[XX][XX] - fabs(box[YY][XX]) - fabs(box[ZZ][XX]) < sqrt(rl2))
 +            {
 +                shp[d] = 2;
 +            }
 +            else
 +            {
 +                shp[d] = 1;
 +            }
 +        }
 +    }
 +
 +    if (nbl->bSimple && !gridi->bSimple)
 +    {
 +        conv_i  = gridi->na_sc/gridj->na_sc;
 +        bb_i    = gridi->bb_simple;
 +        bbcz_i  = gridi->bbcz_simple;
 +        flags_i = gridi->flags_simple;
 +    }
 +    else
 +    {
 +        conv_i  = 1;
 +        bb_i    = gridi->bb;
 +        bbcz_i  = gridi->bbcz;
 +        flags_i = gridi->flags;
 +    }
 +    cell0_i = gridi->cell0*conv_i;
 +
 +    bbcz_j = gridj->bbcz;
 +
 +    if (conv_i != 1)
 +    {
 +        /* Blocks of the conversion factor - 1 give a large repeat count
 +         * combined with a small block size. This should result in good
 +         * load balancing for both small and large domains.
 +         */
 +        ci_block = conv_i - 1;
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug,"nbl nc_i %d col.av. %.1f ci_block %d\n",
 +                gridi->nc,gridi->nc/(double)(gridi->ncx*gridi->ncy),ci_block);
 +    }
 +
 +    ndistc = 0;
 +    ncpcheck = 0;
 +
 +    /* Initially ci_b and ci to 1 before where we want them to start,
 +     * as they will both be incremented in next_ci.
 +     */
 +    ci_b = -1;
 +    ci   = th*ci_block - 1;
 +    ci_x = 0;
 +    ci_y = 0;
 +    while (next_ci(gridi,conv_i,nth,ci_block,&ci_x,&ci_y,&ci_b,&ci))
 +    {
 +        if (nbl->bSimple && flags_i[ci] == 0)
 +        {
 +            continue;
 +        }
 +
 +        ncj_old_i = nbl->ncj;
 +
 +        d2cx = 0;
 +        if (gridj != gridi && shp[XX] == 0)
 +        {
 +            if (nbl->bSimple)
 +            {
 +                bx1 = bb_i[ci*NNBSBB_B+NNBSBB_C+XX];
 +            }
 +            else
 +            {
 +                bx1 = gridi->c0[XX] + (ci_x+1)*gridi->sx;
 +            }
 +            if (bx1 < gridj->c0[XX])
 +            {
 +                d2cx = sqr(gridj->c0[XX] - bx1);
 +
 +                if (d2cx >= rl2)
 +                {
 +                    continue;
 +                }
 +            }
 +        }
 +
 +        ci_xy = ci_x*gridi->ncy + ci_y;
 +
 +        /* Loop over shift vectors in three dimensions */
 +        for (tz=-shp[ZZ]; tz<=shp[ZZ]; tz++)
 +        {
 +            shz = tz*box[ZZ][ZZ];
 +
 +            bz0 = bbcz_i[ci*NNBSBB_D  ] + shz;
 +            bz1 = bbcz_i[ci*NNBSBB_D+1] + shz;
 +
 +            if (tz == 0)
 +            {
 +                d2z = 0;
 +            }
 +            else if (tz < 0)
 +            {
 +                d2z = sqr(bz1);
 +            }
 +            else
 +            {
 +                d2z = sqr(bz0 - box[ZZ][ZZ]);
 +            }
 +
 +            d2z_cx = d2z + d2cx;
 +
 +            if (d2z_cx >= rl2)
 +            {
 +                continue;
 +            }
 +
 +            bz1_frac =
 +                bz1/((real)(gridi->cxy_ind[ci_xy+1] - gridi->cxy_ind[ci_xy]));
 +            if (bz1_frac < 0)
 +            {
 +                bz1_frac = 0;
 +            }
 +            /* The check with bz1_frac close to or larger than 1 comes later */
 +
 +            for (ty=-shp[YY]; ty<=shp[YY]; ty++)
 +            {
 +                shy = ty*box[YY][YY] + tz*box[ZZ][YY];
 +
 +                if (nbl->bSimple)
 +                {
 +                    by0 = bb_i[ci*NNBSBB_B         +YY] + shy;
 +                    by1 = bb_i[ci*NNBSBB_B+NNBSBB_C+YY] + shy;
 +                }
 +                else
 +                {
 +                    by0 = gridi->c0[YY] + (ci_y  )*gridi->sy + shy;
 +                    by1 = gridi->c0[YY] + (ci_y+1)*gridi->sy + shy;
 +                }
 +
 +                get_cell_range(by0,by1,
 +                               gridj->ncy,gridj->c0[YY],gridj->sy,gridj->inv_sy,
 +                               d2z_cx,rl2,
 +                               &cyf,&cyl);
 +
 +                if (cyf > cyl)
 +                {
 +                    continue;
 +                }
 +
 +                d2z_cy = d2z;
 +                if (by1 < gridj->c0[YY])
 +                {
 +                    d2z_cy += sqr(gridj->c0[YY] - by1);
 +                }
 +                else if (by0 > gridj->c1[YY])
 +                {
 +                    d2z_cy += sqr(by0 - gridj->c1[YY]);
 +                }
 +
 +                for (tx=-shp[XX]; tx<=shp[XX]; tx++)
 +                {
 +                    shift = XYZ2IS(tx,ty,tz);
 +
 +#ifdef NBNXN_SHIFT_BACKWARD
 +                    if (gridi == gridj && shift > CENTRAL)
 +                    {
 +                        continue;
 +                    }
 +#endif
 +
 +                    shx = tx*box[XX][XX] + ty*box[YY][XX] + tz*box[ZZ][XX];
 +
 +                    if (nbl->bSimple)
 +                    {
 +                        bx0 = bb_i[ci*NNBSBB_B         +XX] + shx;
 +                        bx1 = bb_i[ci*NNBSBB_B+NNBSBB_C+XX] + shx;
 +                    }
 +                    else
 +                    {
 +                        bx0 = gridi->c0[XX] + (ci_x  )*gridi->sx + shx;
 +                        bx1 = gridi->c0[XX] + (ci_x+1)*gridi->sx + shx;
 +                    }
 +
 +                    get_cell_range(bx0,bx1,
 +                                   gridj->ncx,gridj->c0[XX],gridj->sx,gridj->inv_sx,
 +                                   d2z_cy,rl2,
 +                                   &cxf,&cxl);
 +
 +                    if (cxf > cxl)
 +                    {
 +                        continue;
 +                    }
 +
 +                    if (nbl->bSimple)
 +                    {
 +                        new_ci_entry(nbl,cell0_i+ci,shift,flags_i[ci],
 +                                     nbl->work);
 +                    }
 +                    else
 +                    {
 +                        new_sci_entry(nbl,cell0_i+ci,shift,flags_i[ci],
 +                                      nbl->work);
 +                    }
 +
 +#ifndef NBNXN_SHIFT_BACKWARD
 +                    if (cxf < ci_x)
 +#else
 +                    if (shift == CENTRAL && gridi == gridj &&
 +                        cxf < ci_x)
 +#endif
 +                    {
 +                        /* Leave the pairs with i > j.
 +                         * x is the major index, so skip half of it.
 +                         */
 +                        cxf = ci_x;
 +                    }
 +
 +                    if (nbl->bSimple)
 +                    {
 +                        set_icell_bb_simple(bb_i,ci,shx,shy,shz,
 +                                            nbl->work->bb_ci);
 +                    }
 +                    else
 +                    {
 +                        set_icell_bb_supersub(bb_i,ci,shx,shy,shz,
 +                                              nbl->work->bb_ci);
 +                    }
 +
 +                    nbs->icell_set_x(cell0_i+ci,shx,shy,shz,
 +                                     gridi->na_c,nbat->xstride,nbat->x,
 +                                     nbl->work);
 +
 +                    for(cx=cxf; cx<=cxl; cx++)
 +                    {
 +                        d2zx = d2z;
 +                        if (gridj->c0[XX] + cx*gridj->sx > bx1)
 +                        {
 +                            d2zx += sqr(gridj->c0[XX] + cx*gridj->sx - bx1);
 +                        }
 +                        else if (gridj->c0[XX] + (cx+1)*gridj->sx < bx0)
 +                        {
 +                            d2zx += sqr(gridj->c0[XX] + (cx+1)*gridj->sx - bx0);
 +                        }
 +
 +#ifndef NBNXN_SHIFT_BACKWARD
 +                        if (gridi == gridj &&
 +                            cx == 0 && cyf < ci_y)
 +#else
 +                        if (gridi == gridj &&
 +                            cx == 0 && shift == CENTRAL && cyf < ci_y)
 +#endif
 +                        {
 +                            /* Leave the pairs with i > j.
 +                             * Skip half of y when i and j have the same x.
 +                             */
 +                            cyf_x = ci_y;
 +                        }
 +                        else
 +                        {
 +                            cyf_x = cyf;
 +                        }
 +
 +                        for(cy=cyf_x; cy<=cyl; cy++)
 +                        {
 +                            c0 = gridj->cxy_ind[cx*gridj->ncy+cy];
 +                            c1 = gridj->cxy_ind[cx*gridj->ncy+cy+1];
 +#ifdef NBNXN_SHIFT_BACKWARD
 +                            if (gridi == gridj &&
 +                                shift == CENTRAL && c0 < ci)
 +                            {
 +                                c0 = ci;
 +                            }
 +#endif
 +
 +                            d2zxy = d2zx;
 +                            if (gridj->c0[YY] + cy*gridj->sy > by1)
 +                            {
 +                                d2zxy += sqr(gridj->c0[YY] + cy*gridj->sy - by1);
 +                            }
 +                            else if (gridj->c0[YY] + (cy+1)*gridj->sy < by0)
 +                            {
 +                                d2zxy += sqr(gridj->c0[YY] + (cy+1)*gridj->sy - by0);
 +                            }
 +                            if (c1 > c0 && d2zxy < rl2)
 +                            {
 +                                cs = c0 + (int)(bz1_frac*(c1 - c0));
 +                                if (cs >= c1)
 +                                {
 +                                    cs = c1 - 1;
 +                                }
 +
 +                                d2xy = d2zxy - d2z;
 +
 +                                /* Find the lowest cell that can possibly
 +                                 * be within range.
 +                                 */
 +                                cf = cs;
 +                                while(cf > c0 &&
 +                                      (bbcz_j[cf*NNBSBB_D+1] >= bz0 ||
 +                                       d2xy + sqr(bbcz_j[cf*NNBSBB_D+1] - bz0) < rl2))
 +                                {
 +                                    cf--;
 +                                }
 +
 +                                /* Find the highest cell that can possibly
 +                                 * be within range.
 +                                 */
 +                                cl = cs;
 +                                while(cl < c1-1 &&
 +                                      (bbcz_j[cl*NNBSBB_D] <= bz1 ||
 +                                       d2xy + sqr(bbcz_j[cl*NNBSBB_D] - bz1) < rl2))
 +                                {
 +                                    cl++;
 +                                }
 +
 +#ifdef NBNXN_REFCODE
 +                                {
 +                                    /* Simple reference code, for debugging,
 +                                     * overrides the more complex code above.
 +                                     */
 +                                    int k;
 +                                    cf = c1;
 +                                    cl = -1;
 +                                    for(k=c0; k<c1; k++)
 +                                    {
 +                                        if (box_dist2(bx0,bx1,by0,by1,bz0,bz1,
 +                                                      bb+k*NNBSBB_B) < rl2 &&
 +                                            k < cf)
 +                                        {
 +                                            cf = k;
 +                                        }
 +                                        if (box_dist2(bx0,bx1,by0,by1,bz0,bz1,
 +                                                      bb+k*NNBSBB_B) < rl2 &&
 +                                            k > cl)
 +                                        {
 +                                            cl = k;
 +                                        }
 +                                    }
 +                                }
 +#endif
 +
 +                                if (gridi == gridj)
 +                                {
 +                                    /* We want each atom/cell pair only once,
 +                                     * only use cj >= ci.
 +                                     */
 +#ifndef NBNXN_SHIFT_BACKWARD
 +                                    cf = max(cf,ci);
 +#else
 +                                    if (shift == CENTRAL)
 +                                    {
 +                                        cf = max(cf,ci);
 +                                    }
 +#endif
 +                                }
 +
 +                                if (cf <= cl)
 +                                {
 +                                    /* For f buffer flags with simple lists */
 +                                    ncj_old_j = nbl->ncj;
 +
 +                                    switch (nb_kernel_type)
 +                                    {
 +                                    case nbk4x4_PlainC:
 +                                        check_subcell_list_space_simple(nbl,cl-cf+1);
 +
 +                                        make_cluster_list_simple(gridj,
 +                                                                 nbl,ci,cf,cl,
 +                                                                 (gridi == gridj && shift == CENTRAL),
 +                                                                 nbat->x,
 +                                                                 rl2,rbb2,
 +                                                                 &ndistc);
 +                                        break;
 +#ifdef NBNXN_SEARCH_SSE
 +                                    case nbk4xN_X86_SIMD128:
 +                                        check_subcell_list_space_simple(nbl,ci_to_cj(na_cj_2log,cl-cf)+2);
 +                                        make_cluster_list_x86_simd128(gridj,
 +                                                                      nbl,ci,cf,cl,
 +                                                                      (gridi == gridj && shift == CENTRAL),
 +                                                                      nbat->x,
 +                                                                      rl2,rbb2,
 +                                                                      &ndistc);
 +                                        break;
 +#ifdef GMX_X86_AVX_256
 +                                    case nbk4xN_X86_SIMD256:
 +                                        check_subcell_list_space_simple(nbl,ci_to_cj(na_cj_2log,cl-cf)+2);
 +                                        make_cluster_list_x86_simd256(gridj,
 +                                                                      nbl,ci,cf,cl,
 +                                                                      (gridi == gridj && shift == CENTRAL),
 +                                                                      nbat->x,
 +                                                                      rl2,rbb2,
 +                                                                      &ndistc);
 +                                        break;
 +#endif
 +#endif
 +                                    case nbk8x8x8_PlainC:
 +                                    case nbk8x8x8_CUDA:
 +                                        check_subcell_list_space_supersub(nbl,cl-cf+1);
 +                                        for(cj=cf; cj<=cl; cj++)
 +                                        {
 +                                            make_cluster_list_supersub(nbs,gridi,gridj,
 +                                                                       nbl,ci,cj,
 +                                                                       (gridi == gridj && shift == CENTRAL && ci == cj),
 +                                                                       nbat->xstride,nbat->x,
 +                                                                       rl2,rbb2,
 +                                                                       &ndistc);
 +                                        }
 +                                        break;
 +                                    }
 +                                    ncpcheck += cl - cf + 1;
 +
 +                                    if (bFBufferFlag && nbl->ncj > ncj_old_j)
 +                                    {
 +                                        int cbf,cbl,cb;
 +
-             work->gridi_flags.flag[ci>>NBNXN_CELLBLOCK_SIZE_2LOG] = 1U<<th;
++                                        cbf = nbl->cj[ncj_old_j].cj >> gridj_flag_shift;
++                                        cbl = nbl->cj[nbl->ncj-1].cj >> gridj_flag_shift;
 +                                        for(cb=cbf; cb<=cbl; cb++)
 +                                        {
 +                                            gridj_flag[cb] = 1U<<th;
 +                                        }
 +                                    }
 +                                }
 +                            }
 +                        }
 +                    }
 +
 +                    /* Set the exclusions for this ci list */
 +                    if (nbl->bSimple)
 +                    {
 +                        set_ci_top_excls(nbs,
 +                                         nbl,
 +                                         shift == CENTRAL && gridi == gridj,
 +                                         gridj->na_c_2log,
 +                                         na_cj_2log,
 +                                         &(nbl->ci[nbl->nci]),
 +                                         excl);
 +                    }
 +                    else
 +                    {
 +                        set_sci_top_excls(nbs,
 +                                          nbl,
 +                                          shift == CENTRAL && gridi == gridj,
 +                                          gridj->na_c_2log,
 +                                          &(nbl->sci[nbl->nsci]),
 +                                          excl);
 +                    }
 +
 +                    /* Close this ci list */
 +                    if (nbl->bSimple)
 +                    {
 +                        close_ci_entry_simple(nbl);
 +                    }
 +                    else
 +                    {
 +                        close_ci_entry_supersub(nbl,
 +                                                nsubpair_max,
 +                                                progBal,min_ci_balanced,
 +                                                th,nth);
 +                    }
 +                }
 +            }
 +        }
 +
 +        if (bFBufferFlag && nbl->ncj > ncj_old_i)
 +        {
- static void reduce_cellblock_flags(const nbnxn_search_t nbs,
-                                    int nnbl,
-                                    const nbnxn_grid_t *gridi,
-                                    const nbnxn_grid_t *gridj)
++            work->buffer_flags.flag[(gridi->cell0+ci)>>gridi_flag_shift] = 1U<<th;
 +        }
 +    }
 +
 +    work->ndistc = ndistc;
 +
 +    nbs_cycle_stop(&work->cc[enbsCCsearch]);
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"number of distance checks %d\n",ndistc);
 +        fprintf(debug,"ncpcheck %s %d\n",gridi==gridj ? "local" : "non-local",
 +                ncpcheck);
 +
 +        if (nbl->bSimple)
 +        {
 +            print_nblist_statistics_simple(debug,nbl,nbs,rlist);
 +        }
 +        else
 +        {
 +            print_nblist_statistics_supersub(debug,nbl,nbs,rlist);
 +        }
 +
 +    }
 +}
 +
-     int nbl,cb;
++static void reduce_buffer_flags(const nbnxn_search_t nbs,
++                                int nsrc,
++                                const nbnxn_buffer_flags_t *dest)
 +{
-     if (gridi->cellblock_flags.bUse)
++    int s,b;
 +    const unsigned *flag;
 +
-         for(nbl=0; nbl<nnbl; nbl++)
-         {
-             flag = nbs->work[nbl].gridi_flags.flag;
-             
-             for(cb=0; cb<gridi->cellblock_flags.ncb; cb++)
-             {
-                 gridi->cellblock_flags.flag[cb] |= flag[cb];
-             }
-         }
-     }
-     if (gridj->cellblock_flags.bUse)
-     {
-         for(nbl=0; nbl<nnbl; nbl++)
-         {
-             flag = nbs->work[nbl].gridj_flags.flag;
++    for(s=0; s<nsrc; s++)
 +    {
-             for(cb=0; cb<gridj->cellblock_flags.ncb; cb++)
-             {
-                 gridj->cellblock_flags.flag[cb] |= flag[cb];
-             }
++        flag = nbs->work[s].buffer_flags.flag;
 +
- static void print_reduction_cost(const nbnxn_grid_t *grids,int ngrid,int nnbl)
++        for(b=0; b<dest->nflag; b++)
++        {
++            dest->flag[b] |= flag[b];
 +        }
 +    }
 +}
 +
-     int g,c0,c,cb,nbl;
-     const nbnxn_grid_t *grid;
++static void print_reduction_cost(const nbnxn_buffer_flags_t *flags,int nout)
 +{
-     for(g=0; g<ngrid; g++)
++    int nelem,nkeep,ncopy,nred,b,c,out;
 +
-         grid = &grids[g];
-         c0 = 0;
-         if (grid->cellblock_flags.bUse)
++    nelem = 0;
++    nkeep = 0;
++    ncopy = 0;
++    nred  = 0;
++    for(b=0; b<flags->nflag; b++)
 +    {
-             c  = 0;
-             for(cb=0; cb<grid->cellblock_flags.ncb; cb++)
++        if (flags->flag[b] == 1)
 +        {
-                 for(nbl=0; nbl<nnbl; nbl++)
++            /* Only flag 0 is set, no copy of reduction required */
++            nelem++;
++            nkeep++;
++        }
++        else if (flags->flag[b] > 0)
++        {
++            c = 0;
++            for(out=0; out<nout; out++)
 +            {
-                     if (grid->cellblock_flags.flag[cb] == 1)
-                     {
-                         c0++;
-                     }
-                     else if (grid->cellblock_flags.flag[cb] & (1U<<nbl))
-                     {
-                         c++;
-                     }
++                if (flags->flag[b] & (1U<<out))
 +                {
-         else
-         {
-             c = nnbl*grid->cellblock_flags.ncb;
-         }
-         fprintf(debug,"nbnxn reduction buffers, grid %d: %d flag %d only buf. 0: %4.2f av. reduction: %4.2f\n",
-                 g,nnbl,grid->cellblock_flags.bUse,
-                 c0/(double)(grid->cellblock_flags.ncb),
-                 c/(double)(grid->cellblock_flags.ncb));
++                    c++;
 +                }
 +            }
++            nelem += c;
++            if (c == 1)
++            {
++                ncopy++;
++            }
++            else
++            {
++                nred += c;
++            }
 +        }
-                          const nbnxn_atomdata_t *nbat,
 +    }
++
++    fprintf(debug,"nbnxn reduction: #flag %d #list %d elem %4.2f, keep %4.2f copy %4.2f red %4.2f\n",
++            flags->nflag,nout,
++            nelem/(double)(flags->nflag),
++            nkeep/(double)(flags->nflag),
++            ncopy/(double)(flags->nflag),
++            nred/(double)(flags->nflag));
 +}
 +
 +/* Make a local or non-local pair-list, depending on iloc */
 +void nbnxn_make_pairlist(const nbnxn_search_t nbs,
-     gmx_bool CombineNBLists,bFBufferFlag;
++                         nbnxn_atomdata_t *nbat,
 +                         const t_blocka *excl,
 +                         real rlist,
 +                         int min_ci_balanced,
 +                         nbnxn_pairlist_set_t *nbl_list,
 +                         int iloc,
 +                         int nb_kernel_type,
 +                         t_nrnb *nrnb)
 +{
 +    nbnxn_grid_t *gridi,*gridj;
 +    int nzi,zi,zj0,zj1,zj;
 +    int nsubpair_max;
 +    int th;
 +    int nnbl;
 +    nbnxn_pairlist_t **nbl;
 +    int ci_block;
-                 bFBufferFlag = FALSE;
++    gmx_bool CombineNBLists;
 +    int np_tot,np_noq,np_hlj,nap;
 +
 +    nnbl            = nbl_list->nnbl;
 +    nbl             = nbl_list->nbl;
 +    CombineNBLists  = nbl_list->bCombined;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"ns making %d nblists\n", nnbl);
 +    }
 +
++    nbat->bUseBufferFlags = (nbat->nout > 1);
++    if (nbat->bUseBufferFlags && LOCAL_I(iloc))
++    {
++        init_buffer_flags(&nbat->buffer_flags,nbat->natoms);
++    }
++
 +    if (nbl_list->bSimple)
 +    {
 +        switch (nb_kernel_type)
 +        {
 +#ifdef NBNXN_SEARCH_SSE
 +        case nbk4xN_X86_SIMD128:
 +            nbs->icell_set_x = icell_set_x_x86_simd128;
 +            break;
 +#ifdef GMX_X86_AVX_256
 +        case nbk4xN_X86_SIMD256:
 +            nbs->icell_set_x = icell_set_x_x86_simd256;
 +            break;
 +#endif
 +#endif
 +        default:
 +            nbs->icell_set_x = icell_set_x_simple;
 +            break;
 +        }
 +    }
 +    else
 +    {
 +#ifdef NBNXN_SEARCH_SSE
 +        nbs->icell_set_x = icell_set_x_supersub_sse8;
 +#else
 +        nbs->icell_set_x = icell_set_x_supersub;
 +#endif
 +    }
 +
 +    if (LOCAL_I(iloc))
 +    {
 +        /* Only zone (grid) 0 vs 0 */
 +        nzi = 1;
 +        zj0 = 0;
 +        zj1 = 1;
 +    }
 +    else
 +    {
 +        nzi = nbs->zones->nizone;
 +    }
 +
 +    if (!nbl_list->bSimple && min_ci_balanced > 0)
 +    {
 +        nsubpair_max = get_nsubpair_max(nbs,iloc,rlist,min_ci_balanced);
 +    }
 +    else
 +    {
 +        nsubpair_max = 0;
 +    }
 +
 +    /* Clear all pair-lists */
 +    for(th=0; th<nnbl; th++)
 +    {
 +        clear_pairlist(nbl[th]);
 +    }
 +
 +    for(zi=0; zi<nzi; zi++)
 +    {
 +        gridi = &nbs->grid[zi];
 +
 +        if (NONLOCAL_I(iloc))
 +        {
 +            zj0 = nbs->zones->izone[zi].j0;
 +            zj1 = nbs->zones->izone[zi].j1;
 +            if (zi == 0)
 +            {
 +                zj0++;
 +            }
 +        }
 +        for(zj=zj0; zj<zj1; zj++)
 +        {
 +            gridj = &nbs->grid[zj];
 +
 +            if (debug)
 +            {
 +                fprintf(debug,"ns search grid %d vs %d\n",zi,zj);
 +            }
 +
 +            nbs_cycle_start(&nbs->cc[enbsCCsearch]);
 +
 +            if (nbl[0]->bSimple && !gridi->bSimple)
 +            {
 +                /* Hybrid list, determine blocking later */
 +                ci_block = 0;
-                 ci_block = get_ci_block_size(gridi,nbs->DomDec,nnbl,
-                                              &bFBufferFlag);
-                 if (CombineNBLists)
-                 {
-                     bFBufferFlag = FALSE;
-                 }
-             }
-             if (debug != NULL)
-             {
-                 fprintf(debug,"grid %d %d F buffer flags %d\n",
-                         zi,zj,bFBufferFlag);
 +            }
 +            else
 +            {
-                                          bFBufferFlag,
++                ci_block = get_ci_block_size(gridi,nbs->DomDec,nnbl);
 +            }
 +
 +#pragma omp parallel for num_threads(nnbl) schedule(static)
 +            for(th=0; th<nnbl; th++)
 +            {
++                if (nbat->bUseBufferFlags && zi == 0 && zj == 0)
++                {
++                    init_buffer_flags(&nbs->work[th].buffer_flags,nbat->natoms);
++                }
++
 +                if (CombineNBLists && th > 0)
 +                {
 +                    clear_pairlist(nbl[th]);
 +                }
 +
 +                /* Divide the i super cell equally over the nblists */
 +                nbnxn_make_pairlist_part(nbs,gridi,gridj,
 +                                         &nbs->work[th],nbat,excl,
 +                                         rlist,
 +                                         nb_kernel_type,
 +                                         ci_block,
-             if (bFBufferFlag)
-             {
-                 reduce_cellblock_flags(nbs,nnbl,gridi,gridj);
-             }
-             else
-             {
-                 gridi->cellblock_flags.bUse = FALSE;
-                 gridj->cellblock_flags.bUse = FALSE;
-             }
++                                         nbat->bUseBufferFlags,
 +                                         nsubpair_max,
 +                                         (LOCAL_I(iloc) || nbs->zones->n <= 2),
 +                                         min_ci_balanced,
 +                                         th,nnbl,
 +                                         nbl[th]);
 +            }
 +            nbs_cycle_stop(&nbs->cc[enbsCCsearch]);
 +
 +            np_tot = 0;
 +            np_noq = 0;
 +            np_hlj = 0;
 +            for(th=0; th<nnbl; th++)
 +            {
 +                inc_nrnb(nrnb,eNR_NBNXN_DIST2,nbs->work[th].ndistc);
 +
 +                if (nbl_list->bSimple)
 +                {
 +                    np_tot += nbl[th]->ncj;
 +                    np_noq += nbl[th]->work->ncj_noq;
 +                    np_hlj += nbl[th]->work->ncj_hlj;
 +                }
 +                else
 +                {
 +                    /* This count ignores potential subsequent pair pruning */
 +                    np_tot += nbl[th]->nci_tot;
 +                }
 +            }
 +            nap = nbl[0]->na_ci*nbl[0]->na_cj;
 +            nbl_list->natpair_ljq = (np_tot - np_noq)*nap - np_hlj*nap/2;
 +            nbl_list->natpair_lj  = np_noq*nap;
 +            nbl_list->natpair_q   = np_hlj*nap/2;
 +
 +            if (CombineNBLists && nnbl > 1)
 +            {
 +                nbs_cycle_start(&nbs->cc[enbsCCcombine]);
 +
 +                combine_nblists(nnbl-1,nbl+1,nbl[0]);
 +
 +                nbs_cycle_stop(&nbs->cc[enbsCCcombine]);
 +            }
-     if (gmx_debug_at)
 +        }
 +    }
 +
++    if (nbat->bUseBufferFlags)
++    {
++        reduce_buffer_flags(nbs,nnbl,&nbat->buffer_flags);
++    }
++
 +    /*
 +    print_supersub_nsp("nsubpair",nbl[0],iloc);
 +    */
 +
 +    /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */
 +    if (LOCAL_I(iloc))
 +    {
 +        nbs->search_count++;
 +    }
 +    if (nbs->print_cycles &&
 +        (!nbs->DomDec || (nbs->DomDec && !LOCAL_I(iloc))) &&
 +        nbs->search_count % 100 == 0)
 +    {
 +        nbs_cycle_print(stderr,nbs);
 +    }
 +
 +    if (debug && (CombineNBLists && nnbl > 1))
 +    {
 +        if (nbl[0]->bSimple)
 +        {
 +            print_nblist_statistics_simple(debug,nbl[0],nbs,rlist);
 +        }
 +        else
 +        {
 +            print_nblist_statistics_supersub(debug,nbl[0],nbs,rlist);
 +        }
 +    }
 +
-         if (nbl[0]->bSimple)
++    if (debug)
 +    {
-             print_nblist_ci_cj(debug,nbl[0]);
++        if (gmx_debug_at)
 +        {
-         else
++            if (nbl[0]->bSimple)
++            {
++                print_nblist_ci_cj(debug,nbl[0]);
++            }
++            else
++            {
++                print_nblist_sci_cj(debug,nbl[0]);
++            }
 +        }
-             print_nblist_sci_cj(debug,nbl[0]);
++
++        if (nbat->bUseBufferFlags)
 +        {
-         print_reduction_cost(nbs->grid,nbs->ngrid,nnbl);
++            print_reduction_cost(&nbat->buffer_flags,nnbl);
 +        }
 +    }
 +}
index 891376c61773584fe74f146399c37aa787fb3f53,0000000000000000000000000000000000000000..ddd87698318091eaad12cc9ffa7cf9c032bb03fe
mode 100644,000000..100644
--- /dev/null
@@@ -1,136 -1,0 +1,136 @@@
-                        const nbnxn_atomdata_t *nbat,
-                        const t_blocka *excl,
-                        real rlist,
-                        int min_ci_balanced,
-                        nbnxn_pairlist_set_t *nbl_list,
-                        int iloc,
-                        int nb_kernel_type,
-                        t_nrnb *nrnb);
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustr
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2012, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +
 +#ifndef _nbnxn_search_h
 +#define _nsnxn_search_h
 +
 +#include "typedefs.h"
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
 +
 +/* Returns the j-cluster size for kernel of type nb_kernel_type */
 +int nbnxn_kernel_to_cj_size(int nb_kernel_type);
 +
 +/* Tells if the pair-list corresponding to nb_kernel_type is simple.
 + * Returns FALSE for super-sub type pair-list.
 + */
 +gmx_bool nbnxn_kernel_pairlist_simple(int nb_kernel_type);
 +
 +/* Due to the cluster size the effective pair-list is longer than
 + * that of a simple atom pair-list. This function gives the extra distance.
 + */
 +real nbnxn_get_rlist_effective_inc(int cluster_size,real atom_density);
 +
 +/* Allocates and initializes a pair search data structure */
 +void nbnxn_init_search(nbnxn_search_t * nbs_ptr,
 +                       ivec *n_dd_cells,
 +                       gmx_domdec_zones_t *zones,
 +                       int nthread_max);
 +
 +/* Put the atoms on the pair search grid.
 + * Only atoms a0 to a1 in x are put on the grid.
 + * The atom_density is used to determine the grid size.
 + * When atom_density=-1, the density is determined from a1-a0 and the corners.
 + * With domain decomposition part of the n particles might have migrated,
 + * but have not been removed yet. This count is given by nmoved.
 + * When move[i] < 0 particle i has migrated and will not be put on the grid.
 + * Without domain decomposition move will be NULL.
 + */
 +void nbnxn_put_on_grid(nbnxn_search_t nbs,
 +                       int ePBC,matrix box,
 +                       int dd_zone,
 +                       rvec corner0,rvec corner1,
 +                       int a0,int a1,
 +                       real atom_density,
 +                       const int *atinfo,
 +                       rvec *x,
 +                       int nmoved,int *move,
 +                       int nb_kernel_type,
 +                       nbnxn_atomdata_t *nbat);
 +
 +/* As nbnxn_put_on_grid, but for the non-local atoms
 + * with domain decomposition. Should be called after calling
 + * nbnxn_search_put_on_grid for the local atoms / home zone.
 + */
 +void nbnxn_put_on_grid_nonlocal(nbnxn_search_t nbs,
 +                                const gmx_domdec_zones_t *zones,
 +                                const int *atinfo,
 +                                rvec *x,
 +                              int nb_kernel_type,
 +                                nbnxn_atomdata_t *nbat);
 +
 +/* Add simple grid type information to the local super/sub grid */
 +void nbnxn_grid_add_simple(nbnxn_search_t nbs,
 +                         nbnxn_atomdata_t *nbat);
 +
 +/* Return the number of x and y cells in the local grid */
 +void nbnxn_get_ncells(nbnxn_search_t nbs,int *ncx,int *ncy);
 +
 +/* Return the order indices *a of the atoms on the ns grid, size n */
 +void nbnxn_get_atomorder(nbnxn_search_t nbs,int **a,int *n);
 +
 +/* Renumber the atom indices on the grid to consecutive order */
 +void nbnxn_set_atomorder(nbnxn_search_t nbs);
 +
 +/* Initializes a set of pair lists stored in nbnxn_pairlist_set_t */
 +void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list,
 +                             gmx_bool simple, gmx_bool combined,
 +                             nbnxn_alloc_t *alloc,
 +                             nbnxn_free_t  *free);
 +
 +/* Make a apir-list with radius rlist, store it in nbl.
 + * The parameter min_ci_balanced sets the minimum required
 + * number or roughly equally sized ci blocks in nbl.
 + * When set >0 ci lists will be chopped up when the estimate
 + * for the number of equally sized lists is below min_ci_balanced.
 + */
 +void nbnxn_make_pairlist(const nbnxn_search_t nbs,
++                         nbnxn_atomdata_t *nbat,
++                         const t_blocka *excl,
++                         real rlist,
++                         int min_ci_balanced,
++                         nbnxn_pairlist_set_t *nbl_list,
++                         int iloc,
++                         int nb_kernel_type,
++                         t_nrnb *nrnb);
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +#endif
index b0dd488e5a55798017a6deeabb0a7351f42cfb22,0000000000000000000000000000000000000000..10d909706e7cd87232a290a693e708b29bae236a
mode 100644,000000..100644
--- /dev/null
@@@ -1,2660 -1,0 +1,2895 @@@
-         fprintf(debug,"reallocating neigborlist (ielec=%d, ivdw=%d, igeometry=%d, free_energy=%d), maxnri=%d\n",
-                 nl->ielec,nl->ivdw,nl->igeometry,nl->free_energy,nl->maxnri);
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +#include <string.h>
 +#include "sysstuff.h"
 +#include "smalloc.h"
 +#include "macros.h"
 +#include "maths.h"
 +#include "vec.h"
 +#include "network.h"
 +#include "nsgrid.h"
 +#include "force.h"
 +#include "nonbonded.h"
 +#include "ns.h"
 +#include "pbc.h"
 +#include "names.h"
 +#include "gmx_fatal.h"
 +#include "nrnb.h"
 +#include "txtdump.h"
 +#include "mtop_util.h"
 +
 +#include "domdec.h"
 +#include "adress.h"
 +
 +
 +/* 
 + *    E X C L U S I O N   H A N D L I N G
 + */
 +
 +#ifdef DEBUG
 +static void SETEXCL_(t_excl e[],atom_id i,atom_id j)
 +{   e[j] = e[j] | (1<<i); }
 +static void RMEXCL_(t_excl e[],atom_id i,atom_id j) 
 +{ e[j]=e[j] & ~(1<<i); }
 +static gmx_bool ISEXCL_(t_excl e[],atom_id i,atom_id j) 
 +{ return (gmx_bool)(e[j] & (1<<i)); }
 +static gmx_bool NOTEXCL_(t_excl e[],atom_id i,atom_id j)
 +{  return !(ISEXCL(e,i,j)); }
 +#else
 +#define SETEXCL(e,i,j) (e)[((atom_id) (j))] |= (1<<((atom_id) (i)))
 +#define RMEXCL(e,i,j)  (e)[((atom_id) (j))] &= (~(1<<((atom_id) (i))))
 +#define ISEXCL(e,i,j)  (gmx_bool) ((e)[((atom_id) (j))] & (1<<((atom_id) (i))))
 +#define NOTEXCL(e,i,j) !(ISEXCL(e,i,j))
 +#endif
 +
 +static int
 +round_up_to_simd_width(int length, int simd_width)
 +{
 +    int offset,newlength;
 +    
 +    offset = (simd_width>0) ? length % simd_width : 0;
 +
 +    return (offset==0) ? length : length-offset+simd_width;
 +}
 +/************************************************
 + *
 + *  U T I L I T I E S    F O R    N S
 + *
 + ************************************************/
 +
 +static void reallocate_nblist(t_nblist *nl)
 +{
 +    if (gmx_debug_at)
 +    {
-                         gmx_bool bfree, int igeometry)
++        fprintf(debug,"reallocating neigborlist (ielec=%d, ivdw=%d, igeometry=%d, type=%d), maxnri=%d\n",
++                nl->ielec,nl->ivdw,nl->igeometry,nl->type,nl->maxnri);
 +    }
 +    srenew(nl->iinr,   nl->maxnri);
 +    if (nl->igeometry == GMX_NBLIST_GEOMETRY_CG_CG)
 +    {
 +        srenew(nl->iinr_end,nl->maxnri);
 +    }
 +    srenew(nl->gid,    nl->maxnri);
 +    srenew(nl->shift,  nl->maxnri);
 +    srenew(nl->jindex, nl->maxnri+1);
 +}
 +
 +
 +static void init_nblist(FILE *log, t_nblist *nl_sr,t_nblist *nl_lr,
 +                        int maxsr,int maxlr,
 +                        int ivdw, int ivdwmod,
 +                        int ielec, int ielecmod,
-         nl->free_energy = bfree;
++                        int igeometry, int type)
 +{
 +    t_nblist *nl;
 +    int      homenr;
 +    int      i,nn;
 +    
 +    for(i=0; (i<2); i++)
 +    {
 +        nl     = (i == 0) ? nl_sr : nl_lr;
 +        homenr = (i == 0) ? maxsr : maxlr;
 +
 +        if (nl == NULL)
 +        {
 +            continue;
 +        }
 +
 +
 +        /* Set coul/vdw in neighborlist, and for the normal loops we determine
 +         * an index of which one to call.
 +         */
 +        nl->ivdw        = ivdw;
 +        nl->ivdwmod     = ivdwmod;
 +        nl->ielec       = ielec;
 +        nl->ielecmod    = ielecmod;
-         if (bfree)
++        nl->type        = type;
 +        nl->igeometry   = igeometry;
 +
-         
++        if (nl->type==GMX_NBLIST_INTERACTION_FREE_ENERGY)
 +        {
 +            nl->igeometry  = GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE;
 +        }
-             fprintf(debug,"Initiating neighbourlist (ielec=%d, ivdw=%d, free=%d) for %s interactions,\nwith %d SR, %d LR atoms.\n",
-                     nl->ielec,nl->ivdw,nl->free_energy,gmx_nblist_geometry_names[nl->igeometry],maxsr,maxlr);
++
 +        /* This will also set the simd_padding_width field */
 +        gmx_nonbonded_set_kernel_pointers( (i==0) ? log : NULL,nl);
 +        
 +        /* maxnri is influenced by the number of shifts (maximum is 8)
 +         * and the number of energy groups.
 +         * If it is not enough, nl memory will be reallocated during the run.
 +         * 4 seems to be a reasonable factor, which only causes reallocation
 +         * during runs with tiny and many energygroups.
 +         */
 +        nl->maxnri      = homenr*4;
 +        nl->maxnrj      = 0;
 +        nl->maxlen      = 0;
 +        nl->nri         = -1;
 +        nl->nrj         = 0;
 +        nl->iinr        = NULL;
 +        nl->gid         = NULL;
 +        nl->shift       = NULL;
 +        nl->jindex      = NULL;
 +        reallocate_nblist(nl);
 +        nl->jindex[0] = 0;
 +
 +        if(debug)
 +        {
-    int ielec,ielecf,ivdw,ielecmod,ielecmodf,ivdwmod;
++            fprintf(debug,"Initiating neighbourlist (ielec=%d, ivdw=%d, type=%d) for %s interactions,\nwith %d SR, %d LR atoms.\n",
++                    nl->ielec,nl->ivdw,nl->type,gmx_nblist_geometry_names[nl->igeometry],maxsr,maxlr);
 +        }
 +    }
 +}
 +
 +void init_neighbor_list(FILE *log,t_forcerec *fr,int homenr)
 +{
 +   /* Make maxlr tunable! (does not seem to be a big difference though) 
 +    * This parameter determines the number of i particles in a long range 
 +    * neighbourlist. Too few means many function calls, too many means
 +    * cache trashing.
 +    */
 +   int maxsr,maxsr_wat,maxlr,maxlr_wat;
-                    maxsr,maxlr,ivdw,ivdwmod,ielec,ielecmod,FALSE,igeometry_def);
++   int ielec,ielecf,ivdw,ielecmod,ielecmodf,ivdwmod,type;
 +   int solvent;
 +   int igeometry_def,igeometry_w,igeometry_ww;
 +   int i;
 +   t_nblists *nbl;
 +
 +   /* maxsr     = homenr-fr->nWatMol*3; */
 +   maxsr     = homenr;
 +
 +   if (maxsr < 0)
 +   {
 +     gmx_fatal(FARGS,"%s, %d: Negative number of short range atoms.\n"
 +               "Call your Gromacs dealer for assistance.",__FILE__,__LINE__);
 +   }
 +   /* This is just for initial allocation, so we do not reallocate
 +    * all the nlist arrays many times in a row.
 +    * The numbers seem very accurate, but they are uncritical.
 +    */
 +   maxsr_wat = min(fr->nWatMol,(homenr+2)/3); 
 +   if (fr->bTwinRange) 
 +   {
 +       maxlr     = 50;
 +       maxlr_wat = min(maxsr_wat,maxlr);
 +   }
 +   else
 +   {
 +     maxlr = maxlr_wat = 0;
 +   }  
 +
 +   /* Determine the values for ielec/ivdw. */
 +   ielec = fr->nbkernel_elec_interaction;
 +   ivdw  = fr->nbkernel_vdw_interaction;
 +   ielecmod = fr->nbkernel_elec_modifier;
 +   ivdwmod  = fr->nbkernel_vdw_modifier;
++   type     = GMX_NBLIST_INTERACTION_STANDARD;
 +
 +   fr->ns.bCGlist = (getenv("GMX_NBLISTCG") != 0);
 +   if (!fr->ns.bCGlist)
 +   {
 +       igeometry_def = GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE;
 +   }
 +   else
 +   {
 +       igeometry_def = GMX_NBLIST_GEOMETRY_CG_CG;
 +       if (log != NULL)
 +       {
 +           fprintf(log,"\nUsing charge-group - charge-group neighbor lists and kernels\n\n");
 +       }
 +   }
 +   
 +   if (fr->solvent_opt == esolTIP4P) {
 +       igeometry_w  = GMX_NBLIST_GEOMETRY_WATER4_PARTICLE;
 +       igeometry_ww = GMX_NBLIST_GEOMETRY_WATER4_WATER4;
 +   } else {
 +       igeometry_w  = GMX_NBLIST_GEOMETRY_WATER3_PARTICLE;
 +       igeometry_ww = GMX_NBLIST_GEOMETRY_WATER3_WATER3;
 +   }
 +
 +   for(i=0; i<fr->nnblists; i++) 
 +   {
 +       nbl = &(fr->nblists[i]);
++
++       if ((fr->adress_type!=eAdressOff) && (i>=fr->nnblists/2)){
++           type=GMX_NBLIST_INTERACTION_ADRESS;
++       }
 +       init_nblist(log,&nbl->nlist_sr[eNL_VDWQQ],&nbl->nlist_lr[eNL_VDWQQ],
-                    maxsr,maxlr,ivdw,ivdwmod,GMX_NBKERNEL_ELEC_NONE,eintmodNONE,FALSE,igeometry_def);
++                   maxsr,maxlr,ivdw,ivdwmod,ielec,ielecmod,igeometry_def, type);
 +       init_nblist(log,&nbl->nlist_sr[eNL_VDW],&nbl->nlist_lr[eNL_VDW],
-                    maxsr,maxlr,GMX_NBKERNEL_VDW_NONE,eintmodNONE,ielec,ielecmod,FALSE,igeometry_def);
++                   maxsr,maxlr,ivdw,ivdwmod,GMX_NBKERNEL_ELEC_NONE,eintmodNONE,igeometry_def, type);
 +       init_nblist(log,&nbl->nlist_sr[eNL_QQ],&nbl->nlist_lr[eNL_QQ],
-                    maxsr_wat,maxlr_wat,ivdw,ivdwmod,ielec,ielecmod, FALSE,igeometry_w);
++                   maxsr,maxlr,GMX_NBKERNEL_VDW_NONE,eintmodNONE,ielec,ielecmod,igeometry_def, type);
 +       init_nblist(log,&nbl->nlist_sr[eNL_VDWQQ_WATER],&nbl->nlist_lr[eNL_VDWQQ_WATER],
-                    maxsr_wat,maxlr_wat,GMX_NBKERNEL_VDW_NONE,eintmodNONE,ielec,ielecmod, FALSE,igeometry_w);
++                   maxsr_wat,maxlr_wat,ivdw,ivdwmod,ielec,ielecmod, igeometry_w, type);
 +       init_nblist(log,&nbl->nlist_sr[eNL_QQ_WATER],&nbl->nlist_lr[eNL_QQ_WATER],
-                    maxsr_wat,maxlr_wat,ivdw,ivdwmod,ielec,ielecmod, FALSE,igeometry_ww);
++                   maxsr_wat,maxlr_wat,GMX_NBKERNEL_VDW_NONE,eintmodNONE,ielec,ielecmod, igeometry_w, type);
 +       init_nblist(log,&nbl->nlist_sr[eNL_VDWQQ_WATERWATER],&nbl->nlist_lr[eNL_VDWQQ_WATERWATER],
-                    maxsr_wat,maxlr_wat,GMX_NBKERNEL_VDW_NONE,eintmodNONE,ielec,ielecmod, FALSE,igeometry_ww);
++                   maxsr_wat,maxlr_wat,ivdw,ivdwmod,ielec,ielecmod, igeometry_ww, type);
 +       init_nblist(log,&nbl->nlist_sr[eNL_QQ_WATERWATER],&nbl->nlist_lr[eNL_QQ_WATERWATER],
-                        maxsr,maxlr,ivdw,ivdwmod,ielecf,ielecmod,TRUE,GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE);
++                   maxsr_wat,maxlr_wat,GMX_NBKERNEL_VDW_NONE,eintmodNONE,ielec,ielecmod, igeometry_ww, type);
 +
 +       /* Did we get the solvent loops so we can use optimized water kernels? */
 +       if(nbl->nlist_sr[eNL_VDWQQ_WATER].kernelptr_vf==NULL
 +          || nbl->nlist_sr[eNL_QQ_WATER].kernelptr_vf==NULL
 +#ifndef DISABLE_WATERWATER_NLIST
 +          || nbl->nlist_sr[eNL_VDWQQ_WATERWATER].kernelptr_vf==NULL
 +          || nbl->nlist_sr[eNL_QQ_WATERWATER].kernelptr_vf==NULL
 +#endif
 +          )
 +       {
 +           fr->solvent_opt = esolNO;
 +           fprintf(log,"Note: The available nonbonded kernels do not support water optimization - disabling.\n");
 +       }
 +       
 +       if (fr->efep != efepNO) 
 +       {
 +           if ((fr->bEwald) && (fr->sc_alphacoul > 0)) /* need to handle long range differently if using softcore */
 +           {
 +               ielecf = GMX_NBKERNEL_ELEC_EWALD;
 +               ielecmodf = eintmodNONE;
 +           }
 +           else
 +           {
 +               ielecf = ielec;
 +               ielecmodf = ielecmod;
 +           }
 +
 +           init_nblist(log,&nbl->nlist_sr[eNL_VDWQQ_FREE],&nbl->nlist_lr[eNL_VDWQQ_FREE],
-                        maxsr,maxlr,ivdw,ivdwmod,GMX_NBKERNEL_ELEC_NONE,eintmodNONE,TRUE,GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE);
++                       maxsr,maxlr,ivdw,ivdwmod,ielecf,ielecmod,GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE, GMX_NBLIST_INTERACTION_FREE_ENERGY);
 +           init_nblist(log,&nbl->nlist_sr[eNL_VDW_FREE],&nbl->nlist_lr[eNL_VDW_FREE],
-                        maxsr,maxlr,GMX_NBKERNEL_VDW_NONE,eintmodNONE,ielecf,ielecmod,TRUE,GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE);
++                       maxsr,maxlr,ivdw,ivdwmod,GMX_NBKERNEL_ELEC_NONE,eintmodNONE,GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE, GMX_NBLIST_INTERACTION_FREE_ENERGY);
 +           init_nblist(log,&nbl->nlist_sr[eNL_QQ_FREE],&nbl->nlist_lr[eNL_QQ_FREE],
-                    maxsr,maxlr,0,0,ielec,ielecmod,FALSE,GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE);
++                       maxsr,maxlr,GMX_NBKERNEL_VDW_NONE,eintmodNONE,ielecf,ielecmod,GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE, GMX_NBLIST_INTERACTION_FREE_ENERGY);
 +       }  
 +   }
 +   /* QMMM MM list */
 +   if (fr->bQMMM && fr->qr->QMMMscheme != eQMMMschemeoniom)
 +   {
 +       init_nblist(log,&fr->QMMMlist,NULL,
-             fprintf(debug,"Increasing %s nblist (ielec=%d,ivdw=%d,free=%d,igeometry=%d) j size to %d\n",
-                     bLR ? "LR" : "SR",nlist->ielec,nlist->ivdw,nlist->free_energy,nlist->igeometry,nlist->maxnrj);
++                   maxsr,maxlr,0,0,ielec,ielecmod,GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE, GMX_NBLIST_INTERACTION_STANDARD);
 +   }
 +
 +   if(log!=NULL)
 +   {
 +       fprintf(log,"\n");
 +   }
 +
 +   fr->ns.nblist_initialized=TRUE;
 +}
 +
 +static void reset_nblist(t_nblist *nl)
 +{
 +     nl->nri       = -1;
 +     nl->nrj       = 0;
 +     nl->maxlen    = 0;
 +     if (nl->jindex)
 +     {
 +         nl->jindex[0] = 0;
 +     }
 +}
 +
 +static void reset_neighbor_lists(t_forcerec *fr,gmx_bool bResetSR, gmx_bool bResetLR)
 +{
 +    int n,i;
 +  
 +    if (fr->bQMMM)
 +    {
 +        /* only reset the short-range nblist */
 +        reset_nblist(&(fr->QMMMlist));
 +    }
 +
 +    for(n=0; n<fr->nnblists; n++)
 +    {
 +        for(i=0; i<eNL_NR; i++)
 +        {
 +            if(bResetSR)
 +            {
 +                reset_nblist( &(fr->nblists[n].nlist_sr[i]) );
 +            }
 +            if(bResetLR)
 +            {
 +                reset_nblist( &(fr->nblists[n].nlist_lr[i]) );
 +            }
 +        }
 +    }
 +}
 +
 +
 +
 +
 +static inline void new_i_nblist(t_nblist *nlist,
 +                                gmx_bool bLR,atom_id i_atom,int shift,int gid)
 +{
 +    int    i,k,nri,nshift;
 +    
 +    nri = nlist->nri;
 +    
 +    /* Check whether we have to increase the i counter */
 +    if ((nri == -1) ||
 +        (nlist->iinr[nri]  != i_atom) || 
 +        (nlist->shift[nri] != shift) || 
 +        (nlist->gid[nri]   != gid))
 +    {
 +        /* This is something else. Now see if any entries have 
 +         * been added in the list of the previous atom.
 +         */
 +        if ((nri == -1) ||
 +            ((nlist->jindex[nri+1] > nlist->jindex[nri]) && 
 +             (nlist->gid[nri] != -1)))
 +        {
 +            /* If so increase the counter */
 +            nlist->nri++;
 +            nri++;
 +            if (nlist->nri >= nlist->maxnri)
 +            {
 +                nlist->maxnri += over_alloc_large(nlist->nri);
 +                reallocate_nblist(nlist);
 +            }
 +        }
 +        /* Set the number of neighbours and the atom number */
 +        nlist->jindex[nri+1] = nlist->jindex[nri];
 +        nlist->iinr[nri]     = i_atom;
 +        nlist->gid[nri]      = gid;
 +        nlist->shift[nri]    = shift;
 +    }
 +}
 +
 +static inline void close_i_nblist(t_nblist *nlist) 
 +{
 +    int nri = nlist->nri;
 +    int len;
 +    
 +    if (nri >= 0)
 +    {
 +        /* Add elements up to padding. Since we allocate memory in units
 +         * of the simd_padding width, we do not have to check for possible
 +         * list reallocation here.
 +         */
 +        while((nlist->nrj % nlist->simd_padding_width)!=0)
 +        {
 +            /* Use -4 here, so we can write forces for 4 atoms before real data */
 +            nlist->jjnr[nlist->nrj++]=-4;
 +        }
 +        nlist->jindex[nri+1] = nlist->nrj;
 +        
 +        len=nlist->nrj -  nlist->jindex[nri];
 +        
 +        /* nlist length for water i molecules is treated statically 
 +         * in the innerloops 
 +         */
 +        if (len > nlist->maxlen)
 +        {
 +            nlist->maxlen = len;
 +        }
 +    }
 +}
 +
 +static inline void close_nblist(t_nblist *nlist)
 +{
 +    /* Only close this nblist when it has been initialized.
 +     * Avoid the creation of i-lists with no j-particles.
 +     */
 +    if (nlist->nrj == 0)
 +    {
 +        /* Some assembly kernels do not support empty lists,
 +         * make sure here that we don't generate any empty lists.
 +         * With the current ns code this branch is taken in two cases:
 +         * No i-particles at all: nri=-1 here
 +         * There are i-particles, but no j-particles; nri=0 here
 +         */
 +        nlist->nri = 0;
 +    }
 +    else
 +    {
 +        /* Close list number nri by incrementing the count */
 +        nlist->nri++;
 +    }
 +}
 +
 +static inline void close_neighbor_lists(t_forcerec *fr,gmx_bool bMakeQMMMnblist)
 +{
 +    int n,i;
 +    
 +    if (bMakeQMMMnblist)
 +    {
 +            close_nblist(&(fr->QMMMlist));
 +    }
 +
 +    for(n=0; n<fr->nnblists; n++)
 +    {
 +        for(i=0; (i<eNL_NR); i++)
 +        {
 +            close_nblist(&(fr->nblists[n].nlist_sr[i]));
 +            close_nblist(&(fr->nblists[n].nlist_lr[i]));
 +        }
 +    }
 +}
 +
 +
 +static inline void add_j_to_nblist(t_nblist *nlist,atom_id j_atom,gmx_bool bLR)
 +{
 +    int nrj=nlist->nrj;
 +    
 +    if (nlist->nrj >= nlist->maxnrj)
 +    {
 +        nlist->maxnrj = round_up_to_simd_width(over_alloc_small(nlist->nrj + 1),nlist->simd_padding_width);
 +        
 +        if (gmx_debug_at)
-             fprintf(debug,"Increasing %s nblist (ielec=%d,ivdw=%d,free=%d,igeometry=%d) j size to %d\n",
-                     bLR ? "LR" : "SR",nlist->ielec,nlist->ivdw,nlist->free_energy,nlist->igeometry,nlist->maxnrj);
++            fprintf(debug,"Increasing %s nblist (ielec=%d,ivdw=%d,type=%d,igeometry=%d) j size to %d\n",
++                    bLR ? "LR" : "SR",nlist->ielec,nlist->ivdw,nlist->type,nlist->igeometry,nlist->maxnrj);
 +        
 +        srenew(nlist->jjnr,nlist->maxnrj);
 +    }
 +
 +    nlist->jjnr[nrj] = j_atom;
 +    nlist->nrj ++;
 +}
 +
 +static inline void add_j_to_nblist_cg(t_nblist *nlist,
 +                                      atom_id j_start,int j_end,
 +                                      t_excl *bexcl,gmx_bool i_is_j,
 +                                      gmx_bool bLR)
 +{
 +    int nrj=nlist->nrj;
 +    int j;
 +
 +    if (nlist->nrj >= nlist->maxnrj)
 +    {
 +        nlist->maxnrj = over_alloc_small(nlist->nrj + 1);
 +        if (gmx_debug_at)
-                 
++            fprintf(debug,"Increasing %s nblist (ielec=%d,ivdw=%d,type=%d,igeometry=%d) j size to %d\n",
++                    bLR ? "LR" : "SR",nlist->ielec,nlist->ivdw,nlist->type,nlist->igeometry,nlist->maxnrj);
 +        
 +        srenew(nlist->jjnr    ,nlist->maxnrj);
 +        srenew(nlist->jjnr_end,nlist->maxnrj);
 +        srenew(nlist->excl    ,nlist->maxnrj*MAX_CGCGSIZE);
 +    }
 +
 +    nlist->jjnr[nrj]     = j_start;
 +    nlist->jjnr_end[nrj] = j_end;
 +
 +    if (j_end - j_start > MAX_CGCGSIZE)
 +    {
 +        gmx_fatal(FARGS,"The charge-group - charge-group neighborlist do not support charge groups larger than %d, found a charge group of size %d",MAX_CGCGSIZE,j_end-j_start);
 +    }
 +
 +    /* Set the exclusions */
 +    for(j=j_start; j<j_end; j++)
 +    {
 +        nlist->excl[nrj*MAX_CGCGSIZE + j - j_start] = bexcl[j];
 +    }
 +    if (i_is_j)
 +    {
 +        /* Avoid double counting of intra-cg interactions */
 +        for(j=1; j<j_end-j_start; j++)
 +        {
 +            nlist->excl[nrj*MAX_CGCGSIZE + j] |= (1<<j) - 1;
 +        }
 +    }
 +
 +    nlist->nrj ++;
 +}
 +
 +typedef void
 +put_in_list_t(gmx_bool              bHaveVdW[],
 +              int               ngid,
 +              t_mdatoms *       md,
 +              int               icg,
 +              int               jgid,
 +              int               nj,
 +              atom_id           jjcg[],
 +              atom_id           index[],
 +              t_excl            bExcl[],
 +              int               shift,
 +              t_forcerec *      fr,
 +              gmx_bool          bLR,
 +              gmx_bool          bDoVdW,
 +              gmx_bool          bDoCoul,
 +              int               solvent_opt);
 +
 +static void 
 +put_in_list_at(gmx_bool              bHaveVdW[],
 +               int               ngid,
 +               t_mdatoms *       md,
 +               int               icg,
 +               int               jgid,
 +               int               nj,
 +               atom_id           jjcg[],
 +               atom_id           index[],
 +               t_excl            bExcl[],
 +               int               shift,
 +               t_forcerec *      fr,
 +               gmx_bool          bLR,
 +               gmx_bool          bDoVdW,
 +               gmx_bool          bDoCoul,
 +               int               solvent_opt)
 +{
 +    /* The a[] index has been removed,
 +     * to put it back in i_atom should be a[i0] and jj should be a[jj].
 +     */
 +    t_nblist *   vdwc;
 +    t_nblist *   vdw;
 +    t_nblist *   coul;
 +    t_nblist *   vdwc_free  = NULL;
 +    t_nblist *   vdw_free   = NULL;
 +    t_nblist *   coul_free  = NULL;
 +    t_nblist *   vdwc_ww    = NULL;
 +    t_nblist *   coul_ww    = NULL;
 +    
 +    int           i,j,jcg,igid,gid,nbl_ind,ind_ij;
 +    atom_id   jj,jj0,jj1,i_atom;
 +    int       i0,nicg,len;
 +    
 +    int       *cginfo;
 +    int       *type,*typeB;
 +    real      *charge,*chargeB;
 +    real      qi,qiB,qq,rlj;
 +    gmx_bool      bFreeEnergy,bFree,bFreeJ,bNotEx,*bPert;
 +    gmx_bool      bDoVdW_i,bDoCoul_i,bDoCoul_i_sol;
 +    int       iwater,jwater;
 +    t_nblist  *nlist;
 +    
 +    /* Copy some pointers */
 +    cginfo  = fr->cginfo;
 +    charge  = md->chargeA;
 +    chargeB = md->chargeB;
 +    type    = md->typeA;
 +    typeB   = md->typeB;
 +    bPert   = md->bPerturbed;
 +    
 +    /* Get atom range */
 +    i0     = index[icg];
 +    nicg   = index[icg+1]-i0;
 +    
 +    /* Get the i charge group info */
 +    igid   = GET_CGINFO_GID(cginfo[icg]);
 +
 +    iwater = (solvent_opt!=esolNO) ? GET_CGINFO_SOLOPT(cginfo[icg]) : esolNO;
 +    
 +    bFreeEnergy = FALSE;
 +    if (md->nPerturbed) 
 +    {
 +        /* Check if any of the particles involved are perturbed. 
 +         * If not we can do the cheaper normal put_in_list
 +         * and use more solvent optimization.
 +         */
 +        for(i=0; i<nicg; i++)
 +        {
 +            bFreeEnergy |= bPert[i0+i];
 +        }
 +        /* Loop over the j charge groups */
 +        for(j=0; (j<nj && !bFreeEnergy); j++) 
 +        {
 +            jcg = jjcg[j];
 +            jj0 = index[jcg];
 +            jj1 = index[jcg+1];
 +            /* Finally loop over the atoms in the j-charge group */   
 +            for(jj=jj0; jj<jj1; jj++)
 +            {
 +                bFreeEnergy |= bPert[jj];
 +            }
 +        }
 +    }
 +    
 +    /* Unpack pointers to neighbourlist structs */
 +    if (fr->nnblists == 1)
 +    {
 +        nbl_ind = 0;
 +    }
 +    else
 +    {
 +        nbl_ind = fr->gid2nblists[GID(igid,jgid,ngid)];
 +    }
 +    if (bLR)
 +    {
 +        nlist = fr->nblists[nbl_ind].nlist_lr;
 +    }
 +    else
 +    {
 +        nlist = fr->nblists[nbl_ind].nlist_sr;
 +    }
 +    
 +    if (iwater != esolNO)
 +    {
 +        vdwc = &nlist[eNL_VDWQQ_WATER];
 +        vdw  = &nlist[eNL_VDW];
 +        coul = &nlist[eNL_QQ_WATER];
 +#ifndef DISABLE_WATERWATER_NLIST
 +        vdwc_ww = &nlist[eNL_VDWQQ_WATERWATER];
 +        coul_ww = &nlist[eNL_QQ_WATERWATER];
 +#endif
 +    } 
 +    else 
 +    {
 +        vdwc = &nlist[eNL_VDWQQ];
 +        vdw  = &nlist[eNL_VDW];
 +        coul = &nlist[eNL_QQ];
 +    }
 +    
 +    if (!bFreeEnergy) 
 +    {
 +        if (iwater != esolNO) 
 +        {
 +            /* Loop over the atoms in the i charge group */    
 +            i_atom  = i0;
 +            gid     = GID(igid,jgid,ngid);
 +            /* Create new i_atom for each energy group */
 +            if (bDoCoul && bDoVdW)
 +            {
 +                new_i_nblist(vdwc,bLR,i_atom,shift,gid);
 +#ifndef DISABLE_WATERWATER_NLIST
 +                new_i_nblist(vdwc_ww,bLR,i_atom,shift,gid);
 +#endif
 +            }
 +            if (bDoVdW)
 +            {
 +                new_i_nblist(vdw,bLR,i_atom,shift,gid);
 +            }
 +            if (bDoCoul) 
 +            {
 +                new_i_nblist(coul,bLR,i_atom,shift,gid);
 +#ifndef DISABLE_WATERWATER_NLIST
 +                new_i_nblist(coul_ww,bLR,i_atom,shift,gid);
 +#endif
 +            }      
 +        /* Loop over the j charge groups */
 +            for(j=0; (j<nj); j++) 
 +            {
 +                jcg=jjcg[j];
 +                
 +                if (jcg == icg)
 +                {
 +                    continue;
 +                }
-                 else if (iwater == esolTIP4P && jwater == esolTIP4P) 
++
 +                jj0 = index[jcg];
 +                jwater = GET_CGINFO_SOLOPT(cginfo[jcg]);
 +                
 +                if (iwater == esolSPC && jwater == esolSPC)
 +                {
 +                    /* Interaction between two SPC molecules */
 +                    if (!bDoCoul)
 +                    {
 +                        /* VdW only - only first atoms in each water interact */
 +                        add_j_to_nblist(vdw,jj0,bLR);
 +                    }
 +                    else 
 +                    {
 +#ifdef DISABLE_WATERWATER_NLIST       
 +                        /* Add entries for the three atoms - only do VdW if we need to */
 +                        if (!bDoVdW)
 +                        {
 +                            add_j_to_nblist(coul,jj0,bLR);
 +                        }
 +                        else
 +                        {
 +                            add_j_to_nblist(vdwc,jj0,bLR);
 +                        }
 +                        add_j_to_nblist(coul,jj0+1,bLR);
 +                        add_j_to_nblist(coul,jj0+2,bLR);          
 +#else
 +                        /* One entry for the entire water-water interaction */
 +                        if (!bDoVdW)
 +                        {
 +                            add_j_to_nblist(coul_ww,jj0,bLR);
 +                        }
 +                        else
 +                        {
 +                            add_j_to_nblist(vdwc_ww,jj0,bLR);
 +                        }
 +#endif
 +                    }  
 +                } 
-     
-     if (!fr->ns.bCGlist)
-     {
-         put_in_list = put_in_list_at;
-     }
-     else
-     {
-         put_in_list = put_in_list_cg;
++                else if (iwater == esolTIP4P && jwater == esolTIP4P)
 +                {
 +                    /* Interaction between two TIP4p molecules */
 +                    if (!bDoCoul)
 +                    {
 +                        /* VdW only - only first atoms in each water interact */
 +                        add_j_to_nblist(vdw,jj0,bLR);
 +                    }
 +                    else 
 +                    {
 +#ifdef DISABLE_WATERWATER_NLIST       
 +                        /* Add entries for the four atoms - only do VdW if we need to */
 +                        if (bDoVdW)
 +                        {
 +                            add_j_to_nblist(vdw,jj0,bLR);
 +                        }
 +                        add_j_to_nblist(coul,jj0+1,bLR);
 +                        add_j_to_nblist(coul,jj0+2,bLR);          
 +                        add_j_to_nblist(coul,jj0+3,bLR);          
 +#else
 +                        /* One entry for the entire water-water interaction */
 +                        if (!bDoVdW)
 +                        {
 +                            add_j_to_nblist(coul_ww,jj0,bLR);
 +                        }
 +                        else
 +                        {
 +                            add_j_to_nblist(vdwc_ww,jj0,bLR);
 +                        }
 +#endif
 +                    }                                         
 +                }
 +                else 
 +                {
 +                    /* j charge group is not water, but i is.
 +                     * Add entries to the water-other_atom lists; the geometry of the water
 +                     * molecule doesn't matter - that is taken care of in the nonbonded kernel,
 +                     * so we don't care if it is SPC or TIP4P...
 +                     */
 +                    
 +                    jj1 = index[jcg+1];
 +                    
 +                    if (!bDoVdW) 
 +                    {
 +                        for(jj=jj0; (jj<jj1); jj++) 
 +                        {
 +                            if (charge[jj] != 0)
 +                            {
 +                                add_j_to_nblist(coul,jj,bLR);
 +                            }
 +                        }
 +                    }
 +                    else if (!bDoCoul)
 +                    {
 +                        for(jj=jj0; (jj<jj1); jj++)
 +                        {
 +                            if (bHaveVdW[type[jj]])
 +                            {
 +                                add_j_to_nblist(vdw,jj,bLR);
 +                            }
 +                        }
 +                    }
 +                    else 
 +                    {
 +                        /* _charge_ _groups_ interact with both coulomb and LJ */
 +                        /* Check which atoms we should add to the lists!       */
 +                        for(jj=jj0; (jj<jj1); jj++) 
 +                        {
 +                            if (bHaveVdW[type[jj]]) 
 +                            {
 +                                if (charge[jj] != 0)
 +                                {
 +                                    add_j_to_nblist(vdwc,jj,bLR);
 +                                }
 +                                else
 +                                {
 +                                    add_j_to_nblist(vdw,jj,bLR);
 +                                }
 +                            }
 +                            else if (charge[jj] != 0)
 +                            {
 +                                add_j_to_nblist(coul,jj,bLR);
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +            close_i_nblist(vdw); 
 +            close_i_nblist(coul); 
 +            close_i_nblist(vdwc);  
 +#ifndef DISABLE_WATERWATER_NLIST
 +            close_i_nblist(coul_ww);
 +            close_i_nblist(vdwc_ww); 
 +#endif
 +        } 
 +        else
 +        { 
 +            /* no solvent as i charge group */
 +            /* Loop over the atoms in the i charge group */    
 +            for(i=0; i<nicg; i++) 
 +            {
 +                i_atom  = i0+i;
 +                gid     = GID(igid,jgid,ngid);
 +                qi      = charge[i_atom];
 +                
 +                /* Create new i_atom for each energy group */
 +                if (bDoVdW && bDoCoul)
 +                {
 +                    new_i_nblist(vdwc,bLR,i_atom,shift,gid);
 +                }
 +                if (bDoVdW)
 +                {
 +                    new_i_nblist(vdw,bLR,i_atom,shift,gid);
 +                }
 +                if (bDoCoul)
 +                {
 +                    new_i_nblist(coul,bLR,i_atom,shift,gid);
 +                }
 +                bDoVdW_i  = (bDoVdW  && bHaveVdW[type[i_atom]]);
 +                bDoCoul_i = (bDoCoul && qi!=0);
 +                
 +                if (bDoVdW_i || bDoCoul_i) 
 +                {
 +                    /* Loop over the j charge groups */
 +                    for(j=0; (j<nj); j++) 
 +                    {
 +                        jcg=jjcg[j];
 +                        
 +                        /* Check for large charge groups */
 +                        if (jcg == icg)
 +                        {
 +                            jj0 = i0 + i + 1;
 +                        }
 +                        else
 +                        {
 +                            jj0 = index[jcg];
 +                        }
 +                        
 +                        jj1=index[jcg+1];
 +                        /* Finally loop over the atoms in the j-charge group */       
 +                        for(jj=jj0; jj<jj1; jj++) 
 +                        {
 +                            bNotEx = NOTEXCL(bExcl,i,jj);
 +                            
 +                            if (bNotEx) 
 +                            {
 +                                if (!bDoVdW_i) 
 +                                { 
 +                                    if (charge[jj] != 0)
 +                                    {
 +                                        add_j_to_nblist(coul,jj,bLR);
 +                                    }
 +                                }
 +                                else if (!bDoCoul_i) 
 +                                {
 +                                    if (bHaveVdW[type[jj]])
 +                                    {
 +                                        add_j_to_nblist(vdw,jj,bLR);
 +                                    }
 +                                }
 +                                else 
 +                                {
 +                                    if (bHaveVdW[type[jj]]) 
 +                                    {
 +                                        if (charge[jj] != 0)
 +                                        {
 +                                            add_j_to_nblist(vdwc,jj,bLR);
 +                                        }
 +                                        else
 +                                        {
 +                                            add_j_to_nblist(vdw,jj,bLR);
 +                                        }
 +                                    } 
 +                                    else if (charge[jj] != 0)
 +                                    {
 +                                        add_j_to_nblist(coul,jj,bLR);
 +                                    }
 +                                }
 +                            }
 +                        }
 +                    }
 +                }
 +                close_i_nblist(vdw);
 +                close_i_nblist(coul);
 +                close_i_nblist(vdwc);
 +            }
 +        }
 +    }
 +    else
 +    {
 +        /* we are doing free energy */
 +        vdwc_free = &nlist[eNL_VDWQQ_FREE];
 +        vdw_free  = &nlist[eNL_VDW_FREE];
 +        coul_free = &nlist[eNL_QQ_FREE];
 +        /* Loop over the atoms in the i charge group */    
 +        for(i=0; i<nicg; i++) 
 +        {
 +            i_atom  = i0+i;
 +            gid     = GID(igid,jgid,ngid);
 +            qi      = charge[i_atom];
 +            qiB     = chargeB[i_atom];
 +            
 +            /* Create new i_atom for each energy group */
 +            if (bDoVdW && bDoCoul) 
 +                new_i_nblist(vdwc,bLR,i_atom,shift,gid);
 +            if (bDoVdW)   
 +                new_i_nblist(vdw,bLR,i_atom,shift,gid);
 +            if (bDoCoul) 
 +                new_i_nblist(coul,bLR,i_atom,shift,gid);
 +            
 +            new_i_nblist(vdw_free,bLR,i_atom,shift,gid);
 +            new_i_nblist(coul_free,bLR,i_atom,shift,gid);
 +            new_i_nblist(vdwc_free,bLR,i_atom,shift,gid);
 +            
 +            bDoVdW_i  = (bDoVdW  &&
 +                         (bHaveVdW[type[i_atom]] || bHaveVdW[typeB[i_atom]]));
 +            bDoCoul_i = (bDoCoul && (qi!=0 || qiB!=0));
 +            /* For TIP4P the first atom does not have a charge,
 +             * but the last three do. So we should still put an atom
 +             * without LJ but with charge in the water-atom neighborlist
 +             * for a TIP4p i charge group.
 +             * For SPC type water the first atom has LJ and charge,
 +             * so there is no such problem.
 +             */
 +            if (iwater == esolNO)
 +            {
 +                bDoCoul_i_sol = bDoCoul_i;
 +            }
 +            else
 +            {
 +                bDoCoul_i_sol = bDoCoul;
 +            }
 +            
 +            if (bDoVdW_i || bDoCoul_i_sol) 
 +            {
 +                /* Loop over the j charge groups */
 +                for(j=0; (j<nj); j++)
 +                {
 +                    jcg=jjcg[j];
 +                    
 +                    /* Check for large charge groups */
 +                    if (jcg == icg)
 +                    {
 +                        jj0 = i0 + i + 1;
 +                    }
 +                    else
 +                    {
 +                        jj0 = index[jcg];
 +                    }
 +                    
 +                    jj1=index[jcg+1];
 +                    /* Finally loop over the atoms in the j-charge group */   
 +                    bFree = bPert[i_atom];
 +                    for(jj=jj0; (jj<jj1); jj++) 
 +                    {
 +                        bFreeJ = bFree || bPert[jj];
 +                        /* Complicated if, because the water H's should also
 +                         * see perturbed j-particles
 +                         */
 +                        if (iwater==esolNO || i==0 || bFreeJ) 
 +                        {
 +                            bNotEx = NOTEXCL(bExcl,i,jj);
 +                            
 +                            if (bNotEx) 
 +                            {
 +                                if (bFreeJ)
 +                                {
 +                                    if (!bDoVdW_i) 
 +                                    {
 +                                        if (charge[jj]!=0 || chargeB[jj]!=0)
 +                                        {
 +                                            add_j_to_nblist(coul_free,jj,bLR);
 +                                        }
 +                                    }
 +                                    else if (!bDoCoul_i) 
 +                                    {
 +                                        if (bHaveVdW[type[jj]] || bHaveVdW[typeB[jj]])
 +                                        {
 +                                            add_j_to_nblist(vdw_free,jj,bLR);
 +                                        }
 +                                    }
 +                                    else 
 +                                    {
 +                                        if (bHaveVdW[type[jj]] || bHaveVdW[typeB[jj]]) 
 +                                        {
 +                                            if (charge[jj]!=0 || chargeB[jj]!=0)
 +                                            {
 +                                                add_j_to_nblist(vdwc_free,jj,bLR);
 +                                            }
 +                                            else
 +                                            {
 +                                                add_j_to_nblist(vdw_free,jj,bLR);
 +                                            }
 +                                        }
 +                                        else if (charge[jj]!=0 || chargeB[jj]!=0)
 +                                            add_j_to_nblist(coul_free,jj,bLR);
 +                                    }
 +                                }
 +                                else if (!bDoVdW_i) 
 +                                { 
 +                                    /* This is done whether or not bWater is set */
 +                                    if (charge[jj] != 0)
 +                                    {
 +                                        add_j_to_nblist(coul,jj,bLR);
 +                                    }
 +                                }
 +                                else if (!bDoCoul_i_sol) 
 +                                { 
 +                                    if (bHaveVdW[type[jj]])
 +                                    {
 +                                        add_j_to_nblist(vdw,jj,bLR);
 +                                    }
 +                                }
 +                                else 
 +                                {
 +                                    if (bHaveVdW[type[jj]]) 
 +                                    {
 +                                        if (charge[jj] != 0)
 +                                        {
 +                                            add_j_to_nblist(vdwc,jj,bLR);
 +                                        }
 +                                        else
 +                                        {
 +                                            add_j_to_nblist(vdw,jj,bLR);
 +                                        }
 +                                    } 
 +                                    else if (charge[jj] != 0)
 +                                    {
 +                                        add_j_to_nblist(coul,jj,bLR);
 +                                    }
 +                                }
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +            close_i_nblist(vdw);
 +            close_i_nblist(coul);
 +            close_i_nblist(vdwc);
 +            close_i_nblist(vdw_free);
 +            close_i_nblist(coul_free);
 +            close_i_nblist(vdwc_free);
 +        }
 +    }
 +}
 +
++static void
++put_in_list_adress(gmx_bool              bHaveVdW[],
++               int               ngid,
++               t_mdatoms *       md,
++               int               icg,
++               int               jgid,
++               int               nj,
++               atom_id           jjcg[],
++               atom_id           index[],
++               t_excl            bExcl[],
++               int               shift,
++               t_forcerec *      fr,
++               gmx_bool          bLR,
++               gmx_bool          bDoVdW,
++               gmx_bool          bDoCoul,
++               int               solvent_opt)
++{
++    /* The a[] index has been removed,
++     * to put it back in i_atom should be a[i0] and jj should be a[jj].
++     */
++    t_nblist *   vdwc;
++    t_nblist *   vdw;
++    t_nblist *   coul;
++    t_nblist *   vdwc_adress  = NULL;
++    t_nblist *   vdw_adress   = NULL;
++    t_nblist *   coul_adress  = NULL;
++    t_nblist *   vdwc_ww    = NULL;
++    t_nblist *   coul_ww    = NULL;
++
++    int           i,j,jcg,igid,gid,nbl_ind,nbl_ind_adress;
++    atom_id   jj,jj0,jj1,i_atom;
++    int       i0,nicg,len;
++
++    int       *cginfo;
++    int       *type,*typeB;
++    real      *charge,*chargeB;
++    real      *wf;
++    real      qi,qiB,qq,rlj;
++    gmx_bool      bFreeEnergy,bFree,bFreeJ,bNotEx,*bPert;
++    gmx_bool      bDoVdW_i,bDoCoul_i,bDoCoul_i_sol;
++    gmx_bool      b_hybrid;
++    gmx_bool      j_all_atom;
++    int       iwater,jwater;
++    t_nblist  *nlist, *nlist_adress;
++
++    /* Copy some pointers */
++    cginfo  = fr->cginfo;
++    charge  = md->chargeA;
++    chargeB = md->chargeB;
++    type    = md->typeA;
++    typeB   = md->typeB;
++    bPert   = md->bPerturbed;
++    wf      = md->wf;
++
++    /* Get atom range */
++    i0     = index[icg];
++    nicg   = index[icg+1]-i0;
++
++    /* Get the i charge group info */
++    igid   = GET_CGINFO_GID(cginfo[icg]);
++
++    iwater = (solvent_opt!=esolNO) ? GET_CGINFO_SOLOPT(cginfo[icg]) : esolNO;
++
++    if (md->nPerturbed)
++    {
++        gmx_fatal(FARGS,"AdResS does not support free energy pertubation\n");
++    }
++
++    /* Unpack pointers to neighbourlist structs */
++    if (fr->nnblists == 2)
++    {
++        nbl_ind = 0;
++        nbl_ind_adress = 1;
++    }
++    else
++    {
++        nbl_ind = fr->gid2nblists[GID(igid,jgid,ngid)];
++        nbl_ind_adress = nbl_ind+fr->nnblists/2;
++    }
++    if (bLR)
++    {
++        nlist = fr->nblists[nbl_ind].nlist_lr;
++        nlist_adress= fr->nblists[nbl_ind_adress].nlist_lr;
++    }
++    else
++    {
++        nlist = fr->nblists[nbl_ind].nlist_sr;
++        nlist_adress = fr->nblists[nbl_ind_adress].nlist_sr;
++    }
++
++
++        vdwc = &nlist[eNL_VDWQQ];
++        vdw  = &nlist[eNL_VDW];
++        coul = &nlist[eNL_QQ];
++    
++        vdwc_adress = &nlist_adress[eNL_VDWQQ];
++        vdw_adress  = &nlist_adress[eNL_VDW];
++        coul_adress = &nlist_adress[eNL_QQ];
++
++    /* We do not support solvent optimization with AdResS for now.
++      For this we would need hybrid solvent-other kernels */
++    
++            /* no solvent as i charge group */
++            /* Loop over the atoms in the i charge group */
++            for(i=0; i<nicg; i++)
++            {
++                i_atom  = i0+i;
++                gid     = GID(igid,jgid,ngid);
++                qi      = charge[i_atom];
++
++                /* Create new i_atom for each energy group */
++                if (bDoVdW && bDoCoul)
++                {
++                    new_i_nblist(vdwc,bLR,i_atom,shift,gid);
++                    new_i_nblist(vdwc_adress,bLR,i_atom,shift,gid);
++
++                }
++                if (bDoVdW)
++                {
++                    new_i_nblist(vdw,bLR,i_atom,shift,gid);
++                    new_i_nblist(vdw_adress,bLR,i_atom,shift,gid);
++
++                }
++                if (bDoCoul)
++                {
++                    new_i_nblist(coul,bLR,i_atom,shift,gid);
++                    new_i_nblist(coul_adress,bLR,i_atom,shift,gid);
++                }
++                bDoVdW_i  = (bDoVdW  && bHaveVdW[type[i_atom]]);
++                bDoCoul_i = (bDoCoul && qi!=0);
++
++                if (bDoVdW_i || bDoCoul_i)
++                {
++                    /* Loop over the j charge groups */
++                    for(j=0; (j<nj); j++)
++                    {
++                        jcg=jjcg[j];
++
++                        /* Check for large charge groups */
++                        if (jcg == icg)
++                        {
++                            jj0 = i0 + i + 1;
++                        }
++                        else
++                        {
++                            jj0 = index[jcg];
++                        }
++
++                        jj1=index[jcg+1];
++                        /* Finally loop over the atoms in the j-charge group */
++                        for(jj=jj0; jj<jj1; jj++)
++                        {
++                            bNotEx = NOTEXCL(bExcl,i,jj);
++
++                            b_hybrid=!((wf[i_atom]==1&&wf[jj]==1)||(wf[i_atom] ==0 && wf[jj]==0));
++
++                            if (bNotEx)
++                            {
++                                if (!bDoVdW_i)
++                                {
++                                    if (charge[jj] != 0)
++                                    {
++                                        if(!b_hybrid){
++                                            add_j_to_nblist(coul,jj,bLR);
++                                        }else{
++                                            add_j_to_nblist(coul_adress,jj,bLR);
++                                        }
++                                    }
++                                }
++                                else if (!bDoCoul_i)
++                                {
++                                    if (bHaveVdW[type[jj]])
++                                    {
++                                        if(!b_hybrid){
++                                            add_j_to_nblist(vdw,jj,bLR);
++                                        }else{
++                                            add_j_to_nblist(vdw_adress,jj,bLR);
++                                        }
++                                    }
++                                }
++                                else
++                                {
++                                    if (bHaveVdW[type[jj]])
++                                    {
++                                        if (charge[jj] != 0)
++                                        {
++                                            if(!b_hybrid){
++                                                add_j_to_nblist(vdwc,jj,bLR);
++                                            }else{
++                                                add_j_to_nblist(vdwc_adress,jj,bLR);
++                                            }
++                                        }
++                                        else
++                                        {
++                                            if(!b_hybrid){
++                                                add_j_to_nblist(vdw,jj,bLR);
++                                            }else{
++                                                add_j_to_nblist(vdw_adress,jj,bLR);
++                                            }
++
++                                        }
++                                    }
++                                    else if (charge[jj] != 0)
++                                    {
++                                        if(!b_hybrid){
++                                            add_j_to_nblist(coul,jj,bLR);
++                                        }else{
++                                            add_j_to_nblist(coul_adress,jj,bLR);
++                                        }
++
++                                    }
++                                }
++                            }
++                        }
++                    }
++                
++                close_i_nblist(vdw);
++                close_i_nblist(coul);
++                close_i_nblist(vdwc);
++                close_i_nblist(vdw_adress);
++                close_i_nblist(coul_adress);
++                close_i_nblist(vdwc_adress);
++            }
++    }
++}
++
 +static void 
 +put_in_list_qmmm(gmx_bool              bHaveVdW[],
 +                 int               ngid,
 +                 t_mdatoms *       md,
 +                 int               icg,
 +                 int               jgid,
 +                 int               nj,
 +                 atom_id           jjcg[],
 +                 atom_id           index[],
 +                 t_excl            bExcl[],
 +                 int               shift,
 +                 t_forcerec *      fr,
 +                 gmx_bool          bLR,
 +                 gmx_bool          bDoVdW,
 +                 gmx_bool          bDoCoul,
 +                 int               solvent_opt)
 +{
 +    t_nblist *   coul;
 +    int         i,j,jcg,igid,gid;
 +    atom_id   jj,jj0,jj1,i_atom;
 +    int       i0,nicg;
 +    gmx_bool      bNotEx;
 +    
 +    /* Get atom range */
 +    i0     = index[icg];
 +    nicg   = index[icg+1]-i0;
 +    
 +    /* Get the i charge group info */
 +    igid   = GET_CGINFO_GID(fr->cginfo[icg]);
 +    
 +    coul = &fr->QMMMlist;
 +    
 +    /* Loop over atoms in the ith charge group */
 +    for (i=0;i<nicg;i++)
 +    {
 +        i_atom = i0+i;
 +        gid    = GID(igid,jgid,ngid);
 +        /* Create new i_atom for each energy group */
 +        new_i_nblist(coul,bLR,i_atom,shift,gid);
 +        
 +        /* Loop over the j charge groups */
 +        for (j=0;j<nj;j++)
 +        {
 +            jcg=jjcg[j];
 +            
 +            /* Charge groups cannot have QM and MM atoms simultaneously */
 +            if (jcg!=icg)
 +            {
 +                jj0 = index[jcg];
 +                jj1 = index[jcg+1];
 +                /* Finally loop over the atoms in the j-charge group */
 +                for(jj=jj0; jj<jj1; jj++)
 +                {
 +                    bNotEx = NOTEXCL(bExcl,i,jj);
 +                    if(bNotEx)
 +                        add_j_to_nblist(coul,jj,bLR);
 +                }
 +            }
 +        }
 +        close_i_nblist(coul);
 +    }
 +}
 +
 +static void 
 +put_in_list_cg(gmx_bool              bHaveVdW[],
 +               int               ngid,
 +               t_mdatoms *       md,
 +               int               icg,
 +               int               jgid,
 +               int               nj,
 +               atom_id           jjcg[],
 +               atom_id           index[],
 +               t_excl            bExcl[],
 +               int               shift,
 +               t_forcerec *      fr,
 +               gmx_bool          bLR,
 +               gmx_bool          bDoVdW,
 +               gmx_bool          bDoCoul,
 +               int               solvent_opt)
 +{
 +    int          cginfo;
 +    int          igid,gid,nbl_ind;
 +    t_nblist *   vdwc;
 +    int          j,jcg;
 +
 +    cginfo = fr->cginfo[icg];
 +
 +    igid = GET_CGINFO_GID(cginfo);
 +    gid  = GID(igid,jgid,ngid);
 +
 +    /* Unpack pointers to neighbourlist structs */
 +    if (fr->nnblists == 1)
 +    {
 +        nbl_ind = 0;
 +    }
 +    else
 +    {
 +        nbl_ind = fr->gid2nblists[gid];
 +    }
 +    if (bLR)
 +    {
 +        vdwc = &fr->nblists[nbl_ind].nlist_lr[eNL_VDWQQ];
 +    }
 +    else
 +    {
 +        vdwc = &fr->nblists[nbl_ind].nlist_sr[eNL_VDWQQ];
 +    }
 +
 +    /* Make a new neighbor list for charge group icg.
 +     * Currently simply one neighbor list is made with LJ and Coulomb.
 +     * If required, zero interactions could be removed here
 +     * or in the force loop.
 +     */
 +    new_i_nblist(vdwc,bLR,index[icg],shift,gid);
 +    vdwc->iinr_end[vdwc->nri] = index[icg+1];
 +
 +    for(j=0; (j<nj); j++) 
 +    {
 +        jcg = jjcg[j];
 +        /* Skip the icg-icg pairs if all self interactions are excluded */
 +        if (!(jcg == icg && GET_CGINFO_EXCL_INTRA(cginfo)))
 +        {
 +            /* Here we add the j charge group jcg to the list,
 +             * exclusions are also added to the list.
 +             */
 +            add_j_to_nblist_cg(vdwc,index[jcg],index[jcg+1],bExcl,icg==jcg,bLR);
 +        }
 +    }
 +
 +    close_i_nblist(vdwc);  
 +}
 +
 +static void setexcl(atom_id start,atom_id end,t_blocka *excl,gmx_bool b,
 +                    t_excl bexcl[])
 +{
 +    atom_id i,k;
 +    
 +    if (b)
 +    {
 +        for(i=start; i<end; i++)
 +        {
 +            for(k=excl->index[i]; k<excl->index[i+1]; k++)
 +            {
 +                SETEXCL(bexcl,i-start,excl->a[k]);
 +            }
 +        }
 +    }
 +    else
 +    {
 +        for(i=start; i<end; i++)
 +        {
 +            for(k=excl->index[i]; k<excl->index[i+1]; k++)
 +            {
 +                RMEXCL(bexcl,i-start,excl->a[k]);
 +            }
 +        }
 +    }
 +}
 +
 +int calc_naaj(int icg,int cgtot)
 +{
 +    int naaj;
 +    
 +    if ((cgtot % 2) == 1)
 +    {
 +        /* Odd number of charge groups, easy */
 +        naaj = 1 + (cgtot/2);
 +    }
 +    else if ((cgtot % 4) == 0)
 +    {
 +    /* Multiple of four is hard */
 +        if (icg < cgtot/2)
 +        {
 +            if ((icg % 2) == 0)
 +            {
 +                naaj=1+(cgtot/2);
 +            }
 +            else
 +            {
 +                naaj=cgtot/2;
 +            }
 +        }
 +        else
 +        {
 +            if ((icg % 2) == 1)
 +            {
 +                naaj=1+(cgtot/2);
 +            }
 +            else
 +            {
 +                naaj=cgtot/2;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        /* cgtot/2 = odd */
 +        if ((icg % 2) == 0)
 +        {
 +            naaj=1+(cgtot/2);
 +        }
 +        else
 +        {
 +            naaj=cgtot/2;
 +        }
 +    }
 +#ifdef DEBUG
 +    fprintf(log,"naaj=%d\n",naaj);
 +#endif
 +
 +    return naaj;
 +}
 +
 +/************************************************
 + *
 + *  S I M P L E      C O R E     S T U F F
 + *
 + ************************************************/
 +
 +static real calc_image_tric(rvec xi,rvec xj,matrix box,
 +                            rvec b_inv,int *shift)
 +{
 +    /* This code assumes that the cut-off is smaller than
 +     * a half times the smallest diagonal element of the box.
 +     */
 +    const real h25=2.5;
 +    real dx,dy,dz;
 +    real r2;
 +    int  tx,ty,tz;
 +    
 +    /* Compute diff vector */
 +    dz = xj[ZZ] - xi[ZZ];
 +    dy = xj[YY] - xi[YY];
 +    dx = xj[XX] - xi[XX];
 +    
 +  /* Perform NINT operation, using trunc operation, therefore
 +   * we first add 2.5 then subtract 2 again
 +   */
 +    tz = dz*b_inv[ZZ] + h25;
 +    tz -= 2;
 +    dz -= tz*box[ZZ][ZZ];
 +    dy -= tz*box[ZZ][YY];
 +    dx -= tz*box[ZZ][XX];
 +
 +    ty = dy*b_inv[YY] + h25;
 +    ty -= 2;
 +    dy -= ty*box[YY][YY];
 +    dx -= ty*box[YY][XX];
 +    
 +    tx = dx*b_inv[XX]+h25;
 +    tx -= 2;
 +    dx -= tx*box[XX][XX];
 +  
 +    /* Distance squared */
 +    r2 = (dx*dx) + (dy*dy) + (dz*dz);
 +
 +    *shift = XYZ2IS(tx,ty,tz);
 +
 +    return r2;
 +}
 +
 +static real calc_image_rect(rvec xi,rvec xj,rvec box_size,
 +                            rvec b_inv,int *shift)
 +{
 +    const real h15=1.5;
 +    real ddx,ddy,ddz;
 +    real dx,dy,dz;
 +    real r2;
 +    int  tx,ty,tz;
 +    
 +    /* Compute diff vector */
 +    dx = xj[XX] - xi[XX];
 +    dy = xj[YY] - xi[YY];
 +    dz = xj[ZZ] - xi[ZZ];
 +  
 +    /* Perform NINT operation, using trunc operation, therefore
 +     * we first add 1.5 then subtract 1 again
 +     */
 +    tx = dx*b_inv[XX] + h15;
 +    ty = dy*b_inv[YY] + h15;
 +    tz = dz*b_inv[ZZ] + h15;
 +    tx--;
 +    ty--;
 +    tz--;
 +    
 +    /* Correct diff vector for translation */
 +    ddx = tx*box_size[XX] - dx;
 +    ddy = ty*box_size[YY] - dy;
 +    ddz = tz*box_size[ZZ] - dz;
 +    
 +    /* Distance squared */
 +    r2 = (ddx*ddx) + (ddy*ddy) + (ddz*ddz);
 +    
 +    *shift = XYZ2IS(tx,ty,tz);
 +    
 +    return r2;
 +}
 +
 +static void add_simple(t_ns_buf *nsbuf,int nrj,atom_id cg_j,
 +                       gmx_bool bHaveVdW[],int ngid,t_mdatoms *md,
 +                       int icg,int jgid,t_block *cgs,t_excl bexcl[],
 +                       int shift,t_forcerec *fr,put_in_list_t *put_in_list)
 +{
 +    if (nsbuf->nj + nrj > MAX_CG)
 +    {
 +        put_in_list(bHaveVdW,ngid,md,icg,jgid,nsbuf->ncg,nsbuf->jcg,
 +                    cgs->index,bexcl,shift,fr,FALSE,TRUE,TRUE,fr->solvent_opt);
 +        /* Reset buffer contents */
 +        nsbuf->ncg = nsbuf->nj = 0;
 +    }
 +    nsbuf->jcg[nsbuf->ncg++] = cg_j;
 +    nsbuf->nj += nrj;
 +}
 +
 +static void ns_inner_tric(rvec x[],int icg,int *i_egp_flags,
 +                          int njcg,atom_id jcg[],
 +                          matrix box,rvec b_inv,real rcut2,
 +                          t_block *cgs,t_ns_buf **ns_buf,
 +                          gmx_bool bHaveVdW[],int ngid,t_mdatoms *md,
 +                          t_excl bexcl[],t_forcerec *fr,
 +                          put_in_list_t *put_in_list)
 +{
 +    int      shift;
 +    int      j,nrj,jgid;
 +    int      *cginfo=fr->cginfo;
 +    atom_id  cg_j,*cgindex;
 +    t_ns_buf *nsbuf;
 +    
 +    cgindex = cgs->index;
 +    shift   = CENTRAL;
 +    for(j=0; (j<njcg); j++)
 +    {
 +        cg_j   = jcg[j];
 +        nrj    = cgindex[cg_j+1]-cgindex[cg_j];
 +        if (calc_image_tric(x[icg],x[cg_j],box,b_inv,&shift) < rcut2)
 +        {
 +            jgid  = GET_CGINFO_GID(cginfo[cg_j]);
 +            if (!(i_egp_flags[jgid] & EGP_EXCL))
 +            {
 +                add_simple(&ns_buf[jgid][shift],nrj,cg_j,
 +                           bHaveVdW,ngid,md,icg,jgid,cgs,bexcl,shift,fr,
 +                           put_in_list);
 +            }
 +        }
 +    }
 +}
 +
 +static void ns_inner_rect(rvec x[],int icg,int *i_egp_flags,
 +                          int njcg,atom_id jcg[],
 +                          gmx_bool bBox,rvec box_size,rvec b_inv,real rcut2,
 +                          t_block *cgs,t_ns_buf **ns_buf,
 +                          gmx_bool bHaveVdW[],int ngid,t_mdatoms *md,
 +                          t_excl bexcl[],t_forcerec *fr,
 +                          put_in_list_t *put_in_list)
 +{
 +    int      shift;
 +    int      j,nrj,jgid;
 +    int      *cginfo=fr->cginfo;
 +    atom_id  cg_j,*cgindex;
 +    t_ns_buf *nsbuf;
 +
 +    cgindex = cgs->index;
 +    if (bBox)
 +    {
 +        shift = CENTRAL;
 +        for(j=0; (j<njcg); j++)
 +        {
 +            cg_j   = jcg[j];
 +            nrj    = cgindex[cg_j+1]-cgindex[cg_j];
 +            if (calc_image_rect(x[icg],x[cg_j],box_size,b_inv,&shift) < rcut2)
 +            {
 +                jgid  = GET_CGINFO_GID(cginfo[cg_j]);
 +                if (!(i_egp_flags[jgid] & EGP_EXCL))
 +                {
 +                    add_simple(&ns_buf[jgid][shift],nrj,cg_j,
 +                               bHaveVdW,ngid,md,icg,jgid,cgs,bexcl,shift,fr,
 +                               put_in_list);
 +                }
 +            }
 +        }
 +    }
 +    else
 +    {
 +        for(j=0; (j<njcg); j++)
 +        {
 +            cg_j   = jcg[j];
 +            nrj    = cgindex[cg_j+1]-cgindex[cg_j];
 +            if ((rcut2 == 0) || (distance2(x[icg],x[cg_j]) < rcut2)) {
 +                jgid  = GET_CGINFO_GID(cginfo[cg_j]);
 +                if (!(i_egp_flags[jgid] & EGP_EXCL))
 +                {
 +                    add_simple(&ns_buf[jgid][CENTRAL],nrj,cg_j,
 +                               bHaveVdW,ngid,md,icg,jgid,cgs,bexcl,CENTRAL,fr,
 +                               put_in_list);
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* ns_simple_core needs to be adapted for QMMM still 2005 */
 +
 +static int ns_simple_core(t_forcerec *fr,
 +                          gmx_localtop_t *top,
 +                          t_mdatoms *md,
 +                          matrix box,rvec box_size,
 +                          t_excl bexcl[],atom_id *aaj,
 +                          int ngid,t_ns_buf **ns_buf,
 +                          put_in_list_t *put_in_list,gmx_bool bHaveVdW[])
 +{
 +    int      naaj,k;
 +    real     rlist2;
 +    int      nsearch,icg,jcg,igid,i0,nri,nn;
 +    int      *cginfo;
 +    t_ns_buf *nsbuf;
 +    /* atom_id  *i_atoms; */
 +    t_block  *cgs=&(top->cgs);
 +    t_blocka *excl=&(top->excls);
 +    rvec     b_inv;
 +    int      m;
 +    gmx_bool     bBox,bTriclinic;
 +    int      *i_egp_flags;
 +    
 +    rlist2 = sqr(fr->rlist);
 +    
 +    bBox = (fr->ePBC != epbcNONE);
 +    if (bBox)
 +    {
 +        for(m=0; (m<DIM); m++)
 +        {
 +            b_inv[m] = divide_err(1.0,box_size[m]);
 +        }
 +        bTriclinic = TRICLINIC(box);
 +    }
 +    else
 +    {
 +        bTriclinic = FALSE;
 +    }
 +    
 +    cginfo = fr->cginfo;
 +    
 +    nsearch=0;
 +    for (icg=fr->cg0; (icg<fr->hcg); icg++)
 +    {
 +        /*
 +          i0        = cgs->index[icg];
 +          nri       = cgs->index[icg+1]-i0;
 +          i_atoms   = &(cgs->a[i0]);
 +          i_eg_excl = fr->eg_excl + ngid*md->cENER[*i_atoms];
 +          setexcl(nri,i_atoms,excl,TRUE,bexcl);
 +        */
 +        igid = GET_CGINFO_GID(cginfo[icg]);
 +        i_egp_flags = fr->egp_flags + ngid*igid;
 +        setexcl(cgs->index[icg],cgs->index[icg+1],excl,TRUE,bexcl);
 +        
 +        naaj=calc_naaj(icg,cgs->nr);
 +        if (bTriclinic)
 +        {
 +            ns_inner_tric(fr->cg_cm,icg,i_egp_flags,naaj,&(aaj[icg]),
 +                          box,b_inv,rlist2,cgs,ns_buf,
 +                          bHaveVdW,ngid,md,bexcl,fr,put_in_list);
 +        }
 +        else
 +        {
 +            ns_inner_rect(fr->cg_cm,icg,i_egp_flags,naaj,&(aaj[icg]),
 +                          bBox,box_size,b_inv,rlist2,cgs,ns_buf,
 +                          bHaveVdW,ngid,md,bexcl,fr,put_in_list);
 +        }
 +        nsearch += naaj;
 +        
 +        for(nn=0; (nn<ngid); nn++)
 +        {
 +            for(k=0; (k<SHIFTS); k++)
 +            {
 +                nsbuf = &(ns_buf[nn][k]);
 +                if (nsbuf->ncg > 0)
 +                {
 +                    put_in_list(bHaveVdW,ngid,md,icg,nn,nsbuf->ncg,nsbuf->jcg,
 +                                cgs->index,bexcl,k,fr,FALSE,TRUE,TRUE,fr->solvent_opt);
 +                    nsbuf->ncg=nsbuf->nj=0;
 +                }
 +            }
 +        }
 +        /* setexcl(nri,i_atoms,excl,FALSE,bexcl); */
 +        setexcl(cgs->index[icg],cgs->index[icg+1],excl,FALSE,bexcl);
 +    }
 +    close_neighbor_lists(fr,FALSE);
 +    
 +    return nsearch;
 +}
 +
 +/************************************************
 + *
 + *    N S 5     G R I D     S T U F F
 + *
 + ************************************************/
 +
 +static inline void get_dx(int Nx,real gridx,real rc2,int xgi,real x,
 +                          int *dx0,int *dx1,real *dcx2)
 +{
 +    real dcx,tmp;
 +    int  xgi0,xgi1,i;
 +    
 +    if (xgi < 0)
 +    {
 +        *dx0 = 0;
 +        xgi0 = -1;
 +        *dx1 = -1;
 +        xgi1 = 0;
 +    }
 +    else if (xgi >= Nx)
 +    {
 +        *dx0 = Nx;
 +        xgi0 = Nx-1;
 +        *dx1 = Nx-1;
 +        xgi1 = Nx;
 +    }
 +    else
 +    {
 +        dcx2[xgi] = 0;
 +        *dx0 = xgi;
 +        xgi0 = xgi-1;
 +        *dx1 = xgi;
 +        xgi1 = xgi+1;
 +    }
 +    
 +    for(i=xgi0; i>=0; i--)
 +    {
 +        dcx = (i+1)*gridx-x;
 +        tmp = dcx*dcx;
 +        if (tmp >= rc2)
 +            break;
 +        *dx0 = i;
 +        dcx2[i] = tmp;
 +    }
 +    for(i=xgi1; i<Nx; i++)
 +    {
 +        dcx = i*gridx-x;
 +        tmp = dcx*dcx;
 +        if (tmp >= rc2)
 +        {
 +            break;
 +        }
 +        *dx1 = i;
 +        dcx2[i] = tmp;
 +    }
 +}
 +
 +static inline void get_dx_dd(int Nx,real gridx,real rc2,int xgi,real x,
 +                             int ncpddc,int shift_min,int shift_max,
 +                             int *g0,int *g1,real *dcx2)
 +{
 +    real dcx,tmp;
 +    int  g_min,g_max,shift_home;
 +    
 +    if (xgi < 0)
 +    {
 +        g_min = 0;
 +        g_max = Nx - 1;
 +        *g0   = 0;
 +        *g1   = -1;
 +    }
 +    else if (xgi >= Nx)
 +    {
 +        g_min = 0;
 +        g_max = Nx - 1;
 +        *g0   = Nx;
 +        *g1   = Nx - 1;
 +    }
 +    else
 +    {
 +        if (ncpddc == 0)
 +        {
 +            g_min = 0;
 +            g_max = Nx - 1;
 +        }
 +        else
 +        {
 +            if (xgi < ncpddc)
 +            {
 +                shift_home = 0;
 +            }
 +            else
 +            {
 +                shift_home = -1;
 +            }
 +            g_min = (shift_min == shift_home ? 0          : ncpddc);
 +            g_max = (shift_max == shift_home ? ncpddc - 1 : Nx - 1);
 +        }
 +        if (shift_min > 0)
 +        {
 +            *g0 = g_min;
 +            *g1 = g_min - 1;
 +        }
 +        else if (shift_max < 0)
 +        {
 +            *g0 = g_max + 1;
 +            *g1 = g_max;
 +        }
 +        else
 +        {
 +            *g0 = xgi;
 +            *g1 = xgi;
 +            dcx2[xgi] = 0;
 +        }
 +    }
 +    
 +    while (*g0 > g_min)
 +    {
 +        /* Check one grid cell down */
 +        dcx = ((*g0 - 1) + 1)*gridx - x;
 +        tmp = dcx*dcx;
 +        if (tmp >= rc2)
 +        {
 +            break;
 +        }
 +        (*g0)--;
 +        dcx2[*g0] = tmp;
 +    }
 +    
 +    while (*g1 < g_max)
 +    {
 +        /* Check one grid cell up */
 +        dcx = (*g1 + 1)*gridx - x;
 +        tmp = dcx*dcx;
 +        if (tmp >= rc2)
 +        {
 +            break;
 +        }
 +        (*g1)++;
 +        dcx2[*g1] = tmp;
 +    }
 +}
 +
 +
 +#define sqr(x) ((x)*(x))
 +#define calc_dx2(XI,YI,ZI,y) (sqr(XI-y[XX]) + sqr(YI-y[YY]) + sqr(ZI-y[ZZ]))
 +#define calc_cyl_dx2(XI,YI,y) (sqr(XI-y[XX]) + sqr(YI-y[YY]))
 +/****************************************************
 + *
 + *    F A S T   N E I G H B O R  S E A R C H I N G
 + *
 + *    Optimized neighboursearching routine using grid 
 + *    at least 1x1x1, see GROMACS manual
 + *
 + ****************************************************/
 +
 +
 +static void get_cutoff2(t_forcerec *fr,gmx_bool bDoLongRange,
 +                        real *rvdw2,real *rcoul2,
 +                        real *rs2,real *rm2,real *rl2)
 +{
 +    *rs2 = sqr(fr->rlist);
 +
 +    if (bDoLongRange && fr->bTwinRange)
 +    {
 +        /* The VdW and elec. LR cut-off's could be different,
 +         * so we can not simply set them to rlistlong.
 +         */
 +        if (EVDW_MIGHT_BE_ZERO_AT_CUTOFF(fr->vdwtype) &&
 +            fr->rvdw > fr->rlist)
 +        {
 +            *rvdw2  = sqr(fr->rlistlong);
 +        }
 +        else
 +        {
 +            *rvdw2  = sqr(fr->rvdw);
 +        }
 +        if (EEL_MIGHT_BE_ZERO_AT_CUTOFF(fr->eeltype) &&
 +            fr->rcoulomb > fr->rlist)
 +        {
 +            *rcoul2 = sqr(fr->rlistlong);
 +        }
 +        else
 +        {
 +            *rcoul2 = sqr(fr->rcoulomb);
 +        }
 +    }
 +    else
 +    {
 +        /* Workaround for a gcc -O3 or -ffast-math problem */
 +        *rvdw2  = *rs2;
 +        *rcoul2 = *rs2;
 +    }
 +    *rm2 = min(*rvdw2,*rcoul2);
 +    *rl2 = max(*rvdw2,*rcoul2);
 +}
 +
 +static void init_nsgrid_lists(t_forcerec *fr,int ngid,gmx_ns_t *ns)
 +{
 +    real rvdw2,rcoul2,rs2,rm2,rl2;
 +    int j;
 +
 +    get_cutoff2(fr,TRUE,&rvdw2,&rcoul2,&rs2,&rm2,&rl2);
 +
 +    /* Short range buffers */
 +    snew(ns->nl_sr,ngid);
 +    /* Counters */
 +    snew(ns->nsr,ngid);
 +    snew(ns->nlr_ljc,ngid);
 +    snew(ns->nlr_one,ngid);
 +    
 +    /* Always allocate both list types, since rcoulomb might now change with PME load balancing */
 +    /* Long range VdW and Coul buffers */
 +    snew(ns->nl_lr_ljc,ngid);
 +    /* Long range VdW or Coul only buffers */
 +    snew(ns->nl_lr_one,ngid);
 +
 +    for(j=0; (j<ngid); j++) {
 +        snew(ns->nl_sr[j],MAX_CG);
 +        snew(ns->nl_lr_ljc[j],MAX_CG);
 +        snew(ns->nl_lr_one[j],MAX_CG);
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug,
 +                "ns5_core: rs2 = %g, rm2 = %g, rl2 = %g (nm^2)\n",
 +                rs2,rm2,rl2);
 +    }
 +}
 +
 +static int nsgrid_core(FILE *log,t_commrec *cr,t_forcerec *fr,
 +                       matrix box,rvec box_size,int ngid,
 +                       gmx_localtop_t *top,
 +                       t_grid *grid,rvec x[],
 +                       t_excl bexcl[],gmx_bool *bExcludeAlleg,
 +                       t_nrnb *nrnb,t_mdatoms *md,
 +                       real *lambda,real *dvdlambda,
 +                       gmx_grppairener_t *grppener,
 +                       put_in_list_t *put_in_list,
 +                       gmx_bool bHaveVdW[],
 +                       gmx_bool bDoLongRange,gmx_bool bMakeQMMMnblist)
 +{
 +    gmx_ns_t *ns;
 +    atom_id **nl_lr_ljc,**nl_lr_one,**nl_sr;
 +    int     *nlr_ljc,*nlr_one,*nsr;
 +    gmx_domdec_t *dd=NULL;
 +    t_block *cgs=&(top->cgs);
 +    int     *cginfo=fr->cginfo;
 +    /* atom_id *i_atoms,*cgsindex=cgs->index; */
 +    ivec    sh0,sh1,shp;
 +    int     cell_x,cell_y,cell_z;
 +    int     d,tx,ty,tz,dx,dy,dz,cj;
 +#ifdef ALLOW_OFFDIAG_LT_HALFDIAG
 +    int     zsh_ty,zsh_tx,ysh_tx;
 +#endif
 +    int     dx0,dx1,dy0,dy1,dz0,dz1;
 +    int     Nx,Ny,Nz,shift=-1,j,nrj,nns,nn=-1;
 +    real    gridx,gridy,gridz,grid_x,grid_y,grid_z;
 +    real    *dcx2,*dcy2,*dcz2;
 +    int     zgi,ygi,xgi;
 +    int     cg0,cg1,icg=-1,cgsnr,i0,igid,nri,naaj,max_jcg;
 +    int     jcg0,jcg1,jjcg,cgj0,jgid;
 +    int     *grida,*gridnra,*gridind;
 +    gmx_bool    rvdw_lt_rcoul,rcoul_lt_rvdw;
 +    rvec    xi,*cgcm,grid_offset;
 +    real    r2,rs2,rvdw2,rcoul2,rm2,rl2,XI,YI,ZI,dcx,dcy,dcz,tmp1,tmp2;
 +    int     *i_egp_flags;
 +    gmx_bool    bDomDec,bTriclinicX,bTriclinicY;
 +    ivec    ncpddc;
 +    
 +    ns = &fr->ns;
 +    
 +    bDomDec = DOMAINDECOMP(cr);
 +    if (bDomDec)
 +    {
 +        dd = cr->dd;
 +    }
 +    
 +    bTriclinicX = ((YY < grid->npbcdim &&
 +                    (!bDomDec || dd->nc[YY]==1) && box[YY][XX] != 0) ||
 +                   (ZZ < grid->npbcdim &&
 +                    (!bDomDec || dd->nc[ZZ]==1) && box[ZZ][XX] != 0));
 +    bTriclinicY =  (ZZ < grid->npbcdim &&
 +                    (!bDomDec || dd->nc[ZZ]==1) && box[ZZ][YY] != 0);
 +    
 +    cgsnr    = cgs->nr;
 +
 +    get_cutoff2(fr,bDoLongRange,&rvdw2,&rcoul2,&rs2,&rm2,&rl2);
 +
 +    rvdw_lt_rcoul = (rvdw2 >= rcoul2);
 +    rcoul_lt_rvdw = (rcoul2 >= rvdw2);
 +    
 +    if (bMakeQMMMnblist)
 +    {
 +        rm2 = rl2;
 +        rs2 = rl2;
 +    }
 +
 +    nl_sr     = ns->nl_sr;
 +    nsr       = ns->nsr;
 +    nl_lr_ljc = ns->nl_lr_ljc;
 +    nl_lr_one = ns->nl_lr_one;
 +    nlr_ljc   = ns->nlr_ljc;
 +    nlr_one   = ns->nlr_one;
 +    
 +    /* Unpack arrays */
 +    cgcm    = fr->cg_cm;
 +    Nx      = grid->n[XX];
 +    Ny      = grid->n[YY];
 +    Nz      = grid->n[ZZ];
 +    grida   = grid->a;
 +    gridind = grid->index;
 +    gridnra = grid->nra;
 +    nns     = 0;
 +    
 +    gridx      = grid->cell_size[XX];
 +    gridy      = grid->cell_size[YY];
 +    gridz      = grid->cell_size[ZZ];
 +    grid_x     = 1/gridx;
 +    grid_y     = 1/gridy;
 +    grid_z     = 1/gridz;
 +    copy_rvec(grid->cell_offset,grid_offset);
 +    copy_ivec(grid->ncpddc,ncpddc);
 +    dcx2       = grid->dcx2;
 +    dcy2       = grid->dcy2;
 +    dcz2       = grid->dcz2;
 +    
 +#ifdef ALLOW_OFFDIAG_LT_HALFDIAG
 +    zsh_ty = floor(-box[ZZ][YY]/box[YY][YY]+0.5);
 +    zsh_tx = floor(-box[ZZ][XX]/box[XX][XX]+0.5);
 +    ysh_tx = floor(-box[YY][XX]/box[XX][XX]+0.5);
 +    if (zsh_tx!=0 && ysh_tx!=0)
 +    {
 +        /* This could happen due to rounding, when both ratios are 0.5 */
 +        ysh_tx = 0;
 +    }
 +#endif
 +    
 +    debug_gmx();
 +
 +    if (fr->n_tpi)
 +    {
 +        /* We only want a list for the test particle */
 +        cg0 = cgsnr - 1;
 +    }
 +    else
 +    {
 +        cg0 = grid->icg0;
 +    }
 +    cg1 = grid->icg1;
 +
 +    /* Set the shift range */
 +    for(d=0; d<DIM; d++)
 +    {
 +        sh0[d] = -1;
 +        sh1[d] = 1;
 +        /* Check if we need periodicity shifts.
 +         * Without PBC or with domain decomposition we don't need them.
 +         */
 +        if (d >= ePBC2npbcdim(fr->ePBC) || (bDomDec && dd->nc[d] > 1))
 +        {
 +            shp[d] = 0;
 +        }
 +        else
 +        {
 +            if (d == XX &&
 +                box[XX][XX] - fabs(box[YY][XX]) - fabs(box[ZZ][XX]) < sqrt(rl2))
 +            {
 +                shp[d] = 2;
 +            }
 +            else
 +            {
 +                shp[d] = 1;
 +            }
 +        }
 +    }
 +    
 +    /* Loop over charge groups */
 +    for(icg=cg0; (icg < cg1); icg++)
 +    {
 +        igid = GET_CGINFO_GID(cginfo[icg]);
 +        /* Skip this charge group if all energy groups are excluded! */
 +        if (bExcludeAlleg[igid])
 +        {
 +            continue;
 +        }
 +        
 +        i0   = cgs->index[icg];
 +        
 +        if (bMakeQMMMnblist)
 +        { 
 +            /* Skip this charge group if it is not a QM atom while making a
 +             * QM/MM neighbourlist
 +             */
 +            if (md->bQM[i0]==FALSE)
 +            {
 +                continue; /* MM particle, go to next particle */ 
 +            }
 +            
 +            /* Compute the number of charge groups that fall within the control
 +             * of this one (icg)
 +             */
 +            naaj    = calc_naaj(icg,cgsnr);
 +            jcg0    = icg;
 +            jcg1    = icg + naaj;
 +            max_jcg = cgsnr;       
 +        } 
 +        else
 +        { 
 +            /* make a normal neighbourlist */
 +            
 +            if (bDomDec)
 +            {
 +                /* Get the j charge-group and dd cell shift ranges */
 +                dd_get_ns_ranges(cr->dd,icg,&jcg0,&jcg1,sh0,sh1);
 +                max_jcg = 0;
 +            }
 +            else
 +            {
 +                /* Compute the number of charge groups that fall within the control
 +                 * of this one (icg)
 +                 */
 +                naaj = calc_naaj(icg,cgsnr);
 +                jcg0 = icg;
 +                jcg1 = icg + naaj;
 +                
 +                if (fr->n_tpi)
 +                {
 +                    /* The i-particle is awlways the test particle,
 +                     * so we want all j-particles
 +                     */
 +                    max_jcg = cgsnr - 1;
 +                }
 +                else
 +                {
 +                    max_jcg  = jcg1 - cgsnr;
 +                }
 +            }
 +        }
 +        
 +        i_egp_flags = fr->egp_flags + igid*ngid;
 +        
 +        /* Set the exclusions for the atoms in charge group icg using a bitmask */
 +        setexcl(i0,cgs->index[icg+1],&top->excls,TRUE,bexcl);
 +        
 +        ci2xyz(grid,icg,&cell_x,&cell_y,&cell_z);
 +        
 +        /* Changed iicg to icg, DvdS 990115 
 +         * (but see consistency check above, DvdS 990330) 
 +         */
 +#ifdef NS5DB
 +        fprintf(log,"icg=%5d, naaj=%5d, cell %d %d %d\n",
 +                icg,naaj,cell_x,cell_y,cell_z);
 +#endif
 +        /* Loop over shift vectors in three dimensions */
 +        for (tz=-shp[ZZ]; tz<=shp[ZZ]; tz++)
 +        {
 +            ZI = cgcm[icg][ZZ]+tz*box[ZZ][ZZ];
 +            /* Calculate range of cells in Z direction that have the shift tz */
 +            zgi = cell_z + tz*Nz;
 +#define FAST_DD_NS
 +#ifndef FAST_DD_NS
 +            get_dx(Nz,gridz,rl2,zgi,ZI,&dz0,&dz1,dcz2);
 +#else
 +            get_dx_dd(Nz,gridz,rl2,zgi,ZI-grid_offset[ZZ],
 +                      ncpddc[ZZ],sh0[ZZ],sh1[ZZ],&dz0,&dz1,dcz2);
 +#endif
 +            if (dz0 > dz1)
 +            {
 +                continue;
 +            }
 +            for (ty=-shp[YY]; ty<=shp[YY]; ty++)
 +            {
 +                YI = cgcm[icg][YY]+ty*box[YY][YY]+tz*box[ZZ][YY];
 +                /* Calculate range of cells in Y direction that have the shift ty */
 +                if (bTriclinicY)
 +                {
 +                    ygi = (int)(Ny + (YI - grid_offset[YY])*grid_y) - Ny;
 +                }
 +                else
 +                {
 +                    ygi = cell_y + ty*Ny;
 +                }
 +#ifndef FAST_DD_NS
 +                get_dx(Ny,gridy,rl2,ygi,YI,&dy0,&dy1,dcy2);
 +#else
 +                get_dx_dd(Ny,gridy,rl2,ygi,YI-grid_offset[YY],
 +                          ncpddc[YY],sh0[YY],sh1[YY],&dy0,&dy1,dcy2);
 +#endif
 +                if (dy0 > dy1)
 +                {
 +                    continue;
 +                }
 +                for (tx=-shp[XX]; tx<=shp[XX]; tx++)
 +                {
 +                    XI = cgcm[icg][XX]+tx*box[XX][XX]+ty*box[YY][XX]+tz*box[ZZ][XX];
 +                    /* Calculate range of cells in X direction that have the shift tx */
 +                    if (bTriclinicX)
 +                    {
 +                        xgi = (int)(Nx + (XI - grid_offset[XX])*grid_x) - Nx;
 +                    }
 +                    else
 +                    {
 +                        xgi = cell_x + tx*Nx;
 +                    }
 +#ifndef FAST_DD_NS
 +                    get_dx(Nx,gridx,rl2,xgi*Nx,XI,&dx0,&dx1,dcx2);
 +#else
 +                    get_dx_dd(Nx,gridx,rl2,xgi,XI-grid_offset[XX],
 +                              ncpddc[XX],sh0[XX],sh1[XX],&dx0,&dx1,dcx2);
 +#endif
 +                    if (dx0 > dx1)
 +                    {
 +                        continue;
 +                    }
 +                    /* Adress: an explicit cg that has a weigthing function of 0 is excluded
 +                     *  from the neigbour list as it will not interact  */
 +                    if (fr->adress_type != eAdressOff){
 +                        if (md->wf[cgs->index[icg]]==0 && egp_explicit(fr, igid)){
 +                            continue;
 +                        }
 +                    }
 +                    /* Get shift vector */      
 +                    shift=XYZ2IS(tx,ty,tz);
 +#ifdef NS5DB
 +                    range_check(shift,0,SHIFTS);
 +#endif
 +                    for(nn=0; (nn<ngid); nn++)
 +                    {
 +                        nsr[nn]      = 0;
 +                        nlr_ljc[nn]  = 0;
 +                        nlr_one[nn] = 0;
 +                    }
 +#ifdef NS5DB
 +                    fprintf(log,"shift: %2d, dx0,1: %2d,%2d, dy0,1: %2d,%2d, dz0,1: %2d,%2d\n",
 +                            shift,dx0,dx1,dy0,dy1,dz0,dz1);
 +                    fprintf(log,"cgcm: %8.3f  %8.3f  %8.3f\n",cgcm[icg][XX],
 +                            cgcm[icg][YY],cgcm[icg][ZZ]);
 +                    fprintf(log,"xi:   %8.3f  %8.3f  %8.3f\n",XI,YI,ZI);
 +#endif
 +                    for (dx=dx0; (dx<=dx1); dx++)
 +                    {
 +                        tmp1 = rl2 - dcx2[dx];
 +                        for (dy=dy0; (dy<=dy1); dy++)
 +                        {
 +                            tmp2 = tmp1 - dcy2[dy];
 +                            if (tmp2 > 0)
 +                            {
 +                                for (dz=dz0; (dz<=dz1); dz++) {
 +                                    if (tmp2 > dcz2[dz]) {
 +                                        /* Find grid-cell cj in which possible neighbours are */
 +                                        cj   = xyz2ci(Ny,Nz,dx,dy,dz);
 +                                        
 +                                        /* Check out how many cgs (nrj) there in this cell */
 +                                        nrj  = gridnra[cj];
 +                                        
 +                                        /* Find the offset in the cg list */
 +                                        cgj0 = gridind[cj];
 +                                        
 +                                        /* Check if all j's are out of range so we
 +                                         * can skip the whole cell.
 +                                         * Should save some time, especially with DD.
 +                                         */
 +                                        if (nrj == 0 ||
 +                                            (grida[cgj0] >= max_jcg &&
 +                                             (grida[cgj0] >= jcg1 || grida[cgj0+nrj-1] < jcg0)))
 +                                        {
 +                                            continue;
 +                                        }
 +                                        
 +                                        /* Loop over cgs */
 +                                        for (j=0; (j<nrj); j++)
 +                                        {
 +                                            jjcg = grida[cgj0+j];
 +                                            
 +                                            /* check whether this guy is in range! */
 +                                            if ((jjcg >= jcg0 && jjcg < jcg1) ||
 +                                                (jjcg < max_jcg))
 +                                            {
 +                                                r2=calc_dx2(XI,YI,ZI,cgcm[jjcg]);
 +                                                if (r2 < rl2) {
 +                                                    /* jgid = gid[cgsatoms[cgsindex[jjcg]]]; */
 +                                                    jgid = GET_CGINFO_GID(cginfo[jjcg]);
 +                                                    /* check energy group exclusions */
 +                                                    if (!(i_egp_flags[jgid] & EGP_EXCL))
 +                                                    {
 +                                                        if (r2 < rs2)
 +                                                        {
 +                                                            if (nsr[jgid] >= MAX_CG)
 +                                                            {
 +                                                                /* Add to short-range list */
 +                                                                put_in_list(bHaveVdW,ngid,md,icg,jgid,
 +                                                                            nsr[jgid],nl_sr[jgid],
 +                                                                            cgs->index,/* cgsatoms, */ bexcl,
 +                                                                            shift,fr,FALSE,TRUE,TRUE,fr->solvent_opt);
 +                                                                nsr[jgid]=0;
 +                                                            }
 +                                                            nl_sr[jgid][nsr[jgid]++]=jjcg;
 +                                                        } 
 +                                                        else if (r2 < rm2)
 +                                                        {
 +                                                            if (nlr_ljc[jgid] >= MAX_CG)
 +                                                            {
 +                                                                /* Add to LJ+coulomb long-range list */
 +                                                                put_in_list(bHaveVdW,ngid,md,icg,jgid,
 +                                                                            nlr_ljc[jgid],nl_lr_ljc[jgid],top->cgs.index,
 +                                                                            bexcl,shift,fr,TRUE,TRUE,TRUE,fr->solvent_opt);
 +                                                                nlr_ljc[jgid]=0;
 +                                                            }
 +                                                            nl_lr_ljc[jgid][nlr_ljc[jgid]++]=jjcg;
 +                                                        }
 +                                                        else
 +                                                        {
 +                                                            if (nlr_one[jgid] >= MAX_CG)
 +                                                            {
 +                                                                /* Add to long-range list with only coul, or only LJ */
 +                                                                put_in_list(bHaveVdW,ngid,md,icg,jgid,
 +                                                                            nlr_one[jgid],nl_lr_one[jgid],top->cgs.index,
 +                                                                            bexcl,shift,fr,TRUE,rvdw_lt_rcoul,rcoul_lt_rvdw,fr->solvent_opt);
 +                                                                nlr_one[jgid]=0;
 +                                                            }
 +                                                            nl_lr_one[jgid][nlr_one[jgid]++]=jjcg;
 +                                                        }
 +                                                    }
 +                                                }
 +                                                nns++;
 +                                            }
 +                                        }
 +                                    }
 +                                }
 +                            }
 +                        }
 +                    }
 +                    /* CHECK whether there is anything left in the buffers */
 +                    for(nn=0; (nn<ngid); nn++)
 +                    {
 +                        if (nsr[nn] > 0)
 +                        {
 +                            put_in_list(bHaveVdW,ngid,md,icg,nn,nsr[nn],nl_sr[nn],
 +                                        cgs->index, /* cgsatoms, */ bexcl,
 +                                        shift,fr,FALSE,TRUE,TRUE,fr->solvent_opt);
 +                        }
 +                        
 +                        if (nlr_ljc[nn] > 0)
 +                        {
 +                            put_in_list(bHaveVdW,ngid,md,icg,nn,nlr_ljc[nn],
 +                                        nl_lr_ljc[nn],top->cgs.index,
 +                                        bexcl,shift,fr,TRUE,TRUE,TRUE,fr->solvent_opt);
 +                        }
 +                        
 +                        if (nlr_one[nn] > 0)
 +                        {
 +                            put_in_list(bHaveVdW,ngid,md,icg,nn,nlr_one[nn],
 +                                        nl_lr_one[nn],top->cgs.index,
 +                                        bexcl,shift,fr,TRUE,rvdw_lt_rcoul,rcoul_lt_rvdw,fr->solvent_opt);
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +        /* setexcl(nri,i_atoms,&top->atoms.excl,FALSE,bexcl); */
 +        setexcl(cgs->index[icg],cgs->index[icg+1],&top->excls,FALSE,bexcl);
 +    }
 +    /* No need to perform any left-over force calculations anymore (as we used to do here)
 +     * since we now save the proper long-range lists for later evaluation.
 +     */
 +
 +    debug_gmx();
 +     
 +    /* Close neighbourlists */
 +    close_neighbor_lists(fr,bMakeQMMMnblist);
 +    
 +    return nns;
 +}
 +
 +void ns_realloc_natoms(gmx_ns_t *ns,int natoms)
 +{
 +    int i;
 +    
 +    if (natoms > ns->nra_alloc)
 +    {
 +        ns->nra_alloc = over_alloc_dd(natoms);
 +        srenew(ns->bexcl,ns->nra_alloc);
 +        for(i=0; i<ns->nra_alloc; i++)
 +        {
 +            ns->bexcl[i] = 0;
 +        }
 +    }
 +}
 +
 +void init_ns(FILE *fplog,const t_commrec *cr,
 +             gmx_ns_t *ns,t_forcerec *fr,
 +             const gmx_mtop_t *mtop,
 +             matrix box)
 +{
 +    int  mt,icg,nr_in_cg,maxcg,i,j,jcg,ngid,ncg;
 +    t_block *cgs;
 +    char *ptr;
 +    
 +    /* Compute largest charge groups size (# atoms) */
 +    nr_in_cg=1;
 +    for(mt=0; mt<mtop->nmoltype; mt++) {
 +        cgs = &mtop->moltype[mt].cgs;
 +        for (icg=0; (icg < cgs->nr); icg++)
 +        {
 +            nr_in_cg=max(nr_in_cg,(int)(cgs->index[icg+1]-cgs->index[icg]));
 +        }
 +    }
 +
 +    /* Verify whether largest charge group is <= max cg.
 +     * This is determined by the type of the local exclusion type 
 +     * Exclusions are stored in bits. (If the type is not large
 +     * enough, enlarge it, unsigned char -> unsigned short -> unsigned long)
 +     */
 +    maxcg = sizeof(t_excl)*8;
 +    if (nr_in_cg > maxcg)
 +    {
 +        gmx_fatal(FARGS,"Max #atoms in a charge group: %d > %d\n",
 +                  nr_in_cg,maxcg);
 +    }
 +    
 +    ngid = mtop->groups.grps[egcENER].nr;
 +    snew(ns->bExcludeAlleg,ngid);
 +    for(i=0; i<ngid; i++) {
 +        ns->bExcludeAlleg[i] = TRUE;
 +        for(j=0; j<ngid; j++)
 +        {
 +            if (!(fr->egp_flags[i*ngid+j] & EGP_EXCL))
 +            {
 +                ns->bExcludeAlleg[i] = FALSE;
 +            }
 +        }
 +    }
 +    
 +    if (fr->bGrid) {
 +        /* Grid search */
 +        ns->grid = init_grid(fplog,fr);
 +        init_nsgrid_lists(fr,ngid,ns);
 +    }
 +    else
 +    {
 +        /* Simple search */
 +        snew(ns->ns_buf,ngid);
 +        for(i=0; (i<ngid); i++)
 +        {
 +            snew(ns->ns_buf[i],SHIFTS);
 +        }
 +        ncg = ncg_mtop(mtop);
 +        snew(ns->simple_aaj,2*ncg);
 +        for(jcg=0; (jcg<ncg); jcg++)
 +        {
 +            ns->simple_aaj[jcg]     = jcg;
 +            ns->simple_aaj[jcg+ncg] = jcg;
 +        }
 +    }
 +    
 +    /* Create array that determines whether or not atoms have VdW */
 +    snew(ns->bHaveVdW,fr->ntype);
 +    for(i=0; (i<fr->ntype); i++)
 +    {
 +        for(j=0; (j<fr->ntype); j++)
 +        {
 +            ns->bHaveVdW[i] = (ns->bHaveVdW[i] || 
 +                               (fr->bBHAM ? 
 +                                ((BHAMA(fr->nbfp,fr->ntype,i,j) != 0) ||
 +                                 (BHAMB(fr->nbfp,fr->ntype,i,j) != 0) ||
 +                                 (BHAMC(fr->nbfp,fr->ntype,i,j) != 0)) :
 +                                ((C6(fr->nbfp,fr->ntype,i,j) != 0) ||
 +                                 (C12(fr->nbfp,fr->ntype,i,j) != 0))));
 +        }
 +    }
 +    if (debug) 
 +        pr_bvec(debug,0,"bHaveVdW",ns->bHaveVdW,fr->ntype,TRUE);
 +    
 +    ns->nra_alloc = 0;
 +    ns->bexcl = NULL;
 +    if (!DOMAINDECOMP(cr))
 +    {
 +        /* This could be reduced with particle decomposition */
 +        ns_realloc_natoms(ns,mtop->natoms);
 +    }
 +
 +    ns->nblist_initialized=FALSE;
 +
 +    /* nbr list debug dump */
 +    {
 +        char *ptr=getenv("GMX_DUMP_NL");
 +        if (ptr)
 +        {
 +            ns->dump_nl=strtol(ptr,NULL,10);
 +            if (fplog)
 +            {
 +                fprintf(fplog, "GMX_DUMP_NL = %d", ns->dump_nl);
 +            }
 +        }
 +        else
 +        {
 +            ns->dump_nl=0;
 +        }
 +    }
 +}
 +
 +                       
 +int search_neighbours(FILE *log,t_forcerec *fr,
 +                      rvec x[],matrix box,
 +                      gmx_localtop_t *top,
 +                      gmx_groups_t *groups,
 +                      t_commrec *cr,
 +                      t_nrnb *nrnb,t_mdatoms *md,
 +                      real *lambda,real *dvdlambda,
 +                      gmx_grppairener_t *grppener,
 +                      gmx_bool bFillGrid,
 +                      gmx_bool bDoLongRangeNS,
 +                      gmx_bool bPadListsForKernels)
 +{
 +    t_block  *cgs=&(top->cgs);
 +    rvec     box_size,grid_x0,grid_x1;
 +    int      i,j,m,ngid;
 +    real     min_size,grid_dens;
 +    int      nsearch;
 +    gmx_bool     bGrid;
 +    char     *ptr;
 +    gmx_bool     *i_egp_flags;
 +    int      cg_start,cg_end,start,end;
 +    gmx_ns_t *ns;
 +    t_grid   *grid;
 +    gmx_domdec_zones_t *dd_zones;
 +    put_in_list_t *put_in_list;
 +
 +    ns = &fr->ns;
 +
 +    /* Set some local variables */
 +    bGrid = fr->bGrid;
 +    ngid = groups->grps[egcENER].nr;
 +    
 +    for(m=0; (m<DIM); m++)
 +    {
 +        box_size[m] = box[m][m];
 +    }
 +  
 +    if (fr->ePBC != epbcNONE)
 +    {
 +        if (sqr(fr->rlistlong) >= max_cutoff2(fr->ePBC,box))
 +        {
 +            gmx_fatal(FARGS,"One of the box vectors has become shorter than twice the cut-off length or box_yy-|box_zy| or box_zz has become smaller than the cut-off.");
 +        }
 +        if (!bGrid)
 +        {
 +            min_size = min(box_size[XX],min(box_size[YY],box_size[ZZ]));
 +            if (2*fr->rlistlong >= min_size)
 +                gmx_fatal(FARGS,"One of the box diagonal elements has become smaller than twice the cut-off length.");
 +        }
 +    }
 +    
 +    if (DOMAINDECOMP(cr))
 +    {
 +        ns_realloc_natoms(ns,cgs->index[cgs->nr]);
 +    }
 +    debug_gmx();
 +    
 +    /* Reset the neighbourlists */
 +    reset_neighbor_lists(fr,TRUE,TRUE);
 +    
 +    if (bGrid && bFillGrid)
 +    {
 +              
 +        grid = ns->grid;
 +        if (DOMAINDECOMP(cr))
 +        {
 +            dd_zones = domdec_zones(cr->dd);
 +        }
 +        else
 +        {
 +            dd_zones = NULL;
 +
 +            get_nsgrid_boundaries(grid->nboundeddim,box,NULL,NULL,NULL,NULL,
 +                                  cgs->nr,fr->cg_cm,grid_x0,grid_x1,&grid_dens);
 +
 +            grid_first(log,grid,NULL,NULL,fr->ePBC,box,grid_x0,grid_x1,
 +                       fr->rlistlong,grid_dens);
 +        }
 +        debug_gmx();
 +        
 +        /* Don't know why this all is... (DvdS 3/99) */
 +#ifndef SEGV
 +        start = 0;
 +        end   = cgs->nr;
 +#else
 +        start = fr->cg0;
 +        end   = (cgs->nr+1)/2;
 +#endif
 +        
 +        if (DOMAINDECOMP(cr))
 +        {
 +            end = cgs->nr;
 +            fill_grid(log,dd_zones,grid,end,-1,end,fr->cg_cm);
 +            grid->icg0 = 0;
 +            grid->icg1 = dd_zones->izone[dd_zones->nizone-1].cg1;
 +        }
 +        else
 +        {
 +            fill_grid(log,NULL,grid,cgs->nr,fr->cg0,fr->hcg,fr->cg_cm);
 +            grid->icg0 = fr->cg0;
 +            grid->icg1 = fr->hcg;
 +            debug_gmx();
 +            
 +            if (PARTDECOMP(cr))
 +                mv_grid(cr,grid);
 +            debug_gmx();
 +        }
 +        
 +        calc_elemnr(log,grid,start,end,cgs->nr);
 +        calc_ptrs(grid);
 +        grid_last(log,grid,start,end,cgs->nr);
 +        
 +        if (gmx_debug_at)
 +        {
 +            check_grid(debug,grid);
 +            print_grid(debug,grid);
 +        }
 +    }
 +    else if (fr->n_tpi)
 +    {
 +        /* Set the grid cell index for the test particle only.
 +         * The cell to cg index is not corrected, but that does not matter.
 +         */
 +        fill_grid(log,NULL,ns->grid,fr->hcg,fr->hcg-1,fr->hcg,fr->cg_cm);
 +    }
 +    debug_gmx();
++
++    if (fr->adress_type == eAdressOff){
++        if (!fr->ns.bCGlist)
++        {
++            put_in_list = put_in_list_at;
++        }
++        else
++        {
++            put_in_list = put_in_list_cg;
++        }
++    }else{
++         put_in_list = put_in_list_adress;
 +    }
 +
 +    /* Do the core! */
 +    if (bGrid)
 +    {
 +        grid = ns->grid;
 +        nsearch = nsgrid_core(log,cr,fr,box,box_size,ngid,top,
 +                              grid,x,ns->bexcl,ns->bExcludeAlleg,
 +                              nrnb,md,lambda,dvdlambda,grppener,
 +                              put_in_list,ns->bHaveVdW,
 +                              bDoLongRangeNS,FALSE);
 +        
 +        /* neighbour searching withouth QMMM! QM atoms have zero charge in
 +         * the classical calculation. The charge-charge interaction
 +         * between QM and MM atoms is handled in the QMMM core calculation
 +         * (see QMMM.c). The VDW however, we'd like to compute classically
 +         * and the QM MM atom pairs have just been put in the
 +         * corresponding neighbourlists. in case of QMMM we still need to
 +         * fill a special QMMM neighbourlist that contains all neighbours
 +         * of the QM atoms. If bQMMM is true, this list will now be made: 
 +         */
 +        if (fr->bQMMM && fr->qr->QMMMscheme!=eQMMMschemeoniom)
 +        {
 +            nsearch += nsgrid_core(log,cr,fr,box,box_size,ngid,top,
 +                                   grid,x,ns->bexcl,ns->bExcludeAlleg,
 +                                   nrnb,md,lambda,dvdlambda,grppener,
 +                                   put_in_list_qmmm,ns->bHaveVdW,
 +                                   bDoLongRangeNS,TRUE);
 +        }
 +    }
 +    else 
 +    {
 +        nsearch = ns_simple_core(fr,top,md,box,box_size,
 +                                 ns->bexcl,ns->simple_aaj,
 +                                 ngid,ns->ns_buf,put_in_list,ns->bHaveVdW);
 +    }
 +    debug_gmx();
 +
 +#ifdef DEBUG
 +    pr_nsblock(log);
 +#endif
 +    
 +    inc_nrnb(nrnb,eNR_NS,nsearch);
 +    /* inc_nrnb(nrnb,eNR_LR,fr->nlr); */
 +    
 +    return nsearch;
 +}
 +
 +int natoms_beyond_ns_buffer(t_inputrec *ir,t_forcerec *fr,t_block *cgs,
 +                            matrix scale_tot,rvec *x)
 +{
 +    int  cg0,cg1,cg,a0,a1,a,i,j;
 +    real rint,hbuf2,scale;
 +    rvec *cg_cm,cgsc;
 +    gmx_bool bIsotropic;
 +    int  nBeyond;
 +    
 +    nBeyond = 0;
 +    
 +    rint = max(ir->rcoulomb,ir->rvdw);
 +    if (ir->rlist < rint)
 +    {
 +        gmx_fatal(FARGS,"The neighbor search buffer has negative size: %f nm",
 +                  ir->rlist - rint);
 +    }
 +    cg_cm = fr->cg_cm;
 +    
 +    cg0 = fr->cg0;
 +    cg1 = fr->hcg;
 +    
 +    if (!EI_DYNAMICS(ir->eI) || !DYNAMIC_BOX(*ir))
 +    {
 +        hbuf2 = sqr(0.5*(ir->rlist - rint));
 +        for(cg=cg0; cg<cg1; cg++)
 +        {
 +            a0 = cgs->index[cg];
 +            a1 = cgs->index[cg+1];
 +            for(a=a0; a<a1; a++)
 +            {
 +                if (distance2(cg_cm[cg],x[a]) > hbuf2)
 +                {
 +                    nBeyond++;
 +                }
 +            }
 +        }
 +    }
 +    else
 +    {
 +        bIsotropic = TRUE;
 +        scale = scale_tot[0][0];
 +        for(i=1; i<DIM; i++)
 +        {
 +            /* With anisotropic scaling, the original spherical ns volumes become
 +             * ellipsoids. To avoid costly transformations we use the minimum
 +             * eigenvalue of the scaling matrix for determining the buffer size.
 +             * Since the lower half is 0, the eigenvalues are the diagonal elements.
 +             */
 +            scale = min(scale,scale_tot[i][i]);
 +            if (scale_tot[i][i] != scale_tot[i-1][i-1])
 +            {
 +                bIsotropic = FALSE;
 +            }
 +            for(j=0; j<i; j++)
 +            {
 +                if (scale_tot[i][j] != 0)
 +                {
 +                    bIsotropic = FALSE;
 +                }
 +            }
 +        }
 +        hbuf2 = sqr(0.5*(scale*ir->rlist - rint));
 +        if (bIsotropic)
 +        {
 +            for(cg=cg0; cg<cg1; cg++)
 +            {
 +                svmul(scale,cg_cm[cg],cgsc);
 +                a0 = cgs->index[cg];
 +                a1 = cgs->index[cg+1];
 +                for(a=a0; a<a1; a++)
 +                {
 +                    if (distance2(cgsc,x[a]) > hbuf2)
 +                    {                    
 +                        nBeyond++;
 +                    }
 +                }
 +            }
 +        }
 +        else
 +        {
 +            /* Anistropic scaling */
 +            for(cg=cg0; cg<cg1; cg++)
 +            {
 +                /* Since scale_tot contains the transpose of the scaling matrix,
 +                 * we need to multiply with the transpose.
 +                 */
 +                tmvmul_ur0(scale_tot,cg_cm[cg],cgsc);
 +                a0 = cgs->index[cg];
 +                a1 = cgs->index[cg+1];
 +                for(a=a0; a<a1; a++)
 +                {
 +                    if (distance2(cgsc,x[a]) > hbuf2)
 +                    {
 +                        nBeyond++;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +    
 +    return nBeyond;
 +}
index 88dbfef23d5e410620e159ccd2548fe6bf4e53f6,0000000000000000000000000000000000000000..058025fb0a8d6e1f6af4ba1c7f60e36c11e40c3a
mode 100644,000000..100644
--- /dev/null
@@@ -1,714 -1,0 +1,713 @@@
-         case F_VTEMP:
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROningen Mixture of Alchemy and Childrens' Stories
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <string.h>
 +#include <stdio.h>
 +#include "typedefs.h"
 +#include "sysstuff.h"
 +#include "gmx_fatal.h"
 +#include "network.h"
 +#include "txtdump.h"
 +#include "names.h"
 +#include "physics.h"
 +#include "vec.h"
 +#include "maths.h"
 +#include "mvdata.h"
 +#include "main.h"
 +#include "force.h"
 +#include "vcm.h"
 +#include "smalloc.h"
 +#include "futil.h"
 +#include "network.h"
 +#include "rbin.h"
 +#include "tgroup.h"
 +#include "xtcio.h"
 +#include "gmxfio.h"
 +#include "trnio.h"
 +#include "statutil.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "constr.h"
 +#include "checkpoint.h"
 +#include "xvgr.h"
 +#include "md_support.h"
 +#include "mdrun.h"
 +#include "sim_util.h"
 +
 +typedef struct gmx_global_stat
 +{
 +    t_bin *rb;
 +    int   *itc0;
 +    int   *itc1;
 +} t_gmx_global_stat;
 +
 +gmx_global_stat_t global_stat_init(t_inputrec *ir)
 +{
 +    gmx_global_stat_t gs;
 +
 +    snew(gs,1);
 +    
 +    gs->rb = mk_bin();
 +    snew(gs->itc0,ir->opts.ngtc);
 +    snew(gs->itc1,ir->opts.ngtc);
 +
 +    return gs;
 +}
 +
 +void global_stat_destroy(gmx_global_stat_t gs)
 +{
 +    destroy_bin(gs->rb);
 +    sfree(gs->itc0);
 +    sfree(gs->itc1);
 +    sfree(gs);
 +}
 +
 +static int filter_enerdterm(real *afrom, gmx_bool bToBuffer, real *ato,
 +                            gmx_bool bTemp, gmx_bool bPres, gmx_bool bEner) {
 +    int i,to,from;
 +
 +    from = 0;
 +    to   = 0;
 +    for (i=0;i<F_NRE;i++)
 +    {
 +        if (bToBuffer)
 +        {
 +            from = i;
 +        }
 +        else
 +        {
 +            to = i;
 +        }
 +        switch (i) {
 +        case F_EKIN:
 +        case F_TEMP:
 +        case F_DKDL:
 +            if (bTemp)
 +            {
 +                ato[to++] = afrom[from++];
 +            }
 +            break;
 +        case F_PRES:    
 +        case F_PDISPCORR:
 +            if (bPres)
 +            {
 +                ato[to++] = afrom[from++];
 +            }
 +            break;
 +        default:
 +            if (bEner)
 +            {
 +                ato[to++] = afrom[from++];
 +            }
 +            break;
 +        }
 +    }
 +
 +    return to;
 +}
 +
 +void global_stat(FILE *fplog,gmx_global_stat_t gs,
 +                 t_commrec *cr,gmx_enerdata_t *enerd,
 +                 tensor fvir,tensor svir,rvec mu_tot,
 +                 t_inputrec *inputrec,
 +                 gmx_ekindata_t *ekind,gmx_constr_t constr,
 +                 t_vcm *vcm,
 +                 int nsig,real *sig,
 +                 gmx_mtop_t *top_global, t_state *state_local, 
 +                 gmx_bool bSumEkinhOld, int flags)
 +/* instead of current system, gmx_booleans for summing virial, kinetic energy, and other terms */
 +{
 +  t_bin  *rb;
 +  int    *itc0,*itc1;
 +  int    ie=0,ifv=0,isv=0,irmsd=0,imu=0;
 +  int    idedl=0,idvdll=0,idvdlnl=0,iepl=0,icm=0,imass=0,ica=0,inb=0;
 +  int    isig=-1;
 +  int    icj=-1,ici=-1,icx=-1;
 +  int    inn[egNR];
 +  real   copyenerd[F_NRE];
 +  int    nener,j;
 +  real   *rmsd_data=NULL;
 +  double nb;
 +  gmx_bool   bVV,bTemp,bEner,bPres,bConstrVir,bEkinAveVel,bFirstIterate,bReadEkin;
 +
 +  bVV           = EI_VV(inputrec->eI);
 +  bTemp         = flags & CGLO_TEMPERATURE;
 +  bEner         = flags & CGLO_ENERGY;
 +  bPres         = (flags & CGLO_PRESSURE); 
 +  bConstrVir    = (flags & CGLO_CONSTRAINT);
 +  bFirstIterate = (flags & CGLO_FIRSTITERATE);
 +  bEkinAveVel   = (inputrec->eI==eiVV || (inputrec->eI==eiVVAK && bPres));
 +  bReadEkin     = (flags & CGLO_READEKIN);
 +
 +  rb   = gs->rb;
 +  itc0 = gs->itc0;
 +  itc1 = gs->itc1;
 +  
 +
 +  reset_bin(rb);
 +  /* This routine copies all the data to be summed to one big buffer
 +   * using the t_bin struct. 
 +   */
 +
 +  /* First, we neeed to identify which enerd->term should be
 +     communicated.  Temperature and pressure terms should only be
 +     communicated and summed when they need to be, to avoid repeating
 +     the sums and overcounting. */
 +
 +  nener = filter_enerdterm(enerd->term,TRUE,copyenerd,bTemp,bPres,bEner);
 +  
 +  /* First, the data that needs to be communicated with velocity verlet every time
 +     This is just the constraint virial.*/
 +  if (bConstrVir) {
 +      isv = add_binr(rb,DIM*DIM,svir[0]);
 +      where();
 +  }
 +  
 +/* We need the force virial and the kinetic energy for the first time through with velocity verlet */
 +  if (bTemp || !bVV)
 +  {
 +      if (ekind) 
 +      {
 +          for(j=0; (j<inputrec->opts.ngtc); j++) 
 +          {
 +              if (bSumEkinhOld) 
 +              {
 +                  itc0[j]=add_binr(rb,DIM*DIM,ekind->tcstat[j].ekinh_old[0]);
 +              }
 +              if (bEkinAveVel && !bReadEkin) 
 +              {
 +                  itc1[j]=add_binr(rb,DIM*DIM,ekind->tcstat[j].ekinf[0]);
 +              } 
 +              else if (!bReadEkin)
 +              {
 +                  itc1[j]=add_binr(rb,DIM*DIM,ekind->tcstat[j].ekinh[0]);
 +              }
 +          }
 +          /* these probably need to be put into one of these categories */
 +          where();
 +          idedl = add_binr(rb,1,&(ekind->dekindl));
 +          where();
 +          ica   = add_binr(rb,1,&(ekind->cosacc.mvcos));
 +          where();
 +      }  
 +  }      
 +  where();
 +  
 +  if ((bPres || !bVV) && bFirstIterate)
 +  {
 +      ifv = add_binr(rb,DIM*DIM,fvir[0]);
 +  }
 +
 +
 +  if (bEner) 
 +  { 
 +      where();
 +      if (bFirstIterate) 
 +      {
 +          ie  = add_binr(rb,nener,copyenerd);
 +      }
 +      where();
 +      if (constr) 
 +      {
 +          rmsd_data = constr_rmsd_data(constr);
 +          if (rmsd_data) 
 +          {
 +              irmsd = add_binr(rb,inputrec->eI==eiSD2 ? 3 : 2,rmsd_data);
 +          }
 +      } 
 +      if (!NEED_MUTOT(*inputrec)) 
 +      {
 +          imu = add_binr(rb,DIM,mu_tot);
 +          where();
 +      }
 +      
 +      if (bFirstIterate) 
 +      {
 +          for(j=0; (j<egNR); j++)
 +          {
 +              inn[j]=add_binr(rb,enerd->grpp.nener,enerd->grpp.ener[j]);
 +          }
 +          where();
 +          if (inputrec->efep != efepNO) 
 +          {
 +              idvdll  = add_bind(rb,efptNR,enerd->dvdl_lin);
 +              idvdlnl = add_bind(rb,efptNR,enerd->dvdl_nonlin);
 +              if (enerd->n_lambda > 0) 
 +              {
 +                  iepl = add_bind(rb,enerd->n_lambda,enerd->enerpart_lambda);
 +              }
 +          }
 +      }
 +  }
 +
 +  if (vcm)
 +  {
 +      icm   = add_binr(rb,DIM*vcm->nr,vcm->group_p[0]);
 +      where();
 +      imass = add_binr(rb,vcm->nr,vcm->group_mass);
 +      where();
 +      if (vcm->mode == ecmANGULAR)
 +      {
 +          icj   = add_binr(rb,DIM*vcm->nr,vcm->group_j[0]);
 +          where();
 +          icx   = add_binr(rb,DIM*vcm->nr,vcm->group_x[0]);
 +          where();
 +          ici   = add_binr(rb,DIM*DIM*vcm->nr,vcm->group_i[0][0]);
 +          where();
 +      }
 +  }
 +
 +  if (DOMAINDECOMP(cr)) 
 +  {
 +      nb = cr->dd->nbonded_local;
 +      inb = add_bind(rb,1,&nb);
 +      }
 +  where();
 +  if (nsig > 0) 
 +  {
 +      isig = add_binr(rb,nsig,sig);
 +  }
 +
 +  /* Global sum it all */
 +  if (debug)
 +  {
 +      fprintf(debug,"Summing %d energies\n",rb->maxreal);
 +  }
 +  sum_bin(rb,cr);
 +  where();
 +
 +  /* Extract all the data locally */
 +
 +  if (bConstrVir) 
 +  {
 +      extract_binr(rb,isv ,DIM*DIM,svir[0]);
 +  }
 +
 +  /* We need the force virial and the kinetic energy for the first time through with velocity verlet */
 +  if (bTemp || !bVV)
 +  {
 +      if (ekind) 
 +      {
 +          for(j=0; (j<inputrec->opts.ngtc); j++) 
 +          {
 +              if (bSumEkinhOld)
 +              {
 +                  extract_binr(rb,itc0[j],DIM*DIM,ekind->tcstat[j].ekinh_old[0]);
 +              }
 +              if (bEkinAveVel && !bReadEkin) {
 +                  extract_binr(rb,itc1[j],DIM*DIM,ekind->tcstat[j].ekinf[0]);
 +              }
 +              else if (!bReadEkin)
 +              {
 +                  extract_binr(rb,itc1[j],DIM*DIM,ekind->tcstat[j].ekinh[0]);              
 +              }
 +          }
 +          extract_binr(rb,idedl,1,&(ekind->dekindl));
 +          extract_binr(rb,ica,1,&(ekind->cosacc.mvcos));
 +          where();
 +      }
 +  }
 +  if ((bPres || !bVV) && bFirstIterate)
 +  {
 +      extract_binr(rb,ifv ,DIM*DIM,fvir[0]);
 +  }
 +
 +  if (bEner) 
 +  {
 +      if (bFirstIterate) 
 +      {
 +          extract_binr(rb,ie,nener,copyenerd);
 +          if (rmsd_data) 
 +          {
 +              extract_binr(rb,irmsd,inputrec->eI==eiSD2 ? 3 : 2,rmsd_data);
 +          }
 +          if (!NEED_MUTOT(*inputrec))
 +          {
 +              extract_binr(rb,imu,DIM,mu_tot);
 +          }
 +
 +          for(j=0; (j<egNR); j++)
 +          {
 +              extract_binr(rb,inn[j],enerd->grpp.nener,enerd->grpp.ener[j]);
 +          }
 +          if (inputrec->efep != efepNO) 
 +          {
 +              extract_bind(rb,idvdll ,efptNR,enerd->dvdl_lin);
 +              extract_bind(rb,idvdlnl,efptNR,enerd->dvdl_nonlin);
 +              if (enerd->n_lambda > 0) 
 +              {
 +                  extract_bind(rb,iepl,enerd->n_lambda,enerd->enerpart_lambda);
 +              }
 +          }
 +          if (DOMAINDECOMP(cr)) 
 +          {
 +              extract_bind(rb,inb,1,&nb);
 +              if ((int)(nb + 0.5) != cr->dd->nbonded_global) 
 +              {
 +                  dd_print_missing_interactions(fplog,cr,(int)(nb + 0.5),top_global,state_local);
 +              }
 +          }
 +          where();
 +
 +          filter_enerdterm(copyenerd,FALSE,enerd->term,bTemp,bPres,bEner);    
 +      }
 +  }
 +
 +  if (vcm)
 +  {
 +      extract_binr(rb,icm,DIM*vcm->nr,vcm->group_p[0]);
 +      where();
 +      extract_binr(rb,imass,vcm->nr,vcm->group_mass);
 +      where();
 +      if (vcm->mode == ecmANGULAR)
 +      {
 +          extract_binr(rb,icj,DIM*vcm->nr,vcm->group_j[0]);
 +          where();
 +          extract_binr(rb,icx,DIM*vcm->nr,vcm->group_x[0]);
 +          where();
 +          extract_binr(rb,ici,DIM*DIM*vcm->nr,vcm->group_i[0][0]);
 +          where();
 +      }
 +  }
 +
 +  if (nsig > 0) 
 +  {
 +      extract_binr(rb,isig,nsig,sig);
 +  }
 +  where();
 +}
 +
 +int do_per_step(gmx_large_int_t step,gmx_large_int_t nstep)
 +{
 +  if (nstep != 0) 
 +    return ((step % nstep)==0); 
 +  else 
 +    return 0;
 +}
 +
 +static void moveit(t_commrec *cr,
 +                 int left,int right,const char *s,rvec xx[])
 +{
 +  if (!xx) 
 +    return;
 +
 +  move_rvecs(cr,FALSE,FALSE,left,right,
 +           xx,NULL,(cr->nnodes-cr->npmenodes)-1,NULL);
 +}
 +
 +gmx_mdoutf_t *init_mdoutf(int nfile,const t_filenm fnm[],int mdrun_flags,
 +                          const t_commrec *cr,const t_inputrec *ir,
 +                          const output_env_t oenv)
 +{
 +    gmx_mdoutf_t *of;
 +    char filemode[3];
 +    gmx_bool bAppendFiles;
 +
 +    snew(of,1);
 +
 +    of->fp_trn   = NULL;
 +    of->fp_ene   = NULL;
 +    of->fp_xtc   = NULL;
 +    of->fp_dhdl  = NULL;
 +    of->fp_field = NULL;
 +    
 +    of->eIntegrator     = ir->eI;
 +    of->bExpanded       = ir->bExpanded;
 +    of->elamstats       = ir->expandedvals->elamstats;
 +    of->simulation_part = ir->simulation_part;
 +
 +    if (MASTER(cr))
 +    {
 +        bAppendFiles = (mdrun_flags & MD_APPENDFILES);
 +
 +        of->bKeepAndNumCPT = (mdrun_flags & MD_KEEPANDNUMCPT);
 +
 +        sprintf(filemode, bAppendFiles ? "a+" : "w+");  
 +        
 +        if ((EI_DYNAMICS(ir->eI) || EI_ENERGY_MINIMIZATION(ir->eI))
 +#ifndef GMX_FAHCORE
 +            &&
 +            !(EI_DYNAMICS(ir->eI) &&
 +              ir->nstxout == 0 &&
 +              ir->nstvout == 0 &&
 +              ir->nstfout == 0)
 +#endif
 +          )
 +        {
 +            of->fp_trn = open_trn(ftp2fn(efTRN,nfile,fnm), filemode);
 +        }
 +        if (EI_DYNAMICS(ir->eI) &&
 +            ir->nstxtcout > 0)
 +        {
 +            of->fp_xtc = open_xtc(ftp2fn(efXTC,nfile,fnm), filemode);
 +            of->xtc_prec = ir->xtcprec;
 +        }
 +        if (EI_DYNAMICS(ir->eI) || EI_ENERGY_MINIMIZATION(ir->eI))
 +        {
 +            of->fp_ene = open_enx(ftp2fn(efEDR,nfile,fnm), filemode);
 +        }
 +        of->fn_cpt = opt2fn("-cpo",nfile,fnm);
 +        
 +        if ((ir->efep != efepNO || ir->bSimTemp) && ir->fepvals->nstdhdl > 0 &&
 +            (ir->fepvals->separate_dhdl_file == esepdhdlfileYES ) &&
 +            EI_DYNAMICS(ir->eI))
 +        {
 +            if (bAppendFiles)
 +            {
 +                of->fp_dhdl = gmx_fio_fopen(opt2fn("-dhdl",nfile,fnm),filemode);
 +            }
 +            else
 +            {
 +                of->fp_dhdl = open_dhdl(opt2fn("-dhdl",nfile,fnm),ir,oenv);
 +            }
 +        }
 +        
 +        if (opt2bSet("-field",nfile,fnm) &&
 +            (ir->ex[XX].n || ir->ex[YY].n || ir->ex[ZZ].n))
 +        {
 +            if (bAppendFiles)
 +            {
 +                of->fp_dhdl = gmx_fio_fopen(opt2fn("-field",nfile,fnm),
 +                                            filemode);
 +            }
 +            else
 +            {                           
 +                of->fp_field = xvgropen(opt2fn("-field",nfile,fnm),
 +                                        "Applied electric field","Time (ps)",
 +                                        "E (V/nm)",oenv);
 +            }
 +        }
 +    }
 +
 +    return of;
 +}
 +
 +void done_mdoutf(gmx_mdoutf_t *of)
 +{
 +    if (of->fp_ene != NULL)
 +    {
 +        close_enx(of->fp_ene);
 +    }
 +    if (of->fp_xtc)
 +    {
 +        close_xtc(of->fp_xtc);
 +    }
 +    if (of->fp_trn)
 +    {
 +        close_trn(of->fp_trn);
 +    }
 +    if (of->fp_dhdl != NULL)
 +    {
 +        gmx_fio_fclose(of->fp_dhdl);
 +    }
 +    if (of->fp_field != NULL)
 +    {
 +        gmx_fio_fclose(of->fp_field);
 +    }
 +
 +    sfree(of);
 +}
 +
 +void write_traj(FILE *fplog,t_commrec *cr,
 +                gmx_mdoutf_t *of,
 +                int mdof_flags,
 +                gmx_mtop_t *top_global,
 +                gmx_large_int_t step,double t,
 +                t_state *state_local,t_state *state_global,
 +                rvec *f_local,rvec *f_global,
 +                int *n_xtc,rvec **x_xtc)
 +{
 +    int     i,j;
 +    gmx_groups_t *groups;
 +    rvec    *xxtc;
 +    rvec *local_v;
 +    rvec *global_v;
 +    
 +#define MX(xvf) moveit(cr,GMX_LEFT,GMX_RIGHT,#xvf,xvf)
 +
 +    /* MRS -- defining these variables is to manage the difference
 +     * between half step and full step velocities, but there must be a better way . . . */
 +
 +    local_v  = state_local->v;
 +    global_v = state_global->v;
 +    
 +    if (DOMAINDECOMP(cr))
 +    {
 +        if (mdof_flags & MDOF_CPT)
 +        {
 +            dd_collect_state(cr->dd,state_local,state_global);
 +        }
 +        else
 +        {
 +            if (mdof_flags & (MDOF_X | MDOF_XTC))
 +            {
 +                dd_collect_vec(cr->dd,state_local,state_local->x,
 +                               state_global->x);
 +            }
 +            if (mdof_flags & MDOF_V)
 +            {
 +                dd_collect_vec(cr->dd,state_local,local_v,
 +                               global_v);
 +            }
 +        }
 +        if (mdof_flags & MDOF_F)
 +        {
 +            dd_collect_vec(cr->dd,state_local,f_local,f_global);
 +        }
 +    }
 +    else
 +    {
 +        if (mdof_flags & MDOF_CPT)
 +        {
 +            /* All pointers in state_local are equal to state_global,
 +             * but we need to copy the non-pointer entries.
 +             */
 +            state_global->lambda = state_local->lambda;
 +            state_global->veta = state_local->veta;
 +            state_global->vol0 = state_local->vol0;
 +            copy_mat(state_local->box,state_global->box);
 +            copy_mat(state_local->boxv,state_global->boxv);
 +            copy_mat(state_local->svir_prev,state_global->svir_prev);
 +            copy_mat(state_local->fvir_prev,state_global->fvir_prev);
 +            copy_mat(state_local->pres_prev,state_global->pres_prev);
 +        }
 +        if (cr->nnodes > 1)
 +        {
 +            /* Particle decomposition, collect the data on the master node */
 +            if (mdof_flags & MDOF_CPT)
 +            {
 +                if (state_local->flags & (1<<estX))   MX(state_global->x);
 +                if (state_local->flags & (1<<estV))   MX(state_global->v);
 +                if (state_local->flags & (1<<estSDX)) MX(state_global->sd_X);
 +                if (state_global->nrngi > 1) {
 +                    if (state_local->flags & (1<<estLD_RNG)) {
 +#ifdef GMX_MPI
 +                        MPI_Gather(state_local->ld_rng ,
 +                                   state_local->nrng*sizeof(state_local->ld_rng[0]),MPI_BYTE,
 +                                   state_global->ld_rng,
 +                                   state_local->nrng*sizeof(state_local->ld_rng[0]),MPI_BYTE,
 +                                   MASTERRANK(cr),cr->mpi_comm_mygroup);
 +#endif
 +                    }
 +                    if (state_local->flags & (1<<estLD_RNGI))
 +                    {
 +#ifdef GMX_MPI
 +                        MPI_Gather(state_local->ld_rngi,
 +                                   sizeof(state_local->ld_rngi[0]),MPI_BYTE,
 +                                   state_global->ld_rngi,
 +                                   sizeof(state_local->ld_rngi[0]),MPI_BYTE,
 +                                   MASTERRANK(cr),cr->mpi_comm_mygroup);
 +#endif
 +                    }
 +                }
 +            }
 +            else
 +            {
 +                if (mdof_flags & (MDOF_X | MDOF_XTC)) MX(state_global->x);
 +                if (mdof_flags & MDOF_V)              MX(global_v);
 +            }
 +            if (mdof_flags & MDOF_F) MX(f_global);
 +         }
 +     }
 +
 +     if (MASTER(cr))
 +     {
 +         if (mdof_flags & MDOF_CPT)
 +         {
 +             write_checkpoint(of->fn_cpt,of->bKeepAndNumCPT,
 +                              fplog,cr,of->eIntegrator,of->simulation_part,
 +                              of->bExpanded,of->elamstats,step,t,state_global);
 +         }
 +
 +         if (mdof_flags & (MDOF_X | MDOF_V | MDOF_F))
 +         {
 +            fwrite_trn(of->fp_trn,step,t,state_local->lambda[efptFEP],
 +                       state_local->box,top_global->natoms,
 +                       (mdof_flags & MDOF_X) ? state_global->x : NULL,
 +                       (mdof_flags & MDOF_V) ? global_v : NULL,
 +                       (mdof_flags & MDOF_F) ? f_global : NULL);
 +            if (gmx_fio_flush(of->fp_trn) != 0)
 +            {
 +                gmx_file("Cannot write trajectory; maybe you are out of disk space?");
 +            }
 +            gmx_fio_check_file_position(of->fp_trn);
 +        }      
 +        if (mdof_flags & MDOF_XTC) {
 +            groups = &top_global->groups;
 +            if (*n_xtc == -1)
 +            {
 +                *n_xtc = 0;
 +                for(i=0; (i<top_global->natoms); i++)
 +                {
 +                    if (ggrpnr(groups,egcXTC,i) == 0)
 +                    {
 +                        (*n_xtc)++;
 +                    }
 +                }
 +                if (*n_xtc != top_global->natoms)
 +                {
 +                    snew(*x_xtc,*n_xtc);
 +                }
 +            }
 +            if (*n_xtc == top_global->natoms)
 +            {
 +                xxtc = state_global->x;
 +            }
 +            else
 +            {
 +                xxtc = *x_xtc;
 +                j = 0;
 +                for(i=0; (i<top_global->natoms); i++)
 +                {
 +                    if (ggrpnr(groups,egcXTC,i) == 0)
 +                    {
 +                        copy_rvec(state_global->x[i],xxtc[j++]);
 +                    }
 +                }
 +            }
 +            if (write_xtc(of->fp_xtc,*n_xtc,step,t,
 +                          state_local->box,xxtc,of->xtc_prec) == 0)
 +            {
 +                gmx_fatal(FARGS,"XTC error - maybe you are out of disk space?");
 +            }
 +            gmx_fio_check_file_position(of->fp_xtc);
 +        }
 +    }
 +}
 +
index 86e364424e75d222ad0af75405842cd29028e761,0000000000000000000000000000000000000000..1a2a9ffc32428e2ef3b2e53dd4a69c6a890bae4c
mode 100644,000000..100644
--- /dev/null
@@@ -1,2072 -1,0 +1,2079 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <stdio.h>
 +#include "typedefs.h"
 +#include "vsite.h"
 +#include "macros.h"
 +#include "smalloc.h"
 +#include "nrnb.h"
 +#include "vec.h"
 +#include "mvdata.h"
 +#include "network.h"
 +#include "mshift.h"
 +#include "pbc.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "mtop_util.h"
 +#include "gmx_omp_nthreads.h"
 +#include "gmx_omp.h"
 +
 +/* Routines to send/recieve coordinates and force
 + * of constructing atoms. 
 + */ 
 +
 +static void move_construct_x(t_comm_vsites *vsitecomm, rvec x[], t_commrec *cr)
 +{
 +      rvec *sendbuf;
 +      rvec *recvbuf;
 +      int i,ia;
 +      
 +      sendbuf = vsitecomm->send_buf;
 +      recvbuf = vsitecomm->recv_buf;
 +      
 +
 +              /* Prepare pulse left by copying to send buffer */
 +              for(i=0;i<vsitecomm->left_export_nconstruct;i++)
 +              {
 +                      ia = vsitecomm->left_export_construct[i];
 +                      copy_rvec(x[ia],sendbuf[i]);
 +              }
 +      
 +              /* Pulse coordinates left */
 +              gmx_tx_rx_real(cr,GMX_LEFT,(real *)sendbuf,3*vsitecomm->left_export_nconstruct,GMX_RIGHT,(real *)recvbuf,3*vsitecomm->right_import_nconstruct);
 +              
 +              /* Copy from receive buffer to coordinate array */
 +              for(i=0;i<vsitecomm->right_import_nconstruct;i++)
 +              {
 +                      ia = vsitecomm->right_import_construct[i];
 +                      copy_rvec(recvbuf[i],x[ia]);
 +              }
 +
 +              /* Prepare pulse right by copying to send buffer */
 +              for(i=0;i<vsitecomm->right_export_nconstruct;i++)
 +              {
 +                      ia = vsitecomm->right_export_construct[i];
 +                      copy_rvec(x[ia],sendbuf[i]);
 +              }
 +              
 +              /* Pulse coordinates right */
 +              gmx_tx_rx_real(cr,GMX_RIGHT,(real *)sendbuf,3*vsitecomm->right_export_nconstruct,GMX_LEFT,(real *)recvbuf,3*vsitecomm->left_import_nconstruct);
 +              
 +              /* Copy from receive buffer to coordinate array */
 +              for(i=0;i<vsitecomm->left_import_nconstruct;i++)
 +              {
 +                      ia = vsitecomm->left_import_construct[i];
 +                      copy_rvec(recvbuf[i],x[ia]);
 +              }
 +}
 +
 +
 +static void move_construct_f(t_comm_vsites *vsitecomm, rvec f[], t_commrec *cr)
 +{
 +      rvec *sendbuf;
 +      rvec *recvbuf;
 +      int i,ia;
 +
 +      sendbuf = vsitecomm->send_buf;
 +      recvbuf = vsitecomm->recv_buf;  
 +
 +              /* Prepare pulse right by copying to send buffer */
 +              for(i=0;i<vsitecomm->right_import_nconstruct;i++)
 +              {
 +                      ia = vsitecomm->right_import_construct[i];
 +                      copy_rvec(f[ia],sendbuf[i]);
 +                      clear_rvec(f[ia]); /* Zero it here after moving, just to simplify debug book-keeping... */
 +              }
 +              
 +              /* Pulse forces right */
 +              gmx_tx_rx_real(cr,GMX_RIGHT,(real *)sendbuf,3*vsitecomm->right_import_nconstruct,GMX_LEFT,(real *)recvbuf,3*vsitecomm->left_export_nconstruct);
 +              
 +              /* Copy from receive buffer to coordinate array */
 +              for(i=0;i<vsitecomm->left_export_nconstruct;i++)
 +              {
 +                      ia = vsitecomm->left_export_construct[i];
 +                      rvec_inc(f[ia],recvbuf[i]);
 +              }
 +
 +              /* Prepare pulse left by copying to send buffer */
 +              for(i=0;i<vsitecomm->left_import_nconstruct;i++)
 +              {
 +                      ia = vsitecomm->left_import_construct[i];
 +                      copy_rvec(f[ia],sendbuf[i]);
 +                      clear_rvec(f[ia]); /* Zero it here after moving, just to simplify debug book-keeping... */
 +              }
 +              
 +              /* Pulse coordinates left */
 +              gmx_tx_rx_real(cr,GMX_LEFT,(real *)sendbuf,3*vsitecomm->left_import_nconstruct,GMX_RIGHT,(real *)recvbuf,3*vsitecomm->right_export_nconstruct);
 +              
 +              /* Copy from receive buffer to coordinate array */
 +              for(i=0;i<vsitecomm->right_export_nconstruct;i++)
 +              {
 +                      ia = vsitecomm->right_export_construct[i];
 +                      rvec_inc(f[ia],recvbuf[i]);
 +              }
 +              
 +      /* All forces are now on the home processors */
 +}
 +
 +      
 +static void
 +pd_clear_nonlocal_constructs(t_comm_vsites *vsitecomm, rvec f[])
 +{
 +      int i,ia;
 +      
 +      for(i=0;i<vsitecomm->left_import_nconstruct;i++)
 +      {
 +              ia = vsitecomm->left_import_construct[i];
 +              clear_rvec(f[ia]); 
 +      }
 +      for(i=0;i<vsitecomm->right_import_nconstruct;i++)
 +      {
 +              ia = vsitecomm->right_import_construct[i];
 +              clear_rvec(f[ia]); 
 +      }
 +}
 +
 +
 +
 +static int pbc_rvec_sub(const t_pbc *pbc,const rvec xi,const rvec xj,rvec dx)
 +{
 +  if (pbc) {
 +    return pbc_dx_aiuc(pbc,xi,xj,dx);
 +  }
 +  else {
 +    rvec_sub(xi,xj,dx);
 +    return CENTRAL;
 +  }
 +}
 +
 +/* Vsite construction routines */
 +
 +static void constr_vsite2(rvec xi,rvec xj,rvec x,real a,t_pbc *pbc)
 +{
 +  real b;
 +  rvec dx;
 +
 +  b=1.0-a;
 +  /* 1 flop */
 +  
 +  if (pbc) {
 +    pbc_dx_aiuc(pbc,xj,xi,dx);
 +    x[XX] = xi[XX] + a*dx[XX];
 +    x[YY] = xi[YY] + a*dx[YY];
 +    x[ZZ] = xi[ZZ] + a*dx[ZZ];
 +  } else {
 +    x[XX] = b*xi[XX] + a*xj[XX];
 +    x[YY] = b*xi[YY] + a*xj[YY];
 +    x[ZZ] = b*xi[ZZ] + a*xj[ZZ];
 +    /* 9 Flops */
 +  }
 +  
 +  /* TOTAL: 10 flops */
 +}
 +
 +static void constr_vsite3(rvec xi,rvec xj,rvec xk,rvec x,real a,real b,
 +                        t_pbc *pbc)
 +{
 +  real c;
 +  rvec dxj,dxk;
 +
 +  c=1.0-a-b;
 +  /* 2 flops */
 +  
 +  if (pbc) {
 +    pbc_dx_aiuc(pbc,xj,xi,dxj);
 +    pbc_dx_aiuc(pbc,xk,xi,dxk);
 +    x[XX] = xi[XX] + a*dxj[XX] + b*dxk[XX];
 +    x[YY] = xi[YY] + a*dxj[YY] + b*dxk[YY];
 +    x[ZZ] = xi[ZZ] + a*dxj[ZZ] + b*dxk[ZZ];
 +  } else {
 +    x[XX] = c*xi[XX] + a*xj[XX] + b*xk[XX];
 +    x[YY] = c*xi[YY] + a*xj[YY] + b*xk[YY];
 +    x[ZZ] = c*xi[ZZ] + a*xj[ZZ] + b*xk[ZZ];
 +  /* 15 Flops */
 +  }
 +  
 +  /* TOTAL: 17 flops */
 +}
 +
 +static void constr_vsite3FD(rvec xi,rvec xj,rvec xk,rvec x,real a,real b,
 +                          t_pbc *pbc)
 +{
 +  rvec xij,xjk,temp;
 +  real c;
 +  
 +  pbc_rvec_sub(pbc,xj,xi,xij);
 +  pbc_rvec_sub(pbc,xk,xj,xjk);
 +  /* 6 flops */
 +
 +  /* temp goes from i to a point on the line jk */  
 +  temp[XX] = xij[XX] + a*xjk[XX];
 +  temp[YY] = xij[YY] + a*xjk[YY];
 +  temp[ZZ] = xij[ZZ] + a*xjk[ZZ];
 +  /* 6 flops */
 +  
 +  c=b*gmx_invsqrt(iprod(temp,temp));
 +  /* 6 + 10 flops */
 +  
 +  x[XX] = xi[XX] + c*temp[XX];
 +  x[YY] = xi[YY] + c*temp[YY];
 +  x[ZZ] = xi[ZZ] + c*temp[ZZ];
 +  /* 6 Flops */
 +  
 +  /* TOTAL: 34 flops */
 +}
 +
 +static void constr_vsite3FAD(rvec xi,rvec xj,rvec xk,rvec x,real a,real b, t_pbc *pbc)
 +{
 +  rvec xij,xjk,xp;
 +  real a1,b1,c1,invdij;
 +  
 +  pbc_rvec_sub(pbc,xj,xi,xij);
 +  pbc_rvec_sub(pbc,xk,xj,xjk);
 +  /* 6 flops */
 +
 +  invdij = gmx_invsqrt(iprod(xij,xij));
 +  c1 = invdij * invdij * iprod(xij,xjk);
 +  xp[XX] = xjk[XX] - c1*xij[XX];
 +  xp[YY] = xjk[YY] - c1*xij[YY];
 +  xp[ZZ] = xjk[ZZ] - c1*xij[ZZ];
 +  a1 = a*invdij;
 +  b1 = b*gmx_invsqrt(iprod(xp,xp));
 +  /* 45 */
 +  
 +  x[XX] = xi[XX] + a1*xij[XX] + b1*xp[XX];
 +  x[YY] = xi[YY] + a1*xij[YY] + b1*xp[YY];
 +  x[ZZ] = xi[ZZ] + a1*xij[ZZ] + b1*xp[ZZ];
 +  /* 12 Flops */
 +  
 +  /* TOTAL: 63 flops */
 +}
 +
 +static void constr_vsite3OUT(rvec xi,rvec xj,rvec xk,rvec x,
 +                           real a,real b,real c,t_pbc *pbc)
 +{
 +  rvec xij,xik,temp;
 +  
 +  pbc_rvec_sub(pbc,xj,xi,xij);
 +  pbc_rvec_sub(pbc,xk,xi,xik);
 +  cprod(xij,xik,temp);
 +  /* 15 Flops */
 +  
 +  x[XX] = xi[XX] + a*xij[XX] + b*xik[XX] + c*temp[XX];
 +  x[YY] = xi[YY] + a*xij[YY] + b*xik[YY] + c*temp[YY];
 +  x[ZZ] = xi[ZZ] + a*xij[ZZ] + b*xik[ZZ] + c*temp[ZZ];
 +  /* 18 Flops */
 +  
 +  /* TOTAL: 33 flops */
 +}
 +
 +static void constr_vsite4FD(rvec xi,rvec xj,rvec xk,rvec xl,rvec x,
 +                            real a,real b,real c,t_pbc *pbc)
 +{
 +  rvec xij,xjk,xjl,temp;
 +  real d;
 +  
 +  pbc_rvec_sub(pbc,xj,xi,xij);
 +  pbc_rvec_sub(pbc,xk,xj,xjk);
 +  pbc_rvec_sub(pbc,xl,xj,xjl);
 +  /* 9 flops */
 +
 +  /* temp goes from i to a point on the plane jkl */  
 +  temp[XX] = xij[XX] + a*xjk[XX] + b*xjl[XX];
 +  temp[YY] = xij[YY] + a*xjk[YY] + b*xjl[YY];
 +  temp[ZZ] = xij[ZZ] + a*xjk[ZZ] + b*xjl[ZZ];
 +  /* 12 flops */
 +  
 +  d=c*gmx_invsqrt(iprod(temp,temp));
 +  /* 6 + 10 flops */
 +  
 +  x[XX] = xi[XX] + d*temp[XX];
 +  x[YY] = xi[YY] + d*temp[YY];
 +  x[ZZ] = xi[ZZ] + d*temp[ZZ];
 +  /* 6 Flops */
 +  
 +  /* TOTAL: 43 flops */
 +}
 +
 +
 +static void constr_vsite4FDN(rvec xi,rvec xj,rvec xk,rvec xl,rvec x,
 +                             real a,real b,real c,t_pbc *pbc)
 +{
 +    rvec xij,xik,xil,ra,rb,rja,rjb,rm;
 +    real d;
 +    
 +    pbc_rvec_sub(pbc,xj,xi,xij);
 +    pbc_rvec_sub(pbc,xk,xi,xik);
 +    pbc_rvec_sub(pbc,xl,xi,xil);
 +    /* 9 flops */
 +
 +    ra[XX] = a*xik[XX];
 +    ra[YY] = a*xik[YY];
 +    ra[ZZ] = a*xik[ZZ];
 +    
 +    rb[XX] = b*xil[XX];
 +    rb[YY] = b*xil[YY];
 +    rb[ZZ] = b*xil[ZZ];
 +
 +    /* 6 flops */
 +
 +    rvec_sub(ra,xij,rja);
 +    rvec_sub(rb,xij,rjb);
 +    /* 6 flops */
 +    
 +    cprod(rja,rjb,rm);
 +    /* 9 flops */
 +    
 +    d=c*gmx_invsqrt(norm2(rm));
 +    /* 5+5+1 flops */
 +    
 +    x[XX] = xi[XX] + d*rm[XX];
 +    x[YY] = xi[YY] + d*rm[YY];
 +    x[ZZ] = xi[ZZ] + d*rm[ZZ];
 +    /* 6 Flops */
 +    
 +    /* TOTAL: 47 flops */
 +}
 +
 +
 +static int constr_vsiten(t_iatom *ia, t_iparams ip[],
 +                       rvec *x, t_pbc *pbc)
 +{
 +  rvec xs,x1,dx;
 +  dvec dsum;
 +  int  n3,av,ai,i;
 +  real a;
 +
 +  n3 = 3*ip[ia[0]].vsiten.n;
 +  av = ia[1];
 +  ai = ia[2];
 +  copy_rvec(x[ai],x1);
 +  clear_dvec(dsum);
 +  for(i=3; i<n3; i+=3) {
 +    ai = ia[i+2];
 +    a = ip[ia[i]].vsiten.a;
 +    if (pbc) {
 +      pbc_dx_aiuc(pbc,x[ai],x1,dx);
 +    } else {
 +      rvec_sub(x[ai],x1,dx);
 +    }
 +    dsum[XX] += a*dx[XX];
 +    dsum[YY] += a*dx[YY];
 +    dsum[ZZ] += a*dx[ZZ];
 +    /* 9 Flops */
 +  }
 +
 +  x[av][XX] = x1[XX] + dsum[XX];
 +  x[av][YY] = x1[YY] + dsum[YY];
 +  x[av][ZZ] = x1[ZZ] + dsum[ZZ];
 +
 +  return n3;
 +}
 +
 +
 +void construct_vsites_thread(gmx_vsite_t *vsite,
 +                             rvec x[],t_nrnb *nrnb,
 +                             real dt,rvec *v,
 +                             t_iparams ip[],t_ilist ilist[],
 +                             t_pbc *pbc_null)
 +{
 +    gmx_bool  bPBCAll;
 +    rvec      xpbc,xv,vv,dx;
 +    real      a1,b1,c1,inv_dt;
 +    int       i,inc,ii,nra,nr,tp,ftype;
 +    t_iatom   avsite,ai,aj,ak,al,pbc_atom;
 +    t_iatom   *ia;
 +    t_pbc     *pbc_null2;
 +    int       *vsite_pbc,ishift;
 +    rvec      reftmp,vtmp,rtmp;
 +
 +    if (v != NULL)
 +    {
 +        inv_dt = 1.0/dt;
 +    }
 +    else
 +    {
 +        inv_dt = 1.0;
 +    }
 +
 +    bPBCAll = (pbc_null != NULL && !vsite->bHaveChargeGroups);
 +
 +    pbc_null2 = NULL;
 +    vsite_pbc = NULL;
 +    for(ftype=0; (ftype<F_NRE); ftype++)
 +    {
 +        if ((interaction_function[ftype].flags & IF_VSITE) &&
 +            ilist[ftype].nr > 0)
 +        {
 +            nra    = interaction_function[ftype].nratoms;
 +            inc    = 1 + nra;
 +            nr     = ilist[ftype].nr;
 +            ia     = ilist[ftype].iatoms;
 +
 +            if (bPBCAll)
 +            {
 +                pbc_null2 = pbc_null;
 +            }
 +            else if (pbc_null != NULL)
 +            {
 +                vsite_pbc = vsite->vsite_pbc_loc[ftype-F_VSITE2];
 +            }
 +
 +            for(i=0; i<nr; )
 +            {
 +                tp   = ia[0];
 +
 +                /* The vsite and constructing atoms */
 +                avsite = ia[1];
 +                ai   = ia[2];
 +                aj   = ia[3];
 +
 +                /* Constants for constructing vsites */
 +                a1   = ip[tp].vsite.a;
 +                /* Check what kind of pbc we need to use */
 +                if (bPBCAll)
 +                {
 +                    /* No charge groups, vsite follows its own pbc */
 +                    pbc_atom = avsite;
 +                    copy_rvec(x[avsite],xpbc);
 +                }
 +                else if (vsite_pbc != NULL)
 +                {
 +                    pbc_atom = vsite_pbc[i/(1+nra)];
 +                    if (pbc_atom > -2)
 +                    {
 +                        if (pbc_atom >= 0)
 +                        {
 +                            /* We need to copy the coordinates here,
 +                             * single for single atom cg's pbc_atom
 +                             * is the vsite itself.
 +                             */
 +                            copy_rvec(x[pbc_atom],xpbc);
 +                        }
 +                        pbc_null2 = pbc_null;
 +                    }
 +                    else
 +                    {
 +                        pbc_null2 = NULL;
 +                    }
 +                }
 +                else
 +                {
 +                    pbc_atom = -2;
 +                }
 +                /* Copy the old position */
 +                copy_rvec(x[avsite],xv);
 +
 +                /* Construct the vsite depending on type */
 +                switch (ftype)
 +                {
 +                case F_VSITE2:
 +                    constr_vsite2(x[ai],x[aj],x[avsite],a1,pbc_null2);
 +                    break;
 +                case F_VSITE3:
 +                    ak = ia[4];
 +                    b1 = ip[tp].vsite.b;
 +                    constr_vsite3(x[ai],x[aj],x[ak],x[avsite],a1,b1,pbc_null2);
 +                    break;
 +                case F_VSITE3FD:
 +                    ak = ia[4];
 +                    b1 = ip[tp].vsite.b;
 +                    constr_vsite3FD(x[ai],x[aj],x[ak],x[avsite],a1,b1,pbc_null2);
 +                    break;
 +                case F_VSITE3FAD:
 +                    ak = ia[4];
 +                    b1 = ip[tp].vsite.b;
 +                    constr_vsite3FAD(x[ai],x[aj],x[ak],x[avsite],a1,b1,pbc_null2);
 +                    break;
 +                case F_VSITE3OUT:
 +                    ak = ia[4];
 +                    b1 = ip[tp].vsite.b;
 +                    c1 = ip[tp].vsite.c;
 +                    constr_vsite3OUT(x[ai],x[aj],x[ak],x[avsite],a1,b1,c1,pbc_null2);
 +                    break;
 +                case F_VSITE4FD:
 +                    ak = ia[4];
 +                    al = ia[5];
 +                    b1 = ip[tp].vsite.b;
 +                    c1 = ip[tp].vsite.c;
 +                    constr_vsite4FD(x[ai],x[aj],x[ak],x[al],x[avsite],a1,b1,c1,
 +                                    pbc_null2);
 +                    break;
 +                case F_VSITE4FDN:
 +                    ak = ia[4];
 +                    al = ia[5];
 +                    b1 = ip[tp].vsite.b;
 +                    c1 = ip[tp].vsite.c;
 +                    constr_vsite4FDN(x[ai],x[aj],x[ak],x[al],x[avsite],a1,b1,c1,
 +                                     pbc_null2);
 +                    break;
 +                case F_VSITEN:
 +                    inc = constr_vsiten(ia,ip,x,pbc_null2);
 +                    break;
 +                default:
 +                    gmx_fatal(FARGS,"No such vsite type %d in %s, line %d",
 +                              ftype,__FILE__,__LINE__);
 +                }
 +
 +                if (pbc_atom >= 0)
 +                {
 +                    /* Match the pbc of this vsite to the rest of its charge group */
 +                    ishift = pbc_dx_aiuc(pbc_null,x[avsite],xpbc,dx);
 +                    if (ishift != CENTRAL)
 +                    {
 +                        rvec_add(xpbc,dx,x[avsite]);
 +                    }
 +                }
 +                if (v != NULL)
 +                {
 +                    /* Calculate velocity of vsite... */
 +                    rvec_sub(x[avsite],xv,vv);
 +                    svmul(inv_dt,vv,v[avsite]);
 +                }
 +
 +                /* Increment loop variables */
 +                i  += inc;
 +                ia += inc;
 +            }
 +        }
 +    }
 +}
 +
 +void construct_vsites(FILE *log,gmx_vsite_t *vsite,
 +                      rvec x[],t_nrnb *nrnb,
 +                      real dt,rvec *v,
 +                      t_iparams ip[],t_ilist ilist[],
 +                      int ePBC,gmx_bool bMolPBC,t_graph *graph,
 +                      t_commrec *cr,matrix box)
 +{
 +    t_pbc     pbc,*pbc_null;
 +    gmx_bool  bDomDec;
 +    int       nthreads;
 +
 +    bDomDec = cr && DOMAINDECOMP(cr);
 +              
 +    /* We only need to do pbc when we have inter-cg vsites */
 +    if (ePBC != epbcNONE && (bDomDec || bMolPBC) && vsite->n_intercg_vsite)
 +    {
 +        /* This is wasting some CPU time as we now do this multiple times
 +         * per MD step. But how often do we have vsites with full pbc?
 +         */
 +        pbc_null = set_pbc_dd(&pbc,ePBC,cr!=NULL ? cr->dd : NULL,FALSE,box);
 +    }
 +    else
 +    {
 +        pbc_null = NULL;
 +    }
 +
 +    if (cr)
 +    {
 +        if (bDomDec)
 +        {
 +            dd_move_x_vsites(cr->dd,box,x);
 +        }
 +        else if (vsite->bPDvsitecomm)
 +        {
 +            /* I'm not sure whether the periodicity and shift are guaranteed
 +             * to be consistent between different nodes when running e.g. polymers
 +             * in parallel. In this special case we thus unshift/shift,
 +             * but only when necessary. This is to make sure the coordinates
 +             * we move don't end up a box away...
 +             */
 +            if (graph != NULL)
 +            {
 +                unshift_self(graph,box,x);
 +            }
 +
 +            move_construct_x(vsite->vsitecomm,x,cr);
 +
 +            if (graph != NULL)
 +            {
 +                shift_self(graph,box,x);
 +            }
 +        }
 +    }
 +
 +    if (vsite->nthreads == 1)
 +    {
 +        construct_vsites_thread(vsite,
 +                                x,nrnb,dt,v,
 +                                ip,ilist,
 +                                pbc_null);
 +    }
 +    else
 +    {
 +#pragma omp parallel num_threads(vsite->nthreads)
 +        {
 +            construct_vsites_thread(vsite,
 +                                    x,nrnb,dt,v,
 +                                    ip,vsite->tdata[gmx_omp_get_thread_num()].ilist,
 +                                    pbc_null);
 +        }
 +        /* Now we can construct the vsites that might depend on other vsites */
 +        construct_vsites_thread(vsite,
 +                                x,nrnb,dt,v,
 +                                ip,vsite->tdata[vsite->nthreads].ilist,
 +                                pbc_null);
 +    }
 +}
 +
 +static void spread_vsite2(t_iatom ia[],real a,
 +                          rvec x[],rvec f[],rvec fshift[],
 +                          t_pbc *pbc,t_graph *g)
 +{
 +  rvec    fi,fj,dx;
 +  t_iatom av,ai,aj;
 +  ivec    di;
 +  real    b;
 +  int     siv,sij;
 +  
 +  av = ia[1];
 +  ai = ia[2];
 +  aj = ia[3];
 +  
 +  svmul(1-a,f[av],fi);
 +  svmul(  a,f[av],fj);
 +  /* 7 flop */
 +  
 +  rvec_inc(f[ai],fi);
 +  rvec_inc(f[aj],fj);
 +  /* 6 Flops */
 +
 +  if (g) {
 +    ivec_sub(SHIFT_IVEC(g,ai),SHIFT_IVEC(g,av),di);
 +    siv = IVEC2IS(di);
 +    ivec_sub(SHIFT_IVEC(g,ai),SHIFT_IVEC(g,aj),di);
 +    sij = IVEC2IS(di);
 +  } else if (pbc) {
 +    siv = pbc_dx_aiuc(pbc,x[ai],x[av],dx);
 +    sij = pbc_dx_aiuc(pbc,x[ai],x[aj],dx);
 +  } else {
 +    siv = CENTRAL;
 +    sij = CENTRAL;
 +  }
 +
 +  if (fshift && (siv != CENTRAL || sij != CENTRAL)) {
 +    rvec_inc(fshift[siv],f[av]);
 +    rvec_dec(fshift[CENTRAL],fi);
 +    rvec_dec(fshift[sij],fj);
 +  }
 +
 +  /* TOTAL: 13 flops */
 +}
 +
 +void construct_vsites_mtop(FILE *log,gmx_vsite_t *vsite,
 +                         gmx_mtop_t *mtop,rvec x[])
 +{
 +  int as,mb,mol;
 +  gmx_molblock_t *molb;
 +  gmx_moltype_t  *molt;
 +
 +  as = 0;
 +  for(mb=0; mb<mtop->nmolblock; mb++) {
 +    molb = &mtop->molblock[mb];
 +    molt = &mtop->moltype[molb->type];          
 +    for(mol=0; mol<molb->nmol; mol++) {
 +      construct_vsites(log,vsite,x+as,NULL,0.0,NULL,
 +                     mtop->ffparams.iparams,molt->ilist,
 +                     epbcNONE,TRUE,NULL,NULL,NULL);
 +      as += molt->atoms.nr;
 +    }
 +  }
 +}
 +
 +static void spread_vsite3(t_iatom ia[],real a,real b,
 +                          rvec x[],rvec f[],rvec fshift[],
 +                          t_pbc *pbc,t_graph *g)
 +{
 +  rvec    fi,fj,fk,dx;
 +  atom_id av,ai,aj,ak;
 +  ivec    di;
 +  int     siv,sij,sik;
 +
 +  av = ia[1];
 +  ai = ia[2];
 +  aj = ia[3];
 +  ak = ia[4];
 +  
 +  svmul(1-a-b,f[av],fi);
 +  svmul(    a,f[av],fj);
 +  svmul(    b,f[av],fk);
 +  /* 11 flops */
 +
 +  rvec_inc(f[ai],fi);
 +  rvec_inc(f[aj],fj);
 +  rvec_inc(f[ak],fk);
 +  /* 9 Flops */
 +  
 +  if (g) {
 +    ivec_sub(SHIFT_IVEC(g,ai),SHIFT_IVEC(g,ia[1]),di);
 +    siv = IVEC2IS(di);
 +    ivec_sub(SHIFT_IVEC(g,ai),SHIFT_IVEC(g,aj),di);
 +    sij = IVEC2IS(di);
 +    ivec_sub(SHIFT_IVEC(g,ai),SHIFT_IVEC(g,ak),di);
 +    sik = IVEC2IS(di);
 +  } else if (pbc) {
 +    siv = pbc_dx_aiuc(pbc,x[ai],x[av],dx);
 +    sij = pbc_dx_aiuc(pbc,x[ai],x[aj],dx);
 +    sik = pbc_dx_aiuc(pbc,x[ai],x[ak],dx);
 +  } else {
 +    siv = CENTRAL;
 +    sij = CENTRAL;
 +    sik = CENTRAL;
 +  }
 +
 +  if (fshift && (siv!=CENTRAL || sij!=CENTRAL || sik!=CENTRAL)) {
 +    rvec_inc(fshift[siv],f[av]);
 +    rvec_dec(fshift[CENTRAL],fi);
 +    rvec_dec(fshift[sij],fj);
 +    rvec_dec(fshift[sik],fk);
 +  }
 +
 +  /* TOTAL: 20 flops */
 +}
 +
 +static void spread_vsite3FD(t_iatom ia[],real a,real b,
 +                            rvec x[],rvec f[],rvec fshift[],
 +                            gmx_bool VirCorr,matrix dxdf,
 +                            t_pbc *pbc,t_graph *g)
 +{
 +  real fx,fy,fz,c,invl,fproj,a1;
 +  rvec xvi,xij,xjk,xix,fv,temp;
 +  t_iatom av,ai,aj,ak;
 +  int     svi,sji,skj,d;
 +  ivec    di;
 +
 +  av = ia[1];
 +  ai = ia[2];
 +  aj = ia[3];
 +  ak = ia[4];
 +  copy_rvec(f[av],fv);
 +  
 +  sji = pbc_rvec_sub(pbc,x[aj],x[ai],xij);
 +  skj = pbc_rvec_sub(pbc,x[ak],x[aj],xjk);
 +  /* 6 flops */
 +
 +  /* xix goes from i to point x on the line jk */  
 +  xix[XX]=xij[XX]+a*xjk[XX];
 +  xix[YY]=xij[YY]+a*xjk[YY];
 +  xix[ZZ]=xij[ZZ]+a*xjk[ZZ];
 +  /* 6 flops */
 +  
 +  invl=gmx_invsqrt(iprod(xix,xix));
 +  c=b*invl;
 +  /* 4 + ?10? flops */
 +  
 +  fproj=iprod(xix,fv)*invl*invl; /* = (xix . f)/(xix . xix) */
 +  
 +  temp[XX]=c*(fv[XX]-fproj*xix[XX]);
 +  temp[YY]=c*(fv[YY]-fproj*xix[YY]);
 +  temp[ZZ]=c*(fv[ZZ]-fproj*xix[ZZ]);
 +  /* 16 */
 +  
 +  /* c is already calculated in constr_vsite3FD
 +     storing c somewhere will save 26 flops!     */
 +  
 +  a1=1-a;
 +  f[ai][XX] += fv[XX] - temp[XX];
 +  f[ai][YY] += fv[YY] - temp[YY];
 +  f[ai][ZZ] += fv[ZZ] - temp[ZZ];
 +  f[aj][XX] += a1*temp[XX];
 +  f[aj][YY] += a1*temp[YY];
 +  f[aj][ZZ] += a1*temp[ZZ];
 +  f[ak][XX] += a*temp[XX];
 +  f[ak][YY] += a*temp[YY];
 +  f[ak][ZZ] += a*temp[ZZ];
 +  /* 19 Flops */
 +
 +  if (g) {
 +    ivec_sub(SHIFT_IVEC(g,ia[1]),SHIFT_IVEC(g,ai),di);
 +    svi = IVEC2IS(di);
 +    ivec_sub(SHIFT_IVEC(g,aj),SHIFT_IVEC(g,ai),di);
 +    sji = IVEC2IS(di);
 +    ivec_sub(SHIFT_IVEC(g,ak),SHIFT_IVEC(g,aj),di);
 +    skj = IVEC2IS(di);
 +  } else if (pbc) {
 +    svi = pbc_rvec_sub(pbc,x[av],x[ai],xvi);
 +  } else {
 +    svi = CENTRAL;
 +  }
 +
 +  if (fshift && (svi!=CENTRAL || sji!=CENTRAL || skj!=CENTRAL)) {
 +    rvec_dec(fshift[svi],fv);
 +    fshift[CENTRAL][XX] += fv[XX] - (1 + a)*temp[XX];
 +    fshift[CENTRAL][YY] += fv[YY] - (1 + a)*temp[YY];
 +    fshift[CENTRAL][ZZ] += fv[ZZ] - (1 + a)*temp[ZZ];
 +    fshift[    sji][XX] += temp[XX];
 +    fshift[    sji][YY] += temp[YY];
 +    fshift[    sji][ZZ] += temp[ZZ];
 +    fshift[    skj][XX] += a*temp[XX];
 +    fshift[    skj][YY] += a*temp[YY];
 +    fshift[    skj][ZZ] += a*temp[ZZ];
 +  }
 +
 +    if (VirCorr)
 +    {
 +        /* When VirCorr=TRUE, the virial for the current forces is not
 +         * calculated from the redistributed forces. This means that
 +         * the effect of non-linear virtual site constructions on the virial
 +         * needs to be added separately. This contribution can be calculated
 +         * in many ways, but the simplest and cheapest way is to use
 +         * the first constructing atom ai as a reference position in space:
 +         * subtract (xv-xi)*fv and add (xj-xi)*fj + (xk-xi)*fk.
 +         */
 +        rvec xiv;
 +        int  i,j;
 +
 +        pbc_rvec_sub(pbc,x[av],x[ai],xiv);
 +
 +        for(i=0; i<DIM; i++)
 +        {
 +            for(j=0; j<DIM; j++)
 +            {
 +                /* As xix is a linear combination of j and k, use that here */
 +                dxdf[i][j] += -xiv[i]*fv[j] + xix[i]*temp[j];
 +            }
 +        }
 +    }
 +
 +  /* TOTAL: 61 flops */
 +}
 +
 +static void spread_vsite3FAD(t_iatom ia[],real a,real b,
 +                             rvec x[],rvec f[],rvec fshift[],
 +                             gmx_bool VirCorr,matrix dxdf,
 +                             t_pbc *pbc,t_graph *g)
 +{
 +  rvec    xvi,xij,xjk,xperp,Fpij,Fppp,fv,f1,f2,f3;
 +  real    a1,b1,c1,c2,invdij,invdij2,invdp,fproj;
 +  t_iatom av,ai,aj,ak;
 +  int     svi,sji,skj,d;
 +  ivec    di;
 +  
 +  av = ia[1];
 +  ai = ia[2];
 +  aj = ia[3];
 +  ak = ia[4];
 +  copy_rvec(f[ia[1]],fv);
 +
 +  sji = pbc_rvec_sub(pbc,x[aj],x[ai],xij);
 +  skj = pbc_rvec_sub(pbc,x[ak],x[aj],xjk);
 +  /* 6 flops */
 +  
 +  invdij = gmx_invsqrt(iprod(xij,xij));
 +  invdij2 = invdij * invdij;
 +  c1 = iprod(xij,xjk) * invdij2;
 +  xperp[XX] = xjk[XX] - c1*xij[XX];
 +  xperp[YY] = xjk[YY] - c1*xij[YY];
 +  xperp[ZZ] = xjk[ZZ] - c1*xij[ZZ];
 +  /* xperp in plane ijk, perp. to ij */
 +  invdp = gmx_invsqrt(iprod(xperp,xperp));
 +  a1 = a*invdij;
 +  b1 = b*invdp;
 +  /* 45 flops */
 +  
 +  /* a1, b1 and c1 are already calculated in constr_vsite3FAD
 +     storing them somewhere will save 45 flops!     */
 +  
 +  fproj=iprod(xij  ,fv)*invdij2;
 +  svmul(fproj,                      xij,  Fpij); /* proj. f on xij */
 +  svmul(iprod(xperp,fv)*invdp*invdp,xperp,Fppp); /* proj. f on xperp */
 +  svmul(b1*fproj,                   xperp,f3);
 +  /* 23 flops */
 +  
 +  rvec_sub(fv,Fpij,f1); /* f1 = f - Fpij */
 +  rvec_sub(f1,Fppp,f2); /* f2 = f - Fpij - Fppp */
 +  for (d=0; (d<DIM); d++) {
 +    f1[d]*=a1;
 +    f2[d]*=b1;
 +  }
 +  /* 12 flops */
 +  
 +  c2=1+c1;
 +  f[ai][XX] += fv[XX] - f1[XX] + c1*f2[XX] + f3[XX];
 +  f[ai][YY] += fv[YY] - f1[YY] + c1*f2[YY] + f3[YY];
 +  f[ai][ZZ] += fv[ZZ] - f1[ZZ] + c1*f2[ZZ] + f3[ZZ];
 +  f[aj][XX] +=          f1[XX] - c2*f2[XX] - f3[XX];
 +  f[aj][YY] +=          f1[YY] - c2*f2[YY] - f3[YY];
 +  f[aj][ZZ] +=          f1[ZZ] - c2*f2[ZZ] - f3[ZZ];
 +  f[ak][XX] +=                      f2[XX];
 +  f[ak][YY] +=                      f2[YY];
 +  f[ak][ZZ] +=                      f2[ZZ];
 +  /* 30 Flops */
 +
 +  if (g) {
 +    ivec_sub(SHIFT_IVEC(g,ia[1]),SHIFT_IVEC(g,ai),di);
 +    svi = IVEC2IS(di);
 +    ivec_sub(SHIFT_IVEC(g,aj),SHIFT_IVEC(g,ai),di);
 +    sji = IVEC2IS(di);
 +    ivec_sub(SHIFT_IVEC(g,ak),SHIFT_IVEC(g,aj),di);
 +    skj = IVEC2IS(di);
 +  } else if (pbc) {
 +    svi = pbc_rvec_sub(pbc,x[av],x[ai],xvi);
 +  } else {
 +    svi = CENTRAL;
 +  }
 +
 +  if (fshift && (svi!=CENTRAL || sji!=CENTRAL || skj!=CENTRAL)) {
 +    rvec_dec(fshift[svi],fv);
 +    fshift[CENTRAL][XX] += fv[XX] - f1[XX] - (1-c1)*f2[XX] + f3[XX];
 +    fshift[CENTRAL][YY] += fv[YY] - f1[YY] - (1-c1)*f2[YY] + f3[YY];
 +    fshift[CENTRAL][ZZ] += fv[ZZ] - f1[ZZ] - (1-c1)*f2[ZZ] + f3[ZZ];
 +    fshift[    sji][XX] +=          f1[XX] -    c1 *f2[XX] - f3[XX];
 +    fshift[    sji][YY] +=          f1[YY] -    c1 *f2[YY] - f3[YY];
 +    fshift[    sji][ZZ] +=          f1[ZZ] -    c1 *f2[ZZ] - f3[ZZ];
 +    fshift[    skj][XX] +=                          f2[XX];
 +    fshift[    skj][YY] +=                          f2[YY];
 +    fshift[    skj][ZZ] +=                          f2[ZZ];
 +  }
 +
 +    if (VirCorr)
 +    {
 +        rvec xiv;
 +        int  i,j;
 +
 +        pbc_rvec_sub(pbc,x[av],x[ai],xiv);
 +
 +        for(i=0; i<DIM; i++)
 +        {
 +            for(j=0; j<DIM; j++)
 +            {
 +                /* Note that xik=xij+xjk, so we have to add xij*f2 */
 +                dxdf[i][j] +=
 +                    - xiv[i]*fv[j]
 +                    + xij[i]*(f1[j] + (1 - c2)*f2[j] - f3[j])
 +                    + xjk[i]*f2[j];
 +            }
 +        }
 +    }
 +  
 +  /* TOTAL: 113 flops */
 +}
 +
 +static void spread_vsite3OUT(t_iatom ia[],real a,real b,real c,
 +                             rvec x[],rvec f[],rvec fshift[],
 +                             gmx_bool VirCorr,matrix dxdf,
 +                             t_pbc *pbc,t_graph *g)
 +{
 +  rvec    xvi,xij,xik,fv,fj,fk;
 +  real    cfx,cfy,cfz;
 +  atom_id av,ai,aj,ak;
 +  ivec    di;
 +  int     svi,sji,ski;
 +  
 +  av = ia[1];
 +  ai = ia[2];
 +  aj = ia[3];
 +  ak = ia[4];
 +
 +  sji = pbc_rvec_sub(pbc,x[aj],x[ai],xij);
 +  ski = pbc_rvec_sub(pbc,x[ak],x[ai],xik);
 +  /* 6 Flops */
 +  
 +  copy_rvec(f[av],fv);
 +
 +  cfx = c*fv[XX];
 +  cfy = c*fv[YY];
 +  cfz = c*fv[ZZ];
 +  /* 3 Flops */
 +  
 +  fj[XX] = a*fv[XX]     -  xik[ZZ]*cfy +  xik[YY]*cfz;
 +  fj[YY] =  xik[ZZ]*cfx + a*fv[YY]     -  xik[XX]*cfz;
 +  fj[ZZ] = -xik[YY]*cfx +  xik[XX]*cfy + a*fv[ZZ];
 +  
 +  fk[XX] = b*fv[XX]     +  xij[ZZ]*cfy -  xij[YY]*cfz;
 +  fk[YY] = -xij[ZZ]*cfx + b*fv[YY]     +  xij[XX]*cfz;
 +  fk[ZZ] =  xij[YY]*cfx -  xij[XX]*cfy + b*fv[ZZ];
 +  /* 30 Flops */
 +    
 +  f[ai][XX] += fv[XX] - fj[XX] - fk[XX];
 +  f[ai][YY] += fv[YY] - fj[YY] - fk[YY];
 +  f[ai][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ];
 +  rvec_inc(f[aj],fj);
 +  rvec_inc(f[ak],fk);
 +  /* 15 Flops */
 +
 +  if (g) {
 +    ivec_sub(SHIFT_IVEC(g,ia[1]),SHIFT_IVEC(g,ai),di);
 +    svi = IVEC2IS(di);
 +    ivec_sub(SHIFT_IVEC(g,aj),SHIFT_IVEC(g,ai),di);
 +    sji = IVEC2IS(di);
 +    ivec_sub(SHIFT_IVEC(g,ak),SHIFT_IVEC(g,ai),di);
 +    ski = IVEC2IS(di);
 +  } else if (pbc) {
 +    svi = pbc_rvec_sub(pbc,x[av],x[ai],xvi);
 +  } else {
 +    svi = CENTRAL;
 +  }
 +
 +  if (fshift && (svi!=CENTRAL || sji!=CENTRAL || ski!=CENTRAL)) {
 +    rvec_dec(fshift[svi],fv);
 +    fshift[CENTRAL][XX] += fv[XX] - fj[XX] - fk[XX];
 +    fshift[CENTRAL][YY] += fv[YY] - fj[YY] - fk[YY];
 +    fshift[CENTRAL][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ];
 +    rvec_inc(fshift[sji],fj);
 +    rvec_inc(fshift[ski],fk);
 +  }
 +
 +    if (VirCorr)
 +    {
 +        rvec xiv;
 +        int  i,j;
 +
 +        pbc_rvec_sub(pbc,x[av],x[ai],xiv);
 +
 +        for(i=0; i<DIM; i++)
 +        {
 +            for(j=0; j<DIM; j++)
 +            {
 +                dxdf[i][j] += -xiv[i]*fv[j] + xij[i]*fj[j] + xik[i]*fk[j];
 +            }
 +        }
 +    }
 +  
 +  /* TOTAL: 54 flops */
 +}
 +
 +static void spread_vsite4FD(t_iatom ia[],real a,real b,real c,
 +                            rvec x[],rvec f[],rvec fshift[],
 +                            gmx_bool VirCorr,matrix dxdf,
 +                            t_pbc *pbc,t_graph *g)
 +{
 +  real    d,invl,fproj,a1;
 +  rvec    xvi,xij,xjk,xjl,xix,fv,temp;
 +  atom_id av,ai,aj,ak,al;
 +  ivec    di;
 +  int     svi,sji,skj,slj,m;
 +
 +  av = ia[1];
 +  ai = ia[2];
 +  aj = ia[3];
 +  ak = ia[4];
 +  al = ia[5];
 + 
 +  sji = pbc_rvec_sub(pbc,x[aj],x[ai],xij);
 +  skj = pbc_rvec_sub(pbc,x[ak],x[aj],xjk);
 +  slj = pbc_rvec_sub(pbc,x[al],x[aj],xjl);
 +  /* 9 flops */
 +  
 +  /* xix goes from i to point x on the plane jkl */  
 +  for(m=0; m<DIM; m++)
 +    xix[m] = xij[m] + a*xjk[m] + b*xjl[m];
 +  /* 12 flops */
 +  
 +  invl=gmx_invsqrt(iprod(xix,xix));
 +  d=c*invl;
 +  /* 4 + ?10? flops */
 +
 +  copy_rvec(f[av],fv);
 +
 +  fproj=iprod(xix,fv)*invl*invl; /* = (xix . f)/(xix . xix) */
 +
 +  for(m=0; m<DIM; m++)
 +    temp[m] = d*(fv[m] - fproj*xix[m]);
 +  /* 16 */
 +  
 +  /* c is already calculated in constr_vsite3FD
 +     storing c somewhere will save 35 flops!     */
 +  
 +  a1 = 1 - a - b;
 +  for(m=0; m<DIM; m++) {
 +    f[ai][m] += fv[m] - temp[m];
 +    f[aj][m] += a1*temp[m];
 +    f[ak][m] += a*temp[m];
 +    f[al][m] += b*temp[m];
 +  }
 +  /* 26 Flops */
 +  
 +  if (g) {
 +    ivec_sub(SHIFT_IVEC(g,ia[1]),SHIFT_IVEC(g,ai),di);
 +    svi = IVEC2IS(di);
 +    ivec_sub(SHIFT_IVEC(g,aj),SHIFT_IVEC(g,ai),di);
 +    sji = IVEC2IS(di);
 +    ivec_sub(SHIFT_IVEC(g,ak),SHIFT_IVEC(g,aj),di);
 +    skj = IVEC2IS(di);
 +    ivec_sub(SHIFT_IVEC(g,al),SHIFT_IVEC(g,aj),di);
 +    slj = IVEC2IS(di);
 +  } else if (pbc) {
 +    svi = pbc_rvec_sub(pbc,x[av],x[ai],xvi);
 +  } else {
 +    svi = CENTRAL;
 +  }
 +
 +  if (fshift &&
 +      (svi!=CENTRAL || sji!=CENTRAL || skj!=CENTRAL || slj!=CENTRAL)) {
 +    rvec_dec(fshift[svi],fv);
 +    for(m=0; m<DIM; m++) {
 +      fshift[CENTRAL][m] += fv[m] - (1 + a + b)*temp[m];
 +      fshift[    sji][m] += temp[m];
 +      fshift[    skj][m] += a*temp[m];
 +      fshift[    slj][m] += b*temp[m];
 +    }
 +  }
 +
 +    if (VirCorr)
 +    {
 +        rvec xiv;
 +        int  i,j;
 +
 +        pbc_rvec_sub(pbc,x[av],x[ai],xiv);
 +
 +        for(i=0; i<DIM; i++)
 +        {
 +            for(j=0; j<DIM; j++)
 +            {
 +                dxdf[i][j] += -xiv[i]*fv[j] + xix[i]*temp[j];
 +            }
 +        }
 +    }
 +
 +  /* TOTAL: 77 flops */
 +}
 +
 +
 +static void spread_vsite4FDN(t_iatom ia[],real a,real b,real c,
 +                             rvec x[],rvec f[],rvec fshift[],
 +                             gmx_bool VirCorr,matrix dxdf,
 +                             t_pbc *pbc,t_graph *g)
 +{
 +    rvec xvi,xij,xik,xil,ra,rb,rja,rjb,rab,rm,rt;
 +    rvec fv,fj,fk,fl;
 +    real invrm,denom;
 +    real cfx,cfy,cfz;
 +    ivec di;
 +    int  av,ai,aj,ak,al;
 +    int  svi,sij,sik,sil;
 +
 +    /* DEBUG: check atom indices */
 +    av = ia[1];
 +    ai = ia[2];
 +    aj = ia[3];
 +    ak = ia[4];
 +    al = ia[5];
 +
 +    copy_rvec(f[av],fv);
 +    
 +    sij = pbc_rvec_sub(pbc,x[aj],x[ai],xij);
 +    sik = pbc_rvec_sub(pbc,x[ak],x[ai],xik);
 +    sil = pbc_rvec_sub(pbc,x[al],x[ai],xil);
 +    /* 9 flops */
 +    
 +    ra[XX] = a*xik[XX];
 +    ra[YY] = a*xik[YY];
 +    ra[ZZ] = a*xik[ZZ];
 +    
 +    rb[XX] = b*xil[XX];
 +    rb[YY] = b*xil[YY];
 +    rb[ZZ] = b*xil[ZZ];
 +    
 +    /* 6 flops */
 +    
 +    rvec_sub(ra,xij,rja);
 +    rvec_sub(rb,xij,rjb);
 +    rvec_sub(rb,ra,rab);
 +    /* 9 flops */
 +    
 +    cprod(rja,rjb,rm);
 +    /* 9 flops */
 +
 +    invrm=gmx_invsqrt(norm2(rm));
 +    denom=invrm*invrm;
 +    /* 5+5+2 flops */
 +    
 +    cfx = c*invrm*fv[XX];
 +    cfy = c*invrm*fv[YY];
 +    cfz = c*invrm*fv[ZZ];
 +    /* 6 Flops */
 +    
 +    cprod(rm,rab,rt);
 +    /* 9 flops */
 +
 +    rt[XX] *= denom;
 +    rt[YY] *= denom;
 +    rt[ZZ] *= denom;
 +    /* 3flops */
 +    
 +    fj[XX] = (        -rm[XX]*rt[XX]) * cfx + ( rab[ZZ]-rm[YY]*rt[XX]) * cfy + (-rab[YY]-rm[ZZ]*rt[XX]) * cfz;
 +    fj[YY] = (-rab[ZZ]-rm[XX]*rt[YY]) * cfx + (        -rm[YY]*rt[YY]) * cfy + ( rab[XX]-rm[ZZ]*rt[YY]) * cfz;
 +    fj[ZZ] = ( rab[YY]-rm[XX]*rt[ZZ]) * cfx + (-rab[XX]-rm[YY]*rt[ZZ]) * cfy + (        -rm[ZZ]*rt[ZZ]) * cfz;
 +    /* 30 flops */
 +        
 +    cprod(rjb,rm,rt);
 +    /* 9 flops */
 +
 +    rt[XX] *= denom*a;
 +    rt[YY] *= denom*a;
 +    rt[ZZ] *= denom*a;
 +    /* 3flops */
 +    
 +    fk[XX] = (          -rm[XX]*rt[XX]) * cfx + (-a*rjb[ZZ]-rm[YY]*rt[XX]) * cfy + ( a*rjb[YY]-rm[ZZ]*rt[XX]) * cfz;
 +    fk[YY] = ( a*rjb[ZZ]-rm[XX]*rt[YY]) * cfx + (          -rm[YY]*rt[YY]) * cfy + (-a*rjb[XX]-rm[ZZ]*rt[YY]) * cfz;
 +    fk[ZZ] = (-a*rjb[YY]-rm[XX]*rt[ZZ]) * cfx + ( a*rjb[XX]-rm[YY]*rt[ZZ]) * cfy + (          -rm[ZZ]*rt[ZZ]) * cfz;
 +    /* 36 flops */
 +    
 +    cprod(rm,rja,rt);
 +    /* 9 flops */
 +    
 +    rt[XX] *= denom*b;
 +    rt[YY] *= denom*b;
 +    rt[ZZ] *= denom*b;
 +    /* 3flops */
 +    
 +    fl[XX] = (          -rm[XX]*rt[XX]) * cfx + ( b*rja[ZZ]-rm[YY]*rt[XX]) * cfy + (-b*rja[YY]-rm[ZZ]*rt[XX]) * cfz;
 +    fl[YY] = (-b*rja[ZZ]-rm[XX]*rt[YY]) * cfx + (          -rm[YY]*rt[YY]) * cfy + ( b*rja[XX]-rm[ZZ]*rt[YY]) * cfz;
 +    fl[ZZ] = ( b*rja[YY]-rm[XX]*rt[ZZ]) * cfx + (-b*rja[XX]-rm[YY]*rt[ZZ]) * cfy + (          -rm[ZZ]*rt[ZZ]) * cfz;
 +    /* 36 flops */
 +
 +    f[ai][XX] += fv[XX] - fj[XX] - fk[XX] - fl[XX];
 +    f[ai][YY] += fv[YY] - fj[YY] - fk[YY] - fl[YY];
 +    f[ai][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ] - fl[ZZ];
 +    rvec_inc(f[aj],fj);
 +    rvec_inc(f[ak],fk);
 +    rvec_inc(f[al],fl);
 +    /* 21 flops */
 +
 +    if (g) {
 +        ivec_sub(SHIFT_IVEC(g,av),SHIFT_IVEC(g,ai),di);
 +        svi = IVEC2IS(di);
 +        ivec_sub(SHIFT_IVEC(g,aj),SHIFT_IVEC(g,ai),di);
 +        sij = IVEC2IS(di);
 +        ivec_sub(SHIFT_IVEC(g,ak),SHIFT_IVEC(g,ai),di);
 +        sik = IVEC2IS(di);
 +        ivec_sub(SHIFT_IVEC(g,al),SHIFT_IVEC(g,ai),di);
 +        sil = IVEC2IS(di);
 +    } else if (pbc) {
 +        svi = pbc_rvec_sub(pbc,x[av],x[ai],xvi);
 +    } else {
 +        svi = CENTRAL;
 +    }
 +    
 +    if (fshift && (svi!=CENTRAL || sij!=CENTRAL || sik!=CENTRAL || sil!=CENTRAL)) {
 +        rvec_dec(fshift[svi],fv);
 +        fshift[CENTRAL][XX] += fv[XX] - fj[XX] - fk[XX] - fl[XX];
 +        fshift[CENTRAL][YY] += fv[YY] - fj[YY] - fk[YY] - fl[YY];
 +        fshift[CENTRAL][ZZ] += fv[ZZ] - fj[ZZ] - fk[ZZ] - fl[ZZ];
 +        rvec_inc(fshift[sij],fj);
 +        rvec_inc(fshift[sik],fk);
 +        rvec_inc(fshift[sil],fl);
 +    }
 +
 +    if (VirCorr)
 +    {
 +        rvec xiv;
 +        int  i,j;
 +
 +        pbc_rvec_sub(pbc,x[av],x[ai],xiv);
 +
 +        for(i=0; i<DIM; i++)
 +        {
 +            for(j=0; j<DIM; j++)
 +            {
 +                dxdf[i][j] += -xiv[i]*fv[j] + xij[i]*fj[j] + xik[i]*fk[j] + xil[i]*fl[j];
 +            }
 +        }
 +    }
 +    
 +    /* Total: 207 flops (Yuck!) */
 +}
 +
 +
 +static int spread_vsiten(t_iatom ia[],t_iparams ip[],
 +                       rvec x[],rvec f[],rvec fshift[],
 +                       t_pbc *pbc,t_graph *g)
 +{
 +  rvec xv,dx,fi;
 +  int  n3,av,i,ai;
 +  real a;
 +  ivec di;
 +  int  siv;
 +
 +  n3 = 3*ip[ia[0]].vsiten.n;
 +  av = ia[1];
 +  copy_rvec(x[av],xv);
 +  
 +  for(i=0; i<n3; i+=3) {
 +    ai = ia[i+2];
 +    if (g) {
 +      ivec_sub(SHIFT_IVEC(g,ai),SHIFT_IVEC(g,av),di);
 +      siv = IVEC2IS(di);
 +    } else if (pbc) {
 +      siv = pbc_dx_aiuc(pbc,x[ai],xv,dx);
 +    } else {
 +      siv = CENTRAL;
 +    }
 +    a = ip[ia[i]].vsiten.a;
 +    svmul(a,f[av],fi);
 +    rvec_inc(f[ai],fi);
 +    if (fshift && siv != CENTRAL) {
 +      rvec_inc(fshift[siv],fi);
 +      rvec_dec(fshift[CENTRAL],fi);
 +    }
 +    /* 6 Flops */
 +  }
 +
 +  return n3;
 +}
 +
 +
 +static int vsite_count(const t_ilist *ilist,int ftype)
 +{
 +    if (ftype == F_VSITEN)
 +    {
 +        return ilist[ftype].nr/3;
 +    }
 +    else
 +    {
 +        return ilist[ftype].nr/(1 + interaction_function[ftype].nratoms);
 +    }
 +}
 +
 +static void spread_vsite_f_thread(gmx_vsite_t *vsite,
 +                                  rvec x[],rvec f[],rvec *fshift,
 +                                  gmx_bool VirCorr,matrix dxdf,
 +                                  t_iparams ip[],t_ilist ilist[],
 +                                  t_graph *g,t_pbc *pbc_null)
 +{
 +    gmx_bool  bPBCAll;
 +    real      a1,b1,c1;
 +    int       i,inc,m,nra,nr,tp,ftype;
 +    t_iatom   *ia;
 +    t_pbc     *pbc_null2;
 +    int       *vsite_pbc;
 +
 +    if (VirCorr)
 +    {
 +        clear_mat(dxdf);
 +    }
 +
 +    bPBCAll = (pbc_null != NULL && !vsite->bHaveChargeGroups);
 +   
 +    /* this loop goes backwards to be able to build *
 +     * higher type vsites from lower types         */
 +    pbc_null2 = NULL;
 +    vsite_pbc = NULL;
 +    for(ftype=F_NRE-1; (ftype>=0); ftype--)
 +    {
 +        if ((interaction_function[ftype].flags & IF_VSITE) &&
 +            ilist[ftype].nr > 0)
 +        {
 +            nra    = interaction_function[ftype].nratoms;
 +            inc    = 1 + nra;
 +            nr     = ilist[ftype].nr;
 +            ia     = ilist[ftype].iatoms;
 +
 +            if (bPBCAll)
 +            {
 +                pbc_null2 = pbc_null;
 +            }
 +            else if (pbc_null != NULL)
 +            {
 +                vsite_pbc = vsite->vsite_pbc_loc[ftype-F_VSITE2];
 +            }
 +
 +            for(i=0; i<nr; )
 +            {
 +                if (vsite_pbc != NULL)
 +                {
 +                    if (vsite_pbc[i/(1+nra)] > -2)
 +                    {
 +                        pbc_null2 = pbc_null;
 +                    }
 +                    else
 +                    {
 +                        pbc_null2 = NULL;
 +                    }
 +                }
 +
 +                tp   = ia[0];
 +
 +                /* Constants for constructing */
 +                a1   = ip[tp].vsite.a; 
 +                /* Construct the vsite depending on type */
 +                switch (ftype)
 +                {
 +                case F_VSITE2:
 +                    spread_vsite2(ia,a1,x,f,fshift,pbc_null2,g);
 +                    break;
 +                case F_VSITE3:
 +                    b1 = ip[tp].vsite.b;
 +                    spread_vsite3(ia,a1,b1,x,f,fshift,pbc_null2,g);
 +                    break;
 +                case F_VSITE3FD:
 +                    b1 = ip[tp].vsite.b;
 +                    spread_vsite3FD(ia,a1,b1,x,f,fshift,VirCorr,dxdf,pbc_null2,g);
 +                    break;
 +                case F_VSITE3FAD:
 +                    b1 = ip[tp].vsite.b;
 +                    spread_vsite3FAD(ia,a1,b1,x,f,fshift,VirCorr,dxdf,pbc_null2,g);
 +                    break;
 +                case F_VSITE3OUT:
 +                    b1 = ip[tp].vsite.b;
 +                    c1 = ip[tp].vsite.c;
 +                    spread_vsite3OUT(ia,a1,b1,c1,x,f,fshift,VirCorr,dxdf,pbc_null2,g);
 +                    break;
 +                case F_VSITE4FD:
 +                    b1 = ip[tp].vsite.b;
 +                    c1 = ip[tp].vsite.c;
 +                    spread_vsite4FD(ia,a1,b1,c1,x,f,fshift,VirCorr,dxdf,pbc_null2,g);
 +                    break;
 +                case F_VSITE4FDN:
 +                    b1 = ip[tp].vsite.b;
 +                    c1 = ip[tp].vsite.c;
 +                    spread_vsite4FDN(ia,a1,b1,c1,x,f,fshift,VirCorr,dxdf,pbc_null2,g);
 +                    break;
 +                case F_VSITEN:
 +                    inc = spread_vsiten(ia,ip,x,f,fshift,pbc_null2,g);
 +                    break;
 +                default:
 +                    gmx_fatal(FARGS,"No such vsite type %d in %s, line %d",
 +                              ftype,__FILE__,__LINE__);
 +                }
 +                clear_rvec(f[ia[1]]);
 +
 +                /* Increment loop variables */
 +                i  += inc;
 +                ia += inc;
 +            }
 +        }
 +    }
 +}
 +
 +void spread_vsite_f(FILE *log,gmx_vsite_t *vsite,
 +                    rvec x[],rvec f[],rvec *fshift,
 +                    gmx_bool VirCorr,matrix vir,
 +                    t_nrnb *nrnb,t_idef *idef,
 +                    int ePBC,gmx_bool bMolPBC,t_graph *g,matrix box,
 +                    t_commrec *cr)
 +{
 +    t_pbc pbc,*pbc_null;
 +    int   th;
 +
 +    /* We only need to do pbc when we have inter-cg vsites */
 +    if ((DOMAINDECOMP(cr) || bMolPBC) && vsite->n_intercg_vsite)
 +    {
 +        /* This is wasting some CPU time as we now do this multiple times
 +         * per MD step. But how often do we have vsites with full pbc?
 +         */
 +        pbc_null = set_pbc_dd(&pbc,ePBC,cr->dd,FALSE,box);
 +    }
 +    else
 +    {
 +        pbc_null = NULL;
 +    }
 +  
 +    if (DOMAINDECOMP(cr)) 
 +    {
 +        dd_clear_f_vsites(cr->dd,f);
 +    } 
 +    else if (PARTDECOMP(cr) && vsite->vsitecomm != NULL)
 +    {
 +        pd_clear_nonlocal_constructs(vsite->vsitecomm,f);
 +    }
 +
 +    if (vsite->nthreads == 1)
 +    {
 +        spread_vsite_f_thread(vsite,
 +                              x,f,fshift,
 +                              VirCorr,vsite->tdata[0].dxdf,
 +                              idef->iparams,idef->il,
 +                              g,pbc_null);
 +    }
 +    else
 +    {
 +        /* First spread the vsites that might depend on other vsites */
 +        spread_vsite_f_thread(vsite,
 +                              x,f,fshift,
 +                              VirCorr,vsite->tdata[vsite->nthreads].dxdf,
 +                              idef->iparams,
 +                              vsite->tdata[vsite->nthreads].ilist,
 +                              g,pbc_null);
 +
 +#pragma omp parallel num_threads(vsite->nthreads)
 +        {
 +            int  thread;
 +            rvec *fshift_t;
 +
 +            thread = gmx_omp_get_thread_num();
 +
 +            if (thread == 0 || fshift == NULL)
 +            {
 +                fshift_t = fshift;
 +            }
 +            else
 +            {
++                int i;
++
 +                fshift_t = vsite->tdata[thread].fshift;
++
++                for(i=0; i<SHIFTS; i++)
++                {
++                    clear_rvec(fshift_t[i]);
++                }
 +            }
 +
 +            spread_vsite_f_thread(vsite,
 +                                  x,f,fshift_t,
 +                                  VirCorr,vsite->tdata[thread].dxdf,
 +                                  idef->iparams,
 +                                  vsite->tdata[thread].ilist,
 +                                  g,pbc_null);
 +        }
 +
 +        if (fshift != NULL)
 +        {
 +            int i;
 +
 +            for(th=1; th<vsite->nthreads; th++)
 +            {
 +                for(i=0; i<SHIFTS; i++)
 +                {
 +                    rvec_inc(fshift[i],vsite->tdata[th].fshift[i]);
 +                }
 +            }
 +        }
 +    }
 +
 +    if (VirCorr)
 +    {
 +        int i,j;
 +
 +        for(th=0; th<(vsite->nthreads==1 ? 1 : vsite->nthreads+1); th++)
 +        {
 +            for(i=0; i<DIM; i++)
 +            {
 +                for(j=0; j<DIM; j++)
 +                {
 +                    vir[i][j] += -0.5*vsite->tdata[th].dxdf[i][j];
 +                }
 +            }
 +        }
 +    }
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        dd_move_f_vsites(cr->dd,f,fshift);
 +    }
 +    else if (vsite->bPDvsitecomm)
 +    {
 +        /* We only move forces here, and they are independent of shifts */
 +        move_construct_f(vsite->vsitecomm,f,cr);
 +    }
 +
 +    inc_nrnb(nrnb,eNR_VSITE2,   vsite_count(idef->il,F_VSITE2));
 +    inc_nrnb(nrnb,eNR_VSITE3,   vsite_count(idef->il,F_VSITE3));
 +    inc_nrnb(nrnb,eNR_VSITE3FD, vsite_count(idef->il,F_VSITE3FD));
 +    inc_nrnb(nrnb,eNR_VSITE3FAD,vsite_count(idef->il,F_VSITE3FAD));
 +    inc_nrnb(nrnb,eNR_VSITE3OUT,vsite_count(idef->il,F_VSITE3OUT));
 +    inc_nrnb(nrnb,eNR_VSITE4FD, vsite_count(idef->il,F_VSITE4FD));
 +    inc_nrnb(nrnb,eNR_VSITE4FDN,vsite_count(idef->il,F_VSITE4FDN));
 +    inc_nrnb(nrnb,eNR_VSITEN,   vsite_count(idef->il,F_VSITEN));
 +}
 +
 +static int *atom2cg(t_block *cgs)
 +{
 +  int *a2cg,cg,i;
 +  
 +  snew(a2cg,cgs->index[cgs->nr]);
 +  for(cg=0; cg<cgs->nr; cg++) {
 +    for(i=cgs->index[cg]; i<cgs->index[cg+1]; i++)
 +      a2cg[i] = cg;
 +  }
 +  
 +  return a2cg;
 +}
 +
 +static int count_intercg_vsite(gmx_mtop_t *mtop,
 +                               gmx_bool *bHaveChargeGroups)
 +{
 +    int  mb,mt,ftype,nral,i,cg,a;
 +    gmx_molblock_t *molb;
 +    gmx_moltype_t *molt;
 +    int  *a2cg;
 +    t_ilist *il;
 +    t_iatom *ia;
 +    int  n_intercg_vsite;
 +
 +    *bHaveChargeGroups = FALSE;
 +
 +    n_intercg_vsite = 0;
 +    for(mb=0; mb<mtop->nmolblock; mb++)
 +    {
 +        molb = &mtop->molblock[mb];
 +        molt = &mtop->moltype[molb->type];
 +
 +        if (molt->cgs.nr < molt->atoms.nr)
 +        {
 +            *bHaveChargeGroups = TRUE;
 +        }
 +
 +        a2cg = atom2cg(&molt->cgs);
 +        for(ftype=0; ftype<F_NRE; ftype++)
 +        {
 +            if (interaction_function[ftype].flags & IF_VSITE)
 +            {
 +                nral = NRAL(ftype);
 +                il = &molt->ilist[ftype];
 +                ia  = il->iatoms;
 +                for(i=0; i<il->nr; i+=1+nral)
 +                {
 +                    cg = a2cg[ia[1+i]];
 +                    for(a=1; a<nral; a++)
 +                    {
 +                        if (a2cg[ia[1+a]] != cg) {
 +                            n_intercg_vsite += molb->nmol;
 +                            break;
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +        sfree(a2cg);
 +    }
 +
 +    return n_intercg_vsite;
 +}
 +
 +static int **get_vsite_pbc(t_iparams *iparams,t_ilist *ilist,
 +                         t_atom *atom,t_mdatoms *md,
 +                         t_block *cgs,int *a2cg)
 +{
 +  int  ftype,nral,i,j,vsi,vsite,cg_v,cg_c,a,nc3=0;
 +  t_ilist *il;
 +  t_iatom *ia;
 +  int  **vsite_pbc,*vsite_pbc_f;
 +  char *pbc_set;
 +  gmx_bool bViteOnlyCG_and_FirstAtom;
 +
 +  /* Make an array that tells if the pbc of an atom is set */
 +  snew(pbc_set,cgs->index[cgs->nr]);
 +  /* PBC is set for all non vsites */
 +  for(a=0; a<cgs->index[cgs->nr]; a++) {
 +    if ((atom && atom[a].ptype != eptVSite) ||
 +      (md   && md->ptype[a]  != eptVSite)) {
 +      pbc_set[a] = 1;
 +    }
 +  }
 +
 +  snew(vsite_pbc,F_VSITEN-F_VSITE2+1);
 +  
 +  for(ftype=0; ftype<F_NRE; ftype++) {
 +    if (interaction_function[ftype].flags & IF_VSITE) {
 +      nral = NRAL(ftype);
 +      il = &ilist[ftype];
 +      ia  = il->iatoms;
 +
 +      snew(vsite_pbc[ftype-F_VSITE2],il->nr/(1+nral));
 +      vsite_pbc_f = vsite_pbc[ftype-F_VSITE2];
 +
 +      i = 0;
 +      while (i < il->nr) {
 +      vsi = i/(1+nral);
 +      vsite = ia[i+1];
 +      cg_v = a2cg[vsite];
 +      /* A value of -2 signals that this vsite and its contructing
 +       * atoms are all within the same cg, so no pbc is required.
 +       */
 +      vsite_pbc_f[vsi] = -2;
 +      /* Check if constructing atoms are outside the vsite's cg */
 +      nc3 = 0;
 +      if (ftype == F_VSITEN) {
 +        nc3 = 3*iparams[ia[i]].vsiten.n;
 +        for(j=0; j<nc3; j+=3) {
 +          if (a2cg[ia[i+j+2]] != cg_v)
 +            vsite_pbc_f[vsi] = -1;
 +        }
 +      } else {
 +        for(a=1; a<nral; a++) {
 +          if (a2cg[ia[i+1+a]] != cg_v)
 +            vsite_pbc_f[vsi] = -1;
 +        }
 +      }
 +      if (vsite_pbc_f[vsi] == -1) {
 +        /* Check if this is the first processed atom of a vsite only cg */
 +        bViteOnlyCG_and_FirstAtom = TRUE;
 +        for(a=cgs->index[cg_v]; a<cgs->index[cg_v+1]; a++) {
 +          /* Non-vsites already have pbc set, so simply check for pbc_set */
 +          if (pbc_set[a]) {
 +            bViteOnlyCG_and_FirstAtom = FALSE;
 +            break;
 +          }
 +        }
 +        if (bViteOnlyCG_and_FirstAtom) {
 +          /* First processed atom of a vsite only charge group.
 +           * The pbc of the input coordinates to construct_vsites
 +           * should be preserved.
 +           */
 +          vsite_pbc_f[vsi] = vsite;
 +        } else if (cg_v != a2cg[ia[1+i+1]]) {
 +          /* This vsite has a different charge group index
 +           * than it's first constructing atom
 +           * and the charge group has more than one atom,
 +           * search for the first normal particle
 +           * or vsite that already had its pbc defined.
 +           * If nothing is found, use full pbc for this vsite.
 +           */
 +          for(a=cgs->index[cg_v]; a<cgs->index[cg_v+1]; a++) {
 +            if (a != vsite && pbc_set[a]) {
 +              vsite_pbc_f[vsi] = a;
 +              if (gmx_debug_at)
 +                fprintf(debug,"vsite %d match pbc with atom %d\n",
 +                        vsite+1,a+1);
 +              break;
 +            }
 +          }
 +          if (gmx_debug_at)
 +            fprintf(debug,"vsite atom %d  cg %d - %d pbc atom %d\n",
 +                    vsite+1,cgs->index[cg_v]+1,cgs->index[cg_v+1],
 +                    vsite_pbc_f[vsi]+1);
 +        }
 +      }
 +      if (ftype == F_VSITEN) {
 +        /* The other entries in vsite_pbc_f are not used for center vsites */
 +        i += nc3;
 +      } else {
 +        i += 1+nral;
 +      }
 +      
 +      /* This vsite now has its pbc defined */
 +      pbc_set[vsite] = 1;
 +      }
 +    }
 +  }
 +
 +  sfree(pbc_set);
 +
 +  return vsite_pbc;
 +}
 +
 +
 +gmx_vsite_t *init_vsite(gmx_mtop_t *mtop,t_commrec *cr,
 +                        gmx_bool bSerial_NoPBC)
 +{
 +    int nvsite,i;
 +    int *a2cg,cg;
 +    gmx_vsite_t *vsite;
 +    int mt;
 +    gmx_moltype_t *molt;
 +    int nthreads;
 +
 +    /* check if there are vsites */
 +    nvsite = 0;
 +    for(i=0; i<F_NRE; i++)
 +    {
 +        if (interaction_function[i].flags & IF_VSITE)
 +        {
 +            nvsite += gmx_mtop_ftype_count(mtop,i);
 +        }
 +    }
 +
 +    if (nvsite == 0)
 +    {
 +        return NULL;
 +    }
 +
 +    snew(vsite,1);
 +
 +    vsite->n_intercg_vsite = count_intercg_vsite(mtop,
 +                                                 &vsite->bHaveChargeGroups);
 +
 +    /* If we don't have charge groups, the vsite follows its own pbc */
 +    if (!bSerial_NoPBC &&
 +        vsite->bHaveChargeGroups &&
 +        vsite->n_intercg_vsite > 0 && DOMAINDECOMP(cr))
 +    {
 +        vsite->nvsite_pbc_molt = mtop->nmoltype;
 +        snew(vsite->vsite_pbc_molt,vsite->nvsite_pbc_molt);
 +        for(mt=0; mt<mtop->nmoltype; mt++)
 +        {
 +            molt = &mtop->moltype[mt];
 +            /* Make an atom to charge group index */
 +            a2cg = atom2cg(&molt->cgs);
 +            vsite->vsite_pbc_molt[mt] = get_vsite_pbc(mtop->ffparams.iparams,
 +                                                      molt->ilist,
 +                                                      molt->atoms.atom,NULL,
 +                                                      &molt->cgs,a2cg);
 +            sfree(a2cg);
 +        }
 +   
 +        snew(vsite->vsite_pbc_loc_nalloc,F_VSITEN-F_VSITE2+1);
 +        snew(vsite->vsite_pbc_loc       ,F_VSITEN-F_VSITE2+1);
 +    }
 +
 +    if (bSerial_NoPBC)
 +    {
 +        vsite->nthreads = 1;
 +    }
 +    else
 +    {
 +        vsite->nthreads = gmx_omp_nthreads_get(emntVSITE);
 +    }
 +    if (!bSerial_NoPBC)
 +    {
 +        /* We need one extra thread data structure for the overlap vsites */
 +        snew(vsite->tdata,vsite->nthreads+1);
 +    }
 +
 +    vsite->th_ind        = NULL;
 +    vsite->th_ind_nalloc = 0;
 +
 +    return vsite;
 +}
 +
 +static void prepare_vsite_thread(const t_ilist *ilist,
 +                                 gmx_vsite_thread_t *vsite_th)
 +{
 +    int ftype;
 +
 +    for(ftype=0; ftype<F_NRE; ftype++)
 +    {
 +        if (interaction_function[ftype].flags & IF_VSITE)
 +        {
 +            if (ilist[ftype].nr > vsite_th->ilist[ftype].nalloc)
 +            {
 +                vsite_th->ilist[ftype].nalloc = over_alloc_large(ilist[ftype].nr);
 +                srenew(vsite_th->ilist[ftype].iatoms,vsite_th->ilist[ftype].nalloc);
 +            }
 +
 +            vsite_th->ilist[ftype].nr = 0;
 +        }
 +    }
 +}
 +
 +void split_vsites_over_threads(const t_ilist *ilist,
 +                               const t_mdatoms *mdatoms,
 +                               gmx_bool bLimitRange,
 +                               gmx_vsite_t *vsite)
 +{
 +    int th;
 +    int vsite_atom_range,natperthread;
 +    int *th_ind;
 +    int ftype;
 +    t_iatom *iat;
 +    t_ilist *il_th;
 +    int nral1,inc,i,j;
 +
 +    if (vsite->nthreads == 1)
 +    {
 +        /* Nothing to do */
 +        return;
 +    }
 +
 +#pragma omp parallel for num_threads(vsite->nthreads) schedule(static)
 +    for(th=0; th<vsite->nthreads; th++)
 +    {
 +        prepare_vsite_thread(ilist,&vsite->tdata[th]);
 +    }
 +    /* Master threads does the (potential) overlap vsites */
 +    prepare_vsite_thread(ilist,&vsite->tdata[vsite->nthreads]);
 +
 +    /* The current way of distributing the vsites over threads in primitive.
 +     * We divide the atom range 0 - natoms_in_vsite uniformly over threads,
 +     * without taking into account how the vsites are distributed.
 +     * Without domain decomposition we bLimitRange=TRUE and we at least
 +     * tighten the upper bound of the range (useful for common systems
 +     * such as a vsite-protein in 3-site water).
 +     */
 +    if (bLimitRange)
 +    {
 +        vsite_atom_range = -1;
 +        for(ftype=0; ftype<F_NRE; ftype++)
 +        {
 +            if ((interaction_function[ftype].flags & IF_VSITE) &&
 +                ftype != F_VSITEN)
 +            {
 +                nral1 = 1 + NRAL(ftype);
 +                iat   = ilist[ftype].iatoms;
 +                for(i=0; i<ilist[ftype].nr; i+=nral1)
 +                {
 +                    for(j=i+1; j<i+nral1; j++)
 +                    {
 +                        vsite_atom_range = max(vsite_atom_range,iat[j]);
 +                    }
 +                }
 +            }
 +        }
 +        vsite_atom_range++;
 +    }
 +    else
 +    {
 +        vsite_atom_range = mdatoms->homenr;
 +    }
 +    natperthread = (vsite_atom_range + vsite->nthreads - 1)/vsite->nthreads;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"virtual site thread dist: natoms %d, range %d, natperthread %d\n",mdatoms->nr,vsite_atom_range,natperthread);
 +    }
 +
 +    /* To simplify the vsite assignment, we make an index which tells us
 +     * to which thread particles, both non-vsites and vsites, are assigned.
 +     */
 +    if (mdatoms->nr > vsite->th_ind_nalloc)
 +    {
 +        vsite->th_ind_nalloc = over_alloc_large(mdatoms->nr);
 +        srenew(vsite->th_ind,vsite->th_ind_nalloc);
 +    }
 +    th_ind = vsite->th_ind;
 +    th = 0;
 +    for(i=0; i<mdatoms->nr; i++)
 +    {
 +        if (mdatoms->ptype[i] == eptVSite)
 +        {
 +            /* vsites are not assigned to a thread yet */
 +            th_ind[i] = -1;
 +        }
 +        else
 +        {
 +            /* assign non-vsite particles to thread th */
 +            th_ind[i] = th;
 +        }
 +        if (i == (th + 1)*natperthread && th < vsite->nthreads)
 +        {
 +            th++;
 +        }
 +    }
 +
 +    for(ftype=0; ftype<F_NRE; ftype++)
 +    {
 +        if ((interaction_function[ftype].flags & IF_VSITE) &&
 +            ftype != F_VSITEN)
 +        {
 +            nral1 = 1 + NRAL(ftype);
 +            inc   = nral1;
 +            iat   = ilist[ftype].iatoms;
 +            for(i=0; i<ilist[ftype].nr; )
 +            {
 +                th = iat[1+i]/natperthread;
 +                /* We would like to assign this vsite the thread th,
 +                 * but it might depend on atoms outside the atom range of th
 +                 * or on another vsite not assigned to thread th.
 +                 */
 +                if (ftype != F_VSITEN)
 +                {
 +                    for(j=i+2; j<i+nral1; j++)
 +                    {
 +                        if (th_ind[iat[j]] != th)
 +                        {
 +                            /* Some constructing atoms are not assigned to
 +                             * thread th, move this vsite to a separate batch.
 +                             */
 +                            th = vsite->nthreads;
 +                        }
 +                    }
 +                }
 +                else
 +                {
 +                    inc = iat[i];
 +                    for(j=i+2; j<i+inc; j+=3)
 +                    {
 +                        if (th_ind[iat[j]] != th)
 +                        {
 +                            th = vsite->nthreads;
 +                        }
 +                    }
 +                }
 +                /* Copy this vsite to the thread data struct of thread th */
 +                il_th = &vsite->tdata[th].ilist[ftype];
 +                for(j=i; j<i+inc; j++)
 +                {
 +                    il_th->iatoms[il_th->nr++] = iat[j];
 +                }
 +                /* Update this vsite's thread index entry */
 +                th_ind[iat[1+i]] = th;
 +
 +                i += inc;
 +            }
 +        }
 +    }
 +
 +    if (debug)
 +    {
 +        for(ftype=0; ftype<F_NRE; ftype++)
 +        {
 +            if ((interaction_function[ftype].flags & IF_VSITE) &&
 +            ilist[ftype].nr > 0)
 +            {
 +                fprintf(debug,"%-20s thread dist:",
 +                        interaction_function[ftype].longname);
 +                for(th=0; th<vsite->nthreads+1; th++)
 +                {
 +                    fprintf(debug," %4d",vsite->tdata[th].ilist[ftype].nr);
 +                }
 +                fprintf(debug,"\n");
 +            }
 +        }
 +    }
 +}
 +
 +void set_vsite_top(gmx_vsite_t *vsite,gmx_localtop_t *top,t_mdatoms *md,
 +                   t_commrec *cr)
 +{
 +    int *a2cg;
 +    
 +    if (vsite->n_intercg_vsite > 0)
 +    {
 +        if (vsite->bHaveChargeGroups)
 +        {
 +            /* Make an atom to charge group index */
 +            a2cg = atom2cg(&top->cgs);
 +            vsite->vsite_pbc_loc = get_vsite_pbc(top->idef.iparams,
 +                                                 top->idef.il,NULL,md,
 +                                                 &top->cgs,a2cg);
 +            sfree(a2cg);
 +        }
 +
 +        if (PARTDECOMP(cr))
 +        {
 +            snew(vsite->vsitecomm,1);
 +            vsite->bPDvsitecomm =
 +                setup_parallel_vsites(&(top->idef),cr,vsite->vsitecomm);
 +        }
 +    }
 +
 +    if (vsite->nthreads > 1)
 +    {
 +        if (vsite->bHaveChargeGroups || PARTDECOMP(cr))
 +        {
 +            gmx_incons("Can not use threads virtual sites combined with charge groups or particle decomposition");
 +        }
 +
 +        split_vsites_over_threads(top->idef.il,md,!DOMAINDECOMP(cr),vsite);
 +    }
 +}
index a1a4932d25f136349c2f33f856d975d41c29de6e,0000000000000000000000000000000000000000..f1eacea070204f156401feaa5d0becd47824089c
mode 100644,000000..100644
--- /dev/null
@@@ -1,166 -1,0 +1,166 @@@
-     fprintf(out,"ielec: %d, ivdw: %d, free_energy: %d, Solvent opt: %s\n",
-             nblist->ielec,nblist->ivdw,nblist->free_energy,
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +/* This file is completely threadsafe - keep it that way! */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <stdio.h>
 +#include <string.h>
 +#include "string2.h"
 +#include "force.h"
 +#include "smalloc.h"
 +#include "ns.h"
 +#include "nrnb.h"
 +#include "gmx_fatal.h"
 +#include "macros.h"
 +#include "futil.h"
 +#include "names.h"
 +#include "domdec.h"
 +#include "gmxfio.h"
 +
 +#define header "Neighborlist:"
 +
 +static void write_nblist(FILE *out,gmx_domdec_t *dd,t_nblist *nblist,int nDNL)
 +{
 +  int i,nii,ii,j,zi,zj0,zj1,aj,zj,nj;
 +  int ca1[DD_MAXZONE],np[DD_MAXZONE];
 +  gmx_domdec_zones_t *dd_zones;
 +
 +  if (nblist->nri > 0) {  
++    fprintf(out,"ielec: %d, ivdw: %d, type: %d, Solvent opt: %s\n",
++            nblist->ielec,nblist->ivdw,nblist->type,
 +            gmx_nblist_geometry_names[nblist->igeometry]);
 +    fprintf(out,"nri: %d  npair: %d\n",nblist->nri,nblist->nrj);
 +    if (dd) {
 +      dd_zones = domdec_zones(dd);
 +
 +      for(zi=0; zi<dd_zones->n; zi++)
 +      ca1[zi] = dd->cgindex[dd_zones->cg_range[zi+1]];
 +      i = 0;
 +      for(zi=0; zi<dd_zones->nizone; zi++) {
 +      zj0 = dd_zones->izone[zi].j0;
 +      zj1 = dd_zones->izone[zi].j1;
 +      for(zj=zj0; zj<zj1; zj++)
 +        np[zj] = 0;
 +      while(i < nblist->nri && nblist->iinr[i] < ca1[zi]) {
 +        for(j=nblist->jindex[i]; (j<nblist->jindex[i+1]); j++) {
 +          aj = nblist->jjnr[j];
 +          zj = zj0;
 +          while (aj >= ca1[zj])
 +            zj++;
 +          np[zj]++;
 +        }
 +        i++;
 +      }
 +      fprintf(out,"DD zone %d:",zi);
 +      for(zj=zj0; zj<zj1; zj++)
 +        fprintf(out," %d %d",zj,np[zj]);
 +      fprintf(out,"\n");
 +      }
 +    }
 +    if (nDNL >= 2) {
 +      for(i=0; i<nblist->nri; i++) {
 +      nii = 1;
 +      if (nDNL >= 3 && nblist->igeometry != GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE)
 +        nii = 3;
 +      nj = nblist->jindex[i+1] - nblist->jindex[i];
 +      fprintf(out,"i: %d shift: %d gid: %d nj: %d\n",
 +              ddglatnr(dd,nblist->iinr[i]),
 +              nblist->shift[i],nblist->gid[i],nj);
 +      for(ii=0; ii<nii; ii++) {
 +        for(j=nblist->jindex[i]; (j<nblist->jindex[i+1]); j++) {
 +          fprintf(out,"  i: %5d  j: %5d\n",
 +                  ddglatnr(dd,nblist->iinr[i]+ii),
 +                  ddglatnr(dd,nblist->jjnr[j]));
 +        }
 +      }
 +      }
 +    }
 +    fflush(out);
 +  }
 +}
 +
 +static void set_mat(FILE *fp,int **mat,int i0,int ni,int j0,int nj,
 +                  gmx_bool bSymm,int shift)
 +{
 +  int i,j;
 +  
 +  for(i=i0; (i<i0+ni); i++) {
 +    for(j=j0; (j<j0+nj); j++) {
 +      if (mat[i][j] != 0)
 +      fprintf(fp,"mat[%d][%d] changing from %d to %d\n",
 +              i,j,mat[i][j],shift+1);
 +      mat[i][j] = shift+1;
 +      if (bSymm)
 +      mat[j][i] = 27-shift;
 +    }
 +  }
 +}
 +
 +
 +
 +void dump_nblist(FILE *out,t_commrec *cr,t_forcerec *fr,int nDNL)
 +{
 +#if 0
 +  static FILE *fp=NULL;
 +  char buf[STRLEN];
 +  int  n,i;
 +
 +  if (fp == NULL) {
 +    if (PAR(cr)) {
 +      sprintf(buf,"nlist_n%d.txt",cr->nodeid);
 +    } else {
 +      sprintf(buf,"nlist.txt");
 +    }
 +    fp = gmx_fio_fopen(buf,"w");
 +  }
 +  fprintf(fp,"%s\n",header);
 +
 +  for(n=0; (n<fr->nnblists); n++)
 +    for(i=0; (i<eNL_NR); i++) 
 +      write_nblist(fp,cr->dd,&fr->nblists[n].nlist_sr[i],nDNL);
 +#endif
 +  char buf[STRLEN];
 +  int  n,i;
 +
 +  fprintf(out,"%s\n",header);
 +
 +  for(n=0; (n<fr->nnblists); n++)
 +    for(i=0; (i<eNL_NR); i++) 
 +      write_nblist(out,cr->dd,&fr->nblists[n].nlist_sr[i],nDNL);
 +
 +}
 +
index faa968121be391c11210220999f76cc5bf7c528c,0000000000000000000000000000000000000000..61bab7e51a954cbb3bc5acba7f698257f68d92d0
mode 100644,000000..100644
--- /dev/null
@@@ -1,2005 -1,0 +1,1996 @@@
-             int process_index;
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +#if defined(HAVE_SCHED_H) && defined(HAVE_SCHED_GETAFFINITY)
 +#define _GNU_SOURCE
 +#include <sched.h>
 +#include <sys/syscall.h>
 +#endif
 +#include <signal.h>
 +#include <stdlib.h>
 +#ifdef HAVE_UNISTD_H
 +#include <unistd.h>
 +#endif
 +#include <string.h>
 +#include <assert.h>
 +
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "sysstuff.h"
 +#include "statutil.h"
 +#include "mdrun.h"
 +#include "md_logging.h"
 +#include "md_support.h"
 +#include "network.h"
 +#include "pull.h"
 +#include "pull_rotation.h"
 +#include "names.h"
 +#include "disre.h"
 +#include "orires.h"
 +#include "pme.h"
 +#include "mdatoms.h"
 +#include "repl_ex.h"
 +#include "qmmm.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "coulomb.h"
 +#include "constr.h"
 +#include "mvdata.h"
 +#include "checkpoint.h"
 +#include "mtop_util.h"
 +#include "sighandler.h"
 +#include "tpxio.h"
 +#include "txtdump.h"
 +#include "gmx_detect_hardware.h"
 +#include "gmx_omp_nthreads.h"
 +#include "pull_rotation.h"
 +#include "calc_verletbuf.h"
 +#include "../mdlib/nbnxn_search.h"
 +#include "../mdlib/nbnxn_consts.h"
 +#include "gmx_fatal_collective.h"
 +#include "membed.h"
 +#include "macros.h"
 +#include "gmx_omp.h"
 +
 +#include "thread_mpi/threads.h"
 +
 +#ifdef GMX_LIB_MPI
 +#include <mpi.h>
 +#endif
 +#ifdef GMX_THREAD_MPI
 +#include "tmpi.h"
 +#endif
 +
 +#ifdef GMX_FAHCORE
 +#include "corewrap.h"
 +#endif
 +
 +#ifdef GMX_OPENMM
 +#include "md_openmm.h"
 +#endif
 +
 +#include "gpu_utils.h"
 +#include "nbnxn_cuda_data_mgmt.h"
 +
 +typedef struct { 
 +    gmx_integrator_t *func;
 +} gmx_intp_t;
 +
 +/* The array should match the eI array in include/types/enums.h */
 +#ifdef GMX_OPENMM  /* FIXME do_md_openmm needs fixing */
 +const gmx_intp_t integrator[eiNR] = { {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm},{do_md_openmm}};
 +#else
 +const gmx_intp_t integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md},{do_md}};
 +#endif
 +
 +gmx_large_int_t     deform_init_init_step_tpx;
 +matrix              deform_init_box_tpx;
 +#ifdef GMX_THREAD_MPI
 +tMPI_Thread_mutex_t deform_init_box_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
 +#endif
 +
 +
 +#ifdef GMX_THREAD_MPI
 +struct mdrunner_arglist
 +{
 +    gmx_hw_opt_t *hw_opt;
 +    FILE *fplog;
 +    t_commrec *cr;
 +    int nfile;
 +    const t_filenm *fnm;
 +    output_env_t oenv;
 +    gmx_bool bVerbose;
 +    gmx_bool bCompact;
 +    int nstglobalcomm;
 +    ivec ddxyz;
 +    int dd_node_order;
 +    real rdd;
 +    real rconstr;
 +    const char *dddlb_opt;
 +    real dlb_scale;
 +    const char *ddcsx;
 +    const char *ddcsy;
 +    const char *ddcsz;
 +    const char *nbpu_opt;
 +    int nsteps_cmdline;
 +    int nstepout;
 +    int resetstep;
 +    int nmultisim;
 +    int repl_ex_nst;
 +    int repl_ex_nex;
 +    int repl_ex_seed;
 +    real pforce;
 +    real cpt_period;
 +    real max_hours;
 +    const char *deviceOptions;
 +    unsigned long Flags;
 +    int ret; /* return value */
 +};
 +
 +
 +/* The function used for spawning threads. Extracts the mdrunner() 
 +   arguments from its one argument and calls mdrunner(), after making
 +   a commrec. */
 +static void mdrunner_start_fn(void *arg)
 +{
 +    struct mdrunner_arglist *mda=(struct mdrunner_arglist*)arg;
 +    struct mdrunner_arglist mc=*mda; /* copy the arg list to make sure 
 +                                        that it's thread-local. This doesn't
 +                                        copy pointed-to items, of course,
 +                                        but those are all const. */
 +    t_commrec *cr;  /* we need a local version of this */
 +    FILE *fplog=NULL;
 +    t_filenm *fnm;
 +
 +    fnm = dup_tfn(mc.nfile, mc.fnm);
 +
 +    cr = init_par_threads(mc.cr);
 +
 +    if (MASTER(cr))
 +    {
 +        fplog=mc.fplog;
 +    }
 +
 +    mda->ret=mdrunner(mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv, 
 +                      mc.bVerbose, mc.bCompact, mc.nstglobalcomm, 
 +                      mc.ddxyz, mc.dd_node_order, mc.rdd,
 +                      mc.rconstr, mc.dddlb_opt, mc.dlb_scale, 
 +                      mc.ddcsx, mc.ddcsy, mc.ddcsz,
 +                      mc.nbpu_opt,
 +                      mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
 +                      mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce, 
 +                      mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.Flags);
 +}
 +
 +/* called by mdrunner() to start a specific number of threads (including 
 +   the main thread) for thread-parallel runs. This in turn calls mdrunner()
 +   for each thread. 
 +   All options besides nthreads are the same as for mdrunner(). */
 +static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt, 
 +              FILE *fplog,t_commrec *cr,int nfile, 
 +              const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
 +              gmx_bool bCompact, int nstglobalcomm,
 +              ivec ddxyz,int dd_node_order,real rdd,real rconstr,
 +              const char *dddlb_opt,real dlb_scale,
 +              const char *ddcsx,const char *ddcsy,const char *ddcsz,
 +              const char *nbpu_opt,
 +              int nsteps_cmdline, int nstepout,int resetstep,
 +              int nmultisim,int repl_ex_nst,int repl_ex_nex, int repl_ex_seed,
 +              real pforce,real cpt_period, real max_hours, 
 +              const char *deviceOptions, unsigned long Flags)
 +{
 +    int ret;
 +    struct mdrunner_arglist *mda;
 +    t_commrec *crn; /* the new commrec */
 +    t_filenm *fnmn;
 +
 +    /* first check whether we even need to start tMPI */
 +    if (hw_opt->nthreads_tmpi < 2)
 +    {
 +        return cr;
 +    }
 +
 +    /* a few small, one-time, almost unavoidable memory leaks: */
 +    snew(mda,1);
 +    fnmn=dup_tfn(nfile, fnm);
 +
 +    /* fill the data structure to pass as void pointer to thread start fn */
 +    mda->hw_opt=hw_opt;
 +    mda->fplog=fplog;
 +    mda->cr=cr;
 +    mda->nfile=nfile;
 +    mda->fnm=fnmn;
 +    mda->oenv=oenv;
 +    mda->bVerbose=bVerbose;
 +    mda->bCompact=bCompact;
 +    mda->nstglobalcomm=nstglobalcomm;
 +    mda->ddxyz[XX]=ddxyz[XX];
 +    mda->ddxyz[YY]=ddxyz[YY];
 +    mda->ddxyz[ZZ]=ddxyz[ZZ];
 +    mda->dd_node_order=dd_node_order;
 +    mda->rdd=rdd;
 +    mda->rconstr=rconstr;
 +    mda->dddlb_opt=dddlb_opt;
 +    mda->dlb_scale=dlb_scale;
 +    mda->ddcsx=ddcsx;
 +    mda->ddcsy=ddcsy;
 +    mda->ddcsz=ddcsz;
 +    mda->nbpu_opt=nbpu_opt;
 +    mda->nsteps_cmdline=nsteps_cmdline;
 +    mda->nstepout=nstepout;
 +    mda->resetstep=resetstep;
 +    mda->nmultisim=nmultisim;
 +    mda->repl_ex_nst=repl_ex_nst;
 +    mda->repl_ex_nex=repl_ex_nex;
 +    mda->repl_ex_seed=repl_ex_seed;
 +    mda->pforce=pforce;
 +    mda->cpt_period=cpt_period;
 +    mda->max_hours=max_hours;
 +    mda->deviceOptions=deviceOptions;
 +    mda->Flags=Flags;
 +
 +    fprintf(stderr, "Starting %d tMPI threads\n",hw_opt->nthreads_tmpi);
 +    fflush(stderr);
 +    /* now spawn new threads that start mdrunner_start_fn(), while 
 +       the main thread returns */
 +    ret=tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi,
 +                     (hw_opt->bThreadPinning ? TMPI_AFFINITY_ALL_CORES : TMPI_AFFINITY_NONE),
 +                     mdrunner_start_fn, (void*)(mda) );
 +    if (ret!=TMPI_SUCCESS)
 +        return NULL;
 +
 +    /* make a new comm_rec to reflect the new situation */
 +    crn=init_par_threads(cr);
 +    return crn;
 +}
 +
 +
 +static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
 +                                        const gmx_hw_opt_t *hw_opt,
 +                                        int nthreads_tot,
 +                                        int ngpu)
 +{
 +    int nthreads_tmpi;
 +
 +    /* There are no separate PME nodes here, as we ensured in
 +     * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
 +     * and a conditional ensures we would not have ended up here.
 +     * Note that separate PME nodes might be switched on later.
 +     */
 +    if (ngpu > 0)
 +    {
 +        nthreads_tmpi = ngpu;
 +        if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi)
 +        {
 +            nthreads_tmpi = nthreads_tot;
 +        }
 +    }
 +    else if (hw_opt->nthreads_omp > 0)
 +    {
 +        /* Here we could oversubscribe, when we do, we issue a warning later */
 +        nthreads_tmpi = max(1,nthreads_tot/hw_opt->nthreads_omp);
 +    }
 +    else
 +    {
 +        /* TODO choose nthreads_omp based on hardware topology
 +           when we have a hardware topology detection library */
 +        /* In general, when running up to 4 threads, OpenMP should be faster.
 +         * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
 +         * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
 +         * even on two CPUs it's usually faster (but with many OpenMP threads
 +         * it could be faster not to use HT, currently we always use HT).
 +         * On Nehalem/Westmere we want to avoid running 16 threads over
 +         * two CPUs with HT, so we need a limit<16; thus we use 12.
 +         * A reasonable limit for Intel Sandy and Ivy bridge,
 +         * not knowing the topology, is 16 threads.
 +         */
 +        const int nthreads_omp_always_faster             =  4;
 +        const int nthreads_omp_always_faster_Nehalem     = 12;
 +        const int nthreads_omp_always_faster_SandyBridge = 16;
 +        const int first_model_Nehalem     = 0x1A;
 +        const int first_model_SandyBridge = 0x2A;
 +        gmx_bool bIntel_Family6;
 +
 +        bIntel_Family6 =
 +            (gmx_cpuid_vendor(hwinfo->cpuid_info) == GMX_CPUID_VENDOR_INTEL &&
 +             gmx_cpuid_family(hwinfo->cpuid_info) == 6);
 +
 +        if (nthreads_tot <= nthreads_omp_always_faster ||
 +            (bIntel_Family6 &&
 +             ((gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_Nehalem && nthreads_tot <= nthreads_omp_always_faster_Nehalem) ||
 +              (gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_SandyBridge && nthreads_tot <= nthreads_omp_always_faster_SandyBridge))))
 +        {
 +            /* Use pure OpenMP parallelization */
 +            nthreads_tmpi = 1;
 +        }
 +        else
 +        {
 +            /* Don't use OpenMP parallelization */
 +            nthreads_tmpi = nthreads_tot;
 +        }
 +    }
 +
 +    return nthreads_tmpi;
 +}
 +
 +
 +/* Get the number of threads to use for thread-MPI based on how many
 + * were requested, which algorithms we're using,
 + * and how many particles there are.
 + * At the point we have already called check_and_update_hw_opt.
 + * Thus all options should be internally consistent and consistent
 + * with the hardware, except that ntmpi could be larger than #GPU.
 + */
 +static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
 +                            gmx_hw_opt_t *hw_opt,
 +                            t_inputrec *inputrec, gmx_mtop_t *mtop,
 +                            const t_commrec *cr,
 +                            FILE *fplog)
 +{
 +    int nthreads_hw,nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu;
 +    int min_atoms_per_mpi_thread;
 +    char *env;
 +    char sbuf[STRLEN];
 +    gmx_bool bCanUseGPU;
 +
 +    if (hw_opt->nthreads_tmpi > 0)
 +    {
 +        /* Trivial, return right away */
 +        return hw_opt->nthreads_tmpi;
 +    }
 +
 +    nthreads_hw = hwinfo->nthreads_hw_avail;
 +
 +    /* How many total (#tMPI*#OpenMP) threads can we start? */ 
 +    if (hw_opt->nthreads_tot > 0)
 +    {
 +        nthreads_tot_max = hw_opt->nthreads_tot;
 +    }
 +    else
 +    {
 +        nthreads_tot_max = nthreads_hw;
 +    }
 +
 +    bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
 +    if (bCanUseGPU)
 +    {
 +        ngpu = hwinfo->gpu_info.ncuda_dev_use;
 +    }
 +    else
 +    {
 +        ngpu = 0;
 +    }
 +
 +    nthreads_tmpi =
 +        get_tmpi_omp_thread_division(hwinfo,hw_opt,nthreads_tot_max,ngpu);
 +
 +    if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
 +    {
 +        /* Steps are divided over the nodes iso splitting the atoms */
 +        min_atoms_per_mpi_thread = 0;
 +    }
 +    else
 +    {
 +        if (bCanUseGPU)
 +        {
 +            min_atoms_per_mpi_thread = MIN_ATOMS_PER_GPU;
 +        }
 +        else
 +        {
 +            min_atoms_per_mpi_thread = MIN_ATOMS_PER_MPI_THREAD;
 +        }
 +    }
 +
 +    /* Check if an algorithm does not support parallel simulation.  */
 +    if (nthreads_tmpi != 1 &&
 +        ( inputrec->eI == eiLBFGS ||
 +          inputrec->coulombtype == eelEWALD ) )
 +    {
 +        nthreads_tmpi = 1;
 +
 +        md_print_warn(cr,fplog,"The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
 +        if (hw_opt->nthreads_tmpi > nthreads_tmpi)
 +        {
 +            gmx_fatal(FARGS,"You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
 +        }
 +    }
 +    else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_thread)
 +    {
 +        /* the thread number was chosen automatically, but there are too many
 +           threads (too few atoms per thread) */
 +        nthreads_new = max(1,mtop->natoms/min_atoms_per_mpi_thread);
 +
 +        /* Avoid partial use of Hyper-Threading */
 +        if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
 +            nthreads_new > nthreads_hw/2 && nthreads_new < nthreads_hw)
 +        {
 +            nthreads_new = nthreads_hw/2;
 +        }
 +
 +        /* Avoid large prime numbers in the thread count */
 +        if (nthreads_new >= 6)
 +        {
 +            /* Use only 6,8,10 with additional factors of 2 */
 +            int fac;
 +
 +            fac = 2;
 +            while (3*fac*2 <= nthreads_new)
 +            {
 +                fac *= 2;
 +            }
 +
 +            nthreads_new = (nthreads_new/fac)*fac;
 +        }
 +        else
 +        {
 +            /* Avoid 5 */
 +            if (nthreads_new == 5)
 +            {
 +                nthreads_new = 4;
 +            }
 +        }
 +
 +        nthreads_tmpi = nthreads_new;
 +
 +        fprintf(stderr,"\n");
 +        fprintf(stderr,"NOTE: Parallelization is limited by the small number of atoms,\n");
 +        fprintf(stderr,"      only starting %d thread-MPI threads.\n",nthreads_tmpi);
 +        fprintf(stderr,"      You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
 +    }
 +
 +    return nthreads_tmpi;
 +}
 +#endif /* GMX_THREAD_MPI */
 +
 +
 +/* Environment variable for setting nstlist */
 +static const char*  NSTLIST_ENVVAR          =  "GMX_NSTLIST";
 +/* Try to increase nstlist when using a GPU with nstlist less than this */
 +static const int    NSTLIST_GPU_ENOUGH      = 20;
 +/* Increase nstlist until the non-bonded cost increases more than this factor */
 +static const float  NBNXN_GPU_LIST_OK_FAC   = 1.25;
 +/* Don't increase nstlist beyond a non-bonded cost increases of this factor */
 +static const float  NBNXN_GPU_LIST_MAX_FAC  = 1.40;
 +
 +/* Try to increase nstlist when running on a GPU */
 +static void increase_nstlist(FILE *fp,t_commrec *cr,
 +                             t_inputrec *ir,const gmx_mtop_t *mtop,matrix box)
 +{
 +    char *env;
 +    int  nstlist_orig,nstlist_prev;
 +    verletbuf_list_setup_t ls;
 +    real rlist_inc,rlist_ok,rlist_max,rlist_new,rlist_prev;
 +    int  i;
 +    t_state state_tmp;
 +    gmx_bool bBox,bDD,bCont;
 +    const char *nstl_fmt="\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
 +    const char *vbd_err="Can not increase nstlist for GPU run because verlet-buffer-drift is not set or used";
 +    const char *box_err="Can not increase nstlist for GPU run because the box is too small";
 +    const char *dd_err ="Can not increase nstlist for GPU run because of domain decomposition limitations";
 +    char buf[STRLEN];
 +
 +    /* Number of + nstlist alternative values to try when switching  */
 +    const int nstl[]={ 20, 25, 40, 50 };
 +#define NNSTL  sizeof(nstl)/sizeof(nstl[0])
 +
 +    env = getenv(NSTLIST_ENVVAR);
 +    if (env == NULL)
 +    {
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,nstl_fmt,ir->nstlist);
 +        }
 +    }
 +
 +    if (ir->verletbuf_drift == 0)
 +    {
 +        gmx_fatal(FARGS,"You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
 +    }
 +
 +    if (ir->verletbuf_drift < 0)
 +    {
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr,"%s\n",vbd_err);
 +        }
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,"%s\n",vbd_err);
 +        }
 +
 +        return;
 +    }
 +
 +    nstlist_orig = ir->nstlist;
 +    if (env != NULL)
 +    {
 +        sprintf(buf,"Getting nstlist from environment variable GMX_NSTLIST=%s",env);
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr,"%s\n",buf);
 +        }
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,"%s\n",buf);
 +        }
 +        sscanf(env,"%d",&ir->nstlist);
 +    }
 +
 +    verletbuf_get_list_setup(TRUE,&ls);
 +
 +    /* Allow rlist to make the list double the size of the cut-off sphere */
 +    rlist_inc = nbnxn_get_rlist_effective_inc(NBNXN_GPU_CLUSTER_SIZE,mtop->natoms/det(box));
 +    rlist_ok  = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_OK_FAC,1.0/3.0) - rlist_inc;
 +    rlist_max = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_MAX_FAC,1.0/3.0) - rlist_inc;
 +    if (debug)
 +    {
 +        fprintf(debug,"GPU nstlist tuning: rlist_inc %.3f rlist_max %.3f\n",
 +                rlist_inc,rlist_max);
 +    }
 +
 +    i = 0;
 +    nstlist_prev = nstlist_orig;
 +    rlist_prev   = ir->rlist;
 +    do
 +    {
 +        if (env == NULL)
 +        {
 +            ir->nstlist = nstl[i];
 +        }
 +
 +        /* Set the pair-list buffer size in ir */
 +        calc_verlet_buffer_size(mtop,det(box),ir,ir->verletbuf_drift,&ls,
 +                                NULL,&rlist_new);
 +
 +        /* Does rlist fit in the box? */
 +        bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC,box));
 +        bDD  = TRUE;
 +        if (bBox && DOMAINDECOMP(cr))
 +        {
 +            /* Check if rlist fits in the domain decomposition */
 +            if (inputrec2nboundeddim(ir) < DIM)
 +            {
 +                gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
 +            }
 +            copy_mat(box,state_tmp.box);
 +            bDD = change_dd_cutoff(cr,&state_tmp,ir,rlist_new);
 +        }
 +
 +        bCont = FALSE;
 +
 +        if (env == NULL)
 +        {
 +            if (bBox && bDD && rlist_new <= rlist_max)
 +            {
 +                /* Increase nstlist */
 +                nstlist_prev = ir->nstlist;
 +                rlist_prev   = rlist_new;
 +                bCont = (i+1 < NNSTL && rlist_new < rlist_ok);
 +            }
 +            else
 +            {
 +                /* Stick with the previous nstlist */
 +                ir->nstlist = nstlist_prev;
 +                rlist_new   = rlist_prev;
 +                bBox = TRUE;
 +                bDD  = TRUE;
 +            }
 +        }
 +
 +        i++;
 +    }
 +    while (bCont);
 +
 +    if (!bBox || !bDD)
 +    {
 +        gmx_warning(!bBox ? box_err : dd_err);
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,"\n%s\n",bBox ? box_err : dd_err);
 +        }
 +        ir->nstlist = nstlist_orig;
 +    }
 +    else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
 +    {
 +        sprintf(buf,"Changing nstlist from %d to %d, rlist from %g to %g",
 +                nstlist_orig,ir->nstlist,
 +                ir->rlist,rlist_new);
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr,"%s\n\n",buf);
 +        }
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,"%s\n\n",buf);
 +        }
 +        ir->rlist     = rlist_new;
 +        ir->rlistlong = rlist_new;
 +    }
 +}
 +
 +static void prepare_verlet_scheme(FILE *fplog,
 +                                  gmx_hw_info_t *hwinfo,
 +                                  t_commrec *cr,
 +                                  gmx_hw_opt_t *hw_opt,
 +                                  const char *nbpu_opt,
 +                                  t_inputrec *ir,
 +                                  const gmx_mtop_t *mtop,
 +                                  matrix box,
 +                                  gmx_bool *bUseGPU)
 +{
 +    /* Here we only check for GPU usage on the MPI master process,
 +     * as here we don't know how many GPUs we will use yet.
 +     * We check for a GPU on all processes later.
 +     */
 +    *bUseGPU = hwinfo->bCanUseGPU || (getenv("GMX_EMULATE_GPU") != NULL);
 +
 +    if (ir->verletbuf_drift > 0)
 +    {
 +        /* Update the Verlet buffer size for the current run setup */
 +        verletbuf_list_setup_t ls;
 +        real rlist_new;
 +
 +        /* Here we assume CPU acceleration is on. But as currently
 +         * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
 +         * and 4x2 gives a larger buffer than 4x4, this is ok.
 +         */
 +        verletbuf_get_list_setup(*bUseGPU,&ls);
 +
 +        calc_verlet_buffer_size(mtop,det(box),ir,
 +                                ir->verletbuf_drift,&ls,
 +                                NULL,&rlist_new);
 +        if (rlist_new != ir->rlist)
 +        {
 +            if (fplog != NULL)
 +            {
 +                fprintf(fplog,"\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
 +                        ir->rlist,rlist_new,
 +                        ls.cluster_size_i,ls.cluster_size_j);
 +            }
 +            ir->rlist     = rlist_new;
 +            ir->rlistlong = rlist_new;
 +        }
 +    }
 +
 +    /* With GPU or emulation we should check nstlist for performance */
 +    if ((EI_DYNAMICS(ir->eI) &&
 +         *bUseGPU &&
 +         ir->nstlist < NSTLIST_GPU_ENOUGH) ||
 +        getenv(NSTLIST_ENVVAR) != NULL)
 +    {
 +        /* Choose a better nstlist */
 +        increase_nstlist(fplog,cr,ir,mtop,box);
 +    }
 +}
 +
 +static void convert_to_verlet_scheme(FILE *fplog,
 +                                     t_inputrec *ir,
 +                                     gmx_mtop_t *mtop,real box_vol)
 +{
 +    char *conv_mesg="Converting input file with group cut-off scheme to the Verlet cut-off scheme";
 +
 +    md_print_warn(NULL,fplog,"%s\n",conv_mesg);
 +
 +    ir->cutoff_scheme   = ecutsVERLET;
 +    ir->verletbuf_drift = 0.005;
 +
 +    if (ir->rcoulomb != ir->rvdw)
 +    {
 +        gmx_fatal(FARGS,"The VdW and Coulomb cut-offs are different, whereas the Verlet scheme only supports equal cut-offs");
 +    }
 +
 +    if (ir->vdwtype == evdwUSER || EEL_USER(ir->coulombtype))
 +    {
 +        gmx_fatal(FARGS,"User non-bonded potentials are not (yet) supported with the Verlet scheme");
 +    }
 +    else if (EVDW_SWITCHED(ir->vdwtype) || EEL_SWITCHED(ir->coulombtype))
 +    {
 +        md_print_warn(NULL,fplog,"Converting switched or shifted interactions to a shifted potential (without force shift), this will lead to slightly different interaction potentials");
 +
 +        if (EVDW_SWITCHED(ir->vdwtype))
 +        {
 +            ir->vdwtype = evdwCUT;
 +        }
 +        if (EEL_SWITCHED(ir->coulombtype))
 +        {
 +            if (EEL_FULL(ir->coulombtype))
 +            {
 +                /* With full electrostatic only PME can be switched */
 +                ir->coulombtype = eelPME;
 +            }
 +            else
 +            {
 +                md_print_warn(NULL,fplog,"NOTE: Replacing %s electrostatics with reaction-field with epsilon-rf=inf\n",eel_names[ir->coulombtype]);
 +                ir->coulombtype = eelRF;
 +                ir->epsilon_rf  = 0.0;
 +            }
 +        }
 +
 +        /* We set the target energy drift to a small number.
 +         * Note that this is only for testing. For production the user
 +         * should think about this and set the mdp options.
 +         */
 +        ir->verletbuf_drift = 1e-4;
 +    }
 +
 +    if (inputrec2nboundeddim(ir) != 3)
 +    {
 +        gmx_fatal(FARGS,"Can only convert old tpr files to the Verlet cut-off scheme with 3D pbc");
 +    }
 +
 +    if (ir->efep != efepNO || ir->implicit_solvent != eisNO)
 +    {
 +        gmx_fatal(FARGS,"Will not convert old tpr files to the Verlet cut-off scheme with free-energy calculations or implicit solvent");
 +    }
 +
 +    if (EI_DYNAMICS(ir->eI) && !(EI_MD(ir->eI) && ir->etc == etcNO))
 +    {
 +        verletbuf_list_setup_t ls;
 +
 +        verletbuf_get_list_setup(FALSE,&ls);
 +        calc_verlet_buffer_size(mtop,box_vol,ir,ir->verletbuf_drift,&ls,
 +                                NULL,&ir->rlist);
 +    }
 +    else
 +    {
 +        ir->verletbuf_drift = -1;
 +        ir->rlist           = 1.05*max(ir->rvdw,ir->rcoulomb);
 +    }
 +
 +    gmx_mtop_remove_chargegroups(mtop);
 +}
 +
 +/* Check the process affinity mask and if it is found to be non-zero,
 + * will honor it and disable mdrun internal affinity setting.
 + * This function should be called first before the OpenMP library gets
 + * initialized with the last argument FALSE (which will detect affinity
 + * set by external tools like taskset), and later, after the OpenMP
 + * initialization, with the last argument TRUE to detect affinity changes
 + * made by the OpenMP library.
 + *
 + * Note that this will only work on Linux as we use a GNU feature. */
 +static void check_cpu_affinity_set(FILE *fplog, const t_commrec *cr,
 +                                   gmx_hw_opt_t *hw_opt, int ncpus,
 +                                   gmx_bool bAfterOpenmpInit)
 +{
 +#ifdef HAVE_SCHED_GETAFFINITY
 +    cpu_set_t mask_current;
 +    int       i, ret, cpu_count, cpu_set;
 +    gmx_bool  bAllSet;
 +
 +    assert(hw_opt);
 +    if (!hw_opt->bThreadPinning)
 +    {
 +        /* internal affinity setting is off, don't bother checking process affinity */
 +        return;
 +    }
 +
 +    CPU_ZERO(&mask_current);
 +    if ((ret = sched_getaffinity(0, sizeof(cpu_set_t), &mask_current)) != 0)
 +    {
 +        /* failed to query affinity mask, will just return */
 +        if (debug)
 +        {
 +            fprintf(debug, "Failed to query affinity mask (error %d)", ret);
 +        }
 +        return;
 +    }
 +
 +    /* Before proceeding with the actual check, make sure that the number of
 +     * detected CPUs is >= the CPUs in the current set.
 +     * We need to check for CPU_COUNT as it was added only in glibc 2.6. */
 +#ifdef CPU_COUNT
 +    if (ncpus < CPU_COUNT(&mask_current))
 +    {
 +        if (debug)
 +        {
 +            fprintf(debug, "%d CPUs detected, but %d was returned by CPU_COUNT",
 +                    ncpus, CPU_COUNT(&mask_current));
 +        }
 +        return;
 +    }
 +#endif /* CPU_COUNT */
 +
 +    bAllSet = TRUE;
 +    for (i = 0; (i < ncpus && i < CPU_SETSIZE); i++)
 +    {
 +        bAllSet = bAllSet && (CPU_ISSET(i, &mask_current) != 0);
 +    }
 +
 +    if (!bAllSet)
 +    {
 +        if (!bAfterOpenmpInit)
 +        {
 +            md_print_warn(cr, fplog,
 +                          "Non-default process affinity set, disabling internal affinity");
 +        }
 +        else
 +        {
 +            md_print_warn(cr, fplog,
 +                          "Non-default process affinity set probably by the OpenMP library, "
 +                          "disabling internal affinity");
 +        }
 +        hw_opt->bThreadPinning = FALSE;
 +
 +        if (debug)
 +        {
 +            fprintf(debug, "Non-default affinity mask found\n");
 +        }
 +    }
 +    else
 +    {
 +        if (debug)
 +        {
 +            fprintf(debug, "Default affinity mask found\n");
 +        }
 +    }
 +#endif /* HAVE_SCHED_GETAFFINITY */
 +}
 +
 +/* Set CPU affinity. Can be important for performance.
 +   On some systems (e.g. Cray) CPU Affinity is set by default.
 +   But default assigning doesn't work (well) with only some ranks
 +   having threads. This causes very low performance.
 +   External tools have cumbersome syntax for setting affinity
 +   in the case that only some ranks have threads.
 +   Thus it is important that GROMACS sets the affinity internally
 +   if only PME is using threads.
 +*/
 +static void set_cpu_affinity(FILE *fplog,
 +                             const t_commrec *cr,
 +                             gmx_hw_opt_t *hw_opt,
 +                             int nthreads_pme,
 +                             const gmx_hw_info_t *hwinfo,
 +                             const t_inputrec *inputrec)
 +{
 +#if defined GMX_THREAD_MPI
 +    /* With the number of TMPI threads equal to the number of cores
 +     * we already pinned in thread-MPI, so don't pin again here.
 +     */
 +    if (hw_opt->nthreads_tmpi == tMPI_Thread_get_hw_number())
 +    {
 +        return;
 +    }
 +#endif
 +
 +#ifndef __APPLE__
 +    /* If the tMPI thread affinity setting is not supported encourage the user
 +     * to report it as it's either a bug or an exotic platform which we might
 +     * want to support. */
 +    if (tMPI_Thread_setaffinity_support() != TMPI_SETAFFINITY_SUPPORT_YES)
 +    {
 +        md_print_warn(NULL, fplog,
 +                      "Can not set thread affinities on the current plarform. On NUMA systems this\n"
 +                      "can cause performance degradation. If you think your platform should support\n"
 +                      "setting affinities, contact the GROMACS developers.");
 +        return;
 +    }
 +#endif /* __APPLE__ */
 +
 +    if (hw_opt->bThreadPinning)
 +    {
 +        int nth_affinity_set, thread_id_node, thread_id,
 +            nthread_local, nthread_node, nthread_hw_max, nphyscore;
 +        int offset;
 +        char *env;
 +
 +        /* threads on this MPI process or TMPI thread */
 +        if (cr->duty & DUTY_PP)
 +        {
 +            nthread_local = gmx_omp_nthreads_get(emntNonbonded);
 +        }
 +        else
 +        {
 +            nthread_local = gmx_omp_nthreads_get(emntPME);
 +        }
 +
 +        /* map the current process to cores */
 +        thread_id_node = 0;
 +        nthread_node = nthread_local;
 +#ifdef GMX_MPI
 +        if (PAR(cr) || MULTISIM(cr))
 +        {
 +            /* We need to determine a scan of the thread counts in this
 +             * compute node.
 +             */
-             process_index = cr->nodeid_intra;
-             if (MULTISIM(cr))
-             {
-                 /* To simplify the code, we shift process indices by nnodes.
-                  * There might be far less processes, but that doesn't matter.
-                  */
-                 process_index += cr->ms->sim*cr->nnodes;
-             }
-             MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),process_index,
 +            MPI_Comm comm_intra;
 +
-                 fprintf(debug, "On node %d, thread %d the affinity setting returned %d\n",
-                         cr->nodeid, gmx_omp_get_thread_num(), setaffinity_ret);
++            MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),cr->rank_intranode,
 +                           &comm_intra);
 +            MPI_Scan(&nthread_local,&thread_id_node,1,MPI_INT,MPI_SUM,comm_intra);
 +            /* MPI_Scan is inclusive, but here we need exclusive */
 +            thread_id_node -= nthread_local;
 +            /* Get the total number of threads on this physical node */
 +            MPI_Allreduce(&nthread_local,&nthread_node,1,MPI_INT,MPI_SUM,comm_intra);
 +            MPI_Comm_free(&comm_intra);
 +        }
 +#endif
 +
 +        offset = 0;
 +        if (hw_opt->core_pinning_offset > 0)
 +        {
 +            offset = hw_opt->core_pinning_offset;
 +            if (SIMMASTER(cr))
 +            {
 +                fprintf(stderr, "Applying core pinning offset %d\n", offset);
 +            }
 +            if (fplog)
 +            {
 +                fprintf(fplog, "Applying core pinning offset %d\n", offset);
 +            }
 +        }
 +
 +        /* With Intel Hyper-Threading enabled, we want to pin consecutive
 +         * threads to physical cores when using more threads than physical
 +         * cores or when the user requests so.
 +         */
 +        nthread_hw_max = hwinfo->nthreads_hw_avail;
 +        nphyscore = -1;
 +        if (hw_opt->bPinHyperthreading ||
 +            (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
 +             nthread_node > nthread_hw_max/2 && getenv("GMX_DISABLE_PINHT") == NULL))
 +        {
 +            if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) != GMX_CPUID_X86_SMT_ENABLED)
 +            {
 +                /* We print to stderr on all processes, as we might have
 +                 * different settings on different physical nodes.
 +                 */
 +                if (gmx_cpuid_vendor(hwinfo->cpuid_info) != GMX_CPUID_VENDOR_INTEL)
 +                {
 +                    md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
 +                                  "but non-Intel CPU detected (vendor: %s)\n",
 +                                  gmx_cpuid_vendor_string[gmx_cpuid_vendor(hwinfo->cpuid_info)]);
 +                }
 +                else
 +                {
 +                    md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
 +                                  "but the CPU detected does not have Intel Hyper-Threading support "
 +                                  "(or it is turned off)\n");
 +                }
 +            }
 +            nphyscore = nthread_hw_max/2;
 +
 +            if (SIMMASTER(cr))
 +            {
 +                fprintf(stderr, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
 +                        nphyscore);
 +            }
 +            if (fplog)
 +            {
 +                fprintf(fplog, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
 +                        nphyscore);
 +            }
 +        }
 +
 +        /* Set the per-thread affinity. In order to be able to check the success
 +         * of affinity settings, we will set nth_affinity_set to 1 on threads
 +         * where the affinity setting succeded and to 0 where it failed.
 +         * Reducing these 0/1 values over the threads will give the total number
 +         * of threads on which we succeeded.
 +         */
 +         nth_affinity_set = 0;
 +#pragma omp parallel firstprivate(thread_id_node) num_threads(nthread_local) \
 +                     reduction(+:nth_affinity_set)
 +        {
 +            int      core;
 +            gmx_bool setaffinity_ret;
 +
 +            thread_id       = gmx_omp_get_thread_num();
 +            thread_id_node += thread_id;
 +            if (nphyscore <= 0)
 +            {
 +                core = offset + thread_id_node;
 +            }
 +            else
 +            {
 +                /* Lock pairs of threads to the same hyperthreaded core */
 +                core = offset + thread_id_node/2 + (thread_id_node % 2)*nphyscore;
 +            }
 +
 +            setaffinity_ret = tMPI_Thread_setaffinity_single(tMPI_Thread_self(), core);
 +
 +            /* store the per-thread success-values of the setaffinity */
 +            nth_affinity_set = (setaffinity_ret == 0);
 +
 +            if (debug)
 +            {
-     /* Initialize per-node process ID and counters. */
-     gmx_init_intra_counters(cr);
++                fprintf(debug, "On rank %2d, thread %2d, core %2d the affinity setting returned %d\n",
++                        cr->nodeid, gmx_omp_get_thread_num(), core, setaffinity_ret);
 +            }
 +        }
 +
 +        if (nth_affinity_set > nthread_local)
 +        {
 +            char msg[STRLEN];
 +
 +            sprintf(msg, "Looks like we have set affinity for more threads than "
 +                    "we have (%d > %d)!\n", nth_affinity_set, nthread_local);
 +            gmx_incons(msg);
 +        }
 +        else
 +        {
 +            /* check if some threads failed to set their affinities */
 +            if (nth_affinity_set != nthread_local)
 +            {
 +                char sbuf[STRLEN];
 +                sbuf[0] = '\0';
 +#ifdef GMX_MPI
 +#ifdef GMX_THREAD_MPI
 +                sprintf(sbuf, "In thread-MPI thread #%d", cr->nodeid);
 +#else /* GMX_LIB_MPI */
 +#endif
 +                sprintf(sbuf, "In MPI process #%d", cr->nodeid);
 +#endif /* GMX_MPI */
 +                md_print_warn(NULL, fplog,
 +                              "%s%d/%d thread%s failed to set their affinities. "
 +                              "This can cause performance degradation!",
 +                              sbuf, nthread_local - nth_affinity_set, nthread_local,
 +                              (nthread_local - nth_affinity_set) > 1 ? "s" : "");
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt,
 +                                    int cutoff_scheme)
 +{
 +    gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp);
 +
 +#ifndef GMX_THREAD_MPI
 +    if (hw_opt->nthreads_tot > 0)
 +    {
 +        gmx_fatal(FARGS,"Setting the total number of threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
 +    }
 +    if (hw_opt->nthreads_tmpi > 0)
 +    {
 +        gmx_fatal(FARGS,"Setting the number of thread-MPI threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
 +    }
 +#endif
 +
 +    if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
 +    {
 +        /* We have the same number of OpenMP threads for PP and PME processes,
 +         * thus we can perform several consistency checks.
 +         */
 +        if (hw_opt->nthreads_tmpi > 0 &&
 +            hw_opt->nthreads_omp > 0 &&
 +            hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
 +        {
 +            gmx_fatal(FARGS,"The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
 +                      hw_opt->nthreads_tot,hw_opt->nthreads_tmpi,hw_opt->nthreads_omp);
 +        }
 +
 +        if (hw_opt->nthreads_tmpi > 0 &&
 +            hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
 +        {
 +            gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
 +                      hw_opt->nthreads_tot,hw_opt->nthreads_tmpi);
 +        }
 +
 +        if (hw_opt->nthreads_omp > 0 &&
 +            hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
 +        {
 +            gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
 +                      hw_opt->nthreads_tot,hw_opt->nthreads_omp);
 +        }
 +
 +        if (hw_opt->nthreads_tmpi > 0 &&
 +            hw_opt->nthreads_omp <= 0)
 +        {
 +            hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
 +        }
 +    }
 +
 +#ifndef GMX_OPENMP
 +    if (hw_opt->nthreads_omp > 1)
 +    {
 +        gmx_fatal(FARGS,"OpenMP threads are requested, but Gromacs was compiled without OpenMP support");
 +    }
 +#endif
 +
 +    if (cutoff_scheme == ecutsGROUP)
 +    {
 +        /* We only have OpenMP support for PME only nodes */
 +        if (hw_opt->nthreads_omp > 1)
 +        {
 +            gmx_fatal(FARGS,"OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
 +                      ecutscheme_names[cutoff_scheme],
 +                      ecutscheme_names[ecutsVERLET]);
 +        }
 +        hw_opt->nthreads_omp = 1;
 +    }
 +
 +    if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
 +    {
 +        gmx_fatal(FARGS,"You need to specify -ntomp in addition to -ntomp_pme");
 +    }
 +
 +    if (hw_opt->nthreads_tot == 1)
 +    {
 +        hw_opt->nthreads_tmpi = 1;
 +
 +        if (hw_opt->nthreads_omp > 1)
 +        {
 +            gmx_fatal(FARGS,"You requested %d OpenMP threads with %d total threads",
 +                      hw_opt->nthreads_tmpi,hw_opt->nthreads_tot);
 +        }
 +        hw_opt->nthreads_omp = 1;
 +    }
 +
 +    if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
 +    {
 +        hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
 +                hw_opt->nthreads_tot,
 +                hw_opt->nthreads_tmpi,
 +                hw_opt->nthreads_omp,
 +                hw_opt->nthreads_omp_pme,
 +                hw_opt->gpu_id!=NULL ? hw_opt->gpu_id : "");
 +                
 +    }
 +}
 +
 +
 +/* Override the value in inputrec with value passed on the command line (if any) */
 +static void override_nsteps_cmdline(FILE *fplog,
 +                                    int nsteps_cmdline,
 +                                    t_inputrec *ir,
 +                                    const t_commrec *cr)
 +{
 +    assert(ir);
 +    assert(cr);
 +
 +    /* override with anything else than the default -2 */
 +    if (nsteps_cmdline > -2)
 +    {
 +        char stmp[STRLEN];
 +
 +        ir->nsteps = nsteps_cmdline;
 +        if (EI_DYNAMICS(ir->eI))
 +        {
 +            sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps, %.3f ps",
 +                    nsteps_cmdline, nsteps_cmdline*ir->delta_t);
 +        }
 +        else
 +        {
 +            sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps",
 +                    nsteps_cmdline);
 +        }
 +
 +        md_print_warn(cr, fplog, "%s\n", stmp);
 +    }
 +}
 +
 +/* Data structure set by SIMMASTER which needs to be passed to all nodes
 + * before the other nodes have read the tpx file and called gmx_detect_hardware.
 + */
 +typedef struct {
 +    int cutoff_scheme; /* The cutoff scheme from inputrec_t */
 +    gmx_bool bUseGPU;       /* Use GPU or GPU emulation          */
 +} master_inf_t;
 +
 +int mdrunner(gmx_hw_opt_t *hw_opt,
 +             FILE *fplog,t_commrec *cr,int nfile,
 +             const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
 +             gmx_bool bCompact, int nstglobalcomm,
 +             ivec ddxyz,int dd_node_order,real rdd,real rconstr,
 +             const char *dddlb_opt,real dlb_scale,
 +             const char *ddcsx,const char *ddcsy,const char *ddcsz,
 +             const char *nbpu_opt,
 +             int nsteps_cmdline, int nstepout,int resetstep,
 +             int nmultisim,int repl_ex_nst,int repl_ex_nex,
 +             int repl_ex_seed, real pforce,real cpt_period,real max_hours,
 +             const char *deviceOptions, unsigned long Flags)
 +{
 +    gmx_bool   bForceUseGPU,bTryUseGPU;
 +    double     nodetime=0,realtime;
 +    t_inputrec *inputrec;
 +    t_state    *state=NULL;
 +    matrix     box;
 +    gmx_ddbox_t ddbox={0};
 +    int        npme_major,npme_minor;
 +    real       tmpr1,tmpr2;
 +    t_nrnb     *nrnb;
 +    gmx_mtop_t *mtop=NULL;
 +    t_mdatoms  *mdatoms=NULL;
 +    t_forcerec *fr=NULL;
 +    t_fcdata   *fcd=NULL;
 +    real       ewaldcoeff=0;
 +    gmx_pme_t  *pmedata=NULL;
 +    gmx_vsite_t *vsite=NULL;
 +    gmx_constr_t constr;
 +    int        i,m,nChargePerturbed=-1,status,nalloc;
 +    char       *gro;
 +    gmx_wallcycle_t wcycle;
 +    gmx_bool       bReadRNG,bReadEkin;
 +    int        list;
 +    gmx_runtime_t runtime;
 +    int        rc;
 +    gmx_large_int_t reset_counters;
 +    gmx_edsam_t ed=NULL;
 +    t_commrec   *cr_old=cr; 
 +    int         nthreads_pme=1;
 +    int         nthreads_pp=1;
 +    gmx_membed_t membed=NULL;
 +    gmx_hw_info_t *hwinfo=NULL;
 +    master_inf_t minf={-1,FALSE};
 +
 +    /* CAUTION: threads may be started later on in this function, so
 +       cr doesn't reflect the final parallel state right now */
 +    snew(inputrec,1);
 +    snew(mtop,1);
 +    
 +    if (Flags & MD_APPENDFILES) 
 +    {
 +        fplog = NULL;
 +    }
 +
 +    bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
 +    bTryUseGPU   = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
 +
 +    snew(state,1);
 +    if (SIMMASTER(cr)) 
 +    {
 +        /* Read (nearly) all data required for the simulation */
 +        read_tpx_state(ftp2fn(efTPX,nfile,fnm),inputrec,state,NULL,mtop);
 +
 +        if (inputrec->cutoff_scheme != ecutsVERLET &&
 +            ((Flags & MD_TESTVERLET) || getenv("GMX_VERLET_SCHEME") != NULL))
 +        {
 +            convert_to_verlet_scheme(fplog,inputrec,mtop,det(state->box));
 +        }
 +
 +        /* Detect hardware, gather information. With tMPI only thread 0 does it
 +         * and after threads are started broadcasts hwinfo around. */
 +        snew(hwinfo, 1);
 +        gmx_detect_hardware(fplog, hwinfo, cr,
 +                            bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
 +
 +        minf.cutoff_scheme = inputrec->cutoff_scheme;
 +        minf.bUseGPU       = FALSE;
 +
 +        if (inputrec->cutoff_scheme == ecutsVERLET)
 +        {
 +            prepare_verlet_scheme(fplog,hwinfo,cr,hw_opt,nbpu_opt,
 +                                  inputrec,mtop,state->box,
 +                                  &minf.bUseGPU);
 +        }
 +        else if (hwinfo->bCanUseGPU)
 +        {
 +            md_print_warn(cr,fplog,
 +                          "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
 +                          "      To use a GPU, set the mdp option: cutoff-scheme = Verlet\n"
 +                          "      (for quick performance testing you can use the -testverlet option)\n");
 +
 +            if (bForceUseGPU)
 +            {
 +                gmx_fatal(FARGS,"GPU requested, but can't be used without cutoff-scheme=Verlet");
 +            }
 +        }
 +    }
 +#ifndef GMX_THREAD_MPI
 +    if (PAR(cr))
 +    {
 +        gmx_bcast_sim(sizeof(minf),&minf,cr);
 +    }
 +#endif
 +    if (minf.bUseGPU && cr->npmenodes == -1)
 +    {
 +        /* Don't automatically use PME-only nodes with GPUs */
 +        cr->npmenodes = 0;
 +    }
 +
 +    /* Check for externally set OpenMP affinity and turn off internal
 +     * pinning if any is found. We need to do this check early to tell
 +     * thread-MPI whether it should do pinning when spawning threads.
 +     */
 +    gmx_omp_check_thread_affinity(fplog, cr, hw_opt);
 +
 +#ifdef GMX_THREAD_MPI
 +    /* With thread-MPI inputrec is only set here on the master thread */
 +    if (SIMMASTER(cr))
 +#endif
 +    {
 +        check_and_update_hw_opt(hw_opt,minf.cutoff_scheme);
 +
 +#ifdef GMX_THREAD_MPI
 +        /* Early check for externally set process affinity. Can't do over all
 +         * MPI processes because hwinfo is not available everywhere, but with
 +         * thread-MPI it's needed as pinning might get turned off which needs
 +         * to be known before starting thread-MPI. */
 +        check_cpu_affinity_set(fplog,
 +                               NULL,
 +                               hw_opt, hwinfo->nthreads_hw_avail, FALSE);
 +#endif
 +
 +#ifdef GMX_THREAD_MPI
 +        if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
 +        {
 +            gmx_fatal(FARGS,"You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME nodes");
 +        }
 +#endif
 +
 +        if (hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
 +            cr->npmenodes <= 0)
 +        {
 +            gmx_fatal(FARGS,"You need to explicitly specify the number of PME nodes (-npme) when using different number of OpenMP threads for PP and PME nodes");
 +        }
 +    }
 +
 +#ifdef GMX_THREAD_MPI
 +    if (SIMMASTER(cr))
 +    {
 +        /* NOW the threads will be started: */
 +        hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
 +                                                 hw_opt,
 +                                                 inputrec, mtop,
 +                                                 cr, fplog);
 +        if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
 +        {
 +            hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
 +        }
 +
 +        if (hw_opt->nthreads_tmpi > 1)
 +        {
 +            /* now start the threads. */
 +            cr=mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm, 
 +                                      oenv, bVerbose, bCompact, nstglobalcomm, 
 +                                      ddxyz, dd_node_order, rdd, rconstr, 
 +                                      dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
 +                                      nbpu_opt,
 +                                      nsteps_cmdline, nstepout, resetstep, nmultisim, 
 +                                      repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
 +                                      cpt_period, max_hours, deviceOptions, 
 +                                      Flags);
 +            /* the main thread continues here with a new cr. We don't deallocate
 +               the old cr because other threads may still be reading it. */
 +            if (cr == NULL)
 +            {
 +                gmx_comm("Failed to spawn threads");
 +            }
 +        }
 +    }
 +#endif
 +    /* END OF CAUTION: cr is now reliable */
 +
 +    /* g_membed initialisation *
 +     * Because we change the mtop, init_membed is called before the init_parallel *
 +     * (in case we ever want to make it run in parallel) */
 +    if (opt2bSet("-membed",nfile,fnm))
 +    {
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr,"Initializing membed");
 +        }
 +        membed = init_membed(fplog,nfile,fnm,mtop,inputrec,state,cr,&cpt_period);
 +    }
 +
 +    if (PAR(cr))
 +    {
 +        /* now broadcast everything to the non-master nodes/threads: */
 +        init_parallel(fplog, cr, inputrec, mtop);
 +
 +        /* This check needs to happen after get_nthreads_mpi() */
 +        if (inputrec->cutoff_scheme == ecutsVERLET && (Flags & MD_PARTDEC))
 +        {
 +            gmx_fatal_collective(FARGS,cr,NULL,
 +                                 "The Verlet cut-off scheme is not supported with particle decomposition.\n"
 +                                 "You can achieve the same effect as particle decomposition by running in parallel using only OpenMP threads.");
 +        }
 +    }
 +    if (fplog != NULL)
 +    {
 +        pr_inputrec(fplog,0,"Input Parameters",inputrec,FALSE);
 +    }
 +
 +#if defined GMX_THREAD_MPI
 +    /* With tMPI we detected on thread 0 and we'll just pass the hwinfo pointer
 +     * to the other threads  -- slightly uncool, but works fine, just need to
 +     * make sure that the data doesn't get freed twice. */
 +    if (cr->nnodes > 1)
 +    {
 +        if (!SIMMASTER(cr))
 +        {
 +            snew(hwinfo, 1);
 +        }
 +        gmx_bcast(sizeof(&hwinfo), &hwinfo, cr);
 +    }
 +#else
 +    if (PAR(cr) && !SIMMASTER(cr))
 +    {
 +        /* now we have inputrec on all nodes, can run the detection */
 +        /* TODO: perhaps it's better to propagate within a node instead? */
 +        snew(hwinfo, 1);
 +        gmx_detect_hardware(fplog, hwinfo, cr,
 +                                 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
 +    }
 +
 +    /* Now do the affinity check with MPI/no-MPI (done earlier with thread-MPI). */
 +    check_cpu_affinity_set(fplog, cr,
 +                           hw_opt, hwinfo->nthreads_hw_avail, FALSE);
 +#endif
 +
 +    /* now make sure the state is initialized and propagated */
 +    set_state_entries(state,inputrec,cr->nnodes);
 +
 +    /* remove when vv and rerun works correctly! */
 +    if (PAR(cr) && EI_VV(inputrec->eI) && ((Flags & MD_RERUN) || (Flags & MD_RERUN_VSITE)))
 +    {
 +        gmx_fatal(FARGS,
 +                  "Currently can't do velocity verlet with rerun in parallel.");
 +    }
 +
 +    /* A parallel command line option consistency check that we can
 +       only do after any threads have started. */
 +    if (!PAR(cr) &&
 +        (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
 +    {
 +        gmx_fatal(FARGS,
 +                  "The -dd or -npme option request a parallel simulation, "
 +#ifndef GMX_MPI
 +                  "but %s was compiled without threads or MPI enabled"
 +#else
 +#ifdef GMX_THREAD_MPI
 +                  "but the number of threads (option -nt) is 1"
 +#else
 +                  "but %s was not started through mpirun/mpiexec or only one process was requested through mpirun/mpiexec"
 +#endif
 +#endif
 +                  , ShortProgram()
 +            );
 +    }
 +
 +    if ((Flags & MD_RERUN) &&
 +        (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
 +    {
 +        gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
 +    }
 +
 +    if (can_use_allvsall(inputrec,mtop,TRUE,cr,fplog) && PAR(cr))
 +    {
 +        /* All-vs-all loops do not work with domain decomposition */
 +        Flags |= MD_PARTDEC;
 +    }
 +
 +    if (!EEL_PME(inputrec->coulombtype) || (Flags & MD_PARTDEC))
 +    {
 +        if (cr->npmenodes > 0)
 +        {
 +            if (!EEL_PME(inputrec->coulombtype))
 +            {
 +                gmx_fatal_collective(FARGS,cr,NULL,
 +                                     "PME nodes are requested, but the system does not use PME electrostatics");
 +            }
 +            if (Flags & MD_PARTDEC)
 +            {
 +                gmx_fatal_collective(FARGS,cr,NULL,
 +                                     "PME nodes are requested, but particle decomposition does not support separate PME nodes");
 +            }
 +        }
 +
 +        cr->npmenodes = 0;
 +    }
 +
 +#ifdef GMX_FAHCORE
 +    fcRegisterSteps(inputrec->nsteps,inputrec->init_step);
 +#endif
 +
 +    /* NMR restraints must be initialized before load_checkpoint,
 +     * since with time averaging the history is added to t_state.
 +     * For proper consistency check we therefore need to extend
 +     * t_state here.
 +     * So the PME-only nodes (if present) will also initialize
 +     * the distance restraints.
 +     */
 +    snew(fcd,1);
 +
 +    /* This needs to be called before read_checkpoint to extend the state */
 +    init_disres(fplog,mtop,inputrec,cr,Flags & MD_PARTDEC,fcd,state);
 +
 +    if (gmx_mtop_ftype_count(mtop,F_ORIRES) > 0)
 +    {
 +        if (PAR(cr) && !(Flags & MD_PARTDEC))
 +        {
 +            gmx_fatal(FARGS,"Orientation restraints do not work (yet) with domain decomposition, use particle decomposition (mdrun option -pd)");
 +        }
 +        /* Orientation restraints */
 +        if (MASTER(cr))
 +        {
 +            init_orires(fplog,mtop,state->x,inputrec,cr->ms,&(fcd->orires),
 +                        state);
 +        }
 +    }
 +
 +    if (DEFORM(*inputrec))
 +    {
 +        /* Store the deform reference box before reading the checkpoint */
 +        if (SIMMASTER(cr))
 +        {
 +            copy_mat(state->box,box);
 +        }
 +        if (PAR(cr))
 +        {
 +            gmx_bcast(sizeof(box),box,cr);
 +        }
 +        /* Because we do not have the update struct available yet
 +         * in which the reference values should be stored,
 +         * we store them temporarily in static variables.
 +         * This should be thread safe, since they are only written once
 +         * and with identical values.
 +         */
 +#ifdef GMX_THREAD_MPI
 +        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
 +#endif
 +        deform_init_init_step_tpx = inputrec->init_step;
 +        copy_mat(box,deform_init_box_tpx);
 +#ifdef GMX_THREAD_MPI
 +        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
 +#endif
 +    }
 +
 +    if (opt2bSet("-cpi",nfile,fnm)) 
 +    {
 +        /* Check if checkpoint file exists before doing continuation.
 +         * This way we can use identical input options for the first and subsequent runs...
 +         */
 +        if( gmx_fexist_master(opt2fn_master("-cpi",nfile,fnm,cr),cr) )
 +        {
 +            load_checkpoint(opt2fn_master("-cpi",nfile,fnm,cr),&fplog,
 +                            cr,Flags & MD_PARTDEC,ddxyz,
 +                            inputrec,state,&bReadRNG,&bReadEkin,
 +                            (Flags & MD_APPENDFILES),
 +                            (Flags & MD_APPENDFILESSET));
 +            
 +            if (bReadRNG)
 +            {
 +                Flags |= MD_READ_RNG;
 +            }
 +            if (bReadEkin)
 +            {
 +                Flags |= MD_READ_EKIN;
 +            }
 +        }
 +    }
 +
 +    if (((MASTER(cr) || (Flags & MD_SEPPOT)) && (Flags & MD_APPENDFILES))
 +#ifdef GMX_THREAD_MPI
 +        /* With thread MPI only the master node/thread exists in mdrun.c,
 +         * therefore non-master nodes need to open the "seppot" log file here.
 +         */
 +        || (!MASTER(cr) && (Flags & MD_SEPPOT))
 +#endif
 +        )
 +    {
 +        gmx_log_open(ftp2fn(efLOG,nfile,fnm),cr,!(Flags & MD_SEPPOT),
 +                             Flags,&fplog);
 +    }
 +
 +    /* override nsteps with value from cmdline */
 +    override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
 +
 +    if (SIMMASTER(cr)) 
 +    {
 +        copy_mat(state->box,box);
 +    }
 +
 +    if (PAR(cr)) 
 +    {
 +        gmx_bcast(sizeof(box),box,cr);
 +    }
 +
 +    /* Essential dynamics */
 +    if (opt2bSet("-ei",nfile,fnm))
 +    {
 +        /* Open input and output files, allocate space for ED data structure */
 +        ed = ed_open(nfile,fnm,Flags,cr);
 +    }
 +
 +    if (PAR(cr) && !((Flags & MD_PARTDEC) ||
 +                     EI_TPI(inputrec->eI) ||
 +                     inputrec->eI == eiNM))
 +    {
 +        cr->dd = init_domain_decomposition(fplog,cr,Flags,ddxyz,rdd,rconstr,
 +                                           dddlb_opt,dlb_scale,
 +                                           ddcsx,ddcsy,ddcsz,
 +                                           mtop,inputrec,
 +                                           box,state->x,
 +                                           &ddbox,&npme_major,&npme_minor);
 +
 +        make_dd_communicators(fplog,cr,dd_node_order);
 +
 +        /* Set overallocation to avoid frequent reallocation of arrays */
 +        set_over_alloc_dd(TRUE);
 +    }
 +    else
 +    {
 +        /* PME, if used, is done on all nodes with 1D decomposition */
 +        cr->npmenodes = 0;
 +        cr->duty = (DUTY_PP | DUTY_PME);
 +        npme_major = 1;
 +        npme_minor = 1;
 +        if (!EI_TPI(inputrec->eI))
 +        {
 +            npme_major = cr->nnodes;
 +        }
 +        
 +        if (inputrec->ePBC == epbcSCREW)
 +        {
 +            gmx_fatal(FARGS,
 +                      "pbc=%s is only implemented with domain decomposition",
 +                      epbc_names[inputrec->ePBC]);
 +        }
 +    }
 +
 +    if (PAR(cr))
 +    {
 +        /* After possible communicator splitting in make_dd_communicators.
 +         * we can set up the intra/inter node communication.
 +         */
 +        gmx_setup_nodecomm(fplog,cr);
 +    }
 +
++    /* Initialize per-physical-node MPI process/thread ID and counters. */
++    gmx_init_intranode_counters(cr);
 +
 +#ifdef GMX_MPI
 +    md_print_info(cr,fplog,"Using %d MPI %s\n",
 +                  cr->nnodes,
 +#ifdef GMX_THREAD_MPI
 +                  cr->nnodes==1 ? "thread" : "threads"
 +#else
 +                  cr->nnodes==1 ? "process" : "processes"
 +#endif
 +                  );
 +#endif
 +
 +    gmx_omp_nthreads_init(fplog, cr,
 +                          hwinfo->nthreads_hw_avail,
 +                          hw_opt->nthreads_omp,
 +                          hw_opt->nthreads_omp_pme,
 +                          (cr->duty & DUTY_PP) == 0,
 +                          inputrec->cutoff_scheme == ecutsVERLET);
 +
 +    gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi, minf.bUseGPU);
 +
 +    /* getting number of PP/PME threads
 +       PME: env variable should be read only on one node to make sure it is 
 +       identical everywhere;
 +     */
 +    /* TODO nthreads_pp is only used for pinning threads.
 +     * This is a temporary solution until we have a hw topology library.
 +     */
 +    nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
 +    nthreads_pme = gmx_omp_nthreads_get(emntPME);
 +
 +    wcycle = wallcycle_init(fplog,resetstep,cr,nthreads_pp,nthreads_pme);
 +
 +    if (PAR(cr))
 +    {
 +        /* Master synchronizes its value of reset_counters with all nodes 
 +         * including PME only nodes */
 +        reset_counters = wcycle_get_reset_counters(wcycle);
 +        gmx_bcast_sim(sizeof(reset_counters),&reset_counters,cr);
 +        wcycle_set_reset_counters(wcycle, reset_counters);
 +    }
 +
 +    snew(nrnb,1);
 +    if (cr->duty & DUTY_PP)
 +    {
 +        /* For domain decomposition we allocate dynamically
 +         * in dd_partition_system.
 +         */
 +        if (DOMAINDECOMP(cr))
 +        {
 +            bcast_state_setup(cr,state);
 +        }
 +        else
 +        {
 +            if (PAR(cr))
 +            {
 +                bcast_state(cr,state,TRUE);
 +            }
 +        }
 +
 +        /* Initiate forcerecord */
 +        fr = mk_forcerec();
 +        fr->hwinfo = hwinfo;
 +        init_forcerec(fplog,oenv,fr,fcd,inputrec,mtop,cr,box,FALSE,
 +                      opt2fn("-table",nfile,fnm),
 +                      opt2fn("-tabletf",nfile,fnm),
 +                      opt2fn("-tablep",nfile,fnm),
 +                      opt2fn("-tableb",nfile,fnm),
 +                      nbpu_opt,
 +                      FALSE,pforce);
 +
 +        /* version for PCA_NOT_READ_NODE (see md.c) */
 +        /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
 +          "nofile","nofile","nofile","nofile",FALSE,pforce);
 +          */        
 +        fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT);
 +
 +        /* Initialize QM-MM */
 +        if(fr->bQMMM)
 +        {
 +            init_QMMMrec(cr,box,mtop,inputrec,fr);
 +        }
 +
 +        /* Initialize the mdatoms structure.
 +         * mdatoms is not filled with atom data,
 +         * as this can not be done now with domain decomposition.
 +         */
 +        mdatoms = init_mdatoms(fplog,mtop,inputrec->efep!=efepNO);
 +
 +        /* Initialize the virtual site communication */
 +        vsite = init_vsite(mtop,cr,FALSE);
 +
 +        calc_shifts(box,fr->shift_vec);
 +
 +        /* With periodic molecules the charge groups should be whole at start up
 +         * and the virtual sites should not be far from their proper positions.
 +         */
 +        if (!inputrec->bContinuation && MASTER(cr) &&
 +            !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
 +        {
 +            /* Make molecules whole at start of run */
 +            if (fr->ePBC != epbcNONE)
 +            {
 +                do_pbc_first_mtop(fplog,inputrec->ePBC,box,mtop,state->x);
 +            }
 +            if (vsite)
 +            {
 +                /* Correct initial vsite positions are required
 +                 * for the initial distribution in the domain decomposition
 +                 * and for the initial shell prediction.
 +                 */
 +                construct_vsites_mtop(fplog,vsite,mtop,state->x);
 +            }
 +        }
 +
 +        if (EEL_PME(fr->eeltype))
 +        {
 +            ewaldcoeff = fr->ewaldcoeff;
 +            pmedata = &fr->pmedata;
 +        }
 +        else
 +        {
 +            pmedata = NULL;
 +        }
 +    }
 +    else
 +    {
 +        /* This is a PME only node */
 +
 +        /* We don't need the state */
 +        done_state(state);
 +
 +        ewaldcoeff = calc_ewaldcoeff(inputrec->rcoulomb, inputrec->ewald_rtol);
 +        snew(pmedata,1);
 +    }
 +
 +    /* Before setting affinity, check whether the affinity has changed
 +     * - which indicates that probably the OpenMP library has changed it since
 +     * we first checked). */
 +    check_cpu_affinity_set(fplog, cr, hw_opt, hwinfo->nthreads_hw_avail, TRUE);
 +
 +    /* Set the CPU affinity */
 +    set_cpu_affinity(fplog,cr,hw_opt,nthreads_pme,hwinfo,inputrec);
 +
 +    /* Initiate PME if necessary,
 +     * either on all nodes or on dedicated PME nodes only. */
 +    if (EEL_PME(inputrec->coulombtype))
 +    {
 +        if (mdatoms)
 +        {
 +            nChargePerturbed = mdatoms->nChargePerturbed;
 +        }
 +        if (cr->npmenodes > 0)
 +        {
 +            /* The PME only nodes need to know nChargePerturbed */
 +            gmx_bcast_sim(sizeof(nChargePerturbed),&nChargePerturbed,cr);
 +        }
 +
 +        if (cr->duty & DUTY_PME)
 +        {
 +            status = gmx_pme_init(pmedata,cr,npme_major,npme_minor,inputrec,
 +                                  mtop ? mtop->natoms : 0,nChargePerturbed,
 +                                  (Flags & MD_REPRODUCIBLE),nthreads_pme);
 +            if (status != 0) 
 +            {
 +                gmx_fatal(FARGS,"Error %d initializing PME",status);
 +            }
 +        }
 +    }
 +
 +
 +    if (integrator[inputrec->eI].func == do_md
 +#ifdef GMX_OPENMM
 +        ||
 +        integrator[inputrec->eI].func == do_md_openmm
 +#endif
 +        )
 +    {
 +        /* Turn on signal handling on all nodes */
 +        /*
 +         * (A user signal from the PME nodes (if any)
 +         * is communicated to the PP nodes.
 +         */
 +        signal_handler_install();
 +    }
 +
 +    if (cr->duty & DUTY_PP)
 +    {
 +        if (inputrec->ePull != epullNO)
 +        {
 +            /* Initialize pull code */
 +            init_pull(fplog,inputrec,nfile,fnm,mtop,cr,oenv, inputrec->fepvals->init_lambda,
 +                      EI_DYNAMICS(inputrec->eI) && MASTER(cr),Flags);
 +        }
 +        
 +        if (inputrec->bRot)
 +        {
 +           /* Initialize enforced rotation code */
 +           init_rot(fplog,inputrec,nfile,fnm,cr,state->x,box,mtop,oenv,
 +                    bVerbose,Flags);
 +        }
 +
 +        constr = init_constraints(fplog,mtop,inputrec,ed,state,cr);
 +
 +        if (DOMAINDECOMP(cr))
 +        {
 +            dd_init_bondeds(fplog,cr->dd,mtop,vsite,constr,inputrec,
 +                            Flags & MD_DDBONDCHECK,fr->cginfo_mb);
 +
 +            set_dd_parameters(fplog,cr->dd,dlb_scale,inputrec,fr,&ddbox);
 +
 +            setup_dd_grid(fplog,cr->dd);
 +        }
 +
 +        /* Now do whatever the user wants us to do (how flexible...) */
 +        integrator[inputrec->eI].func(fplog,cr,nfile,fnm,
 +                                      oenv,bVerbose,bCompact,
 +                                      nstglobalcomm,
 +                                      vsite,constr,
 +                                      nstepout,inputrec,mtop,
 +                                      fcd,state,
 +                                      mdatoms,nrnb,wcycle,ed,fr,
 +                                      repl_ex_nst,repl_ex_nex,repl_ex_seed,
 +                                      membed,
 +                                      cpt_period,max_hours,
 +                                      deviceOptions,
 +                                      Flags,
 +                                      &runtime);
 +
 +        if (inputrec->ePull != epullNO)
 +        {
 +            finish_pull(fplog,inputrec->pull);
 +        }
 +        
 +        if (inputrec->bRot)
 +        {
 +            finish_rot(fplog,inputrec->rot);
 +        }
 +
 +    } 
 +    else 
 +    {
 +        /* do PME only */
 +        gmx_pmeonly(*pmedata,cr,nrnb,wcycle,ewaldcoeff,FALSE,inputrec);
 +    }
 +
 +    if (EI_DYNAMICS(inputrec->eI) || EI_TPI(inputrec->eI))
 +    {
 +        /* Some timing stats */  
 +        if (SIMMASTER(cr))
 +        {
 +            if (runtime.proc == 0)
 +            {
 +                runtime.proc = runtime.real;
 +            }
 +        }
 +        else
 +        {
 +            runtime.real = 0;
 +        }
 +    }
 +
 +    wallcycle_stop(wcycle,ewcRUN);
 +
 +    /* Finish up, write some stuff
 +     * if rerunMD, don't write last frame again 
 +     */
 +    finish_run(fplog,cr,ftp2fn(efSTO,nfile,fnm),
 +               inputrec,nrnb,wcycle,&runtime,
 +               fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ?
 +                 nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL,
 +               nthreads_pp, 
 +               EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
 +
 +    if ((cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU)
 +    {
 +        char gpu_err_str[STRLEN];
 +
 +        /* free GPU memory and uninitialize GPU (by destroying the context) */
 +        nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
 +
 +        if (!free_gpu(gpu_err_str))
 +        {
 +            gmx_warning("On node %d failed to free GPU #%d: %s",
 +                        cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
 +        }
 +    }
 +
 +    if (opt2bSet("-membed",nfile,fnm))
 +    {
 +        sfree(membed);
 +    }
 +
 +#ifdef GMX_THREAD_MPI
 +    if (PAR(cr) && SIMMASTER(cr))
 +#endif
 +    {
 +        gmx_hardware_info_free(hwinfo);
 +    }
 +
 +    /* Does what it says */  
 +    print_date_and_time(fplog,cr->nodeid,"Finished mdrun",&runtime);
 +
 +    /* Close logfile already here if we were appending to it */
 +    if (MASTER(cr) && (Flags & MD_APPENDFILES))
 +    {
 +        gmx_log_close(fplog);
 +    } 
 +
 +    rc=(int)gmx_get_stop_condition();
 +
 +#ifdef GMX_THREAD_MPI
 +    /* we need to join all threads. The sub-threads join when they
 +       exit this function, but the master thread needs to be told to 
 +       wait for that. */
 +    if (PAR(cr) && MASTER(cr))
 +    {
 +        tMPI_Finalize();
 +    }
 +#endif
 +
 +    return rc;
 +}
Simple merge
Simple merge