Merge remote-tracking branch 'gerrit/release-4-6' into master
authorRoland Schulz <roland@utk.edu>
Sat, 4 Feb 2012 18:26:42 +0000 (13:26 -0500)
committerRoland Schulz <roland@utk.edu>
Sat, 4 Feb 2012 18:29:36 +0000 (13:29 -0500)
Conflicts:
src/gromacs/mdlib/pme.c
src/programs/mdrun/runner.c
src/tools/CMakeLists.txt

Renamed:
   src/mdlib/pme_sse_single.h -> src/gromacs/mdlib/pme_sse_single.h

Modified:
   src/tools/gmx_dyecoupl.c

Change-Id: Ib4bcf839427908736d9bf529075e1c4def3d68ef

43 files changed:
1  2 
CMakeLists.txt
cmake/ThreadMPI.cmake
cmake/gmxCFlags.cmake
src/gromacs/gmxlib/bondfree.c
src/gromacs/gmxlib/calcgrid.c
src/gromacs/gmxlib/copyrite.c
src/gromacs/gmxlib/ifunc.c
src/gromacs/gmxlib/network.c
src/gromacs/gmxlib/nrnb.c
src/gromacs/gmxlib/thread_mpi/pthreads.c
src/gromacs/gmxlib/tpxio.c
src/gromacs/gmxlib/txtdump.c
src/gromacs/gmxpreprocess/pgutil.c
src/gromacs/gmxpreprocess/pgutil.h
src/gromacs/gmxpreprocess/readir.c
src/gromacs/gmxpreprocess/resall.c
src/gromacs/gmxpreprocess/topdirs.c
src/gromacs/gmxpreprocess/toppush.c
src/gromacs/legacyheaders/bondf.h
src/gromacs/legacyheaders/gmx_ana.h
src/gromacs/legacyheaders/gmx_parallel_3dfft.h
src/gromacs/legacyheaders/gmx_wallcycle.h
src/gromacs/legacyheaders/network.h
src/gromacs/legacyheaders/pme.h
src/gromacs/legacyheaders/thread_mpi/atomic/gcc_x86.h
src/gromacs/legacyheaders/types/idef.h
src/gromacs/legacyheaders/types/nrnb.h
src/gromacs/mdlib/fft5d.c
src/gromacs/mdlib/fft5d.h
src/gromacs/mdlib/forcerec.c
src/gromacs/mdlib/gmx_parallel_3dfft.c
src/gromacs/mdlib/gmx_wallcycle.c
src/gromacs/mdlib/pme.c
src/gromacs/mdlib/pme_sse_single.h
src/gromacs/mdlib/shellfc.c
src/gromacs/mdlib/update.c
src/programs/gmxcheck/tpbcmp.c
src/programs/grompp/convparm.c
src/programs/mdrun/md.c
src/programs/mdrun/runner.c
src/tools/CMakeLists.txt
src/tools/gmx_dyecoupl.c
src/tools/gmx_membed.c

diff --cc CMakeLists.txt
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
index 12514a652b196deb38d4498c3f9595e67e953bd3,0000000000000000000000000000000000000000..de400e7426dedc726fb8b372bb80bb4eae270fe5
mode 100644,000000..100644
--- /dev/null
@@@ -1,638 -1,0 +1,654 @@@
-   int  n,rank,resultlen,hostnum,i,j,ng,ni;
- #ifdef GMX_MPI
-   char mpi_hostname[MPI_MAX_PROCESSOR_NAME],num[MPI_MAX_PROCESSOR_NAME];
- #endif
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROningen Mixture of Alchemy and Childrens' Stories
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <string.h>
 +#include "gmx_fatal.h"
 +#include "main.h"
 +#include "smalloc.h"
 +#include "network.h"
 +#include "copyrite.h"
 +#include "statutil.h"
 +#include "ctype.h"
 +#include "macros.h"
 +
 +#ifdef GMX_LIB_MPI
 +#include <mpi.h>
 +#endif
 +
 +#ifdef GMX_THREADS
 +#include "tmpi.h"
 +#endif
 +
 +
 +/* The source code in this file should be thread-safe. 
 +      Please keep it that way. */
 +
 +gmx_bool gmx_mpi_initialized(void)
 +{
 +  int n;
 +#ifndef GMX_MPI
 +  return 0;
 +#else
 +  MPI_Initialized(&n);
 +  
 +  return n;
 +#endif
 +}
 +
 +int gmx_setup(int *argc,char **argv,int *nnodes)
 +{
 +#ifndef GMX_MPI
 +  gmx_call("gmx_setup");
 +  return 0;
 +#else
 +  char   buf[256];
 +  int    resultlen;               /* actual length of node name      */
 +  int    i,flag;
 +  int  mpi_num_nodes;
 +  int  mpi_my_rank;
 +  char mpi_hostname[MPI_MAX_PROCESSOR_NAME];
 +
 +  /* Call the MPI routines */
 +#ifdef GMX_LIB_MPI
 +#ifdef GMX_FAHCORE
 +  (void) fah_MPI_Init(argc,&argv);
 +#else
 +  (void) MPI_Init(argc,&argv);
 +#endif
 +#endif
 +  (void) MPI_Comm_size( MPI_COMM_WORLD, &mpi_num_nodes );
 +  (void) MPI_Comm_rank( MPI_COMM_WORLD, &mpi_my_rank );
 +  (void) MPI_Get_processor_name( mpi_hostname, &resultlen );
 + 
 +#ifdef GMX_LIB_MPI 
 +  fprintf(stderr,"NNODES=%d, MYRANK=%d, HOSTNAME=%s\n",
 +        mpi_num_nodes,mpi_my_rank,mpi_hostname);
 +#endif
 +  
 +  *nnodes=mpi_num_nodes;
 +  
 +  return mpi_my_rank;
 +#endif
 +}
 +
 +int  gmx_node_num(void)
 +{
 +#ifndef GMX_MPI
 +  return 1;
 +#else
 +  int i;
 +  (void) MPI_Comm_size(MPI_COMM_WORLD, &i);
 +  return i;
 +#endif
 +}
 +
 +int gmx_node_rank(void)
 +{
 +#ifndef GMX_MPI
 +  return 0;
 +#else
 +  int i;
 +  (void) MPI_Comm_rank(MPI_COMM_WORLD, &i);
 +  return i;
 +#endif
 +}
 +
++
++int gmx_hostname_num()
++{
++#ifndef GMX_MPI
++  return 0;
++#else
++  int  resultlen,hostnum,i,j;
++  char mpi_hostname[MPI_MAX_PROCESSOR_NAME],hostnum_str[MPI_MAX_PROCESSOR_NAME];
++
++  MPI_Get_processor_name(mpi_hostname,&resultlen);
++  /* This procedure can only differentiate nodes with host names
++   * that end on unique numbers.
++   */
++  i = 0;
++  j = 0;
++  /* Only parse the host name up to the first dot */
++  while(i < resultlen && mpi_hostname[i] != '.') {
++    if (isdigit(mpi_hostname[i])) {
++      hostnum_str[j++] = mpi_hostname[i];
++    }
++    i++;
++  }
++  hostnum_str[j] = '\0';
++  if (j == 0) {
++    hostnum = 0;
++  } else {
++    /* Use only the last 9 decimals, so we don't overflow an int */
++    hostnum = strtol(hostnum_str + max(0,j-9), NULL, 10);
++  }
++
++  if (debug) {
++    fprintf(debug,"In gmx_setup_nodecomm: hostname '%s', hostnum %d\n",
++        mpi_hostname,hostnum);
++  }
++  return hostnum;
++#endif
++}
++
 +void gmx_setup_nodecomm(FILE *fplog,t_commrec *cr)
 +{
 +  gmx_nodecomm_t *nc;
-     MPI_Get_processor_name(mpi_hostname,&resultlen);
-     /* This procedure can only differentiate nodes with host names
-      * that end on unique numbers.
-      */
-     i = 0;
-     j = 0;
-     /* Only parse the host name up to the first dot */
-     while(i < resultlen && mpi_hostname[i] != '.') {
-       if (isdigit(mpi_hostname[i])) {
-       num[j++] = mpi_hostname[i];
-       }
-       i++;
-     }
-     num[j] = '\0';
-     if (j == 0) {
-       hostnum = 0;
-     } else {
-       /* Use only the last 9 decimals, so we don't overflow an int */
-       hostnum = strtol(num + max(0,j-9), NULL, 10); 
-     }
++  int  n,rank,hostnum,ng,ni;
 +
 +  /* Many MPI implementations do not optimize MPI_Allreduce
 +   * (and probably also other global communication calls)
 +   * for multi-core nodes connected by a network.
 +   * We can optimize such communication by using one MPI call
 +   * within each node and one between the nodes.
 +   * For MVAPICH2 and Intel MPI this reduces the time for
 +   * the global_stat communication by 25%
 +   * for 2x2-core 3 GHz Woodcrest connected by mixed DDR/SDR Infiniband.
 +   * B. Hess, November 2007
 +   */
 +
 +  nc = &cr->nc;
 +
 +  nc->bUse = FALSE;
 +#ifndef GMX_THREADS
 +  if (getenv("GMX_NO_NODECOMM") == NULL) {
 +#ifdef GMX_MPI
 +    MPI_Comm_size(cr->mpi_comm_mygroup,&n);
 +    MPI_Comm_rank(cr->mpi_comm_mygroup,&rank);
-             "In gmx_setup_nodecomm: splitting communicator of size %d\n",
-             n);
-       fprintf(debug,"In gmx_setup_nodecomm: hostname '%s', hostnum %d\n",
-             mpi_hostname,hostnum);
++
++    hostnum = gmx_hostname_num();
 +
 +    if (debug) {
 +      fprintf(debug,
++              "In gmx_setup_nodecomm: splitting communicator of size %d\n",
++              n);
 +    }
 +
++
 +    /* The intra-node communicator, split on node number */
 +    MPI_Comm_split(cr->mpi_comm_mygroup,hostnum,rank,&nc->comm_intra);
 +    MPI_Comm_rank(nc->comm_intra,&nc->rank_intra);
 +    if (debug) {
 +      fprintf(debug,"In gmx_setup_nodecomm: node rank %d rank_intra %d\n",
 +            rank,nc->rank_intra);
 +    }
 +    /* The inter-node communicator, split on rank_intra.
 +     * We actually only need the one for rank=0,
 +     * but it is easier to create them all.
 +     */
 +    MPI_Comm_split(cr->mpi_comm_mygroup,nc->rank_intra,rank,&nc->comm_inter);
 +    /* Check if this really created two step communication */
 +    MPI_Comm_size(nc->comm_inter,&ng);
 +    MPI_Comm_size(nc->comm_intra,&ni);
 +    if (debug) {
 +      fprintf(debug,"In gmx_setup_nodecomm: groups %d, my group size %d\n",
 +            ng,ni);
 +    }
 +    if ((ng > 1 && ng < n) || (ni > 1 && ni < n)) {
 +      nc->bUse = TRUE;
 +      if (fplog)
 +      fprintf(fplog,"Using two step summing over %d groups of on average %.1f processes\n\n",ng,(real)n/(real)ng);
 +      if (nc->rank_intra > 0)
 +      MPI_Comm_free(&nc->comm_inter);
 +    } else {
 +      /* One group or all processes in a separate group, use normal summing */
 +      MPI_Comm_free(&nc->comm_inter);
 +      MPI_Comm_free(&nc->comm_intra);
 +    }
 +#endif
 +  }
 +#endif
 +}
 +
 +void gmx_barrier(const t_commrec *cr)
 +{
 +#ifndef GMX_MPI
 +  gmx_call("gmx_barrier");
 +#else
 +  MPI_Barrier(cr->mpi_comm_mygroup);
 +#endif
 +}
 +
 +void gmx_abort(int noderank,int nnodes,int errorno)
 +{
 +#ifndef GMX_MPI
 +  gmx_call("gmx_abort");
 +#else
 +#ifdef GMX_THREADS
 +  fprintf(stderr,"Halting program %s\n",ShortProgram());
 +  thanx(stderr);
 +  exit(1);
 +#else
 +  if (nnodes > 1)
 +  {
 +      fprintf(stderr,"Halting parallel program %s on CPU %d out of %d\n",
 +              ShortProgram(),noderank,nnodes);
 +  }
 +  else
 +  {
 +      fprintf(stderr,"Halting program %s\n",ShortProgram());
 +  }
 +
 +  thanx(stderr);
 +  MPI_Abort(MPI_COMM_WORLD,errorno);
 +  exit(1);
 +#endif
 +#endif
 +}
 +
 +void gmx_bcast(int nbytes,void *b,const t_commrec *cr)
 +{
 +#ifndef GMX_MPI
 +  gmx_call("gmx_bast");
 +#else
 +  MPI_Bcast(b,nbytes,MPI_BYTE,MASTERRANK(cr),cr->mpi_comm_mygroup);
 +#endif
 +}
 +
 +void gmx_bcast_sim(int nbytes,void *b,const t_commrec *cr)
 +{
 +#ifndef GMX_MPI
 +  gmx_call("gmx_bast");
 +#else
 +  MPI_Bcast(b,nbytes,MPI_BYTE,MASTERRANK(cr),cr->mpi_comm_mysim);
 +#endif
 +}
 +
 +void gmx_sumd(int nr,double r[],const t_commrec *cr)
 +{
 +#ifndef GMX_MPI
 +    gmx_call("gmx_sumd");
 +#else
 +#if defined(MPI_IN_PLACE_EXISTS) || defined(GMX_THREADS)
 +    if (cr->nc.bUse) {
 +        if (cr->nc.rank_intra == 0)
 +        {
 +            /* Use two step summing. */
 +            MPI_Reduce(MPI_IN_PLACE,r,nr,MPI_DOUBLE,MPI_SUM,0,
 +                       cr->nc.comm_intra);
 +            /* Sum the roots of the internal (intra) buffers. */
 +            MPI_Allreduce(MPI_IN_PLACE,r,nr,MPI_DOUBLE,MPI_SUM,
 +                          cr->nc.comm_inter);
 +        }
 +        else
 +        {
 +            /* This is here because of the silly MPI specification
 +                that MPI_IN_PLACE should be put in sendbuf instead of recvbuf */
 +            MPI_Reduce(r,NULL,nr,MPI_DOUBLE,MPI_SUM,0,cr->nc.comm_intra);
 +        }
 +        MPI_Bcast(r,nr,MPI_DOUBLE,0,cr->nc.comm_intra);
 +    } 
 +    else 
 +    {
 +        MPI_Allreduce(MPI_IN_PLACE,r,nr,MPI_DOUBLE,MPI_SUM, 
 +                      cr->mpi_comm_mygroup);
 +    }
 +#else
 +    int i;
 +
 +    if (nr > cr->mpb->dbuf_alloc) {
 +        cr->mpb->dbuf_alloc = nr;
 +        srenew(cr->mpb->dbuf,cr->mpb->dbuf_alloc);
 +    }
 +    if (cr->nc.bUse) {
 +        /* Use two step summing */
 +        MPI_Allreduce(r,cr->mpb->dbuf,nr,MPI_DOUBLE,MPI_SUM,cr->nc.comm_intra);
 +        if (cr->nc.rank_intra == 0) {
 +            /* Sum with the buffers reversed */
 +            MPI_Allreduce(cr->mpb->dbuf,r,nr,MPI_DOUBLE,MPI_SUM, 
 +                          cr->nc.comm_inter);
 +        }
 +        MPI_Bcast(r,nr,MPI_DOUBLE,0,cr->nc.comm_intra);
 +    } else {
 +        MPI_Allreduce(r,cr->mpb->dbuf,nr,MPI_DOUBLE,MPI_SUM,
 +                      cr->mpi_comm_mygroup);
 +        for(i=0; i<nr; i++)
 +            r[i] = cr->mpb->dbuf[i];
 +    }
 +#endif
 +#endif
 +}
 +
 +void gmx_sumf(int nr,float r[],const t_commrec *cr)
 +{
 +#ifndef GMX_MPI
 +    gmx_call("gmx_sumf");
 +#else
 +#if defined(MPI_IN_PLACE_EXISTS) || defined(GMX_THREADS)
 +    if (cr->nc.bUse) {
 +        /* Use two step summing.  */
 +        if (cr->nc.rank_intra == 0)
 +        {
 +            MPI_Reduce(MPI_IN_PLACE,r,nr,MPI_FLOAT,MPI_SUM,0,
 +                       cr->nc.comm_intra);
 +            /* Sum the roots of the internal (intra) buffers */
 +            MPI_Allreduce(MPI_IN_PLACE,r,nr,MPI_FLOAT,MPI_SUM,
 +                          cr->nc.comm_inter);
 +        }
 +        else
 +        {
 +            /* This is here because of the silly MPI specification
 +                that MPI_IN_PLACE should be put in sendbuf instead of recvbuf */
 +            MPI_Reduce(r,NULL,nr,MPI_FLOAT,MPI_SUM,0,cr->nc.comm_intra);
 +        }
 +        MPI_Bcast(r,nr,MPI_FLOAT,0,cr->nc.comm_intra);
 +    } 
 +    else 
 +    {
 +        MPI_Allreduce(MPI_IN_PLACE,r,nr,MPI_FLOAT,MPI_SUM,cr->mpi_comm_mygroup);
 +    }
 +#else
 +    int i;
 +
 +    if (nr > cr->mpb->fbuf_alloc) {
 +        cr->mpb->fbuf_alloc = nr;
 +        srenew(cr->mpb->fbuf,cr->mpb->fbuf_alloc);
 +    }
 +    if (cr->nc.bUse) {
 +        /* Use two step summing */
 +        MPI_Allreduce(r,cr->mpb->fbuf,nr,MPI_FLOAT,MPI_SUM,cr->nc.comm_intra);
 +        if (cr->nc.rank_intra == 0) {
 +            /* Sum with the buffers reversed */
 +            MPI_Allreduce(cr->mpb->fbuf,r,nr,MPI_FLOAT,MPI_SUM, 
 +                          cr->nc.comm_inter);
 +        }
 +        MPI_Bcast(r,nr,MPI_FLOAT,0,cr->nc.comm_intra);
 +    } else {
 +        MPI_Allreduce(r,cr->mpb->fbuf,nr,MPI_FLOAT,MPI_SUM,
 +                      cr->mpi_comm_mygroup);
 +        for(i=0; i<nr; i++)
 +            r[i] = cr->mpb->fbuf[i];
 +    }
 +#endif
 +#endif
 +}
 +
 +void gmx_sumi(int nr,int r[],const t_commrec *cr)
 +{
 +#ifndef GMX_MPI
 +    gmx_call("gmx_sumi");
 +#else
 +#if defined(MPI_IN_PLACE_EXISTS) || defined(GMX_THREADS)
 +    if (cr->nc.bUse) {
 +        /* Use two step summing */
 +        if (cr->nc.rank_intra == 0) 
 +        {
 +            MPI_Reduce(MPI_IN_PLACE,r,nr,MPI_INT,MPI_SUM,0,cr->nc.comm_intra);
 +            /* Sum with the buffers reversed */
 +            MPI_Allreduce(MPI_IN_PLACE,r,nr,MPI_INT,MPI_SUM,cr->nc.comm_inter);
 +        }
 +        else
 +        {
 +            /* This is here because of the silly MPI specification
 +                that MPI_IN_PLACE should be put in sendbuf instead of recvbuf */
 +            MPI_Reduce(r,NULL,nr,MPI_INT,MPI_SUM,0,cr->nc.comm_intra);
 +        }
 +        MPI_Bcast(r,nr,MPI_INT,0,cr->nc.comm_intra);
 +    } 
 +    else 
 +    {
 +        MPI_Allreduce(MPI_IN_PLACE,r,nr,MPI_INT,MPI_SUM,cr->mpi_comm_mygroup);
 +    }
 +#else
 +    int i;
 +
 +    if (nr > cr->mpb->ibuf_alloc) {
 +        cr->mpb->ibuf_alloc = nr;
 +        srenew(cr->mpb->ibuf,cr->mpb->ibuf_alloc);
 +    }
 +    if (cr->nc.bUse) {
 +        /* Use two step summing */
 +        MPI_Allreduce(r,cr->mpb->ibuf,nr,MPI_INT,MPI_SUM,cr->nc.comm_intra);
 +        if (cr->nc.rank_intra == 0) {
 +            /* Sum with the buffers reversed */
 +            MPI_Allreduce(cr->mpb->ibuf,r,nr,MPI_INT,MPI_SUM,cr->nc.comm_inter);
 +        }
 +        MPI_Bcast(r,nr,MPI_INT,0,cr->nc.comm_intra);
 +    } else {
 +        MPI_Allreduce(r,cr->mpb->ibuf,nr,MPI_INT,MPI_SUM,cr->mpi_comm_mygroup);
 +        for(i=0; i<nr; i++)
 +            r[i] = cr->mpb->ibuf[i];
 +    }
 +#endif
 +#endif
 +}
 +
 +void gmx_sumli(int nr,gmx_large_int_t r[],const t_commrec *cr)
 +{
 +#ifndef GMX_MPI
 +    gmx_call("gmx_sumli");
 +#else
 +#if defined(MPI_IN_PLACE_EXISTS) || defined(GMX_THREADS)
 +    if (cr->nc.bUse) {
 +        /* Use two step summing */
 +        if (cr->nc.rank_intra == 0) 
 +        {
 +            MPI_Reduce(MPI_IN_PLACE,r,nr,GMX_MPI_LARGE_INT,MPI_SUM,0,
 +                       cr->nc.comm_intra);
 +            /* Sum with the buffers reversed */
 +            MPI_Allreduce(MPI_IN_PLACE,r,nr,GMX_MPI_LARGE_INT,MPI_SUM,
 +                          cr->nc.comm_inter);
 +        }
 +        else
 +        {
 +            /* This is here because of the silly MPI specification
 +                that MPI_IN_PLACE should be put in sendbuf instead of recvbuf */
 +            MPI_Reduce(r,NULL,nr,GMX_MPI_LARGE_INT,MPI_SUM,0,cr->nc.comm_intra);
 +        }
 +        MPI_Bcast(r,nr,GMX_MPI_LARGE_INT,0,cr->nc.comm_intra);
 +    } 
 +    else 
 +    {
 +        MPI_Allreduce(MPI_IN_PLACE,r,nr,GMX_MPI_LARGE_INT,MPI_SUM,cr->mpi_comm_mygroup);
 +    }
 +#else
 +    int i;
 +
 +    if (nr > cr->mpb->libuf_alloc) {
 +        cr->mpb->libuf_alloc = nr;
 +        srenew(cr->mpb->libuf,cr->mpb->libuf_alloc);
 +    }
 +    if (cr->nc.bUse) {
 +        /* Use two step summing */
 +        MPI_Allreduce(r,cr->mpb->libuf,nr,GMX_MPI_LARGE_INT,MPI_SUM,
 +                      cr->nc.comm_intra);
 +        if (cr->nc.rank_intra == 0) {
 +            /* Sum with the buffers reversed */
 +            MPI_Allreduce(cr->mpb->libuf,r,nr,GMX_MPI_LARGE_INT,MPI_SUM,
 +                          cr->nc.comm_inter);
 +        }
 +        MPI_Bcast(r,nr,GMX_MPI_LARGE_INT,0,cr->nc.comm_intra);
 +    } else {
 +        MPI_Allreduce(r,cr->mpb->libuf,nr,GMX_MPI_LARGE_INT,MPI_SUM,
 +                      cr->mpi_comm_mygroup);
 +        for(i=0; i<nr; i++)
 +            r[i] = cr->mpb->libuf[i];
 +    }
 +#endif
 +#endif
 +}
 +
 +
 +
 +#ifdef GMX_MPI
 +void gmx_sumd_comm(int nr,double r[],MPI_Comm mpi_comm)
 +{
 +#if defined(MPI_IN_PLACE_EXISTS) || defined(GMX_THREADS)
 +    MPI_Allreduce(MPI_IN_PLACE,r,nr,MPI_DOUBLE,MPI_SUM,mpi_comm);
 +#else
 +    /* this function is only used in code that is not performance critical,
 +       (during setup, when comm_rec is not the appropriate communication  
 +       structure), so this isn't as bad as it looks. */
 +    double *buf;
 +    int i;
 +
 +    snew(buf, nr);
 +    MPI_Allreduce(r,buf,nr,MPI_DOUBLE,MPI_SUM,mpi_comm);
 +    for(i=0; i<nr; i++)
 +        r[i] = buf[i];
 +    sfree(buf);
 +#endif
 +}
 +#endif
 +
 +#ifdef GMX_MPI
 +void gmx_sumf_comm(int nr,float r[],MPI_Comm mpi_comm)
 +{
 +#if defined(MPI_IN_PLACE_EXISTS) || defined(GMX_THREADS)
 +    MPI_Allreduce(MPI_IN_PLACE,r,nr,MPI_FLOAT,MPI_SUM,mpi_comm);
 +#else
 +    /* this function is only used in code that is not performance critical,
 +       (during setup, when comm_rec is not the appropriate communication  
 +       structure), so this isn't as bad as it looks. */
 +    float *buf;
 +    int i;
 +
 +    snew(buf, nr);
 +    MPI_Allreduce(r,buf,nr,MPI_FLOAT,MPI_SUM,mpi_comm);
 +    for(i=0; i<nr; i++)
 +        r[i] = buf[i];
 +    sfree(buf);
 +#endif
 +}
 +#endif
 +
 +void gmx_sumd_sim(int nr,double r[],const gmx_multisim_t *ms)
 +{
 +#ifndef GMX_MPI
 +  gmx_call("gmx_sumd_sim");
 +#else
 +  gmx_sumd_comm(nr,r,ms->mpi_comm_masters);
 +#endif
 +}
 +
 +void gmx_sumf_sim(int nr,float r[],const gmx_multisim_t *ms)
 +{
 +#ifndef GMX_MPI
 +  gmx_call("gmx_sumf_sim");
 +#else
 +  gmx_sumf_comm(nr,r,ms->mpi_comm_masters);
 +#endif
 +}
 +
 +void gmx_sumi_sim(int nr,int r[], const gmx_multisim_t *ms)
 +{
 +#ifndef GMX_MPI
 +    gmx_call("gmx_sumi_sim");
 +#else
 +#if defined(MPI_IN_PLACE_EXISTS) || defined(GMX_THREADS)
 +    MPI_Allreduce(MPI_IN_PLACE,r,nr,MPI_INT,MPI_SUM,ms->mpi_comm_masters);
 +#else
 +    /* this is thread-unsafe, but it will do for now: */
 +    int i;
 +
 +    if (nr > ms->mpb->ibuf_alloc) {
 +        ms->mpb->ibuf_alloc = nr;
 +        srenew(ms->mpb->ibuf,ms->mpb->ibuf_alloc);
 +    }
 +    MPI_Allreduce(r,ms->mpb->ibuf,nr,MPI_INT,MPI_SUM,ms->mpi_comm_masters);
 +    for(i=0; i<nr; i++)
 +        r[i] = ms->mpb->ibuf[i];
 +#endif
 +#endif
 +}
 +
 +void gmx_sumli_sim(int nr,gmx_large_int_t r[], const gmx_multisim_t *ms)
 +{
 +#ifndef GMX_MPI
 +    gmx_call("gmx_sumli_sim");
 +#else
 +#if defined(MPI_IN_PLACE_EXISTS) || defined(GMX_THREADS)
 +    MPI_Allreduce(MPI_IN_PLACE,r,nr,GMX_MPI_LARGE_INT,MPI_SUM,
 +                  ms->mpi_comm_masters);
 +#else
 +    /* this is thread-unsafe, but it will do for now: */
 +    int i;
 +
 +    if (nr > ms->mpb->libuf_alloc) {
 +        ms->mpb->libuf_alloc = nr;
 +        srenew(ms->mpb->libuf,ms->mpb->libuf_alloc);
 +    }
 +    MPI_Allreduce(r,ms->mpb->libuf,nr,GMX_MPI_LARGE_INT,MPI_SUM,
 +                  ms->mpi_comm_masters);
 +    for(i=0; i<nr; i++)
 +        r[i] = ms->mpb->libuf[i];
 +#endif
 +#endif
 +}
 +
 +
 +void gmx_finalize(void)
 +{
 +#ifndef GMX_MPI
 +  gmx_call("gmx_finalize");
 +#else
 +  int ret;
 +
 +  /* just as a check; we don't want to finalize twice */
 +  int finalized;
 +  MPI_Finalized(&finalized);
 +  if (finalized)
 +      return;
 +
 +  /* We sync the processes here to try to avoid problems
 +   * with buggy MPI implementations that could cause
 +   * unfinished processes to terminate.
 +   */
 +  MPI_Barrier(MPI_COMM_WORLD);
 +
 +  /*
 +  if (DOMAINDECOMP(cr)) {
 +    if (cr->npmenodes > 0 || cr->dd->bCartesian) 
 +      MPI_Comm_free(&cr->mpi_comm_mygroup);
 +    if (cr->dd->bCartesian)
 +      MPI_Comm_free(&cr->mpi_comm_mysim);
 +  }
 +  */
 +
 +  /* Apparently certain mpich implementations cause problems
 +   * with MPI_Finalize. In that case comment out MPI_Finalize.
 +   */
 +  if (debug)
 +    fprintf(debug,"Will call MPI_Finalize now\n");
 +
 +  ret = MPI_Finalize();
 +  if (debug)
 +    fprintf(debug,"Return code from MPI_Finalize = %d\n",ret);
 +#endif
 +}
 +
Simple merge
Simple merge
index 4792ee750e7d948ac637268b3769fb638001e21f,0000000000000000000000000000000000000000..3f49afe890f02e9b9eb4e1fd0005fb834e41a80b
mode 100644,000000..100644
--- /dev/null
@@@ -1,1516 -1,0 +1,1527 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROningen Mixture of Alchemy and Childrens' Stories
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +/* This file is completely threadsafe - please keep it that way! */
 +#ifdef GMX_THREADS
 +#include <thread_mpi.h>
 +#endif
 +
 +
 +#include <stdio.h>
 +#include "smalloc.h"
 +#include "typedefs.h"
 +#include "names.h"
 +#include "txtdump.h"
 +#include "string2.h"
 +#include "vec.h"
 +#include "macros.h"
 +
 +
 +int pr_indent(FILE *fp,int n)
 +{
 +  int i;
 +
 +  for (i=0; i<n; i++) (void) fprintf(fp," ");
 +  return n;
 +}
 +
 +int available(FILE *fp,void *p,int indent,const char *title)
 +{
 +  if (!p) {
 +    if (indent > 0)
 +      pr_indent(fp,indent);
 +    (void) fprintf(fp,"%s: not available\n",title);
 +  }
 +  return (p!=NULL);
 +}
 +
 +int pr_title(FILE *fp,int indent,const char *title)
 +{
 +  (void) pr_indent(fp,indent);
 +  (void) fprintf(fp,"%s:\n",title);
 +  return (indent+INDENT);
 +}
 +
 +int pr_title_n(FILE *fp,int indent,const char *title,int n)
 +{
 +  (void) pr_indent(fp,indent);
 +  (void) fprintf(fp,"%s (%d):\n",title,n);
 +  return (indent+INDENT);
 +}
 +
 +int pr_title_nxn(FILE *fp,int indent,const char *title,int n1,int n2)
 +{
 +  (void) pr_indent(fp,indent);
 +  (void) fprintf(fp,"%s (%dx%d):\n",title,n1,n2);
 +  return (indent+INDENT);
 +}
 +
 +void pr_ivec(FILE *fp,int indent,const char *title,int vec[],int n, gmx_bool bShowNumbers)
 +{
 +  int i;
 +
 +  if (available(fp,vec,indent,title))
 +    {
 +      indent=pr_title_n(fp,indent,title,n);
 +      for (i=0; i<n; i++)
 +        {
 +          (void) pr_indent(fp,indent);
 +          (void) fprintf(fp,"%s[%d]=%d\n",title,bShowNumbers?i:-1,vec[i]);
 +        }
 +    }
 +}
 +
 +void pr_ivec_block(FILE *fp,int indent,const char *title,int vec[],int n, gmx_bool bShowNumbers)
 +{
 +    int i,j;
 +    
 +    if (available(fp,vec,indent,title))
 +    {
 +        indent=pr_title_n(fp,indent,title,n);
 +        i = 0;
 +        while (i < n)
 +        {
 +            j = i+1;
 +            while (j < n && vec[j] == vec[j-1]+1)
 +            {
 +                j++;
 +            }
 +            /* Print consecutive groups of 3 or more as blocks */
 +            if (j - i < 3)
 +            {
 +                while(i < j)
 +                {
 +                    (void) pr_indent(fp,indent);
 +                    (void) fprintf(fp,"%s[%d]=%d\n",
 +                                   title,bShowNumbers?i:-1,vec[i]);
 +                    i++;
 +                }
 +            }
 +            else
 +            {
 +                (void) pr_indent(fp,indent);
 +                (void) fprintf(fp,"%s[%d,...,%d] = {%d,...,%d}\n",
 +                               title,
 +                               bShowNumbers?i:-1,
 +                               bShowNumbers?j-1:-1,
 +                               vec[i],vec[j-1]); 
 +                i = j;
 +            }
 +        }
 +    }
 +}
 +
 +void pr_bvec(FILE *fp,int indent,const char *title,gmx_bool vec[],int n, gmx_bool bShowNumbers)
 +{
 +  int i;
 +
 +  if (available(fp,vec,indent,title))
 +    {
 +      indent=pr_title_n(fp,indent,title,n);
 +      for (i=0; i<n; i++)
 +        {
 +          (void) pr_indent(fp,indent);
 +          (void) fprintf(fp,"%s[%d]=%s\n",title,bShowNumbers?i:-1,
 +                       BOOL(vec[i]));
 +        }
 +    }
 +}
 +
 +void pr_ivecs(FILE *fp,int indent,const char *title,ivec vec[],int n, gmx_bool bShowNumbers)
 +{
 +  int i,j;
 +
 +  if (available(fp,vec,indent,title))
 +    {  
 +      indent=pr_title_nxn(fp,indent,title,n,DIM);
 +      for (i=0; i<n; i++)
 +        {
 +          (void) pr_indent(fp,indent);
 +          (void) fprintf(fp,"%s[%d]={",title,bShowNumbers?i:-1);
 +          for (j=0; j<DIM; j++)
 +            {
 +              if (j!=0) (void) fprintf(fp,", ");
 +              fprintf(fp,"%d",vec[i][j]);
 +            }
 +          (void) fprintf(fp,"}\n");
 +        }
 +    }
 +}
 +
 +void pr_rvec(FILE *fp,int indent,const char *title,real vec[],int n, gmx_bool bShowNumbers)
 +{
 +  int i;
 +
 +  if (available(fp,vec,indent,title))
 +    {  
 +      indent=pr_title_n(fp,indent,title,n);
 +      for (i=0; i<n; i++)
 +        {
 +          pr_indent(fp,indent);
 +          fprintf(fp,"%s[%d]=%12.5e\n",title,bShowNumbers?i:-1,vec[i]);
 +        }
 +    }
 +}
 +
 +void pr_dvec(FILE *fp,int indent,const char *title,double vec[],int n, gmx_bool bShowNumbers)
 +{
 +      int i;
 +      
 +      if (available(fp,vec,indent,title))
 +    {  
 +              indent=pr_title_n(fp,indent,title,n);
 +              for (i=0; i<n; i++)
 +        {
 +                      pr_indent(fp,indent);
 +                      fprintf(fp,"%s[%d]=%12.5e\n",title,bShowNumbers?i:-1,vec[i]);
 +        }
 +    }
 +}
 +
 +
 +/*
 +void pr_mat(FILE *fp,int indent,char *title,matrix m)
 +{
 +  int i,j;
 +  
 +  if (available(fp,m,indent,title)) {  
 +    indent=pr_title_n(fp,indent,title,n);
 +    for(i=0; i<n; i++) {
 +      pr_indent(fp,indent);
 +      fprintf(fp,"%s[%d]=%12.5e %12.5e %12.5e\n",
 +            title,bShowNumbers?i:-1,m[i][XX],m[i][YY],m[i][ZZ]);
 +    }
 +  }
 +}
 +*/
 +
 +void pr_rvecs_len(FILE *fp,int indent,const char *title,rvec vec[],int n)
 +{
 +  int i,j;
 +
 +  if (available(fp,vec,indent,title)) {  
 +    indent=pr_title_nxn(fp,indent,title,n,DIM);
 +    for (i=0; i<n; i++) {
 +      (void) pr_indent(fp,indent);
 +      (void) fprintf(fp,"%s[%5d]={",title,i);
 +      for (j=0; j<DIM; j++) {
 +      if (j != 0) 
 +        (void) fprintf(fp,", ");
 +      (void) fprintf(fp,"%12.5e",vec[i][j]);
 +      }
 +      (void) fprintf(fp,"} len=%12.5e\n",norm(vec[i]));
 +    }
 +  }
 +}
 +
 +void pr_rvecs(FILE *fp,int indent,const char *title,rvec vec[],int n)
 +{
 +  const char *fshort = "%12.5e";
 +  const char *flong  = "%15.8e";
 +  const char *format;
 +  int i,j;
 +
 +  if (getenv("LONGFORMAT") != NULL)
 +    format = flong;
 +  else
 +    format = fshort;
 +    
 +  if (available(fp,vec,indent,title)) {  
 +    indent=pr_title_nxn(fp,indent,title,n,DIM);
 +    for (i=0; i<n; i++) {
 +      (void) pr_indent(fp,indent);
 +      (void) fprintf(fp,"%s[%5d]={",title,i);
 +      for (j=0; j<DIM; j++) {
 +      if (j != 0) 
 +        (void) fprintf(fp,", ");
 +      (void) fprintf(fp,format,vec[i][j]);
 +      }
 +      (void) fprintf(fp,"}\n");
 +    }
 +  }
 +}
 +
 +
 +void pr_reals(FILE *fp,int indent,const char *title,real *vec,int n)
 +{
 +  int i;
 +    
 +  if (available(fp,vec,indent,title)) {  
 +    (void) pr_indent(fp,indent);
 +    (void) fprintf(fp,"%s:\t",title);
 +    for(i=0; i<n; i++)
 +      fprintf(fp,"  %10g",vec[i]);
 +    (void) fprintf(fp,"\n");
 +  }
 +}
 +
 +void pr_doubles(FILE *fp,int indent,const char *title,double *vec,int n)
 +{
 +  int i;
 +    
 +  if (available(fp,vec,indent,title)) {  
 +    (void) pr_indent(fp,indent);
 +    (void) fprintf(fp,"%s:\t",title);
 +    for(i=0; i<n; i++)
 +      fprintf(fp,"  %10g",vec[i]);
 +    (void) fprintf(fp,"\n");
 +  }
 +}
 +
 +static void pr_int(FILE *fp,int indent,const char *title,int i)
 +{
 +  pr_indent(fp,indent);
 +  fprintf(fp,"%-20s = %d\n",title,i);
 +}
 +
 +static void pr_gmx_large_int(FILE *fp,int indent,const char *title,gmx_large_int_t i)
 +{
 +  char buf[STEPSTRSIZE];
 +
 +  pr_indent(fp,indent);
 +  fprintf(fp,"%-20s = %s\n",title,gmx_step_str(i,buf));
 +}
 +
 +static void pr_real(FILE *fp,int indent,const char *title,real r)
 +{
 +  pr_indent(fp,indent);
 +  fprintf(fp,"%-20s = %g\n",title,r);
 +}
 +
 +static void pr_double(FILE *fp,int indent,const char *title,double d)
 +{
 +  pr_indent(fp,indent);
 +  fprintf(fp,"%-20s = %g\n",title,d);
 +}
 +
 +static void pr_str(FILE *fp,int indent,const char *title,const char *s)
 +{
 +  pr_indent(fp,indent);
 +  fprintf(fp,"%-20s = %s\n",title,s);
 +}
 +
 +void pr_qm_opts(FILE *fp,int indent,const char *title,t_grpopts *opts)
 +{
 +  int i,m,j;
 +
 +  fprintf(fp,"%s:\n",title);
 +  
 +  pr_int(fp,indent,"ngQM",opts->ngQM);
 +  if (opts->ngQM > 0) {
 +    pr_ivec(fp,indent,"QMmethod",opts->QMmethod,opts->ngQM,FALSE);
 +    pr_ivec(fp,indent,"QMbasis",opts->QMbasis,opts->ngQM,FALSE);
 +    pr_ivec(fp,indent,"QMcharge",opts->QMcharge,opts->ngQM,FALSE);
 +    pr_ivec(fp,indent,"QMmult",opts->QMmult,opts->ngQM,FALSE);
 +    pr_bvec(fp,indent,"bSH",opts->bSH,opts->ngQM,FALSE);
 +    pr_ivec(fp,indent,"CASorbitals",opts->CASorbitals,opts->ngQM,FALSE);
 +    pr_ivec(fp,indent,"CASelectrons",opts->CASelectrons,opts->ngQM,FALSE);
 +    pr_rvec(fp,indent,"SAon",opts->SAon,opts->ngQM,FALSE);
 +    pr_rvec(fp,indent,"SAon",opts->SAon,opts->ngQM,FALSE);
 +    pr_ivec(fp,indent,"SAsteps",opts->SAsteps,opts->ngQM,FALSE);
 +    pr_bvec(fp,indent,"bOPT",opts->bOPT,opts->ngQM,FALSE);
 +    pr_bvec(fp,indent,"bTS",opts->bTS,opts->ngQM,FALSE);
 +  }
 +}
 +
 +static void pr_grp_opts(FILE *out,int indent,const char *title,t_grpopts *opts,
 +                      gmx_bool bMDPformat)
 +{
 +  int i,m,j;
 +
 +  if (!bMDPformat)
 +    fprintf(out,"%s:\n",title);
 +  
 +  pr_indent(out,indent);
 +  fprintf(out,"nrdf%s",bMDPformat ? " = " : ":");
 +  for(i=0; (i<opts->ngtc); i++)
 +    fprintf(out,"  %10g",opts->nrdf[i]);
 +  fprintf(out,"\n");
 +  
 +  pr_indent(out,indent);
 +  fprintf(out,"ref-t%s",bMDPformat ? " = " : ":");
 +  for(i=0; (i<opts->ngtc); i++)
 +    fprintf(out,"  %10g",opts->ref_t[i]);
 +  fprintf(out,"\n");
 +
 +  pr_indent(out,indent);
 +  fprintf(out,"tau-t%s",bMDPformat ? " = " : ":");
 +  for(i=0; (i<opts->ngtc); i++)
 +    fprintf(out,"  %10g",opts->tau_t[i]);
 +  fprintf(out,"\n");  
 +  
 +  /* Pretty-print the simulated annealing info */
 +  fprintf(out,"anneal%s",bMDPformat ? " = " : ":");
 +  for(i=0; (i<opts->ngtc); i++)
 +    fprintf(out,"  %10s",EANNEAL(opts->annealing[i]));
 +  fprintf(out,"\n");  
 + 
 +  fprintf(out,"ann-npoints%s",bMDPformat ? " = " : ":");
 +  for(i=0; (i<opts->ngtc); i++)
 +    fprintf(out,"  %10d",opts->anneal_npoints[i]);
 +  fprintf(out,"\n");  
 + 
 +  for(i=0; (i<opts->ngtc); i++) {
 +    if(opts->anneal_npoints[i]>0) {
 +      fprintf(out,"ann. times [%d]:\t",i);
 +      for(j=0; (j<opts->anneal_npoints[i]); j++)
 +      fprintf(out,"  %10.1f",opts->anneal_time[i][j]);
 +      fprintf(out,"\n");  
 +      fprintf(out,"ann. temps [%d]:\t",i);
 +      for(j=0; (j<opts->anneal_npoints[i]); j++)
 +      fprintf(out,"  %10.1f",opts->anneal_temp[i][j]);
 +      fprintf(out,"\n");  
 +    }
 +  }
 +  
 +  pr_indent(out,indent);
 +  fprintf(out,"acc:\t");
 +  for(i=0; (i<opts->ngacc); i++)
 +    for(m=0; (m<DIM); m++)
 +      fprintf(out,"  %10g",opts->acc[i][m]);
 +  fprintf(out,"\n");
 +
 +  pr_indent(out,indent);
 +  fprintf(out,"nfreeze:");
 +  for(i=0; (i<opts->ngfrz); i++)
 +    for(m=0; (m<DIM); m++)
 +      fprintf(out,"  %10s",opts->nFreeze[i][m] ? "Y" : "N");
 +  fprintf(out,"\n");
 +
 +
 +  for(i=0; (i<opts->ngener); i++) {
 +    pr_indent(out,indent);
 +    fprintf(out,"energygrp-flags[%3d]:",i);
 +    for(m=0; (m<opts->ngener); m++)
 +      fprintf(out," %d",opts->egp_flags[opts->ngener*i+m]);
 +    fprintf(out,"\n");
 +  }
 +
 +  fflush(out);
 +}
 +
 +static void pr_matrix(FILE *fp,int indent,const char *title,rvec *m,
 +                    gmx_bool bMDPformat)
 +{
 +  if (bMDPformat)
 +    fprintf(fp,"%-10s    = %g %g %g %g %g %g\n",title,
 +          m[XX][XX],m[YY][YY],m[ZZ][ZZ],m[XX][YY],m[XX][ZZ],m[YY][ZZ]);
 +  else
 +    pr_rvecs(fp,indent,title,m,DIM);
 +}
 +
 +static void pr_cosine(FILE *fp,int indent,const char *title,t_cosines *cos,
 +                    gmx_bool bMDPformat)
 +{
 +  int j;
 +  
 +  if (bMDPformat) {
 +    fprintf(fp,"%s = %d\n",title,cos->n);
 +  }
 +  else {
 +    indent=pr_title(fp,indent,title);
 +    (void) pr_indent(fp,indent);
 +    fprintf(fp,"n = %d\n",cos->n);
 +    if (cos->n > 0) {
 +      (void) pr_indent(fp,indent+2);
 +      fprintf(fp,"a =");
 +      for(j=0; (j<cos->n); j++)
 +      fprintf(fp," %e",cos->a[j]);
 +      fprintf(fp,"\n");
 +      (void) pr_indent(fp,indent+2);
 +      fprintf(fp,"phi =");
 +      for(j=0; (j<cos->n); j++)
 +      fprintf(fp," %e",cos->phi[j]);
 +      fprintf(fp,"\n");
 +    }
 +  }
 +}
 +
 +#define PS(t,s) pr_str(fp,indent,t,s)
 +#define PI(t,s) pr_int(fp,indent,t,s)
 +#define PSTEP(t,s) pr_gmx_large_int(fp,indent,t,s)
 +#define PR(t,s) pr_real(fp,indent,t,s)
 +#define PD(t,s) pr_double(fp,indent,t,s)
 +
 +static void pr_pullgrp(FILE *fp,int indent,int g,t_pullgrp *pg)
 +{
 +  pr_indent(fp,indent);
 +  fprintf(fp,"pull-group %d:\n",g);
 +  indent += 2;
 +  pr_ivec_block(fp,indent,"atom",pg->ind,pg->nat,TRUE);
 +  pr_rvec(fp,indent,"weight",pg->weight,pg->nweight,TRUE);
 +  PI("pbcatom",pg->pbcatom);
 +  pr_rvec(fp,indent,"vec",pg->vec,DIM,TRUE);
 +  pr_rvec(fp,indent,"init",pg->init,DIM,TRUE);
 +  PR("rate",pg->rate);
 +  PR("k",pg->k);
 +  PR("kB",pg->kB);
 +}
 +
 +static void pr_pull(FILE *fp,int indent,t_pull *pull)
 +{
 +  int g;
 +
 +  PS("pull-geometry",EPULLGEOM(pull->eGeom));
 +  pr_ivec(fp,indent,"pull-dim",pull->dim,DIM,TRUE);
 +  PR("pull-r1",pull->cyl_r1);
 +  PR("pull-r0",pull->cyl_r0);
 +  PR("pull-constr-tol",pull->constr_tol);
 +  PI("pull-nstxout",pull->nstxout);
 +  PI("pull-nstfout",pull->nstfout);
 +  PI("pull-ngrp",pull->ngrp);
 +  for(g=0; g<pull->ngrp+1; g++)
 +    pr_pullgrp(fp,indent,g,&pull->grp[g]);
 +}
 +
 +static void pr_rotgrp(FILE *fp,int indent,int g,t_rotgrp *rotg)
 +{
 +  pr_indent(fp,indent);
 +  fprintf(fp,"rotation_group %d:\n",g);
 +  indent += 2;
 +  PS("type",EROTGEOM(rotg->eType));
 +  PS("massw",BOOL(rotg->bMassW));
 +  pr_ivec_block(fp,indent,"atom",rotg->ind,rotg->nat,TRUE);
 +  pr_rvecs(fp,indent,"x_ref",rotg->x_ref,rotg->nat);
 +  pr_rvec(fp,indent,"vec",rotg->vec,DIM,TRUE);
 +  pr_rvec(fp,indent,"pivot",rotg->pivot,DIM,TRUE);
 +  PR("rate",rotg->rate);
 +  PR("k",rotg->k);
 +  PR("slab_dist",rotg->slab_dist);
 +  PR("min_gaussian",rotg->min_gaussian);
 +  PR("epsilon",rotg->eps);
 +  PS("fit_method",EROTFIT(rotg->eFittype));
 +  PI("potfitangle_nstep",rotg->PotAngle_nstep);
 +  PR("potfitangle_step",rotg->PotAngle_step);
 +}
 +
 +static void pr_rot(FILE *fp,int indent,t_rot *rot)
 +{
 +  int g;
 +
 +  PI("rot_nstrout",rot->nstrout);
 +  PI("rot_nstsout",rot->nstsout);
 +  PI("rot_ngrp",rot->ngrp);
 +  for(g=0; g<rot->ngrp; g++)
 +    pr_rotgrp(fp,indent,g,&rot->grp[g]);
 +}
 +
 +void pr_inputrec(FILE *fp,int indent,const char *title,t_inputrec *ir,
 +                 gmx_bool bMDPformat)
 +{
 +  const char *infbuf="inf";
 +  int  i;
 +  
 +  if (available(fp,ir,indent,title)) {
 +    if (!bMDPformat)
 +      indent=pr_title(fp,indent,title);
 +    PS("integrator",EI(ir->eI));
 +    PSTEP("nsteps",ir->nsteps);
 +    PSTEP("init-step",ir->init_step);
 +    PS("ns-type",ENS(ir->ns_type));
 +    PI("nstlist",ir->nstlist);
 +    PI("ndelta",ir->ndelta);
 +    PI("nstcomm",ir->nstcomm);
 +    PS("comm-mode",ECOM(ir->comm_mode));
 +    PI("nstlog",ir->nstlog);
 +    PI("nstxout",ir->nstxout);
 +    PI("nstvout",ir->nstvout);
 +    PI("nstfout",ir->nstfout);
 +    PI("nstcalcenergy",ir->nstcalcenergy);
 +    PI("nstenergy",ir->nstenergy);
 +    PI("nstxtcout",ir->nstxtcout);
 +    PR("init-t",ir->init_t);
 +    PR("delta-t",ir->delta_t);
 +    
 +    PR("xtcprec",ir->xtcprec);
 +    PI("nkx",ir->nkx);
 +    PI("nky",ir->nky);
 +    PI("nkz",ir->nkz);
 +    PI("pme-order",ir->pme_order);
 +    PR("ewald-rtol",ir->ewald_rtol);
 +    PR("ewald-geometry",ir->ewald_geometry);
 +    PR("epsilon-surface",ir->epsilon_surface);
 +    PS("optimize-fft",BOOL(ir->bOptFFT));
 +    PS("ePBC",EPBC(ir->ePBC));
 +    PS("bPeriodicMols",BOOL(ir->bPeriodicMols));
 +    PS("bContinuation",BOOL(ir->bContinuation));
 +    PS("bShakeSOR",BOOL(ir->bShakeSOR));
 +    PS("etc",ETCOUPLTYPE(ir->etc));
 +    PI("nsttcouple",ir->nsttcouple);
 +    PS("epc",EPCOUPLTYPE(ir->epc));
 +    PS("epctype",EPCOUPLTYPETYPE(ir->epct));
 +    PI("nstpcouple",ir->nstpcouple);
 +    PR("tau-p",ir->tau_p);
 +    pr_matrix(fp,indent,"ref-p",ir->ref_p,bMDPformat);
 +    pr_matrix(fp,indent,"compress",ir->compress,bMDPformat);
 +    PS("refcoord-scaling",EREFSCALINGTYPE(ir->refcoord_scaling));
 +    if (bMDPformat)
 +      fprintf(fp,"posres-com  = %g %g %g\n",ir->posres_com[XX],
 +            ir->posres_com[YY],ir->posres_com[ZZ]);
 +    else
 +      pr_rvec(fp,indent,"posres-com",ir->posres_com,DIM,TRUE);
 +    if (bMDPformat)
 +      fprintf(fp,"posres-comB = %g %g %g\n",ir->posres_comB[XX],
 +            ir->posres_comB[YY],ir->posres_comB[ZZ]);
 +    else
 +      pr_rvec(fp,indent,"posres-comB",ir->posres_comB,DIM,TRUE);
 +    PI("andersen-seed",ir->andersen_seed);
 +    PR("rlist",ir->rlist);
 +    PR("rlistlong",ir->rlistlong);
 +    PR("rtpi",ir->rtpi);
 +    PS("coulombtype",EELTYPE(ir->coulombtype));
 +    PR("rcoulomb-switch",ir->rcoulomb_switch);
 +    PR("rcoulomb",ir->rcoulomb);
 +    PS("vdwtype",EVDWTYPE(ir->vdwtype));
 +    PR("rvdw-switch",ir->rvdw_switch);
 +    PR("rvdw",ir->rvdw);
 +    if (ir->epsilon_r != 0)
 +      PR("epsilon-r",ir->epsilon_r);
 +    else
 +      PS("epsilon-r",infbuf);
 +    if (ir->epsilon_rf != 0)
 +      PR("epsilon-rf",ir->epsilon_rf);
 +    else
 +      PS("epsilon-rf",infbuf);
 +    PR("tabext",ir->tabext);
 +    PS("implicit-solvent",EIMPLICITSOL(ir->implicit_solvent));
 +    PS("gb-algorithm",EGBALGORITHM(ir->gb_algorithm));
 +    PR("gb-epsilon-solvent",ir->gb_epsilon_solvent);
 +    PI("nstgbradii",ir->nstgbradii);
 +    PR("rgbradii",ir->rgbradii);
 +    PR("gb-saltconc",ir->gb_saltconc);
 +    PR("gb-obc-alpha",ir->gb_obc_alpha);
 +    PR("gb-obc-beta",ir->gb_obc_beta);
 +    PR("gb-obc-gamma",ir->gb_obc_gamma);
 +    PR("gb-dielectric-offset",ir->gb_dielectric_offset);
 +    PS("sa-algorithm",ESAALGORITHM(ir->gb_algorithm));
 +    PR("sa-surface-tension",ir->sa_surface_tension);
 +        
 +    PS("DispCorr",EDISPCORR(ir->eDispCorr));
 +    PS("free-energy",EFEPTYPE(ir->efep));
 +    PR("init-lambda",ir->init_lambda);
 +    PR("delta-lambda",ir->delta_lambda);
 +    if (!bMDPformat)
 +    {
 +        PI("n-foreign-lambda",ir->n_flambda);
 +    }
 +    if (ir->n_flambda > 0)
 +    {
 +        pr_indent(fp,indent);
 +        fprintf(fp,"foreign-lambda%s",bMDPformat ? " = " : ":");
 +        for(i=0; i<ir->n_flambda; i++)
 +        {
 +            fprintf(fp,"  %10g",ir->flambda[i]);
 +        }
 +        fprintf(fp,"\n");
 +    }
 +    PR("sc-alpha",ir->sc_alpha);
 +    PI("sc-power",ir->sc_power);
 +    PR("sc-sigma",ir->sc_sigma);
 +    PR("sc-sigma-min",ir->sc_sigma_min);
 +    PI("nstdhdl", ir->nstdhdl);
 +    PS("separate-dhdl-file", SEPDHDLFILETYPE(ir->separate_dhdl_file));
 +    PS("dhdl-derivatives", DHDLDERIVATIVESTYPE(ir->dhdl_derivatives));
 +    PI("dh-hist-size", ir->dh_hist_size);
 +    PD("dh-hist-spacing", ir->dh_hist_spacing);
 +
 +    PI("nwall",ir->nwall);
 +    PS("wall-type",EWALLTYPE(ir->wall_type));
 +    PI("wall-atomtype[0]",ir->wall_atomtype[0]);
 +    PI("wall-atomtype[1]",ir->wall_atomtype[1]);
 +    PR("wall-density[0]",ir->wall_density[0]);
 +    PR("wall-density[1]",ir->wall_density[1]);
 +    PR("wall-ewald-zfac",ir->wall_ewald_zfac);
 +
 +    PS("pull",EPULLTYPE(ir->ePull));
 +    if (ir->ePull != epullNO)
 +      pr_pull(fp,indent,ir->pull);
 +    
 +    PS("rotation",BOOL(ir->bRot));
 +    if (ir->bRot)
 +      pr_rot(fp,indent,ir->rot);
 +
 +    PS("disre",EDISRETYPE(ir->eDisre));
 +    PS("disre-weighting",EDISREWEIGHTING(ir->eDisreWeighting));
 +    PS("disre-mixed",BOOL(ir->bDisreMixed));
 +    PR("dr-fc",ir->dr_fc);
 +    PR("dr-tau",ir->dr_tau);
 +    PR("nstdisreout",ir->nstdisreout);
 +    PR("orires-fc",ir->orires_fc);
 +    PR("orires-tau",ir->orires_tau);
 +    PR("nstorireout",ir->nstorireout);
 +
 +    PR("dihre-fc",ir->dihre_fc);
 +    
 +    PR("em-stepsize",ir->em_stepsize);
 +    PR("em-tol",ir->em_tol);
 +    PI("niter",ir->niter);
 +    PR("fc-stepsize",ir->fc_stepsize);
 +    PI("nstcgsteep",ir->nstcgsteep);
 +    PI("nbfgscorr",ir->nbfgscorr);
 +
 +    PS("ConstAlg",ECONSTRTYPE(ir->eConstrAlg));
 +    PR("shake-tol",ir->shake_tol);
 +    PI("lincs-order",ir->nProjOrder);
 +    PR("lincs-warnangle",ir->LincsWarnAngle);
 +    PI("lincs-iter",ir->nLincsIter);
 +    PR("bd-fric",ir->bd_fric);
 +    PI("ld-seed",ir->ld_seed);
 +    PR("cos-accel",ir->cos_accel);
 +    pr_matrix(fp,indent,"deform",ir->deform,bMDPformat);
 +
 +    PS("adress",BOOL(ir->bAdress));
 +    if (ir->bAdress){
 +        PS("adress_type",EADRESSTYPE(ir->adress->type));
 +        PR("adress_const_wf",ir->adress->const_wf);
 +        PR("adress_ex_width",ir->adress->ex_width);
 +        PR("adress_hy_width",ir->adress->hy_width);
 +        PS("adress_interface_correction",EADRESSICTYPE(ir->adress->icor));
 +        PS("adress_site",EADRESSSITETYPE(ir->adress->site));
 +        PR("adress_ex_force_cap",ir->adress->ex_forcecap);
 +        PS("adress_do_hybridpairs", BOOL(ir->adress->do_hybridpairs));
 +
 +        pr_rvec(fp,indent,"adress_reference_coords",ir->adress->refs,DIM,TRUE);
 +    }
 +    PI("userint1",ir->userint1);
 +    PI("userint2",ir->userint2);
 +    PI("userint3",ir->userint3);
 +    PI("userint4",ir->userint4);
 +    PR("userreal1",ir->userreal1);
 +    PR("userreal2",ir->userreal2);
 +    PR("userreal3",ir->userreal3);
 +    PR("userreal4",ir->userreal4);
 +    pr_grp_opts(fp,indent,"grpopts",&(ir->opts),bMDPformat);
 +    pr_cosine(fp,indent,"efield-x",&(ir->ex[XX]),bMDPformat);
 +    pr_cosine(fp,indent,"efield-xt",&(ir->et[XX]),bMDPformat);
 +    pr_cosine(fp,indent,"efield-y",&(ir->ex[YY]),bMDPformat);
 +    pr_cosine(fp,indent,"efield-yt",&(ir->et[YY]),bMDPformat);
 +    pr_cosine(fp,indent,"efield-z",&(ir->ex[ZZ]),bMDPformat);
 +    pr_cosine(fp,indent,"efield-zt",&(ir->et[ZZ]),bMDPformat);
 +    PS("bQMMM",BOOL(ir->bQMMM));
 +    PI("QMconstraints",ir->QMconstraints);
 +    PI("QMMMscheme",ir->QMMMscheme);
 +    PR("scalefactor",ir->scalefactor);
 +    pr_qm_opts(fp,indent,"qm-opts",&(ir->opts));
 +  }
 +}
 +#undef PS
 +#undef PR
 +#undef PI
 +
 +static void pr_harm(FILE *fp,t_iparams *iparams,const char *r,const char *kr)
 +{
 +  fprintf(fp,"%sA=%12.5e, %sA=%12.5e, %sB=%12.5e, %sB=%12.5e\n",
 +        r,iparams->harmonic.rA,kr,iparams->harmonic.krA,
 +        r,iparams->harmonic.rB,kr,iparams->harmonic.krB);
 +}
 +
 +void pr_iparams(FILE *fp,t_functype ftype,t_iparams *iparams)
 +{
 +  int i;
 +  real VA[4],VB[4],*rbcA,*rbcB;
 +
 +  switch (ftype) {
 +  case F_ANGLES:
 +  case F_G96ANGLES:
 +    pr_harm(fp,iparams,"th","ct");
 +    break;
 +  case F_CROSS_BOND_BONDS:
 +    fprintf(fp,"r1e=%15.8e, r2e=%15.8e, krr=%15.8e\n",
 +          iparams->cross_bb.r1e,iparams->cross_bb.r2e,
 +          iparams->cross_bb.krr);
 +    break;
 +  case F_CROSS_BOND_ANGLES:
 +    fprintf(fp,"r1e=%15.8e, r1e=%15.8e, r3e=%15.8e, krt=%15.8e\n",
 +          iparams->cross_ba.r1e,iparams->cross_ba.r2e,
 +          iparams->cross_ba.r3e,iparams->cross_ba.krt);
 +    break;
++  case F_LINEAR_ANGLES:
++    fprintf(fp,"klinA=%15.8e, aA=%15.8e, klinB=%15.8e, aB=%15.8e\n",
++            iparams->linangle.klinA,iparams->linangle.aA,
++            iparams->linangle.klinB,iparams->linangle.aB);
++    break;
 +  case F_UREY_BRADLEY:
 +    fprintf(fp,"theta=%15.8e, ktheta=%15.8e, r13=%15.8e, kUB=%15.8e\n",
 +          iparams->u_b.theta,iparams->u_b.ktheta,iparams->u_b.r13,iparams->u_b.kUB);
 +    break;
 +  case F_QUARTIC_ANGLES:
 +    fprintf(fp,"theta=%15.8e",iparams->qangle.theta);
 +    for(i=0; i<5; i++)
 +      fprintf(fp,", c%c=%15.8e",'0'+i,iparams->qangle.c[i]);
 +    fprintf(fp,"\n");
 +    break;
 +  case F_BHAM:
 +    fprintf(fp,"a=%15.8e, b=%15.8e, c=%15.8e\n",
 +          iparams->bham.a,iparams->bham.b,iparams->bham.c);
 +    break;
 +  case F_BONDS:
 +  case F_G96BONDS:
 +  case F_HARMONIC:
 +    pr_harm(fp,iparams,"b0","cb");
 +    break;
 +  case F_IDIHS:
 +    pr_harm(fp,iparams,"xi","cx");
 +    break;
 +  case F_MORSE:
 +    fprintf(fp,"b0=%15.8e, cb=%15.8e, beta=%15.8e\n",
 +          iparams->morse.b0,iparams->morse.cb,iparams->morse.beta);
 +    break;
 +  case F_CUBICBONDS:
 +    fprintf(fp,"b0=%15.8e, kb=%15.8e, kcub=%15.8e\n",
 +          iparams->cubic.b0,iparams->cubic.kb,iparams->cubic.kcub);
 +    break;
 +  case F_CONNBONDS:
 +    fprintf(fp,"\n");
 +    break;
 +  case F_FENEBONDS:
 +    fprintf(fp,"bm=%15.8e, kb=%15.8e\n",iparams->fene.bm,iparams->fene.kb);
 +    break;
 +  case F_RESTRBONDS:
 +      fprintf(fp,"lowA=%15.8e, up1A=%15.8e, up2A=%15.8e, kA=%15.8e, lowB=%15.8e, up1B=%15.8e, up2B=%15.8e, kB=%15.8e,\n",
 +              iparams->restraint.lowA,iparams->restraint.up1A,
 +              iparams->restraint.up2A,iparams->restraint.kA,
 +              iparams->restraint.lowB,iparams->restraint.up1B,
 +              iparams->restraint.up2B,iparams->restraint.kB);
 +      break;
 +  case F_TABBONDS:
 +  case F_TABBONDSNC:
 +  case F_TABANGLES:
 +  case F_TABDIHS:
 +    fprintf(fp,"tab=%d, kA=%15.8e, kB=%15.8e\n",
 +          iparams->tab.table,iparams->tab.kA,iparams->tab.kB);
 +    break;
 +  case F_POLARIZATION:
 +    fprintf(fp,"alpha=%15.8e\n",iparams->polarize.alpha);
 +    break;
++  case F_ANHARM_POL:
++    fprintf(fp,"alpha=%15.8e drcut=%15.8e khyp=%15.8e\n",
++            iparams->anharm_polarize.alpha,
++            iparams->anharm_polarize.drcut,
++            iparams->anharm_polarize.khyp);
++    break;
 +  case F_THOLE_POL:
 +    fprintf(fp,"a=%15.8e, alpha1=%15.8e, alpha2=%15.8e, rfac=%15.8e\n",
 +          iparams->thole.a,iparams->thole.alpha1,iparams->thole.alpha2,
 +          iparams->thole.rfac);
 +    break;
 +  case F_WATER_POL:
 +    fprintf(fp,"al_x=%15.8e, al_y=%15.8e, al_z=%15.8e, rOH=%9.6f, rHH=%9.6f, rOD=%9.6f\n",
 +          iparams->wpol.al_x,iparams->wpol.al_y,iparams->wpol.al_z,
 +          iparams->wpol.rOH,iparams->wpol.rHH,iparams->wpol.rOD);
 +    break;
 +  case F_LJ:
 +    fprintf(fp,"c6=%15.8e, c12=%15.8e\n",iparams->lj.c6,iparams->lj.c12);
 +    break;
 +  case F_LJ14:
 +    fprintf(fp,"c6A=%15.8e, c12A=%15.8e, c6B=%15.8e, c12B=%15.8e\n",
 +          iparams->lj14.c6A,iparams->lj14.c12A,
 +          iparams->lj14.c6B,iparams->lj14.c12B);
 +    break;
 +  case F_LJC14_Q:
 +    fprintf(fp,"fqq=%15.8e, qi=%15.8e, qj=%15.8e, c6=%15.8e, c12=%15.8e\n",
 +          iparams->ljc14.fqq,
 +          iparams->ljc14.qi,iparams->ljc14.qj,
 +          iparams->ljc14.c6,iparams->ljc14.c12);
 +    break;
 +  case F_LJC_PAIRS_NB:
 +    fprintf(fp,"qi=%15.8e, qj=%15.8e, c6=%15.8e, c12=%15.8e\n",
 +          iparams->ljcnb.qi,iparams->ljcnb.qj,
 +          iparams->ljcnb.c6,iparams->ljcnb.c12);
 +    break;
 +  case F_PDIHS:
 +  case F_PIDIHS:
 +  case F_ANGRES:
 +  case F_ANGRESZ:
 +    fprintf(fp,"phiA=%15.8e, cpA=%15.8e, phiB=%15.8e, cpB=%15.8e, mult=%d\n",
 +          iparams->pdihs.phiA,iparams->pdihs.cpA,
 +          iparams->pdihs.phiB,iparams->pdihs.cpB,
 +          iparams->pdihs.mult);
 +    break;
 +  case F_DISRES:
 +    fprintf(fp,"label=%4d, type=%1d, low=%15.8e, up1=%15.8e, up2=%15.8e, fac=%15.8e)\n",
 +          iparams->disres.label,iparams->disres.type,
 +          iparams->disres.low,iparams->disres.up1,
 +          iparams->disres.up2,iparams->disres.kfac);
 +    break;
 +  case F_ORIRES:
 +    fprintf(fp,"ex=%4d, label=%d, power=%4d, c=%15.8e, obs=%15.8e, kfac=%15.8e)\n",
 +          iparams->orires.ex,iparams->orires.label,iparams->orires.power,
 +          iparams->orires.c,iparams->orires.obs,iparams->orires.kfac);
 +    break;
 +  case F_DIHRES:
 +    fprintf(fp,"label=%d, power=%4d phi=%15.8e, dphi=%15.8e, kfac=%15.8e)\n",
 +          iparams->dihres.label,iparams->dihres.power,
 +          iparams->dihres.phi,iparams->dihres.dphi,iparams->dihres.kfac);
 +    break;
 +  case F_POSRES:
 +    fprintf(fp,"pos0A=(%15.8e,%15.8e,%15.8e), fcA=(%15.8e,%15.8e,%15.8e), pos0B=(%15.8e,%15.8e,%15.8e), fcB=(%15.8e,%15.8e,%15.8e)\n",
 +          iparams->posres.pos0A[XX],iparams->posres.pos0A[YY],
 +          iparams->posres.pos0A[ZZ],iparams->posres.fcA[XX],
 +          iparams->posres.fcA[YY],iparams->posres.fcA[ZZ],
 +          iparams->posres.pos0B[XX],iparams->posres.pos0B[YY],
 +          iparams->posres.pos0B[ZZ],iparams->posres.fcB[XX],
 +          iparams->posres.fcB[YY],iparams->posres.fcB[ZZ]);
 +    break;
 +  case F_RBDIHS:
 +    for (i=0; i<NR_RBDIHS; i++) 
 +      fprintf(fp,"%srbcA[%d]=%15.8e",i==0?"":", ",i,iparams->rbdihs.rbcA[i]);
 +    fprintf(fp,"\n");
 +    for (i=0; i<NR_RBDIHS; i++) 
 +      fprintf(fp,"%srbcB[%d]=%15.8e",i==0?"":", ",i,iparams->rbdihs.rbcB[i]);
 +    fprintf(fp,"\n");
 +    break;
 +  case F_FOURDIHS:
 +    /* Use the OPLS -> Ryckaert-Bellemans formula backwards to get the
 +     * OPLS potential constants back.
 +     */
 +    rbcA = iparams->rbdihs.rbcA;
 +    rbcB = iparams->rbdihs.rbcB;
 +
 +    VA[3] = -0.25*rbcA[4];
 +    VA[2] = -0.5*rbcA[3];
 +    VA[1] = 4.0*VA[3]-rbcA[2];
 +    VA[0] = 3.0*VA[2]-2.0*rbcA[1];
 +
 +    VB[3] = -0.25*rbcB[4];
 +    VB[2] = -0.5*rbcB[3];
 +    VB[1] = 4.0*VB[3]-rbcB[2];
 +    VB[0] = 3.0*VB[2]-2.0*rbcB[1];
 +
 +    for (i=0; i<NR_FOURDIHS; i++) 
 +      fprintf(fp,"%sFourA[%d]=%15.8e",i==0?"":", ",i,VA[i]);
 +    fprintf(fp,"\n");
 +    for (i=0; i<NR_FOURDIHS; i++) 
 +      fprintf(fp,"%sFourB[%d]=%15.8e",i==0?"":", ",i,VB[i]);
 +    fprintf(fp,"\n");
 +    break;
 +   
 +  case F_CONSTR:
 +  case F_CONSTRNC:
 +    fprintf(fp,"dA=%15.8e, dB=%15.8e\n",iparams->constr.dA,iparams->constr.dB);
 +    break;
 +  case F_SETTLE:
 +    fprintf(fp,"doh=%15.8e, dhh=%15.8e\n",iparams->settle.doh,
 +          iparams->settle.dhh);
 +    break;
 +  case F_VSITE2:
 +    fprintf(fp,"a=%15.8e\n",iparams->vsite.a);
 +    break;
 +  case F_VSITE3:
 +  case F_VSITE3FD:
 +  case F_VSITE3FAD:
 +    fprintf(fp,"a=%15.8e, b=%15.8e\n",iparams->vsite.a,iparams->vsite.b);
 +    break;
 +  case F_VSITE3OUT:
 +  case F_VSITE4FD:
 +  case F_VSITE4FDN:
 +    fprintf(fp,"a=%15.8e, b=%15.8e, c=%15.8e\n",
 +          iparams->vsite.a,iparams->vsite.b,iparams->vsite.c);
 +    break;
 +  case F_VSITEN:
 +    fprintf(fp,"n=%2d, a=%15.8e\n",iparams->vsiten.n,iparams->vsiten.a);
 +    break;
 +  case F_GB12:
 +  case F_GB13:
 +  case F_GB14:
 +    fprintf(fp, "sar=%15.8e, st=%15.8e, pi=%15.8e, gbr=%15.8e, bmlt=%15.8e\n",iparams->gb.sar,iparams->gb.st,iparams->gb.pi,iparams->gb.gbr,iparams->gb.bmlt);
 +    break;              
 +  case F_CMAP:
 +    fprintf(fp, "cmapA=%1d, cmapB=%1d\n",iparams->cmap.cmapA, iparams->cmap.cmapB);
 +    break;              
 +  default:
 +    gmx_fatal(FARGS,"unknown function type %d (%s) in %s line %d",
 +            ftype,interaction_function[ftype].name,__FILE__,__LINE__);
 +  }
 +}
 +
 +void pr_ilist(FILE *fp,int indent,const char *title,
 +              t_functype *functype,t_ilist *ilist, gmx_bool bShowNumbers)
 +{
 +    int i,j,k,type,ftype;
 +    t_iatom *iatoms;
 +    
 +    if (available(fp,ilist,indent,title) && ilist->nr > 0)
 +    {  
 +        indent=pr_title(fp,indent,title);
 +        (void) pr_indent(fp,indent);
 +        fprintf(fp,"nr: %d\n",ilist->nr);
 +        if (ilist->nr > 0) {
 +            (void) pr_indent(fp,indent);
 +            fprintf(fp,"iatoms:\n");
 +            iatoms=ilist->iatoms;
 +            for (i=j=0; i<ilist->nr;) {
 +#ifndef DEBUG
 +                (void) pr_indent(fp,indent+INDENT);
 +                type=*(iatoms++);
 +                ftype=functype[type];
 +                (void) fprintf(fp,"%d type=%d (%s)",
 +                               bShowNumbers?j:-1,bShowNumbers?type:-1,
 +                               interaction_function[ftype].name);
 +                j++;
 +                for (k=0; k<interaction_function[ftype].nratoms; k++)
 +                    (void) fprintf(fp," %u",*(iatoms++));
 +                (void) fprintf(fp,"\n");
 +                i+=1+interaction_function[ftype].nratoms;
 +#else
 +                fprintf(fp,"%5d%5d\n",i,iatoms[i]);
 +                i++;
 +#endif
 +            }
 +        }
 +    }
 +}
 +
 +static void pr_cmap(FILE *fp, int indent, const char *title,
 +                    gmx_cmap_t *cmap_grid, gmx_bool bShowNumbers)
 +{
 +    int i,j,nelem;
 +    real dx,idx;
 +      
 +    dx    = 360.0 / cmap_grid->grid_spacing;
 +    nelem = cmap_grid->grid_spacing*cmap_grid->grid_spacing;
 +      
 +    if(available(fp,cmap_grid,indent,title))
 +    {
 +        fprintf(fp,"%s\n",title);
 +              
 +        for(i=0;i<cmap_grid->ngrid;i++)
 +        {
 +            idx = -180.0;
 +            fprintf(fp,"%8s %8s %8s %8s\n","V","dVdx","dVdy","d2dV");
 +                      
 +            fprintf(fp,"grid[%3d]={\n",bShowNumbers?i:-1);
 +                      
 +            for(j=0;j<nelem;j++)
 +            {
 +                if( (j%cmap_grid->grid_spacing)==0)
 +                {
 +                    fprintf(fp,"%8.1f\n",idx);
 +                    idx+=dx;
 +                }
 +                              
 +                fprintf(fp,"%8.3f ",cmap_grid->cmapdata[i].cmap[j*4]);
 +                fprintf(fp,"%8.3f ",cmap_grid->cmapdata[i].cmap[j*4+1]);
 +                fprintf(fp,"%8.3f ",cmap_grid->cmapdata[i].cmap[j*4+2]);
 +                fprintf(fp,"%8.3f\n",cmap_grid->cmapdata[i].cmap[j*4+3]);
 +            }
 +            fprintf(fp,"\n");
 +        }
 +    }
 +      
 +}
 +
 +void pr_ffparams(FILE *fp,int indent,const char *title,
 +                 gmx_ffparams_t *ffparams,
 +                 gmx_bool bShowNumbers)
 +{
 +  int i,j;
 +  
 +  indent=pr_title(fp,indent,title);
 +  (void) pr_indent(fp,indent);
 +  (void) fprintf(fp,"atnr=%d\n",ffparams->atnr);
 +  (void) pr_indent(fp,indent);
 +  (void) fprintf(fp,"ntypes=%d\n",ffparams->ntypes);
 +  for (i=0; i<ffparams->ntypes; i++) {
 +      (void) pr_indent(fp,indent+INDENT);
 +      (void) fprintf(fp,"functype[%d]=%s, ",
 +                     bShowNumbers?i:-1,
 +                     interaction_function[ffparams->functype[i]].name);
 +      pr_iparams(fp,ffparams->functype[i],&ffparams->iparams[i]);
 +  }
 +  (void) pr_double(fp,indent,"reppow",ffparams->reppow);
 +  (void) pr_real(fp,indent,"fudgeQQ",ffparams->fudgeQQ);
 +  pr_cmap(fp,indent,"cmap",&ffparams->cmap_grid,bShowNumbers);
 +}
 +
 +void pr_idef(FILE *fp,int indent,const char *title,t_idef *idef, gmx_bool bShowNumbers)
 +{
 +  int i,j;
 +  
 +  if (available(fp,idef,indent,title)) {  
 +    indent=pr_title(fp,indent,title);
 +    (void) pr_indent(fp,indent);
 +    (void) fprintf(fp,"atnr=%d\n",idef->atnr);
 +    (void) pr_indent(fp,indent);
 +    (void) fprintf(fp,"ntypes=%d\n",idef->ntypes);
 +    for (i=0; i<idef->ntypes; i++) {
 +      (void) pr_indent(fp,indent+INDENT);
 +      (void) fprintf(fp,"functype[%d]=%s, ",
 +                   bShowNumbers?i:-1,
 +                   interaction_function[idef->functype[i]].name);
 +      pr_iparams(fp,idef->functype[i],&idef->iparams[i]);
 +    }
 +    (void) pr_real(fp,indent,"fudgeQQ",idef->fudgeQQ);
 +
 +    for(j=0; (j<F_NRE); j++)
 +      pr_ilist(fp,indent,interaction_function[j].longname,
 +               idef->functype,&idef->il[j],bShowNumbers);
 +  }
 +}
 +
 +static int pr_block_title(FILE *fp,int indent,const char *title,t_block *block)
 +{
 +  int i;
 +
 +  if (available(fp,block,indent,title))
 +    {
 +      indent=pr_title(fp,indent,title);
 +      (void) pr_indent(fp,indent);
 +      (void) fprintf(fp,"nr=%d\n",block->nr);
 +    }
 +  return indent;
 +}
 +
 +static int pr_blocka_title(FILE *fp,int indent,const char *title,t_blocka *block)
 +{
 +  int i;
 +
 +  if (available(fp,block,indent,title))
 +    {
 +      indent=pr_title(fp,indent,title);
 +      (void) pr_indent(fp,indent);
 +      (void) fprintf(fp,"nr=%d\n",block->nr);
 +      (void) pr_indent(fp,indent);
 +      (void) fprintf(fp,"nra=%d\n",block->nra);
 +    }
 +  return indent;
 +}
 +
 +static void low_pr_block(FILE *fp,int indent,const char *title,t_block *block, gmx_bool bShowNumbers)
 +{
 +  int i;
 +  
 +  if (available(fp,block,indent,title))
 +    {
 +      indent=pr_block_title(fp,indent,title,block);
 +      for (i=0; i<=block->nr; i++)
 +        {
 +          (void) pr_indent(fp,indent+INDENT);
 +          (void) fprintf(fp,"%s->index[%d]=%u\n",
 +                       title,bShowNumbers?i:-1,block->index[i]);
 +        }
 +    }
 +}
 +
 +static void low_pr_blocka(FILE *fp,int indent,const char *title,t_blocka *block, gmx_bool bShowNumbers)
 +{
 +  int i;
 +  
 +  if (available(fp,block,indent,title))
 +    {
 +      indent=pr_blocka_title(fp,indent,title,block);
 +      for (i=0; i<=block->nr; i++)
 +        {
 +          (void) pr_indent(fp,indent+INDENT);
 +          (void) fprintf(fp,"%s->index[%d]=%u\n",
 +                       title,bShowNumbers?i:-1,block->index[i]);
 +        }
 +      for (i=0; i<block->nra; i++)
 +        {
 +          (void) pr_indent(fp,indent+INDENT);
 +          (void) fprintf(fp,"%s->a[%d]=%u\n",
 +                       title,bShowNumbers?i:-1,block->a[i]);
 +        }
 +    }
 +}
 +
 +void pr_block(FILE *fp,int indent,const char *title,t_block *block,gmx_bool bShowNumbers)
 +{
 +  int i,j,ok,size,start,end;
 +  
 +  if (available(fp,block,indent,title))
 +    {
 +      indent=pr_block_title(fp,indent,title,block);
 +      start=0;
 +      end=start;
 +      if ((ok=(block->index[start]==0))==0)
 +        (void) fprintf(fp,"block->index[%d] should be 0\n",start);
 +      else
 +        for (i=0; i<block->nr; i++)
 +          {
 +            end=block->index[i+1];
 +            size=pr_indent(fp,indent);
 +            if (end<=start)
 +              size+=fprintf(fp,"%s[%d]={}\n",title,i);
 +            else
 +              size+=fprintf(fp,"%s[%d]={%d..%d}\n",
 +                          title,bShowNumbers?i:-1,
 +                          bShowNumbers?start:-1,bShowNumbers?end-1:-1);
 +            start=end;
 +          }
 +    }
 +}
 +
 +void pr_blocka(FILE *fp,int indent,const char *title,t_blocka *block,gmx_bool bShowNumbers)
 +{
 +  int i,j,ok,size,start,end;
 +  
 +  if (available(fp,block,indent,title))
 +    {
 +      indent=pr_blocka_title(fp,indent,title,block);
 +      start=0;
 +      end=start;
 +      if ((ok=(block->index[start]==0))==0)
 +        (void) fprintf(fp,"block->index[%d] should be 0\n",start);
 +      else
 +        for (i=0; i<block->nr; i++)
 +          {
 +            end=block->index[i+1];
 +            size=pr_indent(fp,indent);
 +            if (end<=start)
 +              size+=fprintf(fp,"%s[%d]={",title,i);
 +            else
 +              size+=fprintf(fp,"%s[%d][%d..%d]={",
 +                          title,bShowNumbers?i:-1,
 +                          bShowNumbers?start:-1,bShowNumbers?end-1:-1);
 +            for (j=start; j<end; j++)
 +              {
 +                if (j>start) size+=fprintf(fp,", ");
 +                if ((size)>(USE_WIDTH))
 +                  {
 +                    (void) fprintf(fp,"\n");
 +                    size=pr_indent(fp,indent+INDENT);
 +                  }
 +                size+=fprintf(fp,"%u",block->a[j]);
 +              }
 +            (void) fprintf(fp,"}\n");
 +            start=end;
 +          }
 +      if ((end!=block->nra)||(!ok)) 
 +        {
 +          (void) pr_indent(fp,indent);
 +          (void) fprintf(fp,"tables inconsistent, dumping complete tables:\n");
 +          low_pr_blocka(fp,indent,title,block,bShowNumbers);
 +        }
 +    }
 +}
 +
 +static void pr_strings(FILE *fp,int indent,const char *title,char ***nm,int n, gmx_bool bShowNumbers)
 +{
 +  int i;
 +
 +  if (available(fp,nm,indent,title))
 +    {  
 +      indent=pr_title_n(fp,indent,title,n);
 +      for (i=0; i<n; i++)
 +        {
 +          (void) pr_indent(fp,indent);
 +          (void) fprintf(fp,"%s[%d]={name=\"%s\"}\n",
 +                       title,bShowNumbers?i:-1,*(nm[i]));
 +        }
 +    }
 +}
 +
 +static void pr_strings2(FILE *fp,int indent,const char *title,
 +                      char ***nm,char ***nmB,int n, gmx_bool bShowNumbers)
 +{
 +  int i;
 +
 +  if (available(fp,nm,indent,title))
 +    {  
 +      indent=pr_title_n(fp,indent,title,n);
 +      for (i=0; i<n; i++)
 +        {
 +          (void) pr_indent(fp,indent);
 +          (void) fprintf(fp,"%s[%d]={name=\"%s\",nameB=\"%s\"}\n",
 +                       title,bShowNumbers?i:-1,*(nm[i]),*(nmB[i]));
 +        }
 +    }
 +}
 +
 +static void pr_resinfo(FILE *fp,int indent,const char *title,t_resinfo *resinfo,int n, gmx_bool bShowNumbers)
 +{
 +    int i;
 +    
 +    if (available(fp,resinfo,indent,title))
 +    {  
 +        indent=pr_title_n(fp,indent,title,n);
 +        for (i=0; i<n; i++)
 +        {
 +            (void) pr_indent(fp,indent);
 +            (void) fprintf(fp,"%s[%d]={name=\"%s\", nr=%d, ic='%c'}\n",
 +                           title,bShowNumbers?i:-1,
 +                           *(resinfo[i].name),resinfo[i].nr,
 +                           (resinfo[i].ic == '\0') ? ' ' : resinfo[i].ic);
 +        }
 +    }
 +}
 +
 +static void pr_atom(FILE *fp,int indent,const char *title,t_atom *atom,int n)
 +{
 +  int i,j;
 +  
 +  if (available(fp,atom,indent,title)) {  
 +    indent=pr_title_n(fp,indent,title,n);
 +    for (i=0; i<n; i++) {
 +      (void) pr_indent(fp,indent);
 +      fprintf(fp,"%s[%6d]={type=%3d, typeB=%3d, ptype=%8s, m=%12.5e, "
 +              "q=%12.5e, mB=%12.5e, qB=%12.5e, resind=%5d, atomnumber=%3d}\n",
 +              title,i,atom[i].type,atom[i].typeB,ptype_str[atom[i].ptype],
 +              atom[i].m,atom[i].q,atom[i].mB,atom[i].qB,
 +              atom[i].resind,atom[i].atomnumber);
 +    }
 +  }
 +}
 +
 +static void pr_grps(FILE *fp,int indent,const char *title,t_grps grps[],
 +                  char **grpname[], gmx_bool bShowNumbers)
 +{
 +    int i,j;
 +
 +    for(i=0; (i<egcNR); i++)
 +    {
 +        fprintf(fp,"%s[%-12s] nr=%d, name=[",title,gtypes[i],grps[i].nr);
 +        for(j=0; (j<grps[i].nr); j++)
 +        {
 +            fprintf(fp," %s",*(grpname[grps[i].nm_ind[j]]));
 +        }
 +        fprintf(fp,"]\n");
 +    }
 +}
 +
 +static void pr_groups(FILE *fp,int indent,const char *title,
 +                      gmx_groups_t *groups,
 +                      gmx_bool bShowNumbers)
 +{
 +    int grpnr[egcNR];
 +    int nat_max,i,g;
 +
 +    pr_grps(fp,indent,"grp",groups->grps,groups->grpname,bShowNumbers);
 +    pr_strings(fp,indent,"grpname",groups->grpname,groups->ngrpname,bShowNumbers);
 +
 +    (void) pr_indent(fp,indent);
 +    fprintf(fp,"groups          ");
 +    for(g=0; g<egcNR; g++)
 +    {
 +       printf(" %5.5s",gtypes[g]);
 +    }
 +    printf("\n");
 +
 +    (void) pr_indent(fp,indent);
 +    fprintf(fp,"allocated       ");
 +    nat_max = 0;
 +    for(g=0; g<egcNR; g++)
 +    {
 +        printf(" %5d",groups->ngrpnr[g]);
 +        nat_max = max(nat_max,groups->ngrpnr[g]);
 +    }
 +    printf("\n");
 +
 +    if (nat_max == 0)
 +    {
 +        (void) pr_indent(fp,indent);
 +        fprintf(fp,"groupnr[%5s] =","*");
 +        for(g=0; g<egcNR; g++)
 +        {
 +            fprintf(fp,"  %3d ",0);
 +        }
 +        fprintf(fp,"\n");
 +    }
 +    else
 +    {
 +        for(i=0; i<nat_max; i++)
 +        {
 +            (void) pr_indent(fp,indent);
 +            fprintf(fp,"groupnr[%5d] =",i);
 +            for(g=0; g<egcNR; g++)
 +            {
 +                fprintf(fp,"  %3d ",
 +                        groups->grpnr[g] ? groups->grpnr[g][i] : 0);
 +            }
 +            fprintf(fp,"\n");
 +        }
 +    }
 +}
 +
 +void pr_atoms(FILE *fp,int indent,const char *title,t_atoms *atoms, 
 +            gmx_bool bShownumbers)
 +{
 +  if (available(fp,atoms,indent,title))
 +    {
 +      indent=pr_title(fp,indent,title);
 +      pr_atom(fp,indent,"atom",atoms->atom,atoms->nr);
 +      pr_strings(fp,indent,"atom",atoms->atomname,atoms->nr,bShownumbers);
 +      pr_strings2(fp,indent,"type",atoms->atomtype,atoms->atomtypeB,atoms->nr,bShownumbers);
 +      pr_resinfo(fp,indent,"residue",atoms->resinfo,atoms->nres,bShownumbers);
 +    }
 +}
 +
 +
 +void pr_atomtypes(FILE *fp,int indent,const char *title,t_atomtypes *atomtypes, 
 +                gmx_bool bShowNumbers)
 +{
 +  int i;
 +  if (available(fp,atomtypes,indent,title)) 
 +  {
 +    indent=pr_title(fp,indent,title);
 +    for(i=0;i<atomtypes->nr;i++) {
 +      pr_indent(fp,indent);
 +              fprintf(fp,
 +                              "atomtype[%3d]={radius=%12.5e, volume=%12.5e, gb_radius=%12.5e, surftens=%12.5e, atomnumber=%4d, S_hct=%12.5e)}\n",
 +                              bShowNumbers?i:-1,atomtypes->radius[i],atomtypes->vol[i],
 +                              atomtypes->gb_radius[i],
 +                              atomtypes->surftens[i],atomtypes->atomnumber[i],atomtypes->S_hct[i]);
 +    }
 +  }
 +}
 +
 +static void pr_moltype(FILE *fp,int indent,const char *title,
 +                       gmx_moltype_t *molt,int n,
 +                       gmx_ffparams_t *ffparams,
 +                       gmx_bool bShowNumbers)
 +{
 +    int j;
 +
 +    indent = pr_title_n(fp,indent,title,n);
 +    (void) pr_indent(fp,indent);
 +    (void) fprintf(fp,"name=\"%s\"\n",*(molt->name));
 +    pr_atoms(fp,indent,"atoms",&(molt->atoms),bShowNumbers);
 +    pr_block(fp,indent,"cgs",&molt->cgs, bShowNumbers);
 +    pr_blocka(fp,indent,"excls",&molt->excls, bShowNumbers);
 +    for(j=0; (j<F_NRE); j++) {
 +        pr_ilist(fp,indent,interaction_function[j].longname,
 +                 ffparams->functype,&molt->ilist[j],bShowNumbers);
 +    }
 +}
 +
 +static void pr_molblock(FILE *fp,int indent,const char *title,
 +                        gmx_molblock_t *molb,int n,
 +                        gmx_moltype_t *molt,
 +                        gmx_bool bShowNumbers)
 +{
 +    indent = pr_title_n(fp,indent,title,n);
 +    (void) pr_indent(fp,indent);
 +    (void) fprintf(fp,"%-20s = %d \"%s\"\n",
 +                   "moltype",molb->type,*(molt[molb->type].name));
 +    pr_int(fp,indent,"#molecules",molb->nmol);
 +    pr_int(fp,indent,"#atoms_mol",molb->natoms_mol);
 +    pr_int(fp,indent,"#posres_xA",molb->nposres_xA);
 +    if (molb->nposres_xA > 0) {
 +        pr_rvecs(fp,indent,"posres_xA",molb->posres_xA,molb->nposres_xA);
 +    }
 +    pr_int(fp,indent,"#posres_xB",molb->nposres_xB);
 +    if (molb->nposres_xB > 0) {
 +        pr_rvecs(fp,indent,"posres_xB",molb->posres_xB,molb->nposres_xB);
 +    }
 +}
 +
 +void pr_mtop(FILE *fp,int indent,const char *title,gmx_mtop_t *mtop,
 +             gmx_bool bShowNumbers)
 +{
 +    int mt,mb;
 +
 +    if (available(fp,mtop,indent,title)) {
 +        indent=pr_title(fp,indent,title);
 +        (void) pr_indent(fp,indent);
 +        (void) fprintf(fp,"name=\"%s\"\n",*(mtop->name));
 +        pr_int(fp,indent,"#atoms",mtop->natoms);
 +        for(mb=0; mb<mtop->nmolblock; mb++) {
 +            pr_molblock(fp,indent,"molblock",&mtop->molblock[mb],mb,
 +                        mtop->moltype,bShowNumbers);
 +        }
 +        pr_ffparams(fp,indent,"ffparams",&(mtop->ffparams),bShowNumbers);
 +        pr_atomtypes(fp,indent,"atomtypes",&(mtop->atomtypes),bShowNumbers);
 +        for(mt=0; mt<mtop->nmoltype; mt++) {
 +            pr_moltype(fp,indent,"moltype",&mtop->moltype[mt],mt,
 +                       &mtop->ffparams,bShowNumbers);
 +        }
 +        pr_groups(fp,indent,"groups",&mtop->groups,bShowNumbers);
 +    }
 +}
 +
 +void pr_top(FILE *fp,int indent,const char *title,t_topology *top, gmx_bool bShowNumbers)
 +{
 +  if (available(fp,top,indent,title)) {
 +    indent=pr_title(fp,indent,title);
 +    (void) pr_indent(fp,indent);
 +    (void) fprintf(fp,"name=\"%s\"\n",*(top->name));
 +    pr_atoms(fp,indent,"atoms",&(top->atoms),bShowNumbers);
 +    pr_atomtypes(fp,indent,"atomtypes",&(top->atomtypes),bShowNumbers);
 +    pr_block(fp,indent,"cgs",&top->cgs, bShowNumbers);
 +    pr_block(fp,indent,"mols",&top->mols, bShowNumbers);
 +    pr_blocka(fp,indent,"excls",&top->excls, bShowNumbers);
 +    pr_idef(fp,indent,"idef",&top->idef,bShowNumbers);
 +  }
 +}
 +
 +void pr_header(FILE *fp,int indent,const char *title,t_tpxheader *sh)
 +{
 +  char buf[22];
 +    
 +  if (available(fp,sh,indent,title))
 +    {
 +      indent=pr_title(fp,indent,title);
 +      pr_indent(fp,indent);
 +      fprintf(fp,"bIr    = %spresent\n",sh->bIr?"":"not ");
 +      pr_indent(fp,indent);
 +      fprintf(fp,"bBox   = %spresent\n",sh->bBox?"":"not ");
 +      pr_indent(fp,indent);
 +      fprintf(fp,"bTop   = %spresent\n",sh->bTop?"":"not ");
 +      pr_indent(fp,indent);
 +      fprintf(fp,"bX     = %spresent\n",sh->bX?"":"not ");
 +      pr_indent(fp,indent);
 +      fprintf(fp,"bV     = %spresent\n",sh->bV?"":"not ");
 +      pr_indent(fp,indent);
 +      fprintf(fp,"bF     = %spresent\n",sh->bF?"":"not ");
 +      
 +      pr_indent(fp,indent);
 +      fprintf(fp,"natoms = %d\n",sh->natoms);
 +      pr_indent(fp,indent);
 +      fprintf(fp,"lambda = %e\n",sh->lambda);
 +    }
 +}
 +
 +void pr_commrec(FILE *fp,int indent,t_commrec *cr)
 +{
 +  pr_indent(fp,indent);
 +  fprintf(fp,"commrec:\n");
 +  indent+=2;
 +  pr_indent(fp,indent);
 +  fprintf(fp,"nodeid    = %d\n",cr->nodeid);
 +  pr_indent(fp,indent);
 +  fprintf(fp,"nnodes    = %d\n",cr->nnodes);
 +  pr_indent(fp,indent);
 +  fprintf(fp,"npmenodes = %d\n",cr->npmenodes);
 +  /*
 +  pr_indent(fp,indent);
 +  fprintf(fp,"threadid  = %d\n",cr->threadid);
 +  pr_indent(fp,indent);
 +  fprintf(fp,"nthreads  = %d\n",cr->nthreads);
 +  */
 +}
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
index f912424815ef6b03c0c41b213aa8ad9c97358616,0000000000000000000000000000000000000000..d3742d050e181d06c356262012957b3e5f167956
mode 100644,000000..100644
--- /dev/null
@@@ -1,285 -1,0 +1,286 @@@
-                          :"=r"(ret)
-                          :"m"(a->value), "0"(ret)
 +/*
 +This source code file is part of thread_mpi.  
 +Written by Sander Pronk, Erik Lindahl, and possibly others. 
 +
 +Copyright (c) 2009, Sander Pronk, Erik Lindahl.
 +All rights reserved.
 +
 +Redistribution and use in source and binary forms, with or without
 +modification, are permitted provided that the following conditions are met:
 +1) Redistributions of source code must retain the above copyright
 +   notice, this list of conditions and the following disclaimer.
 +2) Redistributions in binary form must reproduce the above copyright
 +   notice, this list of conditions and the following disclaimer in the
 +   documentation and/or other materials provided with the distribution.
 +3) Neither the name of the copyright holders nor the
 +   names of its contributors may be used to endorse or promote products
 +   derived from this software without specific prior written permission.
 +
 +THIS SOFTWARE IS PROVIDED BY US ''AS IS'' AND ANY
 +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 +DISCLAIMED. IN NO EVENT SHALL WE BE LIABLE FOR ANY
 +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 +
 +If you want to redistribute modifications, please consider that
 +scientific software is very special. Version control is crucial -
 +bugs must be traceable. We will be happy to consider code for
 +inclusion in the official distribution, but derived work should not
 +be called official thread_mpi. Details are found in the README & COPYING
 +files.
 +*/
 +
 +
 +
 +#include <limits.h>
 +#include <stdint.h>
 +/* This code is executed for x86 and x86-64, with these compilers:
 + * GNU
 + * Intel 
 + * Pathscale
 + * All these support GCC-style inline assembly. 
 + * We also use this section for the documentation.
 + */
 +
 +
 +#if 0
 +/* Only gcc and Intel support this check, otherwise set it to true (skip doc) */
 +#if (!defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined DOXYGEN)
 +#define __builtin_constant_p(i) (1)
 +#endif
 +#endif
 +
 +/* we put all of these on their own cache line by specifying an alignment: */
 +typedef struct tMPI_Atomic
 +{
 +    int value __attribute__ ((aligned(64))); 
 +} tMPI_Atomic_t;
 +
 +typedef struct tMPI_Atomic_ptr
 +{
 +    void* value __attribute__ ((aligned(64)));   
 +} tMPI_Atomic_ptr_t;
 +
 +typedef struct tMPI_Spinlock
 +{
 +    unsigned int lock __attribute__ ((aligned(64)));
 +} tMPI_Spinlock_t;
 +
 +
 +#define TMPI_SPINLOCK_INITIALIZER   { 0 }
 +
 +
 +
 +/* these are guaranteed to be  atomic on x86 and x86_64 */
 +#define tMPI_Atomic_get(a)  ((a)->value) 
 +#define tMPI_Atomic_set(a,i)  (((a)->value) = (i))
 +
 +#define tMPI_Atomic_ptr_get(a)  ((a)->value) 
 +#define tMPI_Atomic_ptr_set(a,i)  (((a)->value) = (void*)(i))
 +
 +
 +/* do the intrinsics. 
 +   
 +   We disable this for 32-bit builds because the target may be 80386, 
 +   which didn't have cmpxchg, etc (they were introduced as only as 'recently' 
 +   as the 486, and gcc on some Linux versions still target 80386 by default). 
 +  
 +   We also specifically check for icc, because intrinsics are not always
 +   supported there. */
 +#if ( (TMPI_GCC_VERSION >= 40100) && defined(__x86_64__) &&  \
 +     !defined(__INTEL_COMPILER) ) 
 +#include "gcc_intrinsics.h"
 +
 +#else
 +/* older versions of gcc don't support atomic intrinsics */
 +
 +
 +#define tMPI_Atomic_memory_barrier() __asm__ __volatile__("sfence;": : :"memory")
 + 
 +static inline int tMPI_Atomic_add_return(tMPI_Atomic_t *a, int i)
 +{
 +    int __i;
 +    
 +    __i = i;
 +    __asm__ __volatile__("lock ; xaddl %0, %1;"
 +                         :"=r"(i) :"m"(a->value), "0"(i) : "memory");
 +    return i + __i;
 +}  
 +
 +static inline int tMPI_Atomic_fetch_add(tMPI_Atomic_t *a, int i)
 +{
 +    __asm__ __volatile__("lock ; xaddl %0, %1;"
 +                         :"=r"(i) :"m"(a->value), "0"(i) : "memory");
 +    return i;
 +}
 +
 +static inline int tMPI_Atomic_cas(tMPI_Atomic_t *a, int oldval, int newval)
 +{
 +    int prev;
 +    
 +    __asm__ __volatile__("lock ; cmpxchgl %1,%2"
 +                         : "=a"(prev)
 +                         : "q"(newval), "m"(a->value), "0"(oldval)
 +                         : "memory");
 +    
 +    return prev==oldval;
 +}
 +
 +static inline int tMPI_Atomic_ptr_cas(tMPI_Atomic_ptr_t *a, 
 +                                      void *oldval,
 +                                      void *newval)
 +{
 +    void* prev;
 +#ifndef __x86_64__ 
 +    __asm__ __volatile__("lock ; cmpxchgl %1,%2"
 +                         : "=a"(prev)
 +                         : "q"(newval), "m"(a->value), "0"(oldval)
 +                         : "memory");
 +#else 
 +    __asm__ __volatile__("lock ; cmpxchgq %1,%2"
 +                         : "=a"(prev)
 +                         : "q"(newval), "m"(a->value), "0"(oldval)
 +                         : "memory");
 +#endif
 +    return prev==oldval;
 +}
 +
 +#endif /* end of check for gcc intrinsics */
 +
 +#define TMPI_HAVE_SWAP
 +/* do the swap fns; we told the intrinsics that we have them. */
 +static inline int tMPI_Atomic_swap(tMPI_Atomic_t *a, int b)
 +{
 +    volatile int ret=b;
 +    __asm__ __volatile__("\txchgl %0, %1;" 
-                          :"=r"(ret)
-                          :"m"(a->value), "0"(ret)
++                         :"+r"(ret), "+m"(a->value)
++                         : 
 +                         :"memory");
 +    return (int)ret;
 +}
 +
 +static inline void *tMPI_Atomic_ptr_swap(tMPI_Atomic_ptr_t *a, void *b)
 +{
 +    void *volatile *ret=(void* volatile*)b;
 +#ifndef __x86_64__ 
 +/*    __asm__ __volatile__("\txchgl %0, %1;" 
 +                         :"=m"(a->value),"=q"(b) 
 +                         :"q"(b)
 +                         :"memory");
 +*/
 +    __asm__ __volatile__("\txchgl %0, %1;" 
-                          :"=r"(ret)
-                          :"m"(a->value), "0"(ret)
++                         :"+r"(ret), "+m"(a->value)
++                         :
 +                         :"memory");
 +
 +#else
 +    __asm__ __volatile__("\txchgq %0, %1;" 
-                          : "=m" (x->lock)         /* input & output var */
++                         :"+r"(ret), "+m"(a->value)
++                         :
 +                         :"memory");
 +#endif
 +    return (void*)ret;
 +}
 +
 +
 +
 +/* spinlocks : */
 +
 +static inline void tMPI_Spinlock_init(tMPI_Spinlock_t *x)
 +{
 +    x->lock = 0;
 +}
 +
 +
 +
 +static inline void tMPI_Spinlock_lock(tMPI_Spinlock_t *x)
 +{
 +    /* this is a spinlock with a double loop, as recommended by Intel
 +       it pauses in the outer loop (the one that just checks for the 
 +       availability of the lock), and thereby reduces bus contention and
 +       prevents the pipeline from flushing. */
 +    __asm__ __volatile__("1:\tcmpl $0, %0\n"      /* check the lock */
 +                         "\tje 2f\n"              /* try to lock if it is
 +                                                     free by jumping forward */
 +                         "\tpause\n"              /* otherwise: small pause
 +                                                     as recommended by Intel */
 +                         "\tjmp 1b\n"             /* and jump back */  
 +
 +                         "2:\tmovl $1, %%eax\n"   /* set eax to 1, the locked
 +                                                     value of the lock */
 +                         "\txchgl %%eax, %0\n"    /* atomically exchange 
 +                                                     eax with the lock value */
 +                         "\tcmpl $0, %%eax\n"     /* compare the exchanged
 +                                                     value with 0 */
 +                         "\tjne 1b"               /* jump backward if we didn't
 +                                                     just lock */
-     int old_value;
++                         : "+m" (x->lock)         /* input & output var */
 +                         : 
 +                         : "%eax", "memory"/* we changed memory */
 +                        );
 +}
 +
 +
 +
 +static inline void tMPI_Spinlock_unlock(tMPI_Spinlock_t *  x)
 +{
 +    /* this is apparently all that is needed for unlocking a lock */
 +    __asm__ __volatile__(
 +                     "\n\tmovl $0, %0\n"
 +                     : "=m"(x->lock) : : "memory" );
 +}
 +
 +
 +
 +static inline int tMPI_Spinlock_trylock(tMPI_Spinlock_t *x)
 +{
-     __asm__ __volatile__("\tmovl $1, %0\n"     /* set eax to 1, the locked
++    int old_value=1;
 +        
-                          :"=r" (old_value), "=m" (x->lock)
-                          : : "memory");
++    __asm__ __volatile__("\tmovl %2, %0\n"     /* set eax to 1, the locked
 +                                                  value of the lock */
 +                         "\txchgl %0, %1\n"    /* atomically exchange 
 +                                                  eax with the address in
 +                                                  rdx. */
-                          : "=m"(x->lock)         /* input & output var */
++                         : "+r"(old_value), "+m" (x->lock)
++                         : "i" (1)
++                         : "memory");
 +    return (old_value);
 +}
 +
 + 
 +
 +static inline int tMPI_Spinlock_islocked(const tMPI_Spinlock_t *x)
 +{
 +    return ( (*((volatile int*)(&(x->lock)))) != 0);
 +}
 +
 +
 +static inline void tMPI_Spinlock_wait(tMPI_Spinlock_t *x)
 +{
 +    /* this is the spinlock without the xchg.  */
 +    __asm__ __volatile__("1:\tcmpl $0, %0\n"      /* check the lock */
 +                         "\tje 2f\n"              /* try to lock if it is
 +                                                     free by jumping forward */
 +                         "\tpause\n"              /* otherwise: small pause
 +                                                     as recommended by Intel */
 +                         "\tjmp 1b\n"             /* and jump back */  
 +                         "2:\tnop\n"              /* jump target for end 
 +                                                     of wait */
++                         : "+m"(x->lock)         /* input & output var */
 +                         : 
 +                         : "memory"/* we changed memory */
 +                        );
 +#if 0
 +    do 
 +    {
 +        tMPI_Atomic_memory_barrier(); 
 +    } 
 +    while(tMPI_Spinlock_islocked(x));
 +#endif
 +}
 +
Simple merge
Simple merge
Simple merge
Simple merge
index aa7cf13009b1747a3acf4c829bce46b409e66306,0000000000000000000000000000000000000000..5198ba6ca0ad383e144f4678d34418cef41d5223
mode 100644,000000..100644
--- /dev/null
@@@ -1,2716 -1,0 +1,4268 @@@
-  * 
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
-  * 
++ *
 + *                This source code is part of
-  * 
++ *
 + *                 G   R   O   M   A   C   S
-  * 
++ *
 + *          GROningen MAchine for Chemical Simulations
-  * 
++ *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
-  * 
++ *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
-  * 
++ *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
-  * 
++ *
 + * For more info, check our website at http://www.gromacs.org
-  */ 
++ *
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +/* IMPORTANT FOR DEVELOPERS:
 + *
 + * Triclinic pme stuff isn't entirely trivial, and we've experienced
 + * some bugs during development (many of them due to me). To avoid
 + * this in the future, please check the following things if you make
 + * changes in this file:
 + *
 + * 1. You should obtain identical (at least to the PME precision)
 + *    energies, forces, and virial for
 + *    a rectangular box and a triclinic one where the z (or y) axis is
 + *    tilted a whole box side. For instance you could use these boxes:
 + *
 + *    rectangular       triclinic
 + *     2  0  0           2  0  0
 + *     0  2  0           0  2  0
 + *     0  0  6           2  2  6
 + *
 + * 2. You should check the energy conservation in a triclinic box.
 + *
 + * It might seem an overkill, but better safe than sorry.
 + * /Erik 001109
-     gmx_bool bSpread;           /* These coordinates are used for spreading */
++ */
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#ifdef GMX_LIB_MPI
 +#include <mpi.h>
 +#endif
 +#ifdef GMX_THREADS
 +#include "tmpi.h"
 +#endif
 +
++#ifdef GMX_OPENMP
++#include <omp.h>
++#endif
 +
 +#include <stdio.h>
 +#include <string.h>
 +#include <math.h>
 +#include "typedefs.h"
 +#include "txtdump.h"
 +#include "vec.h"
 +#include "gmxcomplex.h"
 +#include "smalloc.h"
 +#include "futil.h"
 +#include "coulomb.h"
 +#include "gmx_fatal.h"
 +#include "pme.h"
 +#include "network.h"
 +#include "physics.h"
 +#include "nrnb.h"
 +#include "copyrite.h"
 +#include "gmx_wallcycle.h"
 +#include "gmx_parallel_3dfft.h"
 +#include "pdbio.h"
++#include "gmx_cyclecounter.h"
 +#include "macros.h"
 +
 +#if ( !defined(GMX_DOUBLE) && ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_X86_64_SSE2) ) )
 +#include "gmx_sse2_single.h"
++
++#define PME_SSE
++/* Some old AMD processors could have problems with unaligned loads+stores */
++#ifndef GMX_FAHCORE
++#define PME_SSE_UNALIGNED
++#endif
 +#endif
 +
 +#define DFT_TOL 1e-7
 +/* #define PRT_FORCE */
 +/* conditions for on the fly time-measurement */
 +/* #define TAKETIME (step > 1 && timesteps < 10) */
 +#define TAKETIME FALSE
 +
++/* #define PME_TIME_THREADS */
++
 +#ifdef GMX_DOUBLE
 +#define mpi_type MPI_DOUBLE
 +#else
 +#define mpi_type MPI_FLOAT
 +#endif
 +
++/* GMX_CACHE_SEP should be a multiple of 16 to preserve alignment */
++#define GMX_CACHE_SEP 64
++
++/* We only define a maximum to be able to use local arrays without allocation.
++ * An order larger than 12 should never be needed, even for test cases.
++ * If needed it can be changed here.
++ */
++#define PME_ORDER_MAX 12
++
 +/* Internal datastructures */
 +typedef struct {
 +    int send_index0;
 +    int send_nindex;
 +    int recv_index0;
 +    int recv_nindex;
 +} pme_grid_comm_t;
 +
 +typedef struct {
 +#ifdef GMX_MPI
 +    MPI_Comm mpi_comm;
 +#endif
 +    int  nnodes,nodeid;
 +    int  *s2g0;
 +    int  *s2g1;
 +    int  noverlap_nodes;
 +    int  *send_id,*recv_id;
 +    pme_grid_comm_t *comm_data;
++    real *sendbuf;
++    real *recvbuf;
 +} pme_overlap_t;
 +
++typedef struct {
++    int *n;     /* Cumulative counts of the number of particles per thread */
++    int nalloc; /* Allocation size of i */
++    int *i;     /* Particle indices ordered on thread index (n) */
++} thread_plist_t;
++
++typedef struct {
++    int  n;
++    int  *ind;
++    splinevec theta;
++    splinevec dtheta;
++} splinedata_t;
++
 +typedef struct {
 +    int  dimind;            /* The index of the dimension, 0=x, 1=y */
 +    int  nslab;
 +    int  nodeid;
 +#ifdef GMX_MPI
 +    MPI_Comm mpi_comm;
 +#endif
 +
 +    int  *node_dest;        /* The nodes to send x and q to with DD */
 +    int  *node_src;         /* The nodes to receive x and q from with DD */
 +    int  *buf_index;        /* Index for commnode into the buffers */
 +
 +    int  maxshift;
 +
 +    int  npd;
 +    int  pd_nalloc;
 +    int  *pd;
 +    int  *count;            /* The number of atoms to send to each node */
++    int  **count_thread;
 +    int  *rcount;           /* The number of atoms to receive */
 +
 +    int  n;
 +    int  nalloc;
 +    rvec *x;
 +    real *q;
 +    rvec *f;
-     splinevec theta,dtheta;
++    gmx_bool bSpread;       /* These coordinates are used for spreading */
 +    int  pme_order;
-                               * lower cell boundary 
 +    ivec *idx;
 +    rvec *fractx;            /* Fractional coordinate relative to the
-     gmx_bool bPPnode;            /* Node also does particle-particle forces */
-     gmx_bool bFEP;               /* Compute Free energy contribution */
++                              * lower cell boundary
 +                              */
++    int  nthread;
++    int  *thread_idx;        /* Which thread should spread which charge */
++    thread_plist_t *thread_plist;
++    splinedata_t *spline;
 +} pme_atomcomm_t;
 +
++#define FLBS  3
++#define FLBSZ 4
++
++typedef struct {
++    ivec ci;     /* The spatial location of this grid       */
++    ivec n;      /* The size of *grid, including order-1    */
++    ivec offset; /* The grid offset from the full node grid */
++    int  order;  /* PME spreading order                     */
++    real *grid;  /* The grid local thread, size n           */
++} pmegrid_t;
++
++typedef struct {
++    pmegrid_t grid;     /* The full node grid (non thread-local)            */
++    int  nthread;       /* The number of threads operating on this grid     */
++    ivec nc;            /* The local spatial decomposition over the threads */
++    pmegrid_t *grid_th; /* Array of grids for each thread                   */
++    int  **g2t;         /* The grid to thread index                         */
++    ivec nthread_comm;  /* The number of threads to communicate with        */
++} pmegrids_t;
++
++
++typedef struct {
++#ifdef PME_SSE
++    /* Masks for SSE aligned spreading and gathering */
++    __m128 mask_SSE0[6],mask_SSE1[6];
++#else
++    int dummy; /* C89 requires that struct has at least one member */
++#endif
++} pme_spline_work_t;
++
++typedef struct {
++    /* work data for solve_pme */
++    int      nalloc;
++    real *   mhx;
++    real *   mhy;
++    real *   mhz;
++    real *   m2;
++    real *   denom;
++    real *   tmp1_alloc;
++    real *   tmp1;
++    real *   eterm;
++    real *   m2inv;
++
++    real     energy;
++    matrix   vir;
++} pme_work_t;
++
 +typedef struct gmx_pme {
 +    int  ndecompdim;         /* The number of decomposition dimensions */
 +    int  nodeid;             /* Our nodeid in mpi->mpi_comm */
 +    int  nodeid_major;
 +    int  nodeid_minor;
 +    int  nnodes;             /* The number of nodes doing PME */
 +    int  nnodes_major;
 +    int  nnodes_minor;
 +
 +    MPI_Comm mpi_comm;
 +    MPI_Comm mpi_comm_d[2];  /* Indexed on dimension, 0=x, 1=y */
 +#ifdef GMX_MPI
 +    MPI_Datatype  rvec_mpi;  /* the pme vector's MPI type */
 +#endif
 +
-     real epsilon_r;           
-     
-     real *  pmegridA;  /* Grids on which we do spreading/interpolation, includes overlap */
-     real *  pmegridB;
++    int  nthread;            /* The number of threads doing PME */
++
++    gmx_bool bPPnode;        /* Node also does particle-particle forces */
++    gmx_bool bFEP;           /* Compute Free energy contribution */
 +    int nkx,nky,nkz;         /* Grid dimensions */
 +    int pme_order;
-     int     pmegrid_start_ix,pmegrid_start_iy,pmegrid_start_iz;    
-     
-     real *  pmegrid_sendbuf;
-     real *  pmegrid_recvbuf;
-     
++    real epsilon_r;
++
++    pmegrids_t pmegridA;  /* Grids on which we do spreading/interpolation, includes overlap */
++    pmegrids_t pmegridB;
++    /* The PME charge spreading grid sizes/strides, includes pme_order-1 */
 +    int     pmegrid_nx,pmegrid_ny,pmegrid_nz;
-     
++    /* pmegrid_nz might be larger than strictly necessary to ensure
++     * memory alignment, pmegrid_nz_base gives the real base size.
++     */
++    int     pmegrid_nz_base;
++    /* The local PME grid starting indices */
++    int     pmegrid_start_ix,pmegrid_start_iy,pmegrid_start_iz;
++
++    /* Work data for spreading and gathering */
++    pme_spline_work_t spline_work;
++
 +    real *fftgridA;             /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
 +    real *fftgridB;             /* inside the interpolation grid, but separate for 2D PME decomp. */
 +    int   fftgrid_nx,fftgrid_ny,fftgrid_nz;
-     t_complex *cfftgridB;            
++
 +    t_complex *cfftgridA;             /* Grids for complex FFT data */
-     
++    t_complex *cfftgridB;
 +    int   cfftgrid_nx,cfftgrid_ny,cfftgrid_nz;
-     
++
 +    gmx_parallel_3dfft_t  pfft_setupA;
 +    gmx_parallel_3dfft_t  pfft_setupB;
-     
++
 +    int  *nnx,*nny,*nnz;
 +    real *fshx,*fshy,*fshz;
-     
-     pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */
++
 +    pme_atomcomm_t atc[2];  /* Indexed on decomposition index */
 +    matrix    recipbox;
 +    splinevec bsp_mod;
-     
 +
++    pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */
 +
 +    pme_atomcomm_t atc_energy; /* Only for gmx_pme_calc_energy */
-     /* work data for solve_pme */
-     int      work_nalloc;
-     real *   work_mhx;
-     real *   work_mhy;
-     real *   work_mhz;
-     real *   work_m2;
-     real *   work_denom;
-     real *   work_tmp1_alloc;
-     real *   work_tmp1;
-     real *   work_m2inv;
++
 +    rvec *bufv;             /* Communication buffer */
 +    real *bufr;             /* Communication buffer */
 +    int  buf_nalloc;        /* The communication buffer size */
 +
-     gmx_bool     redist_init;
-     int *    scounts; 
++    /* thread local work data for solve_pme */
++    pme_work_t *work;
 +
 +    /* Work data for PME_redist */
-     int *    idxa;    
++    gmx_bool redist_init;
++    int *    scounts;
 +    int *    rcounts;
 +    int *    sdispls;
 +    int *    rdispls;
 +    int *    sidx;
-     
++    int *    idxa;
 +    real *   redist_buf;
 +    int      redist_buf_nalloc;
- static void calc_interpolation_idx(gmx_pme_t pme,pme_atomcomm_t *atc)
++
 +    /* Work data for sum_qgrid */
 +    real *   sum_qgrid_tmp;
 +    real *   sum_qgrid_dd_tmp;
 +} t_gmx_pme;
 +
 +
-     
++static void calc_interpolation_idx(gmx_pme_t pme,pme_atomcomm_t *atc,
++                                   int start,int end,int thread)
 +{
 +    int  i;
 +    int  *idxptr,tix,tiy,tiz;
 +    real *xptr,*fptr,tx,ty,tz;
 +    real rxx,ryx,ryy,rzx,rzy,rzz;
 +    int  nx,ny,nz;
 +    int  start_ix,start_iy,start_iz;
-     
++    int  *g2tx,*g2ty,*g2tz;
++    gmx_bool bThreads;
++    int  *thread_idx=NULL;
++    thread_plist_t *tpl=NULL;
++    int  *tpl_n=NULL;
++    int  thread_i;
++
 +    nx  = pme->nkx;
 +    ny  = pme->nky;
 +    nz  = pme->nkz;
-     
++
 +    start_ix = pme->pmegrid_start_ix;
 +    start_iy = pme->pmegrid_start_iy;
 +    start_iz = pme->pmegrid_start_iz;
-     
-     for(i=0; (i<atc->n); i++) {
++
 +    rxx = pme->recipbox[XX][XX];
 +    ryx = pme->recipbox[YY][XX];
 +    ryy = pme->recipbox[YY][YY];
 +    rzx = pme->recipbox[ZZ][XX];
 +    rzy = pme->recipbox[ZZ][YY];
 +    rzz = pme->recipbox[ZZ][ZZ];
-         
++
++    g2tx = pme->pmegridA.g2t[XX];
++    g2ty = pme->pmegridA.g2t[YY];
++    g2tz = pme->pmegridA.g2t[ZZ];
++
++    bThreads = (atc->nthread > 1);
++    if (bThreads)
++    {
++        thread_idx = atc->thread_idx;
++
++        tpl   = &atc->thread_plist[thread];
++        tpl_n = tpl->n;
++        for(i=0; i<atc->nthread; i++)
++        {
++            tpl_n[i] = 0;
++        }
++    }
++
++    for(i=start; i<end; i++) {
 +        xptr   = atc->x[i];
 +        idxptr = atc->idx[i];
 +        fptr   = atc->fractx[i];
-         
++
 +        /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
 +        tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 );
 +        ty = ny * (                  xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 );
 +        tz = nz * (                                   xptr[ZZ] * rzz + 2.0 );
-         
++
 +        tix = (int)(tx);
 +        tiy = (int)(ty);
 +        tiz = (int)(tz);
-         fptr[ZZ] = tz - tiz;   
++
 +        /* Because decomposition only occurs in x and y,
 +         * we never have a fraction correction in z.
 +         */
 +        fptr[XX] = tx - tix + pme->fshx[tix];
 +        fptr[YY] = ty - tiy + pme->fshy[tiy];
-   }  
++        fptr[ZZ] = tz - tiz;
 +
 +        idxptr[XX] = pme->nnx[tix];
 +        idxptr[YY] = pme->nny[tiy];
 +        idxptr[ZZ] = pme->nnz[tiz];
 +
 +#ifdef DEBUG
 +        range_check(idxptr[XX],0,pme->pmegrid_nx);
 +        range_check(idxptr[YY],0,pme->pmegrid_ny);
 +        range_check(idxptr[ZZ],0,pme->pmegrid_nz);
 +#endif
- static void pme_calc_pidx(int natoms, matrix recipbox, rvec x[],
-                           pme_atomcomm_t *atc)
++
++        if (bThreads)
++        {
++            thread_i = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
++            thread_idx[i] = thread_i;
++            tpl_n[thread_i]++;
++        }
++    }
++
++    if (bThreads)
++    {
++        /* Make a list of particle indices sorted on thread */
++
++        /* Get the cumulative count */
++        for(i=1; i<atc->nthread; i++)
++        {
++            tpl_n[i] += tpl_n[i-1];
++        }
++        /* The current implementation distributes particles equally
++         * over the threads, so we could actually allocate for that
++         * in pme_realloc_atomcomm_things.
++         */
++        if (tpl_n[atc->nthread-1] > tpl->nalloc)
++        {
++            tpl->nalloc = over_alloc_large(tpl_n[atc->nthread-1]);
++            srenew(tpl->i,tpl->nalloc);
++        }
++        /* Set tpl_n to the cumulative start */
++        for(i=atc->nthread-1; i>=1; i--)
++        {
++            tpl_n[i] = tpl_n[i-1];
++        }
++        tpl_n[0] = 0;
++
++        /* Fill our thread local array with indices sorted on thread */
++        for(i=start; i<end; i++)
++        {
++            tpl->i[tpl_n[atc->thread_idx[i]]++] = i;
++        }
++        /* Now tpl_n contains the cummulative count again */
++    }
 +}
 +
-     int *pd,*count;
++static void make_thread_local_ind(pme_atomcomm_t *atc,
++                                  int thread,splinedata_t *spline)
++{
++    int  n,t,i,start,end;
++    thread_plist_t *tpl;
++
++    /* Combine the indices made by each thread into one index */
++
++    n = 0;
++    start = 0;
++    for(t=0; t<atc->nthread; t++)
++    {
++        tpl = &atc->thread_plist[t];
++        /* Copy our part (start - end) from the list of thread t */
++        if (thread > 0)
++        {
++            start = tpl->n[thread-1];
++        }
++        end = tpl->n[thread];
++        for(i=start; i<end; i++)
++        {
++            spline->ind[n++] = tpl->i[i];
++        }
++    }
++
++    spline->n = n;
++}
++
++
++static void pme_calc_pidx(int start, int end,
++                          matrix recipbox, rvec x[],
++                          pme_atomcomm_t *atc, int *count)
 +{
 +    int  nslab,i;
 +    int  si;
 +    real *xptr,s;
 +    real rxx,ryx,rzx,ryy,rzy;
-     
++    int *pd;
 +
 +    /* Calculate PME task index (pidx) for each grid index.
 +     * Here we always assign equally sized slabs to each node
 +     * for load balancing reasons (the PME grid spacing is not used).
 +     */
-     count = atc->count;
++
 +    nslab = atc->nslab;
 +    pd    = atc->pd;
-     
 +
 +    /* Reset the count */
 +    for(i=0; i<nslab; i++)
 +    {
 +        count[i] = 0;
 +    }
-         for(i=0; (i<natoms); i++)
++
 +    if (atc->dimind == 0)
 +    {
 +        rxx = recipbox[XX][XX];
 +        ryx = recipbox[YY][XX];
 +        rzx = recipbox[ZZ][XX];
 +        /* Calculate the node index in x-dimension */
-         for(i=0; (i<natoms); i++)
++        for(i=start; i<end; i++)
 +        {
 +            xptr   = x[i];
 +            /* Fractional coordinates along box vectors */
 +            s = nslab*(xptr[XX]*rxx + xptr[YY]*ryx + xptr[ZZ]*rzx);
 +            si = (int)(s + 2*nslab) % nslab;
 +            pd[i] = si;
 +            count[si]++;
 +        }
 +    }
 +    else
 +    {
 +        ryy = recipbox[YY][YY];
 +        rzy = recipbox[ZZ][YY];
 +        /* Calculate the node index in y-dimension */
-     int nalloc_old,i;
-     
++        for(i=start; i<end; i++)
 +        {
 +            xptr   = x[i];
 +            /* Fractional coordinates along box vectors */
 +            s = nslab*(xptr[YY]*ryy + xptr[ZZ]*rzy);
 +            si = (int)(s + 2*nslab) % nslab;
 +            pd[i] = si;
 +            count[si]++;
 +        }
 +    }
 +}
 +
++static void pme_calc_pidx_wrapper(int natoms, matrix recipbox, rvec x[],
++                                  pme_atomcomm_t *atc)
++{
++    int nthread,thread,slab;
++
++    nthread = atc->nthread;
++
++#pragma omp parallel for num_threads(nthread) schedule(static)
++    for(thread=0; thread<nthread; thread++)
++    {
++        pme_calc_pidx(natoms* thread   /nthread,
++                      natoms*(thread+1)/nthread,
++                      recipbox,x,atc,atc->count_thread[thread]);
++    }
++    /* Non-parallel reduction, since nslab is small */
++
++    for(thread=1; thread<nthread; thread++)
++    {
++        for(slab=0; slab<atc->nslab; slab++)
++        {
++            atc->count_thread[0][slab] += atc->count_thread[thread][slab];
++        }
++    }
++}
++
++static void pme_realloc_splinedata(splinedata_t *spline, pme_atomcomm_t *atc)
++{
++    int i,d;
++
++    srenew(spline->ind,atc->nalloc);
++    /* Initialize the index to identity so it works without threads */
++    for(i=0; i<atc->nalloc; i++)
++    {
++        spline->ind[i] = i;
++    }
++
++    for(d=0;d<DIM;d++)
++    {
++        srenew(spline->theta[d] ,atc->pme_order*atc->nalloc);
++        srenew(spline->dtheta[d],atc->pme_order*atc->nalloc);
++    }
++}
++
 +static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
 +{
-         
++    int nalloc_old,i,j,nalloc_tpl;
++
 +    /* We have to avoid a NULL pointer for atc->x to avoid
 +     * possible fatal errors in MPI routines.
 +     */
 +    if (atc->n > atc->nalloc || atc->nalloc == 0)
 +    {
 +        nalloc_old = atc->nalloc;
 +        atc->nalloc = over_alloc_dd(max(atc->n,1));
-             for(i=0;i<DIM;i++) {
-                 srenew(atc->theta[i] ,atc->pme_order*atc->nalloc); 
-                 srenew(atc->dtheta[i],atc->pme_order*atc->nalloc);
-             }
-             srenew(atc->fractx,atc->nalloc); 
++
 +        if (atc->nslab > 1) {
 +            srenew(atc->x,atc->nalloc);
 +            srenew(atc->q,atc->nalloc);
 +            srenew(atc->f,atc->nalloc);
 +            for(i=nalloc_old; i<atc->nalloc; i++)
 +            {
 +                clear_rvec(atc->f[i]);
 +            }
 +        }
 +        if (atc->bSpread) {
-     
++            srenew(atc->fractx,atc->nalloc);
 +            srenew(atc->idx   ,atc->nalloc);
++
++            if (atc->nthread > 1)
++            {
++                srenew(atc->thread_idx,atc->nalloc);
++            }
++
++            for(i=0; i<atc->nthread; i++)
++            {
++                pme_realloc_splinedata(&atc->spline[i],atc);
++            }
 +        }
 +    }
 +}
 +
 +static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
 +                         int n, gmx_bool bXF, rvec *x_f, real *charge,
 +                         pme_atomcomm_t *atc)
 +/* Redistribute particle data for PME calculation */
 +/* domain decomposition by x coordinate           */
 +{
 +    int *idxa;
 +    int i, ii;
-     
++
 +    if(FALSE == pme->redist_init) {
 +        snew(pme->scounts,atc->nslab);
 +        snew(pme->rcounts,atc->nslab);
 +        snew(pme->sdispls,atc->nslab);
 +        snew(pme->rdispls,atc->nslab);
 +        snew(pme->sidx,atc->nslab);
 +        pme->redist_init = TRUE;
 +    }
 +    if (n > pme->redist_buf_nalloc) {
 +        pme->redist_buf_nalloc = over_alloc_dd(n);
 +        srenew(pme->redist_buf,pme->redist_buf_nalloc*DIM);
 +    }
-         /* forward, redistribution from pp to pme */ 
-         
++
 +    pme->idxa = atc->pd;
 +
 +#ifdef GMX_MPI
 +    if (forw && bXF) {
-         
-         /* Calculate send and receive displacements and index into send 
++        /* forward, redistribution from pp to pme */
++
 +        /* Calculate send counts and exchange them with other nodes */
 +        for(i=0; (i<atc->nslab); i++) pme->scounts[i]=0;
 +        for(i=0; (i<n); i++) pme->scounts[pme->idxa[i]]++;
 +        MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm);
-         
++
++        /* Calculate send and receive displacements and index into send
 +           buffer */
 +        pme->sdispls[0]=0;
 +        pme->rdispls[0]=0;
 +        pme->sidx[0]=0;
 +        for(i=1; i<atc->nslab; i++) {
 +            pme->sdispls[i]=pme->sdispls[i-1]+pme->scounts[i-1];
 +            pme->rdispls[i]=pme->rdispls[i-1]+pme->rcounts[i-1];
 +            pme->sidx[i]=pme->sdispls[i];
 +        }
 +        /* Total # of particles to be received */
 +        atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1];
-         
++
 +        pme_realloc_atomcomm_things(atc);
-         MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, 
-                       pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls, 
++
 +        /* Copy particle coordinates into send buffer and exchange*/
 +        for(i=0; (i<n); i++) {
 +            ii=DIM*pme->sidx[pme->idxa[i]];
 +            pme->sidx[pme->idxa[i]]++;
 +            pme->redist_buf[ii+XX]=x_f[i][XX];
 +            pme->redist_buf[ii+YY]=x_f[i][YY];
 +            pme->redist_buf[ii+ZZ]=x_f[i][ZZ];
 +        }
-     else { /* backward, redistribution from pme to pp */ 
++        MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
++                      pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
 +                      pme->rvec_mpi, atc->mpi_comm);
 +    }
 +    if (forw) {
 +        /* Copy charge into send buffer and exchange*/
 +        for(i=0; i<atc->nslab; i++) pme->sidx[i]=pme->sdispls[i];
 +        for(i=0; (i<n); i++) {
 +            ii=pme->sidx[pme->idxa[i]];
 +            pme->sidx[pme->idxa[i]]++;
 +            pme->redist_buf[ii]=charge[i];
 +        }
 +        MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, mpi_type,
 +                      atc->q, pme->rcounts, pme->rdispls, mpi_type,
 +                      atc->mpi_comm);
 +    }
-                       pme->redist_buf, pme->scounts, pme->sdispls, 
++    else { /* backward, redistribution from pme to pp */
 +        MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi,
-         
++                      pme->redist_buf, pme->scounts, pme->sdispls,
 +                      pme->rvec_mpi, atc->mpi_comm);
- #endif 
++
 +        /* Copy data from receive buffer */
 +        for(i=0; i<atc->nslab; i++)
 +            pme->sidx[i] = pme->sdispls[i];
 +        for(i=0; (i<n); i++) {
 +            ii = DIM*pme->sidx[pme->idxa[i]];
 +            x_f[i][XX] += pme->redist_buf[ii+XX];
 +            x_f[i][YY] += pme->redist_buf[ii+YY];
 +            x_f[i][ZZ] += pme->redist_buf[ii+ZZ];
 +            pme->sidx[pme->idxa[i]]++;
 +        }
 +    }
-     
++#endif
 +}
 +
 +static void pme_dd_sendrecv(pme_atomcomm_t *atc,
 +                            gmx_bool bBackward,int shift,
 +                            void *buf_s,int nbyte_s,
 +                            void *buf_r,int nbyte_r)
 +{
 +#ifdef GMX_MPI
 +    int dest,src;
 +    MPI_Status stat;
-     
++
 +    if (bBackward == FALSE) {
 +        dest = atc->node_dest[shift];
 +        src  = atc->node_src[shift];
 +    } else {
 +        dest = atc->node_src[shift];
 +        src  = atc->node_dest[shift];
 +    }
- static void dd_pmeredist_x_q(gmx_pme_t pme, 
++
 +    if (nbyte_s > 0 && nbyte_r > 0) {
 +        MPI_Sendrecv(buf_s,nbyte_s,MPI_BYTE,
 +                     dest,shift,
 +                     buf_r,nbyte_r,MPI_BYTE,
 +                     src,shift,
 +                     atc->mpi_comm,&stat);
 +    } else if (nbyte_s > 0) {
 +        MPI_Send(buf_s,nbyte_s,MPI_BYTE,
 +                 dest,shift,
 +                 atc->mpi_comm);
 +    } else if (nbyte_r > 0) {
 +        MPI_Recv(buf_r,nbyte_r,MPI_BYTE,
 +                 src,shift,
 +                 atc->mpi_comm,&stat);
 +    }
 +#endif
 +}
 +
-     
++static void dd_pmeredist_x_q(gmx_pme_t pme,
 +                             int n, gmx_bool bX, rvec *x, real *charge,
 +                             pme_atomcomm_t *atc)
 +{
 +    int *commnode,*buf_index;
 +    int nnodes_comm,i,nsend,local_pos,buf_pos,node,scount,rcount;
-     
++
 +    commnode  = atc->node_dest;
 +    buf_index = atc->buf_index;
-     
++
 +    nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
-         
++
 +    nsend = 0;
 +    for(i=0; i<nnodes_comm; i++) {
 +        buf_index[commnode[i]] = nsend;
 +        nsend += atc->count[commnode[i]];
 +    }
 +    if (bX) {
 +        if (atc->count[atc->nodeid] + nsend != n)
 +            gmx_fatal(FARGS,"%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
 +                      "This usually means that your system is not well equilibrated.",
 +                      n - (atc->count[atc->nodeid] + nsend),
 +                      pme->nodeid,'x'+atc->dimind);
-         
++
 +        if (nsend > pme->buf_nalloc) {
 +            pme->buf_nalloc = over_alloc_dd(nsend);
 +            srenew(pme->bufv,pme->buf_nalloc);
 +            srenew(pme->bufr,pme->buf_nalloc);
 +        }
-         
++
 +        atc->n = atc->count[atc->nodeid];
 +        for(i=0; i<nnodes_comm; i++) {
 +            scount = atc->count[commnode[i]];
 +            /* Communicate the count */
 +            if (debug)
 +                fprintf(debug,"dimind %d PME node %d send to node %d: %d\n",
 +                        atc->dimind,atc->nodeid,commnode[i],scount);
 +            pme_dd_sendrecv(atc,FALSE,i,
 +                            &scount,sizeof(int),
 +                            &atc->rcount[i],sizeof(int));
 +            atc->n += atc->rcount[i];
 +        }
-     
++
 +        pme_realloc_atomcomm_things(atc);
 +    }
-     
++
 +    local_pos = 0;
 +    for(i=0; i<n; i++) {
 +        node = atc->pd[i];
 +        if (node == atc->nodeid) {
 +            /* Copy direct to the receive buffer */
 +            if (bX) {
 +                copy_rvec(x[i],atc->x[local_pos]);
 +            }
 +            atc->q[local_pos] = charge[i];
 +            local_pos++;
 +        } else {
 +            /* Copy to the send buffer */
 +            if (bX) {
 +                copy_rvec(x[i],pme->bufv[buf_index[node]]);
 +            }
 +            pme->bufr[buf_index[node]] = charge[i];
 +            buf_index[node]++;
 +        }
 +    }
- static void 
++
 +    buf_pos = 0;
 +    for(i=0; i<nnodes_comm; i++) {
 +        scount = atc->count[commnode[i]];
 +        rcount = atc->rcount[i];
 +        if (scount > 0 || rcount > 0) {
 +            if (bX) {
 +                /* Communicate the coordinates */
 +                pme_dd_sendrecv(atc,FALSE,i,
 +                                pme->bufv[buf_pos],scount*sizeof(rvec),
 +                                atc->x[local_pos],rcount*sizeof(rvec));
 +            }
 +            /* Communicate the charges */
 +            pme_dd_sendrecv(atc,FALSE,i,
 +                            pme->bufr+buf_pos,scount*sizeof(real),
 +                            atc->q+local_pos,rcount*sizeof(real));
 +            buf_pos   += scount;
 +            local_pos += atc->rcount[i];
 +        }
 +    }
 +}
 +
 +static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc,
 +                           int n, rvec *f,
 +                           gmx_bool bAddF)
 +{
 +  int *commnode,*buf_index;
 +  int nnodes_comm,local_pos,buf_pos,i,scount,rcount,node;
 +
 +  commnode  = atc->node_dest;
 +  buf_index = atc->buf_index;
 +
 +  nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
 +
 +  local_pos = atc->count[atc->nodeid];
 +  buf_pos = 0;
 +  for(i=0; i<nnodes_comm; i++) {
 +    scount = atc->rcount[i];
 +    rcount = atc->count[commnode[i]];
 +    if (scount > 0 || rcount > 0) {
 +      /* Communicate the forces */
 +      pme_dd_sendrecv(atc,TRUE,i,
 +                      atc->f[local_pos],scount*sizeof(rvec),
 +                      pme->bufv[buf_pos],rcount*sizeof(rvec));
 +      local_pos += scount;
 +    }
 +    buf_index[commnode[i]] = buf_pos;
 +    buf_pos   += rcount;
 +  }
 +
 +    local_pos = 0;
 +    if (bAddF)
 +    {
 +        for(i=0; i<n; i++)
 +        {
 +            node = atc->pd[i];
 +            if (node == atc->nodeid)
 +            {
 +                /* Add from the local force array */
 +                rvec_inc(f[i],atc->f[local_pos]);
 +                local_pos++;
 +            }
 +            else
 +            {
 +                /* Add from the receive buffer */
 +                rvec_inc(f[i],pme->bufv[buf_index[node]]);
 +                buf_index[node]++;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        for(i=0; i<n; i++)
 +        {
 +            node = atc->pd[i];
 +            if (node == atc->nodeid)
 +            {
 +                /* Copy from the local force array */
 +                copy_rvec(atc->f[local_pos],f[i]);
 +                local_pos++;
 +            }
 +            else
 +            {
 +                /* Copy from the receive buffer */
 +                copy_rvec(pme->bufv[buf_index[node]],f[i]);
 +                buf_index[node]++;
 +            }
 +        }
 +    }
 +}
 +
 +#ifdef GMX_MPI
-     
++static void
 +gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
 +{
 +    pme_overlap_t *overlap;
 +    int send_index0,send_nindex;
 +    int recv_index0,recv_nindex;
 +    MPI_Status stat;
 +    int i,j,k,ix,iy,iz,icnt;
 +    int ipulse,send_id,recv_id,datasize;
 +    real *p;
 +    real *sendptr,*recvptr;
-     
++
 +    /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
 +    overlap = &pme->overlap[1];
-             send_nindex   = overlap->comm_data[ipulse].recv_nindex;            
++
 +    for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
 +    {
 +        /* Since we have already (un)wrapped the overlap in the z-dimension,
 +         * we only have to communicate 0 to nkz (not pmegrid_nz).
 +         */
 +        if (direction==GMX_SUM_QGRID_FORWARD)
 +        {
 +            send_id = overlap->send_id[ipulse];
 +            recv_id = overlap->recv_id[ipulse];
 +            send_index0   = overlap->comm_data[ipulse].send_index0;
 +            send_nindex   = overlap->comm_data[ipulse].send_nindex;
 +            recv_index0   = overlap->comm_data[ipulse].recv_index0;
 +            recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +        }
 +        else
 +        {
 +            send_id = overlap->recv_id[ipulse];
 +            recv_id = overlap->send_id[ipulse];
 +            send_index0   = overlap->comm_data[ipulse].recv_index0;
-                     pme->pmegrid_sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
++            send_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +            recv_index0   = overlap->comm_data[ipulse].send_index0;
 +            recv_nindex   = overlap->comm_data[ipulse].send_nindex;
 +        }
 +
 +        /* Copy data to contiguous send buffer */
 +        if (debug)
 +        {
 +            fprintf(debug,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
 +                    pme->nodeid,overlap->nodeid,send_id,
 +                    pme->pmegrid_start_iy,
 +                    send_index0-pme->pmegrid_start_iy,
 +                    send_index0-pme->pmegrid_start_iy+send_nindex);
 +        }
 +        icnt = 0;
 +        for(i=0;i<pme->pmegrid_nx;i++)
 +        {
 +            ix = i;
 +            for(j=0;j<send_nindex;j++)
 +            {
 +                iy = j + send_index0 - pme->pmegrid_start_iy;
 +                for(k=0;k<pme->nkz;k++)
 +                {
 +                    iz = k;
-             
++                    overlap->sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
 +                }
 +            }
 +        }
-         
-         MPI_Sendrecv(pme->pmegrid_sendbuf,send_nindex*datasize,GMX_MPI_REAL,
++
 +        datasize      = pme->pmegrid_nx * pme->nkz;
-                      pme->pmegrid_recvbuf,recv_nindex*datasize,GMX_MPI_REAL,
++
++        MPI_Sendrecv(overlap->sendbuf,send_nindex*datasize,GMX_MPI_REAL,
 +                     send_id,ipulse,
-         
++                     overlap->recvbuf,recv_nindex*datasize,GMX_MPI_REAL,
 +                     recv_id,ipulse,
 +                     overlap->mpi_comm,&stat);
-                         grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += pme->pmegrid_recvbuf[icnt++];
++
 +        /* Get data from contiguous recv buffer */
 +        if (debug)
 +        {
 +            fprintf(debug,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
 +                    pme->nodeid,overlap->nodeid,recv_id,
 +                    pme->pmegrid_start_iy,
 +                    recv_index0-pme->pmegrid_start_iy,
 +                    recv_index0-pme->pmegrid_start_iy+recv_nindex);
 +        }
 +        icnt = 0;
 +        for(i=0;i<pme->pmegrid_nx;i++)
 +        {
 +            ix = i;
 +            for(j=0;j<recv_nindex;j++)
 +            {
 +                iy = j + recv_index0 - pme->pmegrid_start_iy;
 +                for(k=0;k<pme->nkz;k++)
 +                {
 +                    iz = k;
 +                    if(direction==GMX_SUM_QGRID_FORWARD)
 +                    {
-                         grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz]  = pme->pmegrid_recvbuf[icnt++];
++                        grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
 +                    }
 +                    else
 +                    {
-     
++                        grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz]  = overlap->recvbuf[icnt++];
 +                    }
 +                }
 +            }
 +        }
 +    }
-     
++
 +    /* Major dimension is easier, no copying required,
 +     * but we might have to sum to separate array.
 +     * Since we don't copy, we have to communicate up to pmegrid_nz,
 +     * not nkz as for the minor direction.
 +     */
 +    overlap = &pme->overlap[0];
-             recvptr   = pme->pmegrid_recvbuf;
++
 +    for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
 +    {
 +        if(direction==GMX_SUM_QGRID_FORWARD)
 +        {
 +            send_id = overlap->send_id[ipulse];
 +            recv_id = overlap->recv_id[ipulse];
 +            send_index0   = overlap->comm_data[ipulse].send_index0;
 +            send_nindex   = overlap->comm_data[ipulse].send_nindex;
 +            recv_index0   = overlap->comm_data[ipulse].recv_index0;
 +            recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
-             send_nindex   = overlap->comm_data[ipulse].recv_nindex;            
++            recvptr   = overlap->recvbuf;
 +        }
 +        else
 +        {
 +            send_id = overlap->recv_id[ipulse];
 +            recv_id = overlap->send_id[ipulse];
 +            send_index0   = overlap->comm_data[ipulse].recv_index0;
-                 
++            send_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +            recv_index0   = overlap->comm_data[ipulse].send_index0;
 +            recv_nindex   = overlap->comm_data[ipulse].send_nindex;
 +            recvptr   = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
 +        }
-         
++
 +        sendptr       = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
 +        datasize      = pme->pmegrid_ny * pme->pmegrid_nz;
 +
 +        if (debug)
 +        {
 +            fprintf(debug,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
 +                    pme->nodeid,overlap->nodeid,send_id,
 +                    pme->pmegrid_start_ix,
 +                    send_index0-pme->pmegrid_start_ix,
 +                    send_index0-pme->pmegrid_start_ix+send_nindex);
 +            fprintf(debug,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
 +                    pme->nodeid,overlap->nodeid,recv_id,
 +                    pme->pmegrid_start_ix,
 +                    recv_index0-pme->pmegrid_start_ix,
 +                    recv_index0-pme->pmegrid_start_ix+recv_nindex);
 +        }
 +
 +        MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
 +                     send_id,ipulse,
 +                     recvptr,recv_nindex*datasize,GMX_MPI_REAL,
 +                     recv_id,ipulse,
 +                     overlap->mpi_comm,&stat);
-         {        
++
 +        /* ADD data from contiguous recv buffer */
 +        if(direction==GMX_SUM_QGRID_FORWARD)
-                 p[i] += pme->pmegrid_recvbuf[i];
++        {
 +            p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
 +            for(i=0;i<recv_nindex*datasize;i++)
 +            {
-     
++                p[i] += overlap->recvbuf[i];
 +            }
 +        }
 +    }
 +}
 +#endif
 +
 +
 +static int
 +copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
 +{
 +    ivec    local_fft_ndata,local_fft_offset,local_fft_size;
 +    ivec    local_pme_size;
 +    int     i,ix,iy,iz;
 +    int     pmeidx,fftidx;
 +
 +    /* Dimensions should be identical for A/B grid, so we just use A here */
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
-     
-     /* The fftgrid is always 'justified' to the lower-left corner of the PME grid, 
++
 +    local_pme_size[0] = pme->pmegrid_nx;
 +    local_pme_size[1] = pme->pmegrid_ny;
 +    local_pme_size[2] = pme->pmegrid_nz;
- copy_fftgrid_to_pmegrid(gmx_pme_t pme, real *fftgrid, real *pmegrid)
++
++    /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
 +     the offset is identical, and the PME grid always has more data (due to overlap)
 +     */
 +    {
 +#ifdef DEBUG_PME
 +        FILE *fp,*fp2;
 +        char fn[STRLEN],format[STRLEN];
 +        real val;
 +        sprintf(fn,"pmegrid%d.pdb",pme->nodeid);
 +        fp = ffopen(fn,"w");
 +        sprintf(fn,"pmegrid%d.txt",pme->nodeid);
 +        fp2 = ffopen(fn,"w");
 +     sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
 +#endif
++
 +    for(ix=0;ix<local_fft_ndata[XX];ix++)
 +    {
 +        for(iy=0;iy<local_fft_ndata[YY];iy++)
 +        {
 +            for(iz=0;iz<local_fft_ndata[ZZ];iz++)
 +            {
 +                pmeidx = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
 +                fftidx = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
 +                fftgrid[fftidx] = pmegrid[pmeidx];
 +#ifdef DEBUG_PME
 +                val = 100*pmegrid[pmeidx];
 +                if (pmegrid[pmeidx] != 0)
 +                fprintf(fp,format,"ATOM",pmeidx,"CA","GLY",' ',pmeidx,' ',
 +                        5.0*ix,5.0*iy,5.0*iz,1.0,val);
 +                if (pmegrid[pmeidx] != 0)
 +                    fprintf(fp2,"%-12s  %5d  %5d  %5d  %12.5e\n",
 +                            "qgrid",
 +                            pme->pmegrid_start_ix + ix,
 +                            pme->pmegrid_start_iy + iy,
 +                            pme->pmegrid_start_iz + iz,
 +                            pmegrid[pmeidx]);
 +#endif
 +            }
 +        }
 +    }
 +#ifdef DEBUG_PME
 +    fclose(fp);
 +    fclose(fp2);
 +#endif
 +    }
 +    return 0;
 +}
 +
 +
++static gmx_cycles_t omp_cyc_start()
++{
++    return gmx_cycles_read();
++}
++
++static gmx_cycles_t omp_cyc_end(gmx_cycles_t c)
++{
++    return gmx_cycles_read() - c;
++}
++
++
 +static int
-     int     i,ix,iy,iz;
++copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid,
++                        int nthread,int thread)
 +{
 +    ivec    local_fft_ndata,local_fft_offset,local_fft_size;
 +    ivec    local_pme_size;
-     
++    int     ixy0,ixy1,ixy,ix,iy,iz;
 +    int     pmeidx,fftidx;
-     
-     /* The fftgrid is always 'justified' to the lower-left corner of the PME grid, 
++#ifdef PME_TIME_THREADS
++    gmx_cycles_t c1;
++    static double cs1=0;
++    static int cnt=0;
++#endif
++
++#ifdef PME_TIME_THREADS
++    c1 = omp_cyc_start();
++#endif
 +    /* Dimensions should be identical for A/B grid, so we just use A here */
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +
 +    local_pme_size[0] = pme->pmegrid_nx;
 +    local_pme_size[1] = pme->pmegrid_ny;
 +    local_pme_size[2] = pme->pmegrid_nz;
-     for(ix=0;ix<local_fft_ndata[XX];ix++)
++
++    /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
 +     the offset is identical, and the PME grid always has more data (due to overlap)
 +     */
-         for(iy=0;iy<local_fft_ndata[YY];iy++)
++    ixy0 = ((thread  )*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
++    ixy1 = ((thread+1)*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
++
++    for(ixy=ixy0;ixy<ixy1;ixy++)
 +    {
-             for(iz=0;iz<local_fft_ndata[ZZ];iz++)
-             {
-                 pmeidx = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
-                 fftidx = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
-                 pmegrid[pmeidx] = fftgrid[fftidx];
-             }
++        ix = ixy/local_fft_ndata[YY];
++        iy = ixy - ix*local_fft_ndata[YY];
++
++        pmeidx = (ix*local_pme_size[YY] + iy)*local_pme_size[ZZ];
++        fftidx = (ix*local_fft_size[YY] + iy)*local_fft_size[ZZ];
++        for(iz=0;iz<local_fft_ndata[ZZ];iz++)
 +        {
-     }   
++            pmegrid[pmeidx+iz] = fftgrid[fftidx+iz];
 +        }
-     for(ix=0; ix<pnx; ix++)
++    }
++
++#ifdef PME_TIME_THREADS
++    c1 = omp_cyc_end(c1);
++    cs1 += (double)c1;
++    cnt++;
++    if (cnt % 20 == 0)
++    {
++        printf("copy %.2f\n",cs1*1e-9);
++    }
++#endif
++
 +    return 0;
 +}
 +
 +
 +static void
 +wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
 +{
 +    int     nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix,iy,iz;
 +
 +    nx = pme->nkx;
 +    ny = pme->nky;
 +    nz = pme->nkz;
 +
 +    pnx = pme->pmegrid_nx;
 +    pny = pme->pmegrid_ny;
 +    pnz = pme->pmegrid_nz;
 +
 +    overlap = pme->pme_order - 1;
 +
 +    /* Add periodic overlap in z */
-         for(iy=0; iy<pny; iy++)
++    for(ix=0; ix<pme->pmegrid_nx; ix++)
 +    {
-        for(ix=0; ix<pnx; ix++)
++        for(iy=0; iy<pme->pmegrid_ny; iy++)
 +        {
 +            for(iz=0; iz<overlap; iz++)
 +            {
 +                pmegrid[(ix*pny+iy)*pnz+iz] +=
 +                    pmegrid[(ix*pny+iy)*pnz+nz+iz];
 +            }
 +        }
 +    }
 +
 +    if (pme->nnodes_minor == 1)
 +    {
-      
++       for(ix=0; ix<pme->pmegrid_nx; ix++)
 +       {
 +           for(iy=0; iy<overlap; iy++)
 +           {
 +               for(iz=0; iz<nz; iz++)
 +               {
 +                   pmegrid[(ix*pny+iy)*pnz+iz] +=
 +                       pmegrid[(ix*pny+ny+iy)*pnz+iz];
 +               }
 +           }
 +       }
 +    }
-         ny_x = (pme->nnodes_minor == 1 ? ny : pny);
++
 +    if (pme->nnodes_major == 1)
 +    {
-     int     nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix,iy,iz;
++        ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
 +
 +        for(ix=0; ix<overlap; ix++)
 +        {
 +            for(iy=0; iy<ny_x; iy++)
 +            {
 +                for(iz=0; iz<nz; iz++)
 +                {
 +                    pmegrid[(ix*pny+iy)*pnz+iz] +=
 +                        pmegrid[((nx+ix)*pny+iy)*pnz+iz];
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void
 +unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
 +{
-         ny_x = (pme->nnodes_minor == 1 ? ny : pny);
++    int     nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix;
 +
 +    nx = pme->nkx;
 +    ny = pme->nky;
 +    nz = pme->nkz;
 +
 +    pnx = pme->pmegrid_nx;
 +    pny = pme->pmegrid_ny;
 +    pnz = pme->pmegrid_nz;
 +
 +    overlap = pme->pme_order - 1;
 +
 +    if (pme->nnodes_major == 1)
 +    {
-        for(ix=0; ix<pnx; ix++)
++        ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
 +
 +        for(ix=0; ix<overlap; ix++)
 +        {
++            int iy,iz;
++
 +            for(iy=0; iy<ny_x; iy++)
 +            {
 +                for(iz=0; iz<nz; iz++)
 +                {
 +                    pmegrid[((nx+ix)*pny+iy)*pnz+iz] =
 +                        pmegrid[(ix*pny+iy)*pnz+iz];
 +                }
 +            }
 +        }
 +    }
 +
 +    if (pme->nnodes_minor == 1)
 +    {
-     for(ix=0; ix<pnx; ix++)
++#pragma omp parallel for num_threads(pme->nthread) schedule(static)
++       for(ix=0; ix<pme->pmegrid_nx; ix++)
 +       {
++           int iy,iz;
++
 +           for(iy=0; iy<overlap; iy++)
 +           {
 +               for(iz=0; iz<nz; iz++)
 +               {
 +                   pmegrid[(ix*pny+ny+iy)*pnz+iz] =
 +                       pmegrid[(ix*pny+iy)*pnz+iz];
 +               }
 +           }
 +       }
 +    }
 +
 +    /* Copy periodic overlap in z */
-         for(iy=0; iy<pny; iy++)
++#pragma omp parallel for num_threads(pme->nthread) schedule(static)
++    for(ix=0; ix<pme->pmegrid_nx; ix++)
 +    {
- static void spread_q_bsplines(gmx_pme_t pme, pme_atomcomm_t *atc, 
-                               real *grid)
++        int iy,iz;
++
++        for(iy=0; iy<pme->pmegrid_ny; iy++)
 +        {
 +            for(iz=0; iz<overlap; iz++)
 +            {
 +                pmegrid[(ix*pny+iy)*pnz+nz+iz] =
 +                    pmegrid[(ix*pny+iy)*pnz+iz];
 +            }
 +        }
 +    }
 +}
 +
++static void clear_grid(int nx,int ny,int nz,real *grid,
++                       ivec fs,int *flag,
++                       int fx,int fy,int fz,
++                       int order)
++{
++    int nc,ncz;
++    int fsx,fsy,fsz,gx,gy,gz,g0x,g0y,x,y,z;
++    int flind;
++
++    nc  = 2 + (order - 2)/FLBS;
++    ncz = 2 + (order - 2)/FLBSZ;
++
++    for(fsx=fx; fsx<fx+nc; fsx++)
++    {
++        for(fsy=fy; fsy<fy+nc; fsy++)
++        {
++            for(fsz=fz; fsz<fz+ncz; fsz++)
++            {
++                flind = (fsx*fs[YY] + fsy)*fs[ZZ] + fsz;
++                if (flag[flind] == 0)
++                {
++                    gx = fsx*FLBS;
++                    gy = fsy*FLBS;
++                    gz = fsz*FLBSZ;
++                    g0x = (gx*ny + gy)*nz + gz;
++                    for(x=0; x<FLBS; x++)
++                    {
++                        g0y = g0x;
++                        for(y=0; y<FLBS; y++)
++                        {
++                            for(z=0; z<FLBSZ; z++)
++                            {
++                                grid[g0y+z] = 0;
++                            }
++                            g0y += nz;
++                        }
++                        g0x += ny*nz;
++                    }
++
++                    flag[flind] = 1;
++                }
++            }
++        }
++    }
++}
 +
 +/* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
 +#define DO_BSPLINE(order)                            \
 +for(ithx=0; (ithx<order); ithx++)                    \
 +{                                                    \
 +    index_x = (i0+ithx)*pny*pnz;                     \
 +    valx    = qn*thx[ithx];                          \
 +                                                     \
 +    for(ithy=0; (ithy<order); ithy++)                \
 +    {                                                \
 +        valxy    = valx*thy[ithy];                   \
 +        index_xy = index_x+(j0+ithy)*pnz;            \
 +                                                     \
 +        for(ithz=0; (ithz<order); ithz++)            \
 +        {                                            \
 +            index_xyz        = index_xy+(k0+ithz);   \
 +            grid[index_xyz] += valxy*thz[ithz];      \
 +        }                                            \
 +    }                                                \
 +}
 +
 +
-   
++static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
++                                     pme_atomcomm_t *atc, splinedata_t *spline,
++                                     pme_spline_work_t *work)
 +{
 +
 +    /* spread charges from home atoms to local grid */
++    real     *grid;
 +    pme_overlap_t *ol;
 +    int      b,i,nn,n,ithx,ithy,ithz,i0,j0,k0;
 +    int *    idxptr;
 +    int      order,norder,index_x,index_xy,index_xyz;
 +    real     valx,valxy,qn;
 +    real     *thx,*thy,*thz;
 +    int      localsize, bndsize;
-   
-     pnx = pme->pmegrid_nx;
-     pny = pme->pmegrid_ny;
-     pnz = pme->pmegrid_nz;
 +    int      pnx,pny,pnz,ndatatot;
-     
++    int      offx,offy,offz;
++
++    pnx = pmegrid->n[XX];
++    pny = pmegrid->n[YY];
++    pnz = pmegrid->n[ZZ];
++
++    offx = pmegrid->offset[XX];
++    offy = pmegrid->offset[YY];
++    offz = pmegrid->offset[ZZ];
++
 +    ndatatot = pnx*pny*pnz;
-     order = pme->pme_order;
++    grid = pmegrid->grid;
 +    for(i=0;i<ndatatot;i++)
 +    {
 +        grid[i] = 0;
 +    }
 +
-     for(nn=0; (nn<atc->n);nn++) 
++    order = pmegrid->order;
 +
-         n      = nn;
-         qn     = atc->q[n];
++    for(nn=0; nn<spline->n; nn++)
 +    {
-         if (qn != 0) 
++        n  = spline->ind[nn];
++        qn = atc->q[n];
 +
-             norder = n*order;
-             
-             i0   = idxptr[XX]; 
-             j0   = idxptr[YY];
-             k0   = idxptr[ZZ];
-             thx = atc->theta[XX] + norder;
-             thy = atc->theta[YY] + norder;
-             thz = atc->theta[ZZ] + norder;
-             
++        if (qn != 0)
 +        {
 +            idxptr = atc->idx[n];
-             case 4:  DO_BSPLINE(4);     break;
-             case 5:  DO_BSPLINE(5);     break;
-             default: DO_BSPLINE(order); break;
++            norder = nn*order;
++
++            i0   = idxptr[XX] - offx;
++            j0   = idxptr[YY] - offy;
++            k0   = idxptr[ZZ] - offz;
++
++            thx = spline->theta[XX] + norder;
++            thy = spline->theta[YY] + norder;
++            thz = spline->theta[ZZ] + norder;
++
 +            switch (order) {
-     } 
++            case 4:
++#ifdef PME_SSE
++#ifdef PME_SSE_UNALIGNED
++#define PME_SPREAD_SSE_ORDER4
++#else
++#define PME_SPREAD_SSE_ALIGNED
++#define PME_ORDER 4
++#endif
++#include "pme_sse_single.h"
++#else
++                DO_BSPLINE(4);
++#endif
++                break;
++            case 5:
++#ifdef PME_SSE
++#define PME_SPREAD_SSE_ALIGNED
++#define PME_ORDER 5
++#include "pme_sse_single.h"
++#else
++                DO_BSPLINE(5);
++#endif
++                break;
++            default:
++                DO_BSPLINE(order);
++                break;
 +            }
 +        }
- #if ( !defined(GMX_DOUBLE) && ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_X86_64_SSE2) ) )
-     /* Calculate exponentials through SSE in float precision */
- #define CALC_EXPONENTIALS(start,end,r_aligned)      \
-     {                                               \
-         __m128 tmp_sse;                             \
-         for(kx=0; kx<end; kx+=4)                    \
-         {                                           \
-             tmp_sse = _mm_load_ps(r_aligned+kx);    \
-             tmp_sse = gmx_mm_exp_ps(tmp_sse);       \
-             _mm_store_ps(r_aligned+kx,tmp_sse);     \
-         }                                           \
++    }
 +}
 +
 +
- #else
- #define CALC_EXPONENTIALS(start,end,r)          \
-     for(kx=start; kx<end; kx++)                 \
-     {                                           \
-         r[kx] = exp(r[kx]);                     \
++static void alloc_real_aligned(int n,real **ptr_raw,real **ptr)
++{
++    snew(*ptr_raw,n+8);
++
++    *ptr = (real *) (((size_t) *ptr_raw + 16) & (~((size_t) 15)));
++
++}
++static void set_grid_alignment(int *pmegrid_nz,int pme_order)
++{
++#ifdef PME_SSE
++    if (pme_order == 5
++#ifndef PME_SSE_UNALIGNED
++        || pme_order == 4
++#endif
++        )
++    {
++        /* Round nz up to a multiple of 4 to ensure alignment */
++        *pmegrid_nz = ((*pmegrid_nz + 3) & ~3);
 +    }
- static int solve_pme_yzx(gmx_pme_t pme,t_complex *grid,
-                          real ewaldcoeff,real vol,
-                          gmx_bool bEnerVir,real *mesh_energy,matrix vir)
++#endif
++}
++
++static void set_gridsize_alignment(int *gridsize,int pme_order)
++{
++#ifdef PME_SSE
++#ifndef PME_SSE_UNALIGNED
++    if (pme_order == 4)
++    {
++        /* Add extra elements to ensured aligned operations do not go
++         * beyond the allocated grid size.
++         * Note that for pme_order=5, the pme grid z-size alignment
++         * ensures that we will not go beyond the grid size.
++         */
++         *gridsize += 4;
 +    }
 +#endif
++#endif
++}
 +
++static void pmegrid_init(pmegrid_t *grid,
++                         int cx, int cy, int cz,
++                         int x0, int y0, int z0,
++                         int x1, int y1, int z1,
++                         gmx_bool set_alignment,
++                         int pme_order,
++                         real *ptr)
++{
++    int nz,gridsize;
++
++    grid->ci[XX] = cx;
++    grid->ci[YY] = cy;
++    grid->ci[ZZ] = cz;
++    grid->offset[XX] = x0;
++    grid->offset[YY] = y0;
++    grid->offset[ZZ] = z0;
++    grid->n[XX]      = x1 - x0 + pme_order - 1;
++    grid->n[YY]      = y1 - y0 + pme_order - 1;
++    grid->n[ZZ]      = z1 - z0 + pme_order - 1;
++
++    nz = grid->n[ZZ];
++    set_grid_alignment(&nz,pme_order);
++    if (set_alignment)
++    {
++        grid->n[ZZ] = nz;
++    }
++    else if (nz != grid->n[ZZ])
++    {
++        gmx_incons("pmegrid_init call with an unaligned z size");
++    }
 +
-     int     nx,ny,nz,iy,iz,kxstart,kxend;
++    grid->order = pme_order;
++    if (ptr == NULL)
++    {
++        gridsize = grid->n[XX]*grid->n[YY]*grid->n[ZZ];
++        set_gridsize_alignment(&gridsize,pme_order);
++        snew_aligned(grid->grid,gridsize,16);
++    }
++    else
++    {
++        grid->grid = ptr;
++    }
++}
++
++static int div_round_up(int enumerator,int denominator)
++{
++    return (enumerator + denominator - 1)/denominator;
++}
++
++static void make_subgrid_division(const ivec n,int ovl,int nthread,
++                                  ivec nsub)
++{
++    int gsize_opt,gsize;
++    int nsx,nsy,nsz;
++    char *env;
++
++    gsize_opt = -1;
++    for(nsx=1; nsx<=nthread; nsx++)
++    {
++        if (nthread % nsx == 0)
++        {
++            for(nsy=1; nsy<=nthread; nsy++)
++            {
++                if (nsx*nsy <= nthread && nthread % (nsx*nsy) == 0)
++                {
++                    nsz = nthread/(nsx*nsy);
++
++                    /* Determine the number of grid points per thread */
++                    gsize =
++                        (div_round_up(n[XX],nsx) + ovl)*
++                        (div_round_up(n[YY],nsy) + ovl)*
++                        (div_round_up(n[ZZ],nsz) + ovl);
++
++                    /* Minimize the number of grids points per thread
++                     * and, secondarily, the number of cuts in minor dimensions.
++                     */
++                    if (gsize_opt == -1 ||
++                        gsize < gsize_opt ||
++                        (gsize == gsize_opt &&
++                         (nsz < nsub[ZZ] || (nsz == nsub[ZZ] && nsy < nsub[YY]))))
++                    {
++                        nsub[XX] = nsx;
++                        nsub[YY] = nsy;
++                        nsub[ZZ] = nsz;
++                        gsize_opt = gsize;
++                    }
++                }
++            }
++        }
++    }
++
++    env = getenv("GMX_PME_THREAD_DIVISION");
++    if (env != NULL)
++    {
++        sscanf(env,"%d %d %d",&nsub[XX],&nsub[YY],&nsub[ZZ]);
++    }
++
++    if (nsub[XX]*nsub[YY]*nsub[ZZ] != nthread)
++    {
++        gmx_fatal(FARGS,"PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)",nsub[XX],nsub[YY],nsub[ZZ],nthread);
++    }
++}
++
++static void pmegrids_init(pmegrids_t *grids,
++                          int nx,int ny,int nz,int nz_base,
++                          int pme_order,
++                          int nthread,
++                          int overlap_x,
++                          int overlap_y)
++{
++    ivec n,n_base,g0,g1;
++    int t,x,y,z,d,i,tfac;
++    int max_comm_lines;
++
++    n[XX] = nx - (pme_order - 1);
++    n[YY] = ny - (pme_order - 1);
++    n[ZZ] = nz - (pme_order - 1);
++
++    copy_ivec(n,n_base);
++    n_base[ZZ] = nz_base;
++
++    pmegrid_init(&grids->grid,0,0,0,0,0,0,n[XX],n[YY],n[ZZ],FALSE,pme_order,
++                 NULL);
++
++    grids->nthread = nthread;
++
++    make_subgrid_division(n_base,pme_order-1,grids->nthread,grids->nc);
++
++    if (grids->nthread > 1)
++    {
++        ivec nst;
++        int gridsize;
++        real *grid_all;
++
++        for(d=0; d<DIM; d++)
++        {
++            nst[d] = div_round_up(n[d],grids->nc[d]) + pme_order - 1;
++        }
++        set_grid_alignment(&nst[ZZ],pme_order);
++
++        if (debug)
++        {
++            fprintf(debug,"pmegrid thread local division: %d x %d x %d\n",
++                    grids->nc[XX],grids->nc[YY],grids->nc[ZZ]);
++            fprintf(debug,"pmegrid %d %d %d max thread pmegrid %d %d %d\n",
++                    nx,ny,nz,
++                    nst[XX],nst[YY],nst[ZZ]);
++        }
++
++        snew(grids->grid_th,grids->nthread);
++        t = 0;
++        gridsize = nst[XX]*nst[YY]*nst[ZZ];
++        set_gridsize_alignment(&gridsize,pme_order);
++        snew_aligned(grid_all,
++                     grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
++                     16);
++
++        for(x=0; x<grids->nc[XX]; x++)
++        {
++            for(y=0; y<grids->nc[YY]; y++)
++            {
++                for(z=0; z<grids->nc[ZZ]; z++)
++                {
++                    pmegrid_init(&grids->grid_th[t],
++                                 x,y,z,
++                                 (n[XX]*(x  ))/grids->nc[XX],
++                                 (n[YY]*(y  ))/grids->nc[YY],
++                                 (n[ZZ]*(z  ))/grids->nc[ZZ],
++                                 (n[XX]*(x+1))/grids->nc[XX],
++                                 (n[YY]*(y+1))/grids->nc[YY],
++                                 (n[ZZ]*(z+1))/grids->nc[ZZ],
++                                 TRUE,
++                                 pme_order,
++                                 grid_all+GMX_CACHE_SEP+t*(gridsize+GMX_CACHE_SEP));
++                    t++;
++                }
++            }
++        }
++    }
++
++    snew(grids->g2t,DIM);
++    tfac = 1;
++    for(d=DIM-1; d>=0; d--)
++    {
++        snew(grids->g2t[d],n[d]);
++        t = 0;
++        for(i=0; i<n[d]; i++)
++        {
++            /* The second check should match the parameters
++             * of the pmegrid_init call above.
++             */
++            while (t + 1 < grids->nc[d] && i >= (n[d]*(t+1))/grids->nc[d])
++            {
++                t++;
++            }
++            grids->g2t[d][i] = t*tfac;
++        }
++
++        tfac *= grids->nc[d];
++
++        switch (d)
++        {
++        case XX: max_comm_lines = overlap_x;     break;
++        case YY: max_comm_lines = overlap_y;     break;
++        case ZZ: max_comm_lines = pme_order - 1; break;
++        }
++        grids->nthread_comm[d] = 0;
++        while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines)
++        {
++            grids->nthread_comm[d]++;
++        }
++        if (debug != NULL)
++        {
++            fprintf(debug,"pmegrid thread grid communication range in %c: %d\n",
++                    'x'+d,grids->nthread_comm[d]);
++        }
++        /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
++         * work, but this is not a problematic restriction.
++         */
++        if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d])
++        {
++            gmx_fatal(FARGS,"Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME",grids->nthread);
++        }
++    }
++}
++
++
++static void pmegrids_destroy(pmegrids_t *grids)
++{
++    int t;
++
++    if (grids->grid.grid != NULL)
++    {
++        sfree(grids->grid.grid);
++
++        if (grids->nthread > 0)
++        {
++            for(t=0; t<grids->nthread; t++)
++            {
++                sfree(grids->grid_th[t].grid);
++            }
++            sfree(grids->grid_th);
++        }
++    }
++}
++
++
++static void realloc_work(pme_work_t *work,int nkx)
++{
++    if (nkx > work->nalloc)
++    {
++        work->nalloc = nkx;
++        srenew(work->mhx  ,work->nalloc);
++        srenew(work->mhy  ,work->nalloc);
++        srenew(work->mhz  ,work->nalloc);
++        srenew(work->m2   ,work->nalloc);
++        srenew(work->denom,work->nalloc);
++        /* Allocate an aligned pointer for SSE operations, including 3 extra
++         * elements at the end since SSE operates on 4 elements at a time.
++         */
++        sfree_aligned(work->denom);
++        sfree_aligned(work->tmp1);
++        sfree_aligned(work->eterm);
++        snew_aligned(work->denom,work->nalloc+3,16);
++        snew_aligned(work->tmp1 ,work->nalloc+3,16);
++        snew_aligned(work->eterm,work->nalloc+3,16);
++        srenew(work->m2inv,work->nalloc);
++    }
++}
++
++
++static void free_work(pme_work_t *work)
++{
++    sfree(work->mhx);
++    sfree(work->mhy);
++    sfree(work->mhz);
++    sfree(work->m2);
++    sfree_aligned(work->denom);
++    sfree_aligned(work->tmp1);
++    sfree_aligned(work->eterm);
++    sfree(work->m2inv);
++}
++
++
++#ifdef PME_SSE
++    /* Calculate exponentials through SSE in float precision */
++inline static void calc_exponentials(int start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
++{
++    {
++        const __m128 two = _mm_set_ps(2.0f,2.0f,2.0f,2.0f);
++        __m128 f_sse;
++        __m128 lu;
++        __m128 tmp_d1,d_inv,tmp_r,tmp_e;
++        int kx;
++        f_sse = _mm_load1_ps(&f);
++        for(kx=0; kx<end; kx+=4)
++        {
++            tmp_d1   = _mm_load_ps(d_aligned+kx);
++            lu       = _mm_rcp_ps(tmp_d1);
++            d_inv    = _mm_mul_ps(lu,_mm_sub_ps(two,_mm_mul_ps(lu,tmp_d1)));
++            tmp_r    = _mm_load_ps(r_aligned+kx);
++            tmp_r    = gmx_mm_exp_ps(tmp_r);
++            tmp_e    = _mm_mul_ps(f_sse,d_inv);
++            tmp_e    = _mm_mul_ps(tmp_e,tmp_r);
++            _mm_store_ps(e_aligned+kx,tmp_e);
++        }
++    }
++}
++#else
++inline static void calc_exponentials(int start, int end, real f, real *d, real *r, real *e)
++{
++    int kx;
++    for(kx=start; kx<end; kx++)
++    {
++        d[kx] = 1.0/d[kx];
++    }
++    for(kx=start; kx<end; kx++)
++    {
++        r[kx] = exp(r[kx]);
++    }
++    for(kx=start; kx<end; kx++)
++    {
++        e[kx] = f*r[kx]*d[kx];
++    }
++}
++#endif
++
++
++static int solve_pme_yzx(gmx_pme_t pme,t_complex *grid,
++                         real ewaldcoeff,real vol,
++                         gmx_bool bEnerVir,
++                         int nthread,int thread)
 +{
 +    /* do recip sum over local cells in grid */
 +    /* y major, z middle, x minor or continuous */
 +    t_complex *p0;
 +    int     kx,ky,kz,maxkx,maxky,maxkz;
-     real    eterm,d1,d2,energy=0;
++    int     nx,ny,nz,iyz0,iyz1,iyz,iy,iz,kxstart,kxend;
 +    real    mx,my,mz;
 +    real    factor=M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
 +    real    ets2,struct2,vfactor,ets2vf;
-       real    *mhx,*mhy,*mhz,*m2,*denom,*tmp1,*m2inv;
++    real    d1,d2,energy=0;
 +    real    by,bz;
 +    real    virxx=0,virxy=0,virxz=0,viryy=0,viryz=0,virzz=0;
 +    real    rxx,ryx,ryy,rzx,rzy,rzz;
-     
++    pme_work_t *work;
++    real    *mhx,*mhy,*mhz,*m2,*denom,*tmp1,*eterm,*m2inv;
 +    real    mhxk,mhyk,mhzk,m2k;
 +    real    corner_fac;
 +    ivec    complex_order;
 +    ivec    local_ndata,local_offset,local_size;
-     
++    real    elfac;
++
++    elfac = ONE_4PI_EPS0/pme->epsilon_r;
++
 +    nx = pme->nkx;
 +    ny = pme->nky;
 +    nz = pme->nkz;
-     
++
 +    /* Dimensions should be identical for A/B grid, so we just use A here */
 +    gmx_parallel_3dfft_complex_limits(pme->pfft_setupA,
 +                                      complex_order,
 +                                      local_ndata,
 +                                      local_offset,
 +                                      local_size);
-     
++
 +    rxx = pme->recipbox[XX][XX];
 +    ryx = pme->recipbox[YY][XX];
 +    ryy = pme->recipbox[YY][YY];
 +    rzx = pme->recipbox[ZZ][XX];
 +    rzy = pme->recipbox[ZZ][YY];
 +    rzz = pme->recipbox[ZZ][ZZ];
-       
-       mhx   = pme->work_mhx;
-       mhy   = pme->work_mhy;
-       mhz   = pme->work_mhz;
-       m2    = pme->work_m2;
-       denom = pme->work_denom;
-       tmp1  = pme->work_tmp1;
-       m2inv = pme->work_m2inv;        
++
 +    maxkx = (nx+1)/2;
 +    maxky = (ny+1)/2;
 +    maxkz = nz/2+1;
-     for(iy=0;iy<local_ndata[YY];iy++)
 +
-         
-         if (ky < maxky) 
++    work = &pme->work[thread];
++    mhx   = work->mhx;
++    mhy   = work->mhy;
++    mhz   = work->mhz;
++    m2    = work->m2;
++    denom = work->denom;
++    tmp1  = work->tmp1;
++    eterm = work->eterm;
++    m2inv = work->m2inv;
++
++    iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread   /nthread;
++    iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
++
++    for(iyz=iyz0; iyz<iyz1; iyz++)
 +    {
++        iy = iyz/local_ndata[ZZ];
++        iz = iyz - iy*local_ndata[ZZ];
++
 +        ky = iy + local_offset[YY];
-         else 
++
++        if (ky < maxky)
 +        {
 +            my = ky;
 +        }
-         
++        else
 +        {
 +            my = (ky - ny);
 +        }
-         for(iz=0;iz<local_ndata[ZZ];iz++)
-         {
-             kz = iz + local_offset[ZZ];
-             
-             mz = kz;
-             bz = pme->bsp_mod[ZZ][kz];
-             
-             /* 0.5 correction for corner points */
-                       corner_fac = 1;
-             if (kz == 0)
-                 corner_fac = 0.5;
-             if (kz == (nz+1)/2)
-                 corner_fac = 0.5;
-                       
-             p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
-             
-             /* We should skip the k-space point (0,0,0) */
-             if (local_offset[XX] > 0 ||
-                 local_offset[YY] > 0 || ky > 0 ||
-                 kz > 0)
++
 +        by = M_PI*vol*pme->bsp_mod[YY][ky];
 +
-                 kxstart = local_offset[XX];
++        kz = iz + local_offset[ZZ];
++
++        mz = kz;
++
++        bz = pme->bsp_mod[ZZ][kz];
++
++        /* 0.5 correction for corner points */
++        corner_fac = 1;
++        if (kz == 0 || kz == (nz+1)/2)
++        {
++            corner_fac = 0.5;
++        }
++
++        p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
++
++        /* We should skip the k-space point (0,0,0) */
++        if (local_offset[XX] > 0 || ky > 0 || kz > 0)
++        {
++            kxstart = local_offset[XX];
++        }
++        else
++        {
++            kxstart = local_offset[XX] + 1;
++            p0++;
++        }
++        kxend = local_offset[XX] + local_ndata[XX];
++
++        if (bEnerVir)
++        {
++            /* More expensive inner loop, especially because of the storage
++             * of the mh elements in array's.
++             * Because x is the minor grid index, all mh elements
++             * depend on kx for triclinic unit cells.
++             */
++
++                /* Two explicit loops to avoid a conditional inside the loop */
++            for(kx=kxstart; kx<maxkx; kx++)
 +            {
-             else
++                mx = kx;
++
++                mhxk      = mx * rxx;
++                mhyk      = mx * ryx + my * ryy;
++                mhzk      = mx * rzx + my * rzy + mz * rzz;
++                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
++                mhx[kx]   = mhxk;
++                mhy[kx]   = mhyk;
++                mhz[kx]   = mhzk;
++                m2[kx]    = m2k;
++                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
++                tmp1[kx]  = -factor*m2k;
 +            }
-                 kxstart = local_offset[XX] + 1;
-                 p0++;
++
++            for(kx=maxkx; kx<kxend; kx++)
 +            {
-             kxend = local_offset[XX] + local_ndata[XX];
-                       
-             if (bEnerVir)
++                mx = (kx - nx);
++
++                mhxk      = mx * rxx;
++                mhyk      = mx * ryx + my * ryy;
++                mhzk      = mx * rzx + my * rzy + mz * rzz;
++                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
++                mhx[kx]   = mhxk;
++                mhy[kx]   = mhyk;
++                mhz[kx]   = mhzk;
++                m2[kx]    = m2k;
++                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
++                tmp1[kx]  = -factor*m2k;
 +            }
-                 /* More expensive inner loop, especially because of the storage
-                  * of the mh elements in array's.
-                  * Because x is the minor grid index, all mh elements
-                  * depend on kx for triclinic unit cells.
-                  */
++
++            for(kx=kxstart; kx<kxend; kx++)
 +            {
-                 /* Two explicit loops to avoid a conditional inside the loop */
-                 for(kx=kxstart; kx<maxkx; kx++)
-                 {
-                     mx = kx;
-                     
-                     mhxk      = mx * rxx;
-                     mhyk      = mx * ryx + my * ryy;
-                     mhzk      = mx * rzx + my * rzy + mz * rzz;
-                     m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
-                     mhx[kx]   = mhxk;
-                     mhy[kx]   = mhyk;
-                     mhz[kx]   = mhzk;
-                     m2[kx]    = m2k;
-                     denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
-                     tmp1[kx]  = -factor*m2k;
-                 }
-                 
-                 for(kx=maxkx; kx<kxend; kx++)
-                 {
-                     mx = (kx - nx);
-                     mhxk      = mx * rxx;
-                     mhyk      = mx * ryx + my * ryy;
-                     mhzk      = mx * rzx + my * rzy + mz * rzz;
-                     m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
-                     mhx[kx]   = mhxk;
-                     mhy[kx]   = mhyk;
-                     mhz[kx]   = mhzk;
-                     m2[kx]    = m2k;
-                     denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
-                     tmp1[kx]  = -factor*m2k;
-                 }
-                 
-                 for(kx=kxstart; kx<kxend; kx++)
-                 {
-                     m2inv[kx] = 1.0/m2[kx];
-                 }
-                 for(kx=kxstart; kx<kxend; kx++)
-                 {
-                     denom[kx] = 1.0/denom[kx];
-                 }
++                m2inv[kx] = 1.0/m2[kx];
++            }
 +
-                 CALC_EXPONENTIALS(kxstart,kxend,tmp1);
++            calc_exponentials(kxstart,kxend,elfac,denom,tmp1,eterm);
 +
-                 for(kx=kxstart; kx<kxend; kx++,p0++)
-                 {
-                     d1      = p0->re;
-                     d2      = p0->im;
-                     
-                     eterm    = ONE_4PI_EPS0/pme->epsilon_r*tmp1[kx]*denom[kx];
-                     
-                     p0->re  = d1*eterm;
-                     p0->im  = d2*eterm;
-                     
-                     struct2 = 2.0*(d1*d1+d2*d2);
-                     
-                     tmp1[kx] = eterm*struct2;
-                 }
-                 
-                 for(kx=kxstart; kx<kxend; kx++)
-                 {
-                     ets2     = corner_fac*tmp1[kx];
-                     vfactor  = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
-                     energy  += ets2;
-                     
-                     ets2vf   = ets2*vfactor;
-                     virxx   += ets2vf*mhx[kx]*mhx[kx] - ets2;
-                     virxy   += ets2vf*mhx[kx]*mhy[kx];
-                     virxz   += ets2vf*mhx[kx]*mhz[kx];
-                     viryy   += ets2vf*mhy[kx]*mhy[kx] - ets2;
-                     viryz   += ets2vf*mhy[kx]*mhz[kx];
-                     virzz   += ets2vf*mhz[kx]*mhz[kx] - ets2;
-                 }
++            for(kx=kxstart; kx<kxend; kx++,p0++)
++            {
++                d1      = p0->re;
++                d2      = p0->im;
 +
-             else
++                p0->re  = d1*eterm[kx];
++                p0->im  = d2*eterm[kx];
++
++                struct2 = 2.0*(d1*d1+d2*d2);
++
++                tmp1[kx] = eterm[kx]*struct2;
 +            }
-                 /* We don't need to calculate the energy and the virial.
-                  * In this case the triclinic overhead is small.
-                  */
++
++            for(kx=kxstart; kx<kxend; kx++)
 +            {
-                 /* Two explicit loops to avoid a conditional inside the loop */
++                ets2     = corner_fac*tmp1[kx];
++                vfactor  = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
++                energy  += ets2;
++
++                ets2vf   = ets2*vfactor;
++                virxx   += ets2vf*mhx[kx]*mhx[kx] - ets2;
++                virxy   += ets2vf*mhx[kx]*mhy[kx];
++                virxz   += ets2vf*mhx[kx]*mhz[kx];
++                viryy   += ets2vf*mhy[kx]*mhy[kx] - ets2;
++                viryz   += ets2vf*mhy[kx]*mhz[kx];
++                virzz   += ets2vf*mhz[kx]*mhz[kx] - ets2;
++            }
++        }
++        else
++        {
++            /* We don't need to calculate the energy and the virial.
++             * In this case the triclinic overhead is small.
++             */
 +
-                 for(kx=kxstart; kx<maxkx; kx++)
-                 {
-                     mx = kx;
-                     
-                     mhxk      = mx * rxx;
-                     mhyk      = mx * ryx + my * ryy;
-                     mhzk      = mx * rzx + my * rzy + mz * rzz;
-                     m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
-                     denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
-                     tmp1[kx]  = -factor*m2k;
-                 }
-                 
-                 for(kx=maxkx; kx<kxend; kx++)
-                 {
-                     mx = (kx - nx);
-                     
-                     mhxk      = mx * rxx;
-                     mhyk      = mx * ryx + my * ryy;
-                     mhzk      = mx * rzx + my * rzy + mz * rzz;
-                     m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
-                     denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
-                     tmp1[kx]  = -factor*m2k;
-                 }
-                 
-                 for(kx=kxstart; kx<kxend; kx++)
-                 {
-                     denom[kx] = 1.0/denom[kx];
-                 }
++            /* Two explicit loops to avoid a conditional inside the loop */
 +
-                 CALC_EXPONENTIALS(kxstart,kxend,tmp1);
-                
-                 for(kx=kxstart; kx<kxend; kx++,p0++)
-                 {
-                     d1      = p0->re;
-                     d2      = p0->im;
-                     
-                     eterm    = ONE_4PI_EPS0/pme->epsilon_r*tmp1[kx]*denom[kx];
-                     
-                     p0->re  = d1*eterm;
-                     p0->im  = d2*eterm;
-                 }
++            for(kx=kxstart; kx<maxkx; kx++)
++            {
++                mx = kx;
++
++                mhxk      = mx * rxx;
++                mhyk      = mx * ryx + my * ryy;
++                mhzk      = mx * rzx + my * rzy + mz * rzz;
++                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
++                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
++                tmp1[kx]  = -factor*m2k;
++            }
 +
-     
++            for(kx=maxkx; kx<kxend; kx++)
++            {
++                mx = (kx - nx);
++
++                mhxk      = mx * rxx;
++                mhyk      = mx * ryx + my * ryy;
++                mhzk      = mx * rzx + my * rzy + mz * rzz;
++                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
++                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
++                tmp1[kx]  = -factor*m2k;
++            }
++
++            calc_exponentials(kxstart,kxend,elfac,denom,tmp1,eterm);
++
++            for(kx=kxstart; kx<kxend; kx++,p0++)
++            {
++                d1      = p0->re;
++                d2      = p0->im;
++
++                p0->re  = d1*eterm[kx];
++                p0->im  = d2*eterm[kx];
 +            }
 +        }
 +    }
-         vir[XX][XX] = 0.25*virxx;
-         vir[YY][YY] = 0.25*viryy;
-         vir[ZZ][ZZ] = 0.25*virzz;
-         vir[XX][YY] = vir[YY][XX] = 0.25*virxy;
-         vir[XX][ZZ] = vir[ZZ][XX] = 0.25*virxz;
-         vir[YY][ZZ] = vir[ZZ][YY] = 0.25*viryz;
-         
++
 +    if (bEnerVir)
 +    {
 +        /* Update virial with local values.
 +         * The virial is symmetric by definition.
 +         * this virial seems ok for isotropic scaling, but I'm
 +         * experiencing problems on semiisotropic membranes.
 +         * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
 +         */
-         *mesh_energy = 0.5*energy;
++        work->vir[XX][XX] = 0.25*virxx;
++        work->vir[YY][YY] = 0.25*viryy;
++        work->vir[ZZ][ZZ] = 0.25*virzz;
++        work->vir[XX][YY] = work->vir[YY][XX] = 0.25*virxy;
++        work->vir[XX][ZZ] = work->vir[ZZ][XX] = 0.25*virxz;
++        work->vir[YY][ZZ] = work->vir[ZZ][YY] = 0.25*viryz;
++
 +        /* This energy should be corrected for a charged system */
-     return local_ndata[YY]*local_ndata[ZZ]*local_ndata[XX];
++        work->energy = 0.5*energy;
 +    }
 +
 +    /* Return the loop count */
- {                                                                        \
++    return local_ndata[YY]*local_ndata[XX];
++}
++
++static void get_pme_ener_vir(const gmx_pme_t pme,int nthread,
++                             real *mesh_energy,matrix vir)
++{
++    /* This function sums output over threads
++     * and should therefore only be called after thread synchronization.
++     */
++    int thread;
++
++    *mesh_energy = pme->work[0].energy;
++    copy_mat(pme->work[0].vir,vir);
++
++    for(thread=1; thread<nthread; thread++)
++    {
++        *mesh_energy += pme->work[thread].energy;
++        m_add(vir,pme->work[thread].vir,vir);
++    }
++}
++
++static int solve_pme_yzx_wrapper(gmx_pme_t pme,t_complex *grid,
++                                 real ewaldcoeff,real vol,
++                                 gmx_bool bEnerVir,real *mesh_energy,matrix vir)
++{
++    int  nthread,thread;
++    int  nelements=0;
++
++    nthread = pme->nthread;
++
++#pragma omp parallel for num_threads(nthread) schedule(static)
++    for(thread=0; thread<nthread; thread++)
++    {
++        int n;
++
++        n = solve_pme_yzx(pme,grid,ewaldcoeff,vol,bEnerVir,nthread,thread);
++        if (thread == 0)
++        {
++            nelements = n;
++        }
++    }
++
++    if (bEnerVir)
++    {
++        get_pme_ener_vir(pme,nthread,mesh_energy,vir);
++    }
++
++    return nelements;
 +}
 +
 +
 +#define DO_FSPLINE(order)                      \
 +for(ithx=0; (ithx<order); ithx++)              \
-     {                                                                            \
++{                                              \
 +    index_x = (i0+ithx)*pny*pnz;               \
 +    tx      = thx[ithx];                       \
 +    dx      = dthx[ithx];                      \
 +                                               \
 +    for(ithy=0; (ithy<order); ithy++)          \
-         {                                                                        \
++    {                                          \
 +        index_xy = index_x+(j0+ithy)*pnz;      \
 +        ty       = thy[ithy];                  \
 +        dy       = dthy[ithy];                 \
 +        fxy1     = fz1 = 0;                    \
 +                                               \
 +        for(ithz=0; (ithz<order); ithz++)      \
-                        gmx_bool bClearF,pme_atomcomm_t *atc,real scale)
++        {                                      \
 +            gval  = grid[index_xy+(k0+ithz)];  \
 +            fxy1 += thz[ithz]*gval;            \
 +            fz1  += dthz[ithz]*gval;           \
 +        }                                      \
 +        fx += dx*ty*fxy1;                      \
 +        fy += tx*dy*fxy1;                      \
 +        fz += tx*ty*fz1;                       \
 +    }                                          \
 +}
 +
 +
 +void gather_f_bsplines(gmx_pme_t pme,real *grid,
-     /* sum forces for local particles */  
++                       gmx_bool bClearF,pme_atomcomm_t *atc,
++                       splinedata_t *spline,
++                       real scale)
 +{
-     
++    /* sum forces for local particles */
 +    int     nn,n,ithx,ithy,ithz,i0,j0,k0;
 +    int     index_x,index_xy;
 +    int     nx,ny,nz,pnx,pny,pnz;
 +    int *   idxptr;
 +    real    tx,ty,dx,dy,qn;
 +    real    fx,fy,fz,gval;
 +    real    fxy1,fz1;
 +    real    *thx,*thy,*thz,*dthx,*dthy,*dthz;
 +    int     norder;
 +    real    rxx,ryx,ryy,rzx,rzy,rzz;
 +    int     order;
-     thx   = atc->theta[XX];
-     thy   = atc->theta[YY];
-     thz   = atc->theta[ZZ];
-     dthx  = atc->dtheta[XX];
-     dthy  = atc->dtheta[YY];
-     dthz  = atc->dtheta[ZZ];
++
++    pme_spline_work_t *work;
++
++    work = &pme->spline_work;
++
 +    order = pme->pme_order;
-     
++    thx   = spline->theta[XX];
++    thy   = spline->theta[YY];
++    thz   = spline->theta[ZZ];
++    dthx  = spline->dtheta[XX];
++    dthy  = spline->dtheta[YY];
++    dthz  = spline->dtheta[ZZ];
 +    nx    = pme->nkx;
 +    ny    = pme->nky;
 +    nz    = pme->nkz;
 +    pnx   = pme->pmegrid_nx;
 +    pny   = pme->pmegrid_ny;
 +    pnz   = pme->pmegrid_nz;
-     for(nn=0; (nn<atc->n); nn++) 
++
 +    rxx   = pme->recipbox[XX][XX];
 +    ryx   = pme->recipbox[YY][XX];
 +    ryy   = pme->recipbox[YY][YY];
 +    rzx   = pme->recipbox[ZZ][XX];
 +    rzy   = pme->recipbox[ZZ][YY];
 +    rzz   = pme->recipbox[ZZ][ZZ];
 +
-         n = nn;
-         qn      = scale*atc->q[n];
-         
-         if (bClearF) 
++    for(nn=0; nn<spline->n; nn++)
 +    {
-         if (qn != 0) 
++        n  = spline->ind[nn];
++        qn = atc->q[n];
++
++        if (bClearF)
 +        {
 +            atc->f[n][XX] = 0;
 +            atc->f[n][YY] = 0;
 +            atc->f[n][ZZ] = 0;
 +        }
-             norder = n*order;
-             
-             i0   = idxptr[XX]; 
++        if (qn != 0)
 +        {
 +            fx     = 0;
 +            fy     = 0;
 +            fz     = 0;
 +            idxptr = atc->idx[n];
-             
++            norder = nn*order;
++
++            i0   = idxptr[XX];
 +            j0   = idxptr[YY];
 +            k0   = idxptr[ZZ];
-             thx  = atc->theta[XX] + norder;
-             thy  = atc->theta[YY] + norder;
-             thz  = atc->theta[ZZ] + norder;
-             dthx = atc->dtheta[XX] + norder;
-             dthy = atc->dtheta[YY] + norder;
-             dthz = atc->dtheta[ZZ] + norder;
-             
++
 +            /* Pointer arithmetic alert, next six statements */
-             case 4:  DO_FSPLINE(4);     break;
-             case 5:  DO_FSPLINE(5);     break;
-             default: DO_FSPLINE(order); break;
++            thx  = spline->theta[XX] + norder;
++            thy  = spline->theta[YY] + norder;
++            thz  = spline->theta[ZZ] + norder;
++            dthx = spline->dtheta[XX] + norder;
++            dthy = spline->dtheta[YY] + norder;
++            dthz = spline->dtheta[ZZ] + norder;
++
 +            switch (order) {
-     
-     
++            case 4:
++#ifdef PME_SSE
++#ifdef PME_SSE_UNALIGNED
++#define PME_GATHER_F_SSE_ORDER4
++#else
++#define PME_GATHER_F_SSE_ALIGNED
++#define PME_ORDER 4
++#endif
++#include "pme_sse_single.h"
++#else
++                DO_FSPLINE(4);
++#endif
++                break;
++            case 5:
++#ifdef PME_SSE
++#define PME_GATHER_F_SSE_ALIGNED
++#define PME_ORDER 5
++#include "pme_sse_single.h"
++#else
++                DO_FSPLINE(5);
++#endif
++                break;
++            default:
++                DO_FSPLINE(order);
++                break;
 +            }
 +
 +            atc->f[n][XX] += -qn*( fx*nx*rxx );
 +            atc->f[n][YY] += -qn*( fx*nx*ryx + fy*ny*ryy );
 +            atc->f[n][ZZ] += -qn*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
 +        }
 +    }
 +    /* Since the energy and not forces are interpolated
 +     * the net force might not be exactly zero.
 +     * This can be solved by also interpolating F, but
 +     * that comes at a cost.
 +     * A better hack is to remove the net force every
 +     * step, but that must be done at a higher level
 +     * since this routine doesn't see all atoms if running
 +     * in parallel. Don't know how important it is?  EL 990726
 +     */
 +}
 +
++
 +static real gather_energy_bsplines(gmx_pme_t pme,real *grid,
 +                                   pme_atomcomm_t *atc)
 +{
++    splinedata_t *spline;
 +    int     n,ithx,ithy,ithz,i0,j0,k0;
 +    int     index_x,index_xy;
 +    int *   idxptr;
 +    real    energy,pot,tx,ty,qn,gval;
 +    real    *thx,*thy,*thz;
 +    int     norder;
 +    int     order;
-     
++
++    spline = &atc->spline[0];
++
 +    order = pme->pme_order;
-         
++
 +    energy = 0;
 +    for(n=0; (n<atc->n); n++) {
 +        qn      = atc->q[n];
-             
-             i0   = idxptr[XX]; 
++
 +        if (qn != 0) {
 +            idxptr = atc->idx[n];
 +            norder = n*order;
-             
++
++            i0   = idxptr[XX];
 +            j0   = idxptr[YY];
 +            k0   = idxptr[ZZ];
-             thx  = atc->theta[XX] + norder;
-             thy  = atc->theta[YY] + norder;
-             thz  = atc->theta[ZZ] + norder;
++
 +            /* Pointer arithmetic alert, next three statements */
-                    rvec fractx[],int nr,real charge[],
++            thx  = spline->theta[XX] + norder;
++            thy  = spline->theta[YY] + norder;
++            thz  = spline->theta[ZZ] + norder;
 +
 +            pot = 0;
 +            for(ithx=0; (ithx<order); ithx++)
 +            {
 +                index_x = (i0+ithx)*pme->pmegrid_ny*pme->pmegrid_nz;
 +                tx      = thx[ithx];
 +
 +                for(ithy=0; (ithy<order); ithy++)
 +                {
 +                    index_xy = index_x+(j0+ithy)*pme->pmegrid_nz;
 +                    ty       = thy[ithy];
 +
 +                    for(ithz=0; (ithz<order); ithz++)
 +                    {
 +                        gval  = grid[index_xy+(k0+ithz)];
 +                        pot  += tx*ty*thz[ithz]*gval;
 +                    }
 +
 +                }
 +            }
 +
 +            energy += pot*qn;
 +        }
 +    }
 +
 +    return energy;
 +}
 +
++/* Macro to force loop unrolling by fixing order.
++ * This gives a significant performance gain.
++ */
++#define CALC_SPLINE(order)                     \
++{                                              \
++    int j,k,l;                                 \
++    real dr,div;                               \
++    real data[PME_ORDER_MAX];                  \
++    real ddata[PME_ORDER_MAX];                 \
++                                               \
++    for(j=0; (j<DIM); j++)                     \
++    {                                          \
++        dr  = xptr[j];                         \
++                                               \
++        /* dr is relative offset from lower cell limit */ \
++        data[order-1] = 0;                     \
++        data[1] = dr;                          \
++        data[0] = 1 - dr;                      \
++                                               \
++        for(k=3; (k<order); k++)               \
++        {                                      \
++            div = 1.0/(k - 1.0);               \
++            data[k-1] = div*dr*data[k-2];      \
++            for(l=1; (l<(k-1)); l++)           \
++            {                                  \
++                data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
++                                   data[k-l-1]);                \
++            }                                  \
++            data[0] = div*(1-dr)*data[0];      \
++        }                                      \
++        /* differentiate */                    \
++        ddata[0] = -data[0];                   \
++        for(k=1; (k<order); k++)               \
++        {                                      \
++            ddata[k] = data[k-1] - data[k];    \
++        }                                      \
++                                               \
++        div = 1.0/(order - 1);                 \
++        data[order-1] = div*dr*data[order-2];  \
++        for(l=1; (l<(order-1)); l++)           \
++        {                                      \
++            data[order-l-1] = div*((dr+l)*data[order-l-2]+    \
++                               (order-l-dr)*data[order-l-1]); \
++        }                                      \
++        data[0] = div*(1 - dr)*data[0];        \
++                                               \
++        for(k=0; k<order; k++)                 \
++        {                                      \
++            theta[j][i*order+k]  = data[k];    \
++            dtheta[j][i*order+k] = ddata[k];   \
++        }                                      \
++    }                                          \
++}
++
 +void make_bsplines(splinevec theta,splinevec dtheta,int order,
-     int  i,j,k,l;
-     real dr,div;
-     real *data,*ddata,*xptr;
-     
-     for(i=0; (i<nr); i++) {
++                   rvec fractx[],int nr,int ind[],real charge[],
 +                   gmx_bool bFreeEnergy)
 +{
 +    /* construct splines for local atoms */
-         if (bFreeEnergy || charge[i] != 0.0) {
-             xptr = fractx[i];
-             for(j=0; (j<DIM); j++) {
-                 dr  = xptr[j];
-                 
-                 /* dr is relative offset from lower cell limit */
-                 data=&(theta[j][i*order]);
-                 data[order-1]=0;
-                 data[1]=dr;
-                 data[0]=1-dr;
-                 
-                 for(k=3; (k<order); k++) {
-                     div=1.0/(k-1.0);    
-                     data[k-1]=div*dr*data[k-2];
-                     for(l=1; (l<(k-1)); l++) {
-                         data[k-l-1]=div*((dr+l)*data[k-l-2]+(k-l-dr)*
-                                          data[k-l-1]);
-                     }
-                     data[0]=div*(1-dr)*data[0];
-                 }
-                 /* differentiate */
-                 ddata    = &(dtheta[j][i*order]);
-                 ddata[0] = -data[0];
-                 for(k=1; (k<order); k++) {
-                     ddata[k]=data[k-1]-data[k];
-                 }
-                 
-                 div=1.0/(order-1);
-                 data[order-1]=div*dr*data[order-2];
-                 for(l=1; (l<(order-1)); l++) {
-                     data[order-l-1]=div*((dr+l)*data[order-l-2]+
-                                          (order-l-dr)*data[order-l-1]);
-                 }
-                 data[0]=div*(1-dr)*data[0]; 
++    int  i,ii;
++    real *xptr;
++
++    for(i=0; i<nr; i++)
++    {
 +        /* With free energy we do not use the charge check.
 +         * In most cases this will be more efficient than calling make_bsplines
 +         * twice, since usually more than half the particles have charges.
 +         */
-     
++        ii = ind[i];
++        if (bFreeEnergy || charge[ii] != 0.0) {
++            xptr = fractx[ii];
++            switch(order) {
++            case 4:  CALC_SPLINE(4);     break;
++            case 5:  CALC_SPLINE(5);     break;
++            default: CALC_SPLINE(order); break;
 +            }
 +        }
 +    }
 +}
 +
 +
 +void make_dft_mod(real *mod,real *data,int ndata)
 +{
 +  int i,j;
 +  real sc,ss,arg;
-     
++
 +  for(i=0;i<ndata;i++) {
 +    sc=ss=0;
 +    for(j=0;j<ndata;j++) {
 +      arg=(2.0*M_PI*i*j)/ndata;
 +      sc+=data[j]*cos(arg);
 +      ss+=data[j]*sin(arg);
 +    }
 +    mod[i]=sc*sc+ss*ss;
 +  }
 +  for(i=0;i<ndata;i++)
 +    if(mod[i]<1e-7)
 +      mod[i]=(mod[i-1]+mod[i+1])*0.5;
 +}
 +
 +
 +
 +void make_bspline_moduli(splinevec bsp_mod,int nx,int ny,int nz,int order)
 +{
 +  int nmax=max(nx,max(ny,nz));
 +  real *data,*ddata,*bsp_data;
 +  int i,k,l;
 +  real div;
-           
++
 +  snew(data,order);
 +  snew(ddata,order);
 +  snew(bsp_data,nmax);
 +
 +  data[order-1]=0;
 +  data[1]=0;
 +  data[0]=1;
-   data[0]=div*data[0]; 
++
 +  for(k=3;k<order;k++) {
 +    div=1.0/(k-1.0);
 +    data[k-1]=0;
 +    for(l=1;l<(k-1);l++)
 +      data[k-l-1]=div*(l*data[k-l-2]+(k-l)*data[k-l-1]);
 +    data[0]=div*data[0];
 +  }
 +  /* differentiate */
 +  ddata[0]=-data[0];
 +  for(k=1;k<order;k++)
 +    ddata[k]=data[k-1]-data[k];
 +  div=1.0/(order-1);
 +  data[order-1]=0;
 +  for(l=1;l<(order-1);l++)
 +    data[order-l-1]=div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
-     
++  data[0]=div*data[0];
 +
 +  for(i=0;i<nmax;i++)
 +    bsp_data[i]=0;
 +  for(i=1;i<=order;i++)
 +    bsp_data[i]=data[i-1];
-     } 
++
 +  make_dft_mod(bsp_mod[XX],bsp_data,nx);
 +  make_dft_mod(bsp_mod[YY],bsp_data,ny);
 +  make_dft_mod(bsp_mod[ZZ],bsp_data,nz);
 +
 +  sfree(data);
 +  sfree(ddata);
 +  sfree(bsp_data);
 +}
 +
 +static void setup_coordinate_communication(pme_atomcomm_t *atc)
 +{
 +  int nslab,n,i;
 +  int fw,bw;
 +
 +  nslab = atc->nslab;
 +
 +  n = 0;
 +  for(i=1; i<=nslab/2; i++) {
 +    fw = (atc->nodeid + i) % nslab;
 +    bw = (atc->nodeid - i + nslab) % nslab;
 +    if (n < nslab - 1) {
 +      atc->node_dest[n] = fw;
 +      atc->node_src[n]  = bw;
 +      n++;
-       
-     sfree((*pmedata)->pmegridA);
++    }
 +    if (n < nslab - 1) {
 +      atc->node_dest[n] = bw;
 +      atc->node_src[n]  = fw;
 +      n++;
 +    }
 +  }
 +}
 +
 +int gmx_pme_destroy(FILE *log,gmx_pme_t *pmedata)
 +{
++    int thread;
++
 +    if(NULL != log)
 +    {
 +        fprintf(log,"Destroying PME data structures.\n");
 +    }
 +
 +    sfree((*pmedata)->nnx);
 +    sfree((*pmedata)->nny);
 +    sfree((*pmedata)->nnz);
-     
-     if((*pmedata)->pmegridB)
++
++    pmegrids_destroy(&(*pmedata)->pmegridA);
++
 +    sfree((*pmedata)->fftgridA);
 +    sfree((*pmedata)->cfftgridA);
 +    gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupA);
-         sfree((*pmedata)->pmegridB);
++
++    if ((*pmedata)->pmegridB.grid.grid != NULL)
 +    {
-     sfree((*pmedata)->work_mhz);
-     sfree((*pmedata)->work_m2);
-     sfree((*pmedata)->work_denom);
-     sfree((*pmedata)->work_tmp1_alloc);
-     sfree((*pmedata)->work_m2inv);
-       
++        pmegrids_destroy(&(*pmedata)->pmegridB);
 +        sfree((*pmedata)->fftgridB);
 +        sfree((*pmedata)->cfftgridB);
 +        gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupB);
 +    }
-   
++    for(thread=0; thread<(*pmedata)->nthread; thread++)
++    {
++        free_work(&(*pmedata)->work[thread]);
++    }
++    sfree((*pmedata)->work);
++
 +    sfree(*pmedata);
 +    *pmedata = NULL;
-     int nk,k,s;
++
 +  return 0;
 +}
 +
 +static int mult_up(int n,int f)
 +{
 +    return ((n + f - 1)/f)*f;
 +}
 +
 +
 +static double pme_load_imbalance(gmx_pme_t pme)
 +{
 +    int    nma,nmi;
 +    double n1,n2,n3;
 +
 +    nma = pme->nnodes_major;
 +    nmi = pme->nnodes_minor;
 +
 +    n1 = mult_up(pme->nkx,nma)*mult_up(pme->nky,nmi)*pme->nkz;
 +    n2 = mult_up(pme->nkx,nma)*mult_up(pme->nkz,nmi)*pme->nky;
 +    n3 = mult_up(pme->nky,nma)*mult_up(pme->nkz,nmi)*pme->nkx;
 +
 +    /* pme_solve is roughly double the cost of an fft */
 +
 +    return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
 +}
 +
 +static void init_atomcomm(gmx_pme_t pme,pme_atomcomm_t *atc, t_commrec *cr,
 +                          int dimind,gmx_bool bSpread)
 +{
-         
-         snew(atc->count,atc->nslab);
++    int nk,k,s,thread;
 +
 +    atc->dimind = dimind;
 +    atc->nslab  = 1;
 +    atc->nodeid = 0;
 +    atc->pd_nalloc = 0;
 +#ifdef GMX_MPI
 +    if (pme->nnodes > 1)
 +    {
 +        atc->mpi_comm = pme->mpi_comm_d[dimind];
 +        MPI_Comm_size(atc->mpi_comm,&atc->nslab);
 +        MPI_Comm_rank(atc->mpi_comm,&atc->nodeid);
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug,"For PME atom communication in dimind %d: nslab %d rank %d\n",atc->dimind,atc->nslab,atc->nodeid);
 +    }
 +#endif
 +
 +    atc->bSpread   = bSpread;
 +    atc->pme_order = pme->pme_order;
 +
 +    if (atc->nslab > 1)
 +    {
 +        /* These three allocations are not required for particle decomp. */
 +        snew(atc->node_dest,atc->nslab);
 +        snew(atc->node_src,atc->nslab);
 +        setup_coordinate_communication(atc);
- static void 
++
++        snew(atc->count_thread,pme->nthread);
++        for(thread=0; thread<pme->nthread; thread++)
++        {
++            snew(atc->count_thread[thread],atc->nslab);
++        }
++        atc->count = atc->count_thread[0];
 +        snew(atc->rcount,atc->nslab);
 +        snew(atc->buf_index,atc->nslab);
 +    }
++
++    atc->nthread = pme->nthread;
++    if (atc->nthread > 1)
++    {
++        snew(atc->thread_plist,atc->nthread);
++    }
++    snew(atc->spline,atc->nthread);
++    for(thread=0; thread<atc->nthread; thread++)
++    {
++        if (atc->nthread > 1)
++        {
++            snew(atc->thread_plist[thread].n,atc->nthread+2*GMX_CACHE_SEP);
++            atc->thread_plist[thread].n += GMX_CACHE_SEP;
++        }
++    }
 +}
 +
-                   MPI_Comm         comm,  
++static void
 +init_overlap_comm(pme_overlap_t *  ol,
 +                  int              norder,
 +#ifdef GMX_MPI
-                   int              nnodes, 
++                  MPI_Comm         comm,
 +#endif
-                   int              ndata)
++                  int              nnodes,
 +                  int              nodeid,
-     
++                  int              ndata,
++                  int              commplainsize)
 +{
 +    int lbnd,rbnd,maxlr,b,i;
 +    int exten;
 +    int nn,nk;
 +    pme_grid_comm_t *pgc;
 +    gmx_bool bCont;
 +    int fft_start,fft_end,send_index1,recv_index1;
-     
++
 +#ifdef GMX_MPI
 +    ol->mpi_comm = comm;
 +#endif
-     for(i=0; i<nnodes; i++) 
++
 +    ol->nnodes = nnodes;
 +    ol->nodeid = nodeid;
 +
 +    /* Linear translation of the PME grid wo'nt affect reciprocal space
 +     * calculations, so to optimize we only interpolate "upwards",
 +     * which also means we only have to consider overlap in one direction.
 +     * I.e., particles on this node might also be spread to grid indices
 +     * that belong to higher nodes (modulo nnodes)
 +     */
 +
 +    snew(ol->s2g0,ol->nnodes+1);
 +    snew(ol->s2g1,ol->nnodes);
 +    if (debug) { fprintf(debug,"PME slab boundaries:"); }
-                     fsh[i] = -1; 
++    for(i=0; i<nnodes; i++)
 +    {
 +        /* s2g0 the local interpolation grid start.
 +         * s2g1 the local interpolation grid end.
 +         * Because grid overlap communication only goes forward,
 +         * the grid the slabs for fft's should be rounded down.
 +         */
 +        ol->s2g0[i] = ( i   *ndata + 0       )/nnodes;
 +        ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
 +
 +        if (debug)
 +        {
 +            fprintf(debug,"  %3d %3d",ol->s2g0[i],ol->s2g1[i]);
 +        }
 +    }
 +    ol->s2g0[nnodes] = ndata;
 +    if (debug) { fprintf(debug,"\n"); }
 +
 +    /* Determine with how many nodes we need to communicate the grid overlap */
 +    b = 0;
 +    do
 +    {
 +        b++;
 +        bCont = FALSE;
 +        for(i=0; i<nnodes; i++)
 +        {
 +            if ((i+b <  nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
 +                (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
 +            {
 +                bCont = TRUE;
 +            }
 +        }
 +    }
 +    while (bCont && b < nnodes);
 +    ol->noverlap_nodes = b - 1;
 +
 +    snew(ol->send_id,ol->noverlap_nodes);
 +    snew(ol->recv_id,ol->noverlap_nodes);
 +    for(b=0; b<ol->noverlap_nodes; b++)
 +    {
 +        ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
 +        ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
 +    }
 +    snew(ol->comm_data, ol->noverlap_nodes);
 +
 +    for(b=0; b<ol->noverlap_nodes; b++)
 +    {
 +        pgc = &ol->comm_data[b];
 +        /* Send */
 +        fft_start        = ol->s2g0[ol->send_id[b]];
 +        fft_end          = ol->s2g0[ol->send_id[b]+1];
 +        if (ol->send_id[b] < nodeid)
 +        {
 +            fft_start += ndata;
 +            fft_end   += ndata;
 +        }
 +        send_index1      = ol->s2g1[nodeid];
 +        send_index1      = min(send_index1,fft_end);
 +        pgc->send_index0 = fft_start;
 +        pgc->send_nindex = max(0,send_index1 - pgc->send_index0);
 +
 +        /* We always start receiving to the first index of our slab */
 +        fft_start        = ol->s2g0[ol->nodeid];
 +        fft_end          = ol->s2g0[ol->nodeid+1];
 +        recv_index1      = ol->s2g1[ol->recv_id[b]];
 +        if (ol->recv_id[b] > nodeid)
 +        {
 +            recv_index1 -= ndata;
 +        }
 +        recv_index1      = min(recv_index1,fft_end);
 +        pgc->recv_index0 = fft_start;
 +        pgc->recv_nindex = max(0,recv_index1 - pgc->recv_index0);
 +    }
++
++    /* For non-divisible grid we need pme_order iso pme_order-1 */
++    snew(ol->sendbuf,norder*commplainsize);
++    snew(ol->recvbuf,norder*commplainsize);
 +}
 +
 +static void
 +make_gridindex5_to_localindex(int n,int local_start,int local_range,
 +                              int **global_to_local,
 +                              real **fraction_shift)
 +{
 +    int i;
 +    int * gtl;
 +    real * fsh;
 +
 +    snew(gtl,5*n);
 +    snew(fsh,5*n);
 +    for(i=0; (i<5*n); i++)
 +    {
 +        /* Determine the global to local grid index */
 +        gtl[i] = (i - local_start + n) % n;
 +        /* For coordinates that fall within the local grid the fraction
 +         * is correct, we don't need to shift it.
 +         */
 +        fsh[i] = 0;
 +        if (local_range < n)
 +        {
 +            /* Due to rounding issues i could be 1 beyond the lower or
 +             * upper boundary of the local grid. Correct the index for this.
 +             * If we shift the index, we need to shift the fraction by
 +             * the same amount in the other direction to not affect
 +             * the weights.
 +             * Note that due to this shifting the weights at the end of
 +             * the spline might change, but that will only involve values
 +             * between zero and values close to the precision of a real,
 +             * which is anyhow the accuracy of the whole mesh calculation.
 +             */
 +            /* With local_range=0 we should not change i=local_start */
 +            if (i % n != local_start)
 +            {
 +                if (gtl[i] == n-1)
 +                {
 +                    gtl[i] = 0;
-         
++                    fsh[i] = -1;
 +                }
 +                else if (gtl[i] == local_range)
 +                {
 +                    gtl[i] = local_range - 1;
 +                    fsh[i] = 1;
 +                }
 +            }
 +        }
 +    }
 +
 +    *global_to_local = gtl;
 +    *fraction_shift  = fsh;
 +}
 +
++static void sse_mask_init(pme_spline_work_t *work,int order)
++{
++#ifdef PME_SSE
++    float  tmp[8];
++    __m128 zero_SSE;
++    int    of,i;
++
++    zero_SSE = _mm_setzero_ps();
++
++    for(of=0; of<8-(order-1); of++)
++    {
++        for(i=0; i<8; i++)
++        {
++            tmp[i] = (i >= of && i < of+order ? 1 : 0);
++        }
++        work->mask_SSE0[of] = _mm_loadu_ps(tmp);
++        work->mask_SSE1[of] = _mm_loadu_ps(tmp+4);
++        work->mask_SSE0[of] = _mm_cmpgt_ps(work->mask_SSE0[of],zero_SSE);
++        work->mask_SSE1[of] = _mm_cmpgt_ps(work->mask_SSE1[of],zero_SSE);
++    }
++#endif
++}
++
 +static void
 +gmx_pme_check_grid_restrictions(FILE *fplog,char dim,int nnodes,int *nk)
 +{
 +    int nk_new;
 +
 +    if (*nk % nnodes != 0)
 +    {
 +        nk_new = nnodes*(*nk/nnodes + 1);
 +
 +        if (2*nk_new >= 3*(*nk))
 +        {
 +            gmx_fatal(FARGS,"The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). The grid size would have to be increased by more than 50%% to make the grid divisible. Change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).",
 +                      dim,*nk,dim,nnodes,dim);
 +        }
-             
++
 +        if (fplog != NULL)
 +        {
 +            fprintf(fplog,"\nNOTE: The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). Increasing the PME grid size in dim %c to %d. This will increase the accuracy and will not decrease the performance significantly on this number of nodes. For optimal performance change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).\n\n",
 +                    dim,*nk,dim,nnodes,dim,nk_new,dim);
 +        }
-                  gmx_bool                bFreeEnergy,
-                  gmx_bool                bReproducible)
++
 +        *nk = nk_new;
 +    }
 +}
 +
 +int gmx_pme_init(gmx_pme_t *         pmedata,
 +                 t_commrec *         cr,
 +                 int                 nnodes_major,
 +                 int                 nnodes_minor,
 +                 t_inputrec *        ir,
 +                 int                 homenr,
-     
++                 gmx_bool            bFreeEnergy,
++                 gmx_bool            bReproducible,
++                 int                 nthread)
 +{
 +    gmx_pme_t pme=NULL;
-     int bufsizex,bufsizey,bufsize;
++
 +    pme_atomcomm_t *atc;
-     
 +    ivec ndata;
-         
++
 +    if (debug)
 +        fprintf(debug,"Creating PME data structures.\n");
 +    snew(pme,1);
-     
++
 +    pme->redist_init         = FALSE;
 +    pme->sum_qgrid_tmp       = NULL;
 +    pme->sum_qgrid_dd_tmp    = NULL;
 +    pme->buf_nalloc          = 0;
 +    pme->redist_buf_nalloc   = 0;
-     
++
 +    pme->nnodes              = 1;
 +    pme->bPPnode             = TRUE;
-     if (nnodes_major*nnodes_minor > 1 && PAR(cr)) 
++
 +    pme->nnodes_major        = nnodes_major;
 +    pme->nnodes_minor        = nnodes_minor;
 +
 +#ifdef GMX_MPI
-         
++    if (PAR(cr))
 +    {
 +        pme->mpi_comm        = cr->mpi_comm_mygroup;
-         if (pme->nnodes != nnodes_major*nnodes_minor)
-         {
-             gmx_incons("PME node count mismatch");
-         }
++
 +        MPI_Comm_rank(pme->mpi_comm,&pme->nodeid);
 +        MPI_Comm_size(pme->mpi_comm,&pme->nnodes);
-             pme->mpi_comm_d[1] = NULL;
 +    }
 +#endif
 +
 +    if (pme->nnodes == 1)
 +    {
 +        pme->ndecompdim = 0;
 +        pme->nodeid_major = 0;
 +        pme->nodeid_minor = 0;
++#ifdef GMX_MPI
++        pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL;
++#endif
 +    }
 +    else
 +    {
 +        if (nnodes_minor == 1)
 +        {
 +#ifdef GMX_MPI
 +            pme->mpi_comm_d[0] = pme->mpi_comm;
-             
++            pme->mpi_comm_d[1] = MPI_COMM_NULL;
 +#endif
 +            pme->ndecompdim = 1;
 +            pme->nodeid_major = pme->nodeid;
 +            pme->nodeid_minor = 0;
-             pme->mpi_comm_d[0] = NULL;
++
 +        }
 +        else if (nnodes_major == 1)
 +        {
 +#ifdef GMX_MPI
-         else 
++            pme->mpi_comm_d[0] = MPI_COMM_NULL;
 +            pme->mpi_comm_d[1] = pme->mpi_comm;
 +#endif
 +            pme->ndecompdim = 1;
 +            pme->nodeid_major = 0;
 +            pme->nodeid_minor = pme->nodeid;
 +        }
-             
++        else
 +        {
 +            if (pme->nnodes % nnodes_major != 0)
 +            {
 +                gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
 +            }
 +            pme->ndecompdim = 2;
-             
++
 +#ifdef GMX_MPI
 +            MPI_Comm_split(pme->mpi_comm,pme->nodeid % nnodes_minor,
 +                           pme->nodeid,&pme->mpi_comm_d[0]);  /* My communicator along major dimension */
 +            MPI_Comm_split(pme->mpi_comm,pme->nodeid/nnodes_minor,
 +                           pme->nodeid,&pme->mpi_comm_d[1]);  /* My communicator along minor dimension */
-     
++
 +            MPI_Comm_rank(pme->mpi_comm_d[0],&pme->nodeid_major);
 +            MPI_Comm_size(pme->mpi_comm_d[0],&pme->nnodes_major);
 +            MPI_Comm_rank(pme->mpi_comm_d[1],&pme->nodeid_minor);
 +            MPI_Comm_size(pme->mpi_comm_d[1],&pme->nnodes_minor);
 +#endif
 +        }
 +        pme->bPPnode = (cr->duty & DUTY_PP);
 +    }
-     
++
++    pme->nthread = nthread;
++
 +    if (ir->ePBC == epbcSCREW)
 +    {
 +        gmx_fatal(FARGS,"pme does not (yet) work with pbc = screw");
 +    }
-     
++
 +    pme->bFEP        = ((ir->efep != efepNO) && bFreeEnergy);
 +    pme->nkx         = ir->nkx;
 +    pme->nky         = ir->nky;
 +    pme->nkz         = ir->nkz;
 +    pme->pme_order   = ir->pme_order;
 +    pme->epsilon_r   = ir->epsilon_r;
-     if (pme->nkx <= pme->pme_order*(pme->nnodes_major > 1 ? 2 : 1) ||
-         pme->nky <= pme->pme_order*(pme->nnodes_minor > 1 ? 2 : 1) ||
-         pme->nkz <= pme->pme_order)
++
++    if (pme->pme_order > PME_ORDER_MAX)
++    {
++        gmx_fatal(FARGS,"pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
++                  pme->pme_order,PME_ORDER_MAX);
++    }
++
 +    /* Currently pme.c supports only the fft5d FFT code.
 +     * Therefore the grid always needs to be divisible by nnodes.
 +     * When the old 1D code is also supported again, change this check.
 +     *
 +     * This check should be done before calling gmx_pme_init
 +     * and fplog should be passed iso stderr.
 +     *
 +    if (pme->ndecompdim >= 2)
 +    */
 +    if (pme->ndecompdim >= 1)
 +    {
 +        /*
 +        gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
 +                                        'x',nnodes_major,&pme->nkx);
 +        gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
 +                                        'y',nnodes_minor,&pme->nky);
 +        */
 +    }
 +
-         gmx_fatal(FARGS,"The pme grid dimensions need to be larger than pme_order (%d) and in parallel larger than 2*pme_order for x and/or y",pme->pme_order);
-     }
++    if (pme->nkx <= pme->pme_order*(pme->nnodes_major > 1 ? 2 : 1) ||
++        pme->nky <= pme->pme_order*(pme->nnodes_minor > 1 ? 2 : 1) ||
++        pme->nkz <= pme->pme_order)
++    {
++        gmx_fatal(FARGS,"The pme grid dimensions need to be larger than pme_order (%d) and in parallel larger than 2*pme_ordern for x and/or y",pme->pme_order);
++    }
++
++    if (pme->nnodes > 1) {
++        double imbal;
++
++#ifdef GMX_MPI
++        MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
++        MPI_Type_commit(&(pme->rvec_mpi));
++#endif
++
++        /* Note that the charge spreading and force gathering, which usually
++         * takes about the same amount of time as FFT+solve_pme,
++         * is always fully load balanced
++         * (unless the charge distribution is inhomogeneous).
++         */
++
++        imbal = pme_load_imbalance(pme);
++        if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
++        {
++            fprintf(stderr,
++                    "\n"
++                    "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
++                    "      For optimal PME load balancing\n"
++                    "      PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
++                    "      and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
++                    "\n",
++                    (int)((imbal-1)*100 + 0.5),
++                    pme->nkx,pme->nky,pme->nnodes_major,
++                    pme->nky,pme->nkz,pme->nnodes_minor);
++        }
++    }
++
++    /* For non-divisible grid we need pme_order iso pme_order-1 */
++    /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
++     * y is always copied through a buffer: we don't need padding in z,
++     * but we do need the overlap in x because of the communication order.
++     */
++    init_overlap_comm(&pme->overlap[0],pme->pme_order,
++#ifdef GMX_MPI
++                      pme->mpi_comm_d[0],
++#endif
++                      pme->nnodes_major,pme->nodeid_major,
++                      pme->nkx,
++                      (div_round_up(pme->nky,pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
++
++    init_overlap_comm(&pme->overlap[1],pme->pme_order,
++#ifdef GMX_MPI
++                      pme->mpi_comm_d[1],
++#endif
++                      pme->nnodes_minor,pme->nodeid_minor,
++                      pme->nky,
++                      (div_round_up(pme->nkx,pme->nnodes_major)+pme->pme_order)*pme->nkz);
++
++    /* Check for a limitation of the (current) sum_fftgrid_dd code */
++    if (pme->nthread > 1 &&
++        (pme->overlap[0].noverlap_nodes > 1 ||
++         pme->overlap[1].noverlap_nodes > 1))
++    {
++        gmx_fatal(FARGS,"With threads the number of grid lines per node along x and or y should be pme_order (%d) or more or exactly pme_order-1",pme->pme_order);
++    }
++
++    snew(pme->bsp_mod[XX],pme->nkx);
++    snew(pme->bsp_mod[YY],pme->nky);
++    snew(pme->bsp_mod[ZZ],pme->nkz);
++
++    /* The required size of the interpolation grid, including overlap.
++     * The allocated size (pmegrid_n?) might be slightly larger.
++     */
++    pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
++                      pme->overlap[0].s2g0[pme->nodeid_major];
++    pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
++                      pme->overlap[1].s2g0[pme->nodeid_minor];
++    pme->pmegrid_nz_base = pme->nkz;
++    pme->pmegrid_nz = pme->pmegrid_nz_base + pme->pme_order - 1;
++    set_grid_alignment(&pme->pmegrid_nz,pme->pme_order);
++
++    pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
++    pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
++    pme->pmegrid_start_iz = 0;
++
++    make_gridindex5_to_localindex(pme->nkx,
++                                  pme->pmegrid_start_ix,
++                                  pme->pmegrid_nx - (pme->pme_order-1),
++                                  &pme->nnx,&pme->fshx);
++    make_gridindex5_to_localindex(pme->nky,
++                                  pme->pmegrid_start_iy,
++                                  pme->pmegrid_ny - (pme->pme_order-1),
++                                  &pme->nny,&pme->fshy);
++    make_gridindex5_to_localindex(pme->nkz,
++                                  pme->pmegrid_start_iz,
++                                  pme->pmegrid_nz_base,
++                                  &pme->nnz,&pme->fshz);
++
++    pmegrids_init(&pme->pmegridA,
++                  pme->pmegrid_nx,pme->pmegrid_ny,pme->pmegrid_nz,
++                  pme->pmegrid_nz_base,
++                  pme->pme_order,
++                  pme->nthread,
++                  pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
++                  pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
++
++    sse_mask_init(&pme->spline_work,pme->pme_order);
++
++    ndata[0] = pme->nkx;
++    ndata[1] = pme->nky;
++    ndata[2] = pme->nkz;
++
++    /* This routine will allocate the grid data to fit the FFTs */
++    gmx_parallel_3dfft_init(&pme->pfft_setupA,ndata,
++                            &pme->fftgridA,&pme->cfftgridA,
++                            pme->mpi_comm_d,
++                            pme->overlap[0].s2g0,pme->overlap[1].s2g0,
++                            bReproducible,pme->nthread);
++
++    if (bFreeEnergy)
++    {
++        pmegrids_init(&pme->pmegridB,
++                      pme->pmegrid_nx,pme->pmegrid_ny,pme->pmegrid_nz,
++                      pme->pmegrid_nz_base,
++                      pme->pme_order,
++                      pme->nthread,
++                      pme->nkx % pme->nnodes_major != 0,
++                      pme->nky % pme->nnodes_minor != 0);
++
++        gmx_parallel_3dfft_init(&pme->pfft_setupB,ndata,
++                                &pme->fftgridB,&pme->cfftgridB,
++                                pme->mpi_comm_d,
++                                pme->overlap[0].s2g0,pme->overlap[1].s2g0,
++                                bReproducible,pme->nthread);
++    }
++    else
++    {
++        pme->pmegridB.grid.grid = NULL;
++        pme->fftgridB           = NULL;
++        pme->cfftgridB          = NULL;
++    }
++
++    make_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order);
++
++    /* Use atc[0] for spreading */
++    init_atomcomm(pme,&pme->atc[0],cr,nnodes_major > 1 ? 0 : 1,TRUE);
++    if (pme->ndecompdim >= 2)
++    {
++        init_atomcomm(pme,&pme->atc[1],cr,1,FALSE);
++    }
++
++    if (pme->nnodes == 1) {
++        pme->atc[0].n = homenr;
++        pme_realloc_atomcomm_things(&pme->atc[0]);
++    }
++
++    {
++        int thread;
++
++        /* Use fft5d, order after FFT is y major, z, x minor */
++
++        snew(pme->work,pme->nthread);
++        for(thread=0; thread<pme->nthread; thread++)
++        {
++            realloc_work(&pme->work[thread],pme->nkx);
++        }
++    }
++
++    *pmedata = pme;
++
++    return 0;
++}
++
++
++static void copy_local_grid(gmx_pme_t pme,
++                            pmegrids_t *pmegrids,int thread,real *fftgrid)
++{
++    ivec local_fft_ndata,local_fft_offset,local_fft_size;
++    int  fft_my,fft_mz;
++    int  nsx,nsy,nsz;
++    ivec nf;
++    int  offx,offy,offz,x,y,z,i0,i0t;
++    int  d;
++    pmegrid_t *pmegrid;
++    real *grid_th;
++
++    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
++                                   local_fft_ndata,
++                                   local_fft_offset,
++                                   local_fft_size);
++    fft_my = local_fft_size[YY];
++    fft_mz = local_fft_size[ZZ];
++
++    pmegrid = &pmegrids->grid_th[thread];
++
++    nsx = pmegrid->n[XX];
++    nsy = pmegrid->n[YY];
++    nsz = pmegrid->n[ZZ];
++
++    for(d=0; d<DIM; d++)
++    {
++        nf[d] = min(pmegrid->n[d] - (pmegrid->order - 1),
++                    local_fft_ndata[d] - pmegrid->offset[d]);
++    }
++
++    offx = pmegrid->offset[XX];
++    offy = pmegrid->offset[YY];
++    offz = pmegrid->offset[ZZ];
++
++    /* Directly copy the non-overlapping parts of the local grids.
++     * This also initializes the full grid.
++     */
++    grid_th = pmegrid->grid;
++    for(x=0; x<nf[XX]; x++)
++    {
++        for(y=0; y<nf[YY]; y++)
++        {
++            i0  = ((offx + x)*fft_my + (offy + y))*fft_mz + offz;
++            i0t = (x*nsy + y)*nsz;
++            for(z=0; z<nf[ZZ]; z++)
++            {
++                fftgrid[i0+z] = grid_th[i0t+z];
++            }
++        }
++    }
++}
++
++static void print_sendbuf(gmx_pme_t pme,real *sendbuf)
++{
++    ivec local_fft_ndata,local_fft_offset,local_fft_size;
++    pme_overlap_t *overlap;
++    int datasize,nind;
++    int i,x,y,z,n;
++
++    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
++                                   local_fft_ndata,
++                                   local_fft_offset,
++                                   local_fft_size);
++    /* Major dimension */
++    overlap = &pme->overlap[0];
++
++    nind   = overlap->comm_data[0].send_nindex;
++
++    for(y=0; y<local_fft_ndata[YY]; y++) {
++         printf(" %2d",y);
++    }
++    printf("\n");
++
++    i = 0;
++    for(x=0; x<nind; x++) {
++        for(y=0; y<local_fft_ndata[YY]; y++) {
++            n = 0;
++            for(z=0; z<local_fft_ndata[ZZ]; z++) {
++                if (sendbuf[i] != 0) n++;
++                i++;
++            }
++            printf(" %2d",n);
++        }
++        printf("\n");
++    }
++}
++
++static void
++reduce_threadgrid_overlap(gmx_pme_t pme,
++                          const pmegrids_t *pmegrids,int thread,
++                          real *fftgrid,real *commbuf_x,real *commbuf_y)
++{
++    ivec local_fft_ndata,local_fft_offset,local_fft_size;
++    int  fft_nx,fft_ny,fft_nz;
++    int  fft_my,fft_mz;
++    int  buf_my=-1;
++    int  nsx,nsy,nsz;
++    ivec ne;
++    int  offx,offy,offz,x,y,z,i0,i0t;
++    int  sx,sy,sz,fx,fy,fz,tx1,ty1,tz1,ox,oy,oz;
++    gmx_bool bClearBufX,bClearBufY,bClearBufXY,bClearBuf;
++    gmx_bool bCommX,bCommY;
++    int  d;
++    int  thread_f;
++    const pmegrid_t *pmegrid,*pmegrid_g,*pmegrid_f;
++    const real *grid_th;
++    real *commbuf=NULL;
++
++    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
++                                   local_fft_ndata,
++                                   local_fft_offset,
++                                   local_fft_size);
++    fft_nx = local_fft_ndata[XX];
++    fft_ny = local_fft_ndata[YY];
++    fft_nz = local_fft_ndata[ZZ];
++
++    fft_my = local_fft_size[YY];
++    fft_mz = local_fft_size[ZZ];
++
++    /* This routine is called when all thread have finished spreading.
++     * Here each thread sums grid contributions calculated by other threads
++     * to the thread local grid volume.
++     * To minimize the number of grid copying operations,
++     * this routines sums immediately from the pmegrid to the fftgrid.
++     */
++
++    /* Determine which part of the full node grid we should operate on,
++     * this is our thread local part of the full grid.
++     */
++    pmegrid = &pmegrids->grid_th[thread];
++
++    for(d=0; d<DIM; d++)
++    {
++        ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
++                    local_fft_ndata[d]);
++    }
++
++    offx = pmegrid->offset[XX];
++    offy = pmegrid->offset[YY];
++    offz = pmegrid->offset[ZZ];
++
++
++    bClearBufX  = TRUE;
++    bClearBufY  = TRUE;
++    bClearBufXY = TRUE;
++
++    /* Now loop over all the thread data blocks that contribute
++     * to the grid region we (our thread) are operating on.
++     */
++    /* Note that ffy_nx/y is equal to the number of grid points
++     * between the first point of our node grid and the one of the next node.
++     */
++    for(sx=0; sx>=-pmegrids->nthread_comm[XX]; sx--)
++    {
++        fx = pmegrid->ci[XX] + sx;
++        ox = 0;
++        bCommX = FALSE;
++        if (fx < 0) {
++            fx += pmegrids->nc[XX];
++            ox -= fft_nx;
++            bCommX = (pme->nnodes_major > 1);
++        }
++        pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
++        ox += pmegrid_g->offset[XX];
++        if (!bCommX)
++        {
++            tx1 = min(ox + pmegrid_g->n[XX],ne[XX]);
++        }
++        else
++        {
++            tx1 = min(ox + pmegrid_g->n[XX],pme->pme_order);
++        }
++
++        for(sy=0; sy>=-pmegrids->nthread_comm[YY]; sy--)
++        {
++            fy = pmegrid->ci[YY] + sy;
++            oy = 0;
++            bCommY = FALSE;
++            if (fy < 0) {
++                fy += pmegrids->nc[YY];
++                oy -= fft_ny;
++                bCommY = (pme->nnodes_minor > 1);
++            }
++            pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
++            oy += pmegrid_g->offset[YY];
++            if (!bCommY)
++            {
++                ty1 = min(oy + pmegrid_g->n[YY],ne[YY]);
++            }
++            else
++            {
++                ty1 = min(oy + pmegrid_g->n[YY],pme->pme_order);
++            }
++
++            for(sz=0; sz>=-pmegrids->nthread_comm[ZZ]; sz--)
++            {
++                fz = pmegrid->ci[ZZ] + sz;
++                oz = 0;
++                if (fz < 0)
++                {
++                    fz += pmegrids->nc[ZZ];
++                    oz -= fft_nz;
++                }
++                pmegrid_g = &pmegrids->grid_th[fz];
++                oz += pmegrid_g->offset[ZZ];
++                tz1 = min(oz + pmegrid_g->n[ZZ],ne[ZZ]);
++
++                if (sx == 0 && sy == 0 && sz == 0)
++                {
++                    /* We have already added our local contribution
++                     * before calling this routine, so skip it here.
++                     */
++                    continue;
++                }
++
++                thread_f = (fx*pmegrids->nc[YY] + fy)*pmegrids->nc[ZZ] + fz;
++
++                pmegrid_f = &pmegrids->grid_th[thread_f];
++
++                grid_th = pmegrid_f->grid;
++
++                nsx = pmegrid_f->n[XX];
++                nsy = pmegrid_f->n[YY];
++                nsz = pmegrid_f->n[ZZ];
++
++#ifdef DEBUG_PME_REDUCE
++                printf("n%d t%d add %d  %2d %2d %2d  %2d %2d %2d  %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
++                       pme->nodeid,thread,thread_f,
++                       pme->pmegrid_start_ix,
++                       pme->pmegrid_start_iy,
++                       pme->pmegrid_start_iz,
++                       sx,sy,sz,
++                       offx-ox,tx1-ox,offx,tx1,
++                       offy-oy,ty1-oy,offy,ty1,
++                       offz-oz,tz1-oz,offz,tz1);
++#endif
++
++                if (!(bCommX || bCommY))
++                {
++                    /* Copy from the thread local grid to the node grid */
++                    for(x=offx; x<tx1; x++)
++                    {
++                        for(y=offy; y<ty1; y++)
++                        {
++                            i0  = (x*fft_my + y)*fft_mz;
++                            i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
++                            for(z=offz; z<tz1; z++)
++                            {
++                                fftgrid[i0+z] += grid_th[i0t+z];
++                            }
++                        }
++                    }
++                }
++                else
++                {
++                    /* The order of this conditional decides
++                     * where the corner volume gets stored with x+y decomp.
++                     */
++                    if (bCommY)
++                    {
++                        commbuf = commbuf_y;
++                        buf_my  = ty1 - offy;
++                        if (bCommX)
++                        {
++                            /* We index commbuf modulo the local grid size */
++                            commbuf += buf_my*fft_nx*fft_nz;
++
++                            bClearBuf  = bClearBufXY;
++                            bClearBufXY = FALSE;
++                        }
++                        else
++                        {
++                            bClearBuf  = bClearBufY;
++                            bClearBufY = FALSE;
++                        }
++                    }
++                    else
++                    {
++                        commbuf = commbuf_x;
++                        buf_my  = fft_ny;
++                        bClearBuf  = bClearBufX;
++                        bClearBufX = FALSE;
++                    }
++
++                    /* Copy to the communication buffer */
++                    for(x=offx; x<tx1; x++)
++                    {
++                        for(y=offy; y<ty1; y++)
++                        {
++                            i0  = (x*buf_my + y)*fft_nz;
++                            i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
++
++                            if (bClearBuf)
++                            {
++                                /* First access of commbuf, initialize it */
++                                for(z=offz; z<tz1; z++)
++                                {
++                                    commbuf[i0+z]  = grid_th[i0t+z];
++                                }
++                            }
++                            else
++                            {
++                                for(z=offz; z<tz1; z++)
++                                {
++                                    commbuf[i0+z] += grid_th[i0t+z];
++                                }
++                            }
++                        }
++                    }
++                }
++            }
++        }
++    }
++}
++
++
++static void sum_fftgrid_dd(gmx_pme_t pme,real *fftgrid)
++{
++    ivec local_fft_ndata,local_fft_offset,local_fft_size;
++    pme_overlap_t *overlap;
++    int  send_nindex;
++    int  recv_index0,recv_nindex;
++#ifdef GMX_MPI
++    MPI_Status stat;
++#endif
++    int  ipulse,send_id,recv_id,datasize,gridsize,size_yx;
++    real *sendptr,*recvptr;
++    int  x,y,z,indg,indb;
++
++    /* Note that this routine is only used for forward communication.
++     * Since the force gathering, unlike the charge spreading,
++     * can be trivially parallelized over the particles,
++     * the backwards process is much simpler and can use the "old"
++     * communication setup.
++     */
++
++    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
++                                   local_fft_ndata,
++                                   local_fft_offset,
++                                   local_fft_size);
++
++    /* Currently supports only a single communication pulse */
++
++/* for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++) */
++    if (pme->nnodes_minor > 1)
++    {
++        /* Major dimension */
++        overlap = &pme->overlap[1];
++
++        if (pme->nnodes_major > 1)
++        {
++             size_yx = pme->overlap[0].comm_data[0].send_nindex;
++        }
++        else
++        {
++            size_yx = 0;
++        }
++        datasize = (local_fft_ndata[XX]+size_yx)*local_fft_ndata[ZZ];
++
++        ipulse = 0;
++
++        send_id = overlap->send_id[ipulse];
++        recv_id = overlap->recv_id[ipulse];
++        send_nindex   = overlap->comm_data[ipulse].send_nindex;
++        /* recv_index0   = overlap->comm_data[ipulse].recv_index0; */
++        recv_index0 = 0;
++        recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
++
++        sendptr = overlap->sendbuf;
++        recvptr = overlap->recvbuf;
++
++        /*
++        printf("node %d comm %2d x %2d x %2d\n",pme->nodeid,
++               local_fft_ndata[XX]+size_yx,send_nindex,local_fft_ndata[ZZ]);
++        printf("node %d send %f, %f\n",pme->nodeid,
++               sendptr[0],sendptr[send_nindex*datasize-1]);
++        */
++
++#ifdef GMX_MPI
++        MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
++                     send_id,ipulse,
++                     recvptr,recv_nindex*datasize,GMX_MPI_REAL,
++                     recv_id,ipulse,
++                     overlap->mpi_comm,&stat);
++#endif
++
++        for(x=0; x<local_fft_ndata[XX]; x++)
++        {
++            for(y=0; y<recv_nindex; y++)
++            {
++                indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
++                indb = (x*recv_nindex        + y)*local_fft_ndata[ZZ];
++                for(z=0; z<local_fft_ndata[ZZ]; z++)
++                {
++                    fftgrid[indg+z] += recvptr[indb+z];
++                }
++            }
++        }
++        if (pme->nnodes_major > 1)
++        {
++            sendptr = pme->overlap[0].sendbuf;
++            for(x=0; x<size_yx; x++)
++            {
++                for(y=0; y<recv_nindex; y++)
++                {
++                    indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
++                    indb = ((local_fft_ndata[XX] + x)*recv_nindex +y)*local_fft_ndata[ZZ];
++                    for(z=0; z<local_fft_ndata[ZZ]; z++)
++                    {
++                        sendptr[indg+z] += recvptr[indb+z];
++                    }
++                }
++            }
++        }
++    }
++
++    /* for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++) */
++    if (pme->nnodes_major > 1)
 +    {
-     if (pme->nnodes > 1) {
-         double imbal;
++        /* Major dimension */
++        overlap = &pme->overlap[0];
 +
-         MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
-         MPI_Type_commit(&(pme->rvec_mpi));
++        datasize = local_fft_ndata[YY]*local_fft_ndata[ZZ];
++        gridsize = local_fft_size[YY] *local_fft_size[ZZ];
++
++        ipulse = 0;
++
++        send_id = overlap->send_id[ipulse];
++        recv_id = overlap->recv_id[ipulse];
++        send_nindex   = overlap->comm_data[ipulse].send_nindex;
++        /* recv_index0   = overlap->comm_data[ipulse].recv_index0; */
++        recv_index0 = 0;
++        recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
++
++        sendptr = overlap->sendbuf;
++        recvptr = overlap->recvbuf;
++
++        if (debug != NULL)
++        {
++            fprintf(debug,"PME fftgrid comm %2d x %2d x %2d\n",
++                   send_nindex,local_fft_ndata[YY],local_fft_ndata[ZZ]);
++        }
 +
 +#ifdef GMX_MPI
-         
-         /* Note that the charge spreading and force gathering, which usually
-          * takes about the same amount of time as FFT+solve_pme,
-          * is always fully load balanced
-          * (unless the charge distribution is inhomogeneous).
-          */
-         
-         imbal = pme_load_imbalance(pme);
-         if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
++        MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
++                     send_id,ipulse,
++                     recvptr,recv_nindex*datasize,GMX_MPI_REAL,
++                     recv_id,ipulse,
++                     overlap->mpi_comm,&stat);
 +#endif
-             fprintf(stderr,
-                     "\n"
-                     "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
-                     "      For optimal PME load balancing\n"
-                     "      PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
-                     "      and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
-                     "\n",
-                     (int)((imbal-1)*100 + 0.5),
-                     pme->nkx,pme->nky,pme->nnodes_major,
-                     pme->nky,pme->nkz,pme->nnodes_minor);
++
++        for(x=0; x<recv_nindex; x++)
 +        {
-     init_overlap_comm(&pme->overlap[0],pme->pme_order,
- #ifdef GMX_MPI
-                       pme->mpi_comm_d[0],
++            for(y=0; y<local_fft_ndata[YY]; y++)
++            {
++                indg = (x*local_fft_size[YY]  + y)*local_fft_size[ZZ];
++                indb = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
++                for(z=0; z<local_fft_ndata[ZZ]; z++)
++                {
++                    fftgrid[indg+z] += recvptr[indb+z];
++                }
++            }
 +        }
 +    }
++}
 +
-                       pme->nnodes_major,pme->nodeid_major,pme->nkx);
-     
-     init_overlap_comm(&pme->overlap[1],pme->pme_order,
- #ifdef GMX_MPI
-                       pme->mpi_comm_d[1],
++
++static void spread_on_grid(gmx_pme_t pme,
++                           pme_atomcomm_t *atc,pmegrids_t *grids,
++                           gmx_bool bCalcSplines,gmx_bool bSpread,
++                           real *fftgrid)
++{
++    int nthread,thread;
++#ifdef PME_TIME_THREADS
++    gmx_cycles_t c1,c2,c3,ct1a,ct1b,ct1c;
++    static double cs1=0,cs2=0,cs3=0;
++    static double cs1a[6]={0,0,0,0,0,0};
++    static int cnt=0;
 +#endif
-                       pme->nnodes_minor,pme->nodeid_minor,pme->nky);
-     
-     snew(pme->bsp_mod[XX],pme->nkx);
-     snew(pme->bsp_mod[YY],pme->nky);
-     snew(pme->bsp_mod[ZZ],pme->nkz);
-     
-     /* Allocate data for the interpolation grid, including overlap */
-     pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
-                       pme->overlap[0].s2g0[pme->nodeid_major];
-     pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] - 
-                       pme->overlap[1].s2g0[pme->nodeid_minor];
-     pme->pmegrid_nz = pme->nkz + pme->pme_order - 1;
-     
-     pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
-     pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
-     pme->pmegrid_start_iz = 0;
-     
-     make_gridindex5_to_localindex(pme->nkx,
-                                   pme->pmegrid_start_ix,
-                                   pme->pmegrid_nx - (pme->pme_order-1),
-                                   &pme->nnx,&pme->fshx);
-     make_gridindex5_to_localindex(pme->nky,
-                                   pme->pmegrid_start_iy,
-                                   pme->pmegrid_ny - (pme->pme_order-1),
-                                   &pme->nny,&pme->fshy);
-     make_gridindex5_to_localindex(pme->nkz,
-                                   pme->pmegrid_start_iz,
-                                   pme->pmegrid_nz - (pme->pme_order-1),
-                                   &pme->nnz,&pme->fshz);
-     
-     snew(pme->pmegridA,pme->pmegrid_nx*pme->pmegrid_ny*pme->pmegrid_nz);
-     
-     /* For non-divisible grid we need pme_order iso pme_order-1 */
-     /* x overlap is copied in place: take padding into account.
-      * y is always copied through a buffer: we don't need padding in z,
-      * but we do need the overlap in x because of the communication order.
-      */
-     bufsizex = pme->pme_order*pme->pmegrid_ny*pme->pmegrid_nz;
-     bufsizey = pme->pme_order*pme->pmegrid_nx*pme->nkz;
-     bufsize  = (bufsizex>bufsizey) ? bufsizex : bufsizey;
-     
-     snew(pme->pmegrid_sendbuf,bufsize);
-     snew(pme->pmegrid_recvbuf,bufsize);
-     
-     ndata[0] = pme->nkx;
-     ndata[1] = pme->nky;
-     ndata[2] = pme->nkz;
-     
-     /* This routine will allocate the grid data to fit the FFTs */
-     gmx_parallel_3dfft_init(&pme->pfft_setupA,ndata,
-                             &pme->fftgridA,&pme->cfftgridA,
-                             pme->mpi_comm_d,
-                             pme->overlap[0].s2g0,pme->overlap[1].s2g0,
-                             bReproducible);
-     
-     if (bFreeEnergy)
-     {
-         snew(pme->pmegridB,pme->pmegrid_nx*pme->pmegrid_ny*pme->pmegrid_nz);    
-         gmx_parallel_3dfft_init(&pme->pfft_setupB,ndata,
-                                 &pme->fftgridB,&pme->cfftgridB,
-                                 pme->mpi_comm_d,
-                                 pme->overlap[0].s2g0,pme->overlap[1].s2g0,
-                                 bReproducible);
-     } else 
++
++    nthread = pme->nthread;
++
++#ifdef PME_TIME_THREADS
++    c1 = omp_cyc_start();
 +#endif
-         pme->pmegridB    = NULL;
-         pme->fftgridB    = NULL;
-         pme->cfftgridB   = NULL;
++    if (bCalcSplines)
 +    {
-     
-     make_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order);
-     
-     /* Use atc[0] for spreading */
-     init_atomcomm(pme,&pme->atc[0],cr,nnodes_major > 1 ? 0 : 1,TRUE);
-     if (pme->ndecompdim >= 2)
++#pragma omp parallel for num_threads(nthread) schedule(static)
++        for(thread=0; thread<nthread; thread++)
++        {
++            int start,end;
++
++            start = atc->n* thread   /nthread;
++            end   = atc->n*(thread+1)/nthread;
++
++            /* Compute fftgrid index for all atoms,
++             * with help of some extra variables.
++             */
++            calc_interpolation_idx(pme,atc,start,end,thread);
++        }
 +    }
-         init_atomcomm(pme,&pme->atc[1],cr,1,FALSE);
++#ifdef PME_TIME_THREADS
++    c1 = omp_cyc_end(c1);
++    cs1 += (double)c1;
++#endif
++
++#ifdef PME_TIME_THREADS
++    c2 = omp_cyc_start();
++#endif
++#pragma omp parallel for num_threads(nthread) schedule(static)
++    for(thread=0; thread<nthread; thread++)
 +    {
-     
-     if (pme->nnodes == 1) {
-         pme->atc[0].n = homenr;
-         pme_realloc_atomcomm_things(&pme->atc[0]);
++        splinedata_t *spline;
++        pmegrid_t *grid;
++
++        /* make local bsplines  */
++        if (grids->nthread == 1)
++        {
++            spline = &atc->spline[0];
++
++            spline->n = atc->n;
++
++            grid = &grids->grid;
++        }
++        else
++        {
++            spline = &atc->spline[thread];
++
++            make_thread_local_ind(atc,thread,spline);
++
++            grid = &grids->grid_th[thread];
++        }
++
++        if (bCalcSplines)
++        {
++            make_bsplines(spline->theta,spline->dtheta,pme->pme_order,
++                          atc->fractx,spline->n,spline->ind,atc->q,pme->bFEP);
++        }
++
++        if (bSpread)
++        {
++            /* put local atoms on grid. */
++#ifdef PME_TIME_SPREAD
++            ct1a = omp_cyc_start();
++#endif
++            spread_q_bsplines_thread(grid,atc,spline,&pme->spline_work);
++
++            if (grids->nthread > 1)
++            {
++                copy_local_grid(pme,grids,thread,fftgrid);
++            }
++#ifdef PME_TIME_SPREAD
++            ct1a = omp_cyc_end(ct1a);
++            cs1a[thread] += (double)ct1a;
++#endif
++        }
 +    }
-     
-     /* Use fft5d, order after FFT is y major, z, x minor */
-     pme->work_nalloc = pme->nkx;
-     snew(pme->work_mhx,pme->work_nalloc);
-     snew(pme->work_mhy,pme->work_nalloc);
-     snew(pme->work_mhz,pme->work_nalloc);
-     snew(pme->work_m2,pme->work_nalloc);
-     snew(pme->work_denom,pme->work_nalloc);
-     /* Allocate an aligned pointer for SSE operations, including 3 extra
-      * elements at the end since SSE operates on 4 elements at a time.
-      */
-     snew(pme->work_tmp1_alloc,pme->work_nalloc+8);
-     pme->work_tmp1 = (real *) (((size_t) pme->work_tmp1_alloc + 16) & (~((size_t) 15)));
-     snew(pme->work_m2inv,pme->work_nalloc);
++#ifdef PME_TIME_THREADS
++    c2 = omp_cyc_end(c2);
++    cs2 += (double)c2;
++#endif
++
++    if (grids->nthread > 1)
++    {
++#ifdef PME_TIME_THREADS
++        c3 = omp_cyc_start();
++#endif
++#pragma omp parallel for num_threads(grids->nthread) schedule(static)
++        for(thread=0; thread<grids->nthread; thread++)
++        {
++            reduce_threadgrid_overlap(pme,grids,thread,
++                                      fftgrid,
++                                      pme->overlap[0].sendbuf,
++                                      pme->overlap[1].sendbuf);
++#ifdef PRINT_PME_SENDBUF
++            print_sendbuf(pme,pme->overlap[0].sendbuf);
++#endif
++        }
++#ifdef PME_TIME_THREADS
++        c3 = omp_cyc_end(c3);
++        cs3 += (double)c3;
++#endif
++
++        if (pme->nnodes > 1)
++        {
++            /* Communicate the overlapping part of the fftgrid */
++            sum_fftgrid_dd(pme,fftgrid);
++        }
 +    }
-     *pmedata = pme;
-     
-     return 0;
 +
- static void spread_on_grid(gmx_pme_t pme,
-                            pme_atomcomm_t *atc,real *grid,
-                            gmx_bool bCalcSplines,gmx_bool bSpread)
- {    
-     if (bCalcSplines)
-     {
-     
-         /* Compute fftgrid index for all atoms,
-          * with help of some extra variables.
-          */
-         calc_interpolation_idx(pme,atc);
-         
-         /* make local bsplines  */
-         make_bsplines(atc->theta,atc->dtheta,pme->pme_order,
-                       atc->fractx,atc->n,atc->q,pme->bFEP);
-     }    
-     
-     if (bSpread)
++#ifdef PME_TIME_THREADS
++    cnt++;
++    if (cnt % 20 == 0)
++    {
++        printf("idx %.2f spread %.2f red %.2f",
++               cs1*1e-9,cs2*1e-9,cs3*1e-9);
++#ifdef PME_TIME_SPREAD
++        for(thread=0; thread<nthread; thread++)
++            printf(" %.2f",cs1a[thread]*1e-9);
++#endif
++        printf("\n");
++    }
++#endif
 +}
 +
-         /* put local atoms on grid. */
-         spread_q_bsplines(pme,atc,grid);
++
++static void dump_grid(FILE *fp,
++                      int sx,int sy,int sz,int nx,int ny,int nz,
++                      int my,int mz,const real *g)
++{
++    int x,y,z;
++
++    for(x=0; x<nx; x++)
 +    {
-     real *grid;
++        for(y=0; y<ny; y++)
++        {
++            for(z=0; z<nz; z++)
++            {
++                fprintf(fp,"%2d %2d %2d %6.3f\n",
++                        sx+x,sy+y,sz+z,g[(x*my + y)*mz + z]);
++            }
++        }
 +    }
 +}
 +
++static void dump_local_fftgrid(gmx_pme_t pme,const real *fftgrid)
++{
++    ivec local_fft_ndata,local_fft_offset,local_fft_size;
++
++    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
++                                   local_fft_ndata,
++                                   local_fft_offset,
++                                   local_fft_size);
++
++    dump_grid(stderr,
++              pme->pmegrid_start_ix,
++              pme->pmegrid_start_iy,
++              pme->pmegrid_start_iz,
++              pme->pmegrid_nx-pme->pme_order+1,
++              pme->pmegrid_ny-pme->pme_order+1,
++              pme->pmegrid_nz-pme->pme_order+1,
++              local_fft_size[YY],
++              local_fft_size[ZZ],
++              fftgrid);
++}
++
++
 +void gmx_pme_calc_energy(gmx_pme_t pme,int n,rvec *x,real *q,real *V)
 +{
 +    pme_atomcomm_t *atc;
-     
++    pmegrids_t *grid;
 +
 +    if (pme->nnodes > 1)
 +    {
 +        gmx_incons("gmx_pme_calc_energy called in parallel");
 +    }
 +    if (pme->bFEP > 1)
 +    {
 +        gmx_incons("gmx_pme_calc_energy with free energy");
 +    }
 +
 +    atc = &pme->atc_energy;
 +    atc->nslab     = 1;
 +    atc->bSpread   = TRUE;
 +    atc->pme_order = pme->pme_order;
 +    atc->n         = n;
 +    pme_realloc_atomcomm_things(atc);
 +    atc->x         = x;
 +    atc->q         = q;
-     grid = pme->pmegridA;
++
 +    /* We only use the A-charges grid */
-     spread_on_grid(pme,atc,NULL,TRUE,FALSE);
++    grid = &pme->pmegridA;
 +
-     *V = gather_energy_bsplines(pme,grid,atc);
++    spread_on_grid(pme,atc,NULL,TRUE,FALSE,pme->fftgridA);
 +
-     
-     
++    *V = gather_energy_bsplines(pme,grid->grid.grid,atc);
 +}
 +
 +
 +static void reset_pmeonly_counters(t_commrec *cr,gmx_wallcycle_t wcycle,
 +        t_nrnb *nrnb,t_inputrec *ir, gmx_large_int_t step_rel)
 +{
 +    /* Reset all the counters related to performance over the run */
 +    wallcycle_stop(wcycle,ewcRUN);
 +    wallcycle_reset_all(wcycle);
 +    init_nrnb(nrnb);
 +    ir->init_step += step_rel;
 +    ir->nsteps    -= step_rel;
 +    wallcycle_start(wcycle,ewcRUN);
 +}
 +
 +
 +int gmx_pmeonly(gmx_pme_t pme,
 +                t_commrec *cr,    t_nrnb *nrnb,
 +                gmx_wallcycle_t wcycle,
 +                real ewaldcoeff,  gmx_bool bGatherOnly,
 +                t_inputrec *ir)
 +{
 +    gmx_pme_pp_t pme_pp;
 +    int  natoms;
 +    matrix box;
 +    rvec *x_pp=NULL,*f_pp=NULL;
 +    real *chargeA=NULL,*chargeB=NULL;
 +    real lambda=0;
 +    int  maxshift_x=0,maxshift_y=0;
 +    real energy,dvdlambda;
 +    matrix vir;
 +    float cycles;
 +    int  count;
 +    gmx_bool bEnerVir;
 +    gmx_large_int_t step,step_rel;
-     
++
++
 +    pme_pp = gmx_pme_pp_init(cr);
-     
++
 +    init_nrnb(nrnb);
-         
++
 +    count = 0;
 +    do /****** this is a quasi-loop over time steps! */
 +    {
 +        /* Domain decomposition */
 +        natoms = gmx_pme_recv_q_x(pme_pp,
 +                                  &chargeA,&chargeB,box,&x_pp,&f_pp,
 +                                  &maxshift_x,&maxshift_y,
 +                                  &pme->bFEP,&lambda,
 +                                  &bEnerVir,
 +                                  &step);
-         
++
 +        if (natoms == -1) {
 +            /* We should stop: break out of the loop */
 +            break;
 +        }
-         
++
 +        step_rel = step - ir->init_step;
-         
++
 +        if (count == 0)
 +            wallcycle_start(wcycle,ewcRUN);
-         
++
 +        wallcycle_start(wcycle,ewcPMEMESH);
-         
++
 +        dvdlambda = 0;
 +        clear_mat(vir);
 +        gmx_pme_do(pme,0,natoms,x_pp,f_pp,chargeA,chargeB,box,
 +                   cr,maxshift_x,maxshift_y,nrnb,wcycle,vir,ewaldcoeff,
 +                   &energy,lambda,&dvdlambda,
 +                   GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
-         
++
 +        cycles = wallcycle_stop(wcycle,ewcPMEMESH);
-         
++
 +        gmx_pme_send_force_vir_ener(pme_pp,
 +                                    f_pp,vir,energy,dvdlambda,
 +                                    cycles);
-         
++
 +        count++;
 +
 +        if (step_rel == wcycle_get_reset_counters(wcycle))
 +        {
 +            /* Reset all the counters related to performance over the run */
 +            reset_pmeonly_counters(cr,wcycle,nrnb,ir,step_rel);
 +            wcycle_set_reset_counters(wcycle, 0);
 +        }
-     
++
 +    } /***** end of quasi-loop, we stop with the break above */
 +    while (TRUE);
-                matrix box,    t_commrec *cr,
++
 +    return 0;
 +}
 +
 +int gmx_pme_do(gmx_pme_t pme,
 +               int start,       int homenr,
 +               rvec x[],        rvec f[],
 +               real *chargeA,   real *chargeB,
-                real *energy,    real lambda, 
++               matrix box, t_commrec *cr,
 +               int  maxshift_x, int maxshift_y,
 +               t_nrnb *nrnb,    gmx_wallcycle_t wcycle,
 +               matrix vir,      real ewaldcoeff,
-     int     loop_count;
++               real *energy,    real lambda,
 +               real *dvdlambda, int flags)
 +{
 +    int     q,d,i,j,ntot,npme;
 +    int     nx,ny,nz;
 +    int     n_d,local_ny;
-     real *  grid=NULL;
 +    pme_atomcomm_t *atc=NULL;
-     real    *charge=NULL,*q_d,vol;
++    pmegrids_t *pmegrid=NULL;
++    real    *grid=NULL;
 +    real    *ptr;
 +    rvec    *x_d,*f_d;
-     gmx_bool    bClearF;
++    real    *charge=NULL,*q_d;
 +    real    energy_AB[2];
 +    matrix  vir_AB[2];
-     
++    gmx_bool bClearF;
 +    gmx_parallel_3dfft_t pfft_setup;
 +    real *  fftgrid;
 +    t_complex * cfftgrid;
++    int     thread;
 +
 +    if (pme->nnodes > 1) {
 +        atc = &pme->atc[0];
 +        atc->npd = homenr;
 +        if (atc->npd > atc->pd_nalloc) {
 +            atc->pd_nalloc = over_alloc_dd(atc->npd);
 +            srenew(atc->pd,atc->pd_nalloc);
 +        }
 +        atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y);
 +    }
 +    else
 +    {
 +        /* This could be necessary for TPI */
 +        pme->atc[0].n = homenr;
 +    }
-             grid = pme->pmegridA;
++
 +    for(q=0; q<(pme->bFEP ? 2 : 1); q++) {
 +        if (q == 0) {
-             grid = pme->pmegridB;
++            pmegrid = &pme->pmegridA;
 +            fftgrid = pme->fftgridA;
 +            cfftgrid = pme->cfftgridA;
 +            pfft_setup = pme->pfft_setupA;
 +            charge = chargeA+start;
 +        } else {
-         
-         m_inv_ur0(box,pme->recipbox); 
++            pmegrid = &pme->pmegridB;
 +            fftgrid = pme->fftgridB;
 +            cfftgrid = pme->cfftgridB;
 +            pfft_setup = pme->pfft_setupB;
 +            charge = chargeB+start;
 +        }
++        grid = pmegrid->grid.grid;
 +        /* Unpack structure */
 +        if (debug) {
 +            fprintf(debug,"PME: nnodes = %d, nodeid = %d\n",
 +                    cr->nnodes,cr->nodeid);
 +            fprintf(debug,"Grid = %p\n",(void*)grid);
 +            if (grid == NULL)
 +                gmx_fatal(FARGS,"No grid!");
 +        }
 +        where();
-                 pme_calc_pidx(n_d,pme->recipbox,x_d,atc);
++
++        m_inv_ur0(box,pme->recipbox);
 +
 +        if (pme->nnodes == 1) {
 +            atc = &pme->atc[0];
 +            if (DOMAINDECOMP(cr)) {
 +                atc->n = homenr;
 +                pme_realloc_atomcomm_things(atc);
 +            }
 +            atc->x = x;
 +            atc->q = charge;
 +            atc->f = f;
 +        } else {
 +            wallcycle_start(wcycle,ewcPME_REDISTXF);
 +            for(d=pme->ndecompdim-1; d>=0; d--)
 +            {
 +                if (d == pme->ndecompdim-1)
 +                {
 +                    n_d = homenr;
 +                    x_d = x + start;
 +                    q_d = charge;
 +                }
 +                else
 +                {
 +                    n_d = pme->atc[d+1].n;
 +                    x_d = atc->x;
 +                    q_d = atc->q;
 +                }
 +                atc = &pme->atc[d];
 +                atc->npd = n_d;
 +                if (atc->npd > atc->pd_nalloc) {
 +                    atc->pd_nalloc = over_alloc_dd(atc->npd);
 +                    srenew(atc->pd,atc->pd_nalloc);
 +                }
 +                atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y);
-                 
++                pme_calc_pidx_wrapper(n_d,pme->recipbox,x_d,atc);
 +                where();
-         
++
 +                /* Redistribute x (only once) and qA or qB */
 +                if (DOMAINDECOMP(cr)) {
 +                    dd_pmeredist_x_q(pme, n_d, q==0, x_d, q_d, atc);
 +                } else {
 +                    pmeredist_pd(pme, TRUE, n_d, q==0, x_d, q_d, atc);
 +                }
 +            }
 +            where();
 +
 +            wallcycle_stop(wcycle,ewcPME_REDISTXF);
 +        }
-             spread_on_grid(pme,&pme->atc[0],grid,q==0,TRUE);
++
 +        if (debug)
 +            fprintf(debug,"Node= %6d, pme local particles=%6d\n",
 +                    cr->nodeid,atc->n);
 +
 +        if (flags & GMX_PME_SPREAD_Q)
 +        {
 +            wallcycle_start(wcycle,ewcPME_SPREADGATHER);
 +
 +            /* Spread the charges on a grid */
-             wrap_periodic_pmegrid(pme,grid);
++            spread_on_grid(pme,&pme->atc[0],pmegrid,q==0,TRUE,fftgrid);
 +
 +            if (q == 0)
 +            {
 +                inc_nrnb(nrnb,eNR_WEIGHTS,DIM*atc->n);
 +            }
 +            inc_nrnb(nrnb,eNR_SPREADQBSP,
 +                     pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
 +
-             /* sum contributions to local grid from other nodes */
++            if (pme->nthread == 1)
++            {
++                wrap_periodic_pmegrid(pme,grid);
 +
-             if (pme->nnodes > 1) {
-                 gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_FORWARD);
-                 where();
-             }
++                /* sum contributions to local grid from other nodes */
 +#ifdef GMX_MPI
-             where();
++                if (pme->nnodes > 1)
++                {
++                    gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_FORWARD);
++                    where();
++                }
 +#endif
-             copy_pmegrid_to_fftgrid(pme,grid,fftgrid);
 +
-         }
-          
-         if (flags & GMX_PME_SOLVE)
-         {
-             /* do 3d-fft */ 
-             wallcycle_start(wcycle,ewcPME_FFT);
-             gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_REAL_TO_COMPLEX,fftgrid,cfftgrid);
-             wallcycle_stop(wcycle,ewcPME_FFT);
-             where();
-             
-             /* solve in k-space for our local cells */
-             vol = det(box);
-             wallcycle_start(wcycle,ewcPME_SOLVE);
-             loop_count =
-                 solve_pme_yzx(pme,cfftgrid,ewaldcoeff,vol,
-                               flags & GMX_PME_CALC_ENER_VIR,
-                               &energy_AB[q],vir_AB[q]);
-             wallcycle_stop(wcycle,ewcPME_SOLVE);
-             where();
-             inc_nrnb(nrnb,eNR_SOLVEPME,loop_count);
++                copy_pmegrid_to_fftgrid(pme,grid,fftgrid);
++            }
 +
 +            wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
-         if ((flags & GMX_PME_CALC_F) ||
-             (flags & GMX_PME_CALC_POT))
++
++            /*
++            dump_local_fftgrid(pme,fftgrid);
++            exit(0);
++            */
 +        }
 +
-             
-             /* do 3d-invfft */
-             where();
-             wallcycle_start(wcycle,ewcPME_FFT);
-             gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_COMPLEX_TO_REAL,cfftgrid,fftgrid);
-             wallcycle_stop(wcycle,ewcPME_FFT);
++        /* Here we start a large thread parallel region */
++#pragma omp parallel for num_threads(pme->nthread) schedule(static)
++        for(thread=0; thread<pme->nthread; thread++)
 +        {
-             where();
++            if (flags & GMX_PME_SOLVE)
++            {
++                int loop_count;
 +
-             if (pme->nodeid == 0)
-             {
-                 ntot = pme->nkx*pme->nky*pme->nkz;
-                 npme  = ntot*log((real)ntot)/log(2.0);
-                 inc_nrnb(nrnb,eNR_FFT,2*npme);
++                /* do 3d-fft */
++                if (thread == 0)
++                {
++                    wallcycle_start(wcycle,ewcPME_FFT);
++                }
++                gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_REAL_TO_COMPLEX,
++                                           fftgrid,cfftgrid,thread,wcycle);
++                if (thread == 0)
++                {
++                    wallcycle_stop(wcycle,ewcPME_FFT);
++                }
++                where();
 +
-             wallcycle_start(wcycle,ewcPME_SPREADGATHER);
++                /* solve in k-space for our local cells */
++                if (thread == 0)
++                {
++                    wallcycle_start(wcycle,ewcPME_SOLVE);
++                }
++                loop_count =
++                    solve_pme_yzx(pme,cfftgrid,ewaldcoeff,
++                                  box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
++                                  flags & GMX_PME_CALC_ENER_VIR,
++                                  pme->nthread,thread);
++                if (thread == 0)
++                {
++                    wallcycle_stop(wcycle,ewcPME_SOLVE);
++                    where();
++                    inc_nrnb(nrnb,eNR_SOLVEPME,loop_count);
++                }
 +            }
 +
-             copy_fftgrid_to_pmegrid(pme,fftgrid,grid);
++            if (flags & GMX_PME_CALC_F)
++            {
++                /* do 3d-invfft */
++                if (thread == 0)
++                {
++                    where();
++                    wallcycle_start(wcycle,ewcPME_FFT);
++                }
++                gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_COMPLEX_TO_REAL,
++                                           cfftgrid,fftgrid,thread,wcycle);
++                if (thread == 0)
++                {
++                    wallcycle_stop(wcycle,ewcPME_FFT);
++
++                    where();
++
++                    if (pme->nodeid == 0)
++                    {
++                        ntot = pme->nkx*pme->nky*pme->nkz;
++                        npme  = ntot*log((real)ntot)/log(2.0);
++                        inc_nrnb(nrnb,eNR_FFT,2*npme);
++                    }
++
++                    wallcycle_start(wcycle,ewcPME_SPREADGATHER);
++                }
 +
-         }
++                copy_fftgrid_to_pmegrid(pme,fftgrid,grid,pme->nthread,thread);
++            }
++        }
++        /* End of thread parallel section.
++         * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
++         */
 +
++        if (flags & GMX_PME_CALC_F)
++        {
 +            /* distribute local grid to all nodes */
 +#ifdef GMX_MPI
 +            if (pme->nnodes > 1) {
 +                gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_BACKWARD);
 +            }
 +#endif
 +            where();
 +
 +            unwrap_periodic_pmegrid(pme,grid);
-         if (flags & GMX_PME_CALC_F)
-         {
 +
-             
 +            /* interpolate forces for our local atoms */
 +
 +            where();
-             gather_f_bsplines(pme,grid,bClearF,&pme->atc[0],
-                               pme->bFEP ? (q==0 ? 1.0-lambda : lambda) : 1.0);
++
 +            /* If we are running without parallelization,
 +             * atc->f is the actual force array, not a buffer,
 +             * therefore we should not clear it.
 +             */
 +            bClearF = (q == 0 && PAR(cr));
-        }
++#pragma omp parallel for num_threads(pme->nthread) schedule(static)
++            for(thread=0; thread<pme->nthread; thread++)
++            {
++                gather_f_bsplines(pme,grid,bClearF,atc,
++                                  &atc->spline[thread],
++                                  pme->bFEP ? (q==0 ? 1.0-lambda : lambda) : 1.0);
++            }
++
 +            where();
 +
 +            inc_nrnb(nrnb,eNR_GATHERFBSP,
 +                     pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
 +            wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
-     
++        }
++
++        if (flags & GMX_PME_CALC_ENER_VIR)
++        {
++            /* This should only be called on the master thread
++             * and after the threads have synchronized.
++             */
++            get_pme_ener_vir(pme,pme->nthread,&energy_AB[q],vir_AB[q]);
++        }
 +    } /* of q-loop */
-     
++
 +    if ((flags & GMX_PME_CALC_F) && pme->nnodes > 1) {
 +        wallcycle_start(wcycle,ewcPME_REDISTXF);
 +        for(d=0; d<pme->ndecompdim; d++)
 +        {
 +            atc = &pme->atc[d];
 +            if (d == pme->ndecompdim - 1)
 +            {
 +                n_d = homenr;
 +                f_d = f + start;
 +            }
 +            else
 +            {
 +                n_d = pme->atc[d+1].n;
 +                f_d = pme->atc[d+1].f;
 +            }
 +            if (DOMAINDECOMP(cr)) {
 +                dd_pmeredist_f(pme,atc,n_d,f_d,
 +                               d==pme->ndecompdim-1 && pme->bPPnode);
 +            } else {
 +                pmeredist_pd(pme, FALSE, n_d, TRUE, f_d, NULL, atc);
 +            }
 +        }
 +
 +        wallcycle_stop(wcycle,ewcPME_REDISTXF);
 +    }
 +    where();
-     
++
 +    if (!pme->bFEP) {
 +        *energy = energy_AB[0];
 +        m_add(vir,vir_AB[0],vir);
 +    } else {
 +        *energy = (1.0-lambda)*energy_AB[0] + lambda*energy_AB[1];
 +        *dvdlambda += energy_AB[1] - energy_AB[0];
 +        for(i=0; i<DIM; i++)
 +            for(j=0; j<DIM; j++)
 +                vir[i][j] += (1.0-lambda)*vir_AB[0][i][j] + lambda*vir_AB[1][i][j];
 +    }
 +
 +    if (debug)
++    {
 +        fprintf(debug,"PME mesh energy: %g\n",*energy);
++    }
++
 +    return 0;
 +}
index 0000000000000000000000000000000000000000,1b0b61760b1170eb58e6c7f8c7b92304ae5fe123..1b0b61760b1170eb58e6c7f8c7b92304ae5fe123
mode 000000,100644..100644
--- /dev/null
index 6897fc6835212d9088e772b0dff1f797220d4300,0000000000000000000000000000000000000000..30d13d9150084d105bbff004bca56d242da193be
mode 100644,000000..100644
--- /dev/null
@@@ -1,1029 -1,0 +1,1031 @@@
-   int         bondtypes[] = { F_BONDS, F_HARMONIC, F_CUBICBONDS, F_POLARIZATION, F_WATER_POL };
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2008, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 + 
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <string.h>
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "gmx_fatal.h"
 +#include "vec.h"
 +#include "txtdump.h"
 +#include "mdrun.h"
 +#include "partdec.h"
 +#include "mdatoms.h"
 +#include "vsite.h"
 +#include "network.h"
 +#include "names.h"
 +#include "constr.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "physics.h"
 +#include "copyrite.h"
 +#include "shellfc.h"
 +#include "mtop_util.h"
 +#include "chargegroup.h"
 +#include "macros.h"
 +
 +
 +typedef struct {
 +  int     nnucl;
 +  atom_id shell;              /* The shell id                         */
 +  atom_id nucl1,nucl2,nucl3;  /* The nuclei connected to the shell    */
 +  /* gmx_bool    bInterCG; */       /* Coupled to nuclei outside cg?        */
 +  real    k;                  /* force constant                       */
 +  real    k_1;                        /* 1 over force constant                */
 +  rvec    xold;
 +  rvec    fold;
 +  rvec    step;
 +} t_shell;
 +
 +typedef struct gmx_shellfc {
 +  int     nshell_gl;       /* The number of shells in the system       */
 +  t_shell *shell_gl;       /* All the shells (for DD only)             */
 +  int     *shell_index_gl; /* Global shell index (for DD only)         */
 +  gmx_bool    bInterCG;        /* Are there inter charge-group shells?     */
 +  int     nshell;          /* The number of local shells               */
 +  t_shell *shell;          /* The local shells                         */
 +  int     shell_nalloc;    /* The allocation size of shell             */
 +  gmx_bool    bPredict;        /* Predict shell positions                  */
 +  gmx_bool    bForceInit;      /* Force initialization of shell positions  */
 +  int     nflexcon;        /* The number of flexible constraints       */
 +  rvec    *x[2];           /* Array for iterative minimization         */
 +  rvec    *f[2];           /* Array for iterative minimization         */
 +  int     x_nalloc;        /* The allocation size of x and f           */
 +  rvec    *acc_dir;        /* Acceleration direction for flexcon       */
 +  rvec    *x_old;          /* Old coordinates for flexcon              */
 +  int     flex_nalloc;     /* The allocation size of acc_dir and x_old */
 +  rvec    *adir_xnold;     /* Work space for init_adir                 */
 +  rvec    *adir_xnew;      /* Work space for init_adir                 */
 +  int     adir_nalloc;     /* Work space for init_adir                 */
 +} t_gmx_shellfc;
 +
 +      
 +static void pr_shell(FILE *fplog,int ns,t_shell s[])
 +{
 +  int i;
 +  
 +  fprintf(fplog,"SHELL DATA\n");
 +  fprintf(fplog,"%5s  %8s  %5s  %5s  %5s\n",
 +        "Shell","Force k","Nucl1","Nucl2","Nucl3");
 +  for(i=0; (i<ns); i++) {
 +    fprintf(fplog,"%5d  %8.3f  %5d",s[i].shell,1.0/s[i].k_1,s[i].nucl1);
 +    if (s[i].nnucl == 2)
 +      fprintf(fplog,"  %5d\n",s[i].nucl2);
 +    else if (s[i].nnucl == 3)
 +      fprintf(fplog,"  %5d  %5d\n",s[i].nucl2,s[i].nucl3);
 +    else
 +      fprintf(fplog,"\n");
 +  }
 +}
 +
 +static void predict_shells(FILE *fplog,rvec x[],rvec v[],real dt,
 +                         int ns,t_shell s[],
 +                         real mass[],gmx_mtop_t *mtop,gmx_bool bInit)
 +{
 +  int  i,m,s1,n1,n2,n3;
 +  real dt_1,dt_2,dt_3,fudge,tm,m1,m2,m3;
 +  rvec *ptr;
 +  t_atom *atom;
 +  
 +  /* We introduce a fudge factor for performance reasons: with this choice
 +   * the initial force on the shells is about a factor of two lower than 
 +   * without
 +   */
 +  fudge = 1.0;
 +    
 +  if (bInit) {
 +    if (fplog)
 +      fprintf(fplog,"RELAX: Using prediction for initial shell placement\n");
 +    ptr  = x;
 +    dt_1 = 1;
 +  }
 +  else {
 +    ptr  = v;
 +    dt_1 = fudge*dt;
 +  }
 +    
 +  for(i=0; (i<ns); i++) {
 +    s1 = s[i].shell;
 +    if (bInit)
 +      clear_rvec(x[s1]);
 +    switch (s[i].nnucl) {
 +    case 1:
 +      n1 = s[i].nucl1;
 +      for(m=0; (m<DIM); m++)
 +      x[s1][m]+=ptr[n1][m]*dt_1;
 +      break;
 +    case 2:
 +      n1 = s[i].nucl1;
 +      n2 = s[i].nucl2;
 +      if (mass) {
 +      m1 = mass[n1];
 +      m2 = mass[n2];
 +      } else {
 +      /* Not the correct masses with FE, but it is just a prediction... */
 +      m1 = atom[n1].m;
 +      m2 = atom[n2].m;
 +      }
 +      tm = dt_1/(m1+m2);
 +      for(m=0; (m<DIM); m++)
 +      x[s1][m]+=(m1*ptr[n1][m]+m2*ptr[n2][m])*tm;
 +      break;
 +    case 3:
 +      n1 = s[i].nucl1;
 +      n2 = s[i].nucl2;
 +      n3 = s[i].nucl3;
 +      if (mass) {
 +      m1 = mass[n1];
 +      m2 = mass[n2];
 +      m3 = mass[n3];
 +      } else {
 +      /* Not the correct masses with FE, but it is just a prediction... */
 +      gmx_mtop_atomnr_to_atom(mtop,n1,&atom);
 +      m1 = atom->m;
 +      gmx_mtop_atomnr_to_atom(mtop,n2,&atom);
 +      m2 = atom->m;
 +      gmx_mtop_atomnr_to_atom(mtop,n3,&atom);
 +      m3 = atom->m;
 +      }
 +      tm = dt_1/(m1+m2+m3);
 +      for(m=0; (m<DIM); m++)
 +      x[s1][m]+=(m1*ptr[n1][m]+m2*ptr[n2][m]+m3*ptr[n3][m])*tm;
 +      break;
 +    default:
 +      gmx_fatal(FARGS,"Shell %d has %d nuclei!",i,s[i].nnucl);
 +    }
 +  }
 +}
 +
 +gmx_shellfc_t init_shell_flexcon(FILE *fplog,
 +                               gmx_mtop_t *mtop,int nflexcon,
 +                               rvec *x)
 +{
 +  struct gmx_shellfc *shfc;
 +  t_shell     *shell;
 +  int         *shell_index=NULL,*at2cg;
 +  t_atom      *atom;
 +  int         n[eptNR],ns,nshell,nsi;
 +  int         i,j,nmol,type,mb,mt,a_offset,cg,mol,ftype,nra;
 +  real        qS,alpha;
 +  int         aS,aN=0; /* Shell and nucleus */
++  int         bondtypes[] = { F_BONDS, F_HARMONIC, F_CUBICBONDS, F_POLARIZATION, F_ANHARM_POL, F_WATER_POL };
 +#define NBT asize(bondtypes)
 +  t_iatom     *ia;
 +  gmx_mtop_atomloop_block_t aloopb;
 +  gmx_mtop_atomloop_all_t aloop;
 +  gmx_ffparams_t *ffparams;
 +  gmx_molblock_t *molb;
 +  gmx_moltype_t *molt;
 +  t_block     *cgs;
 +
 +  /* Count number of shells, and find their indices */
 +  for(i=0; (i<eptNR); i++) {
 +    n[i] = 0;
 +  }
 +
 +  aloopb = gmx_mtop_atomloop_block_init(mtop);
 +  while (gmx_mtop_atomloop_block_next(aloopb,&atom,&nmol)) {
 +    n[atom->ptype] += nmol;
 +  }
 +
 +  if (fplog) {
 +    /* Print the number of each particle type */  
 +    for(i=0; (i<eptNR); i++) {
 +      if (n[i] != 0) {
 +      fprintf(fplog,"There are: %d %ss\n",n[i],ptype_str[i]);
 +      }
 +    }
 +  }
 +
 +  nshell = n[eptShell];
 +  
 +  if (nshell == 0 && nflexcon == 0) {
 +    return NULL;
 +  }
 +
 +  snew(shfc,1);
 +  shfc->nflexcon = nflexcon;
 +
 +  if (nshell == 0) {
 +    return shfc;
 +  }
 +
 +  /* We have shells: fill the shell data structure */
 +
 +  /* Global system sized array, this should be avoided */
 +  snew(shell_index,mtop->natoms);
 +
 +  aloop = gmx_mtop_atomloop_all_init(mtop);
 +  nshell = 0;
 +  while (gmx_mtop_atomloop_all_next(aloop,&i,&atom)) {
 +    if (atom->ptype == eptShell) {
 +      shell_index[i] = nshell++;
 +    }
 +  }
 +
 +  snew(shell,nshell);
 +  
 +  /* Initiate the shell structures */    
 +  for(i=0; (i<nshell); i++) {
 +    shell[i].shell = NO_ATID;
 +    shell[i].nnucl = 0;
 +    shell[i].nucl1 = NO_ATID;
 +    shell[i].nucl2 = NO_ATID;
 +    shell[i].nucl3 = NO_ATID;
 +    /* shell[i].bInterCG=FALSE; */
 +    shell[i].k_1   = 0;
 +    shell[i].k     = 0;
 +  }
 +
 +  ffparams = &mtop->ffparams;
 +
 +  /* Now fill the structures */
 +  shfc->bInterCG = FALSE;
 +  ns = 0;
 +  a_offset = 0;
 +  for(mb=0; mb<mtop->nmolblock; mb++) {
 +    molb = &mtop->molblock[mb];
 +    molt = &mtop->moltype[molb->type];
 +
 +    cgs = &molt->cgs;
 +    snew(at2cg,molt->atoms.nr);
 +    for(cg=0; cg<cgs->nr; cg++) {
 +      for(i=cgs->index[cg]; i<cgs->index[cg+1]; i++) {
 +      at2cg[i] = cg;
 +      }
 +    }
 +
 +    atom = molt->atoms.atom;
 +    for(mol=0; mol<molb->nmol; mol++) {
 +      for(j=0; (j<NBT); j++) {
 +      ia = molt->ilist[bondtypes[j]].iatoms;
 +      for(i=0; (i<molt->ilist[bondtypes[j]].nr); ) {
 +        type  = ia[0];
 +        ftype = ffparams->functype[type];
 +        nra   = interaction_function[ftype].nratoms;
 +        
 +        /* Check whether we have a bond with a shell */
 +        aS = NO_ATID;
 +        
 +        switch (bondtypes[j]) {
 +        case F_BONDS:
 +        case F_HARMONIC:
 +        case F_CUBICBONDS:
 +        case F_POLARIZATION:
++        case F_ANHARM_POL:
 +          if (atom[ia[1]].ptype == eptShell) {
 +            aS = ia[1];
 +            aN = ia[2];
 +          }
 +          else if (atom[ia[2]].ptype == eptShell) {
 +            aS = ia[2];
 +            aN = ia[1];
 +          }
 +          break;
 +        case F_WATER_POL:
 +          aN    = ia[4];  /* Dummy */
 +          aS    = ia[5];  /* Shell */
 +          break;
 +        default:
 +          gmx_fatal(FARGS,"Death Horror: %s, %d",__FILE__,__LINE__);
 +        }
 +        
 +        if (aS != NO_ATID) {    
 +          qS = atom[aS].q;
 +          
 +          /* Check whether one of the particles is a shell... */
 +          nsi = shell_index[a_offset+aS];
 +          if ((nsi < 0) || (nsi >= nshell))
 +            gmx_fatal(FARGS,"nsi is %d should be within 0 - %d. aS = %d",
 +                      nsi,nshell,aS);
 +          if (shell[nsi].shell == NO_ATID) {
 +            shell[nsi].shell = a_offset + aS;
 +            ns ++;
 +          }
 +          else if (shell[nsi].shell != a_offset+aS)
 +            gmx_fatal(FARGS,"Weird stuff in %s, %d",__FILE__,__LINE__);
 +          
 +          if      (shell[nsi].nucl1 == NO_ATID) {
 +            shell[nsi].nucl1 = a_offset + aN;
 +          } else if (shell[nsi].nucl2 == NO_ATID) {
 +            shell[nsi].nucl2 = a_offset + aN;
 +          } else if (shell[nsi].nucl3 == NO_ATID) {
 +            shell[nsi].nucl3 = a_offset + aN;
 +          } else {
 +            if (fplog)
 +              pr_shell(fplog,ns,shell);
 +            gmx_fatal(FARGS,"Can not handle more than three bonds per shell\n");
 +          }
 +          if (at2cg[aS] != at2cg[aN]) {
 +            /* shell[nsi].bInterCG = TRUE; */
 +            shfc->bInterCG = TRUE;
 +          }
 +          
 +          switch (bondtypes[j]) {
 +          case F_BONDS:
 +          case F_HARMONIC:
 +            shell[nsi].k    += ffparams->iparams[type].harmonic.krA;
 +            break;
 +          case F_CUBICBONDS:
 +            shell[nsi].k    += ffparams->iparams[type].cubic.kb;
 +            break;
 +          case F_POLARIZATION:
++          case F_ANHARM_POL:
 +            if (qS != atom[aS].qB)
 +              gmx_fatal(FARGS,"polarize can not be used with qA != qB");
 +            shell[nsi].k    += sqr(qS)*ONE_4PI_EPS0/
 +              ffparams->iparams[type].polarize.alpha;
 +            break;
 +          case F_WATER_POL:
 +            if (qS != atom[aS].qB)
 +              gmx_fatal(FARGS,"water_pol can not be used with qA != qB");
 +            alpha          = (ffparams->iparams[type].wpol.al_x+
 +                              ffparams->iparams[type].wpol.al_y+
 +                              ffparams->iparams[type].wpol.al_z)/3.0;
 +            shell[nsi].k  += sqr(qS)*ONE_4PI_EPS0/alpha;
 +            break;
 +          default:
 +            gmx_fatal(FARGS,"Death Horror: %s, %d",__FILE__,__LINE__);
 +          }
 +          shell[nsi].nnucl++;
 +        }
 +        ia += nra+1;
 +        i  += nra+1;
 +      }
 +      }
 +      a_offset += molt->atoms.nr;
 +    }
 +    /* Done with this molecule type */
 +    sfree(at2cg);
 +  }
 +  
 +  /* Verify whether it's all correct */
 +  if (ns != nshell)
 +    gmx_fatal(FARGS,"Something weird with shells. They may not be bonded to something");
 +  
 +  for(i=0; (i<ns); i++)
 +    shell[i].k_1 = 1.0/shell[i].k;
 +  
 +  if (debug)
 +    pr_shell(debug,ns,shell);
 +
 +  
 +  shfc->nshell_gl      = ns;
 +  shfc->shell_gl       = shell;
 +  shfc->shell_index_gl = shell_index;
 +
 +  shfc->bPredict   = (getenv("GMX_NOPREDICT") == NULL);
 +  shfc->bForceInit = FALSE;
 +  if (!shfc->bPredict) {
 +    if (fplog)
 +      fprintf(fplog,"\nWill never predict shell positions\n");
 +  } else {
 +    shfc->bForceInit = (getenv("GMX_FORCEINIT") != NULL);
 +    if (shfc->bForceInit && fplog)
 +      fprintf(fplog,"\nWill always initiate shell positions\n");
 +  }
 +
 +  if (shfc->bPredict) {
 +    if (x) {
 +      predict_shells(fplog,x,NULL,0,shfc->nshell_gl,shfc->shell_gl,
 +                   NULL,mtop,TRUE);
 +    }
 +
 +    if (shfc->bInterCG) {
 +      if (fplog)
 +      fprintf(fplog,"\nNOTE: there all shells that are connected to particles outside thier own charge group, will not predict shells positions during the run\n\n");
 +      shfc->bPredict = FALSE;
 +    }
 +  }
 +
 +  return shfc;
 +}
 +
 +void make_local_shells(t_commrec *cr,t_mdatoms *md,
 +                     struct gmx_shellfc *shfc)
 +{
 +  t_shell *shell;
 +  int a0,a1,*ind,nshell,i;
 +  gmx_domdec_t *dd=NULL;
 +
 +  if (PAR(cr)) {
 +    if (DOMAINDECOMP(cr)) {
 +      dd = cr->dd;
 +      a0 = 0;
 +      a1 = dd->nat_home;
 +    } else {
 +      pd_at_range(cr,&a0,&a1);
 +    }
 +  } else {
 +    /* Single node: we need all shells, just copy the pointer */
 +    shfc->nshell = shfc->nshell_gl;
 +    shfc->shell  = shfc->shell_gl;
 +    
 +    return;
 +  }
 +
 +  ind = shfc->shell_index_gl;
 +
 +  nshell = 0;
 +  shell  = shfc->shell; 
 +  for(i=a0; i<a1; i++) {
 +    if (md->ptype[i] == eptShell) {
 +      if (nshell+1 > shfc->shell_nalloc) {
 +      shfc->shell_nalloc = over_alloc_dd(nshell+1);
 +      srenew(shell,shfc->shell_nalloc);
 +      }
 +      if (dd) {
 +      shell[nshell] = shfc->shell_gl[ind[dd->gatindex[i]]];
 +      } else {
 +      shell[nshell] = shfc->shell_gl[ind[i]];
 +      }
 +      /* With inter-cg shells we can no do shell prediction,
 +       * so we do not need the nuclei numbers.
 +       */
 +      if (!shfc->bInterCG) {
 +      shell[nshell].nucl1   = i + shell[nshell].nucl1 - shell[nshell].shell;
 +      if (shell[nshell].nnucl > 1)
 +        shell[nshell].nucl2 = i + shell[nshell].nucl2 - shell[nshell].shell;
 +      if (shell[nshell].nnucl > 2)
 +        shell[nshell].nucl3 = i + shell[nshell].nucl3 - shell[nshell].shell;
 +      }
 +      shell[nshell].shell = i;
 +      nshell++;
 +    }
 +  }
 +
 +  shfc->nshell = nshell;
 +  shfc->shell  = shell;
 +}
 +
 +static void do_1pos(rvec xnew,rvec xold,rvec f,real step)
 +{
 +  real xo,yo,zo;
 +  real dx,dy,dz;
 +  
 +  xo=xold[XX];
 +  yo=xold[YY];
 +  zo=xold[ZZ];
 +
 +  dx=f[XX]*step;
 +  dy=f[YY]*step;
 +  dz=f[ZZ]*step;
 +
 +  xnew[XX]=xo+dx;
 +  xnew[YY]=yo+dy;
 +  xnew[ZZ]=zo+dz;
 +}
 +
 +static void do_1pos3(rvec xnew,rvec xold,rvec f,rvec step)
 +{
 +  real xo,yo,zo;
 +  real dx,dy,dz;
 +  
 +  xo=xold[XX];
 +  yo=xold[YY];
 +  zo=xold[ZZ];
 +
 +  dx=f[XX]*step[XX];
 +  dy=f[YY]*step[YY];
 +  dz=f[ZZ]*step[ZZ];
 +
 +  xnew[XX]=xo+dx;
 +  xnew[YY]=yo+dy;
 +  xnew[ZZ]=zo+dz;
 +}
 +
 +static void directional_sd(FILE *log,rvec xold[],rvec xnew[],rvec acc_dir[],
 +                         int start,int homenr,real step)
 +{
 +  int  i;
 +
 +  for(i=start; i<homenr; i++)
 +    do_1pos(xnew[i],xold[i],acc_dir[i],step);
 +}
 +
 +static void shell_pos_sd(FILE *log,rvec xcur[],rvec xnew[],rvec f[],
 +                       int ns,t_shell s[],int count)
 +{
 +  int  i,shell,d;
 +  real dx,df,k_est;
 +#ifdef PRINT_STEP  
 +  real step_min,step_max;
 +
 +  step_min = 1e30;
 +  step_max = 0;
 +#endif
 +  for(i=0; (i<ns); i++) {
 +    shell = s[i].shell;
 +    if (count == 1) {
 +      for(d=0; d<DIM; d++) {
 +      s[i].step[d] = s[i].k_1;
 +#ifdef PRINT_STEP
 +      step_min = min(step_min,s[i].step[d]);
 +      step_max = max(step_max,s[i].step[d]);
 +#endif
 +      }
 +    } else {
 +      for(d=0; d<DIM; d++) {
 +      dx = xcur[shell][d] - s[i].xold[d];
 +      df =    f[shell][d] - s[i].fold[d];
 +      if (dx != 0 && df != 0) {
 +        k_est = -dx/df;
 +        if (k_est >= 2*s[i].step[d]) {
 +          s[i].step[d] *= 1.2;
 +        } else if (k_est <= 0) {
 +          s[i].step[d] *= 0.8;
 +        } else {
 +          s[i].step[d] = 0.8*s[i].step[d] + 0.2*k_est;
 +        }
 +      } else if (dx != 0) {
 +        s[i].step[d] *= 1.2;
 +      }
 +#ifdef PRINT_STEP
 +      step_min = min(step_min,s[i].step[d]);
 +      step_max = max(step_max,s[i].step[d]);
 +#endif
 +      }
 +    }
 +    copy_rvec(xcur[shell],s[i].xold);
 +    copy_rvec(f[shell],   s[i].fold);
 +
 +    do_1pos3(xnew[shell],xcur[shell],f[shell],s[i].step);
 +
 +    if (gmx_debug_at) {
 +      fprintf(debug,"shell[%d] = %d\n",i,shell);
 +      pr_rvec(debug,0,"fshell",f[shell],DIM,TRUE);
 +      pr_rvec(debug,0,"xold",xcur[shell],DIM,TRUE);
 +      pr_rvec(debug,0,"step",s[i].step,DIM,TRUE);
 +      pr_rvec(debug,0,"xnew",xnew[shell],DIM,TRUE);
 +    }
 +  }
 +#ifdef PRINT_STEP
 +  printf("step %.3e %.3e\n",step_min,step_max);
 +#endif
 +}
 +
 +static void decrease_step_size(int nshell,t_shell s[])
 +{
 +  int i;
 +  
 +  for(i=0; i<nshell; i++)
 +    svmul(0.8,s[i].step,s[i].step);
 +}
 +
 +static void print_epot(FILE *fp,gmx_large_int_t mdstep,int count,real epot,real df,
 +                     int ndir,real sf_dir)
 +{
 +  char buf[22];
 +
 +  fprintf(fp,"MDStep=%5s/%2d EPot: %12.8e, rmsF: %6.2e",
 +        gmx_step_str(mdstep,buf),count,epot,df);
 +  if (ndir)
 +    fprintf(fp,", dir. rmsF: %6.2e\n",sqrt(sf_dir/ndir));
 +  else
 +    fprintf(fp,"\n");
 +}
 +
 +
 +static real rms_force(t_commrec *cr,rvec f[],int ns,t_shell s[],
 +                    int ndir,real *sf_dir,real *Epot)
 +{
 +  int  i,shell,ntot;
 +  double buf[4];
 +
 +  buf[0] = *sf_dir;
 +  for(i=0; i<ns; i++) {
 +    shell = s[i].shell;
 +    buf[0]  += norm2(f[shell]);
 +  }
 +  ntot = ns;
 +
 +  if (PAR(cr)) {
 +    buf[1] = ntot;
 +    buf[2] = *sf_dir;
 +    buf[3] = *Epot;
 +    gmx_sumd(4,buf,cr);
 +    ntot = (int)(buf[1] + 0.5);
 +    *sf_dir = buf[2];
 +    *Epot   = buf[3];
 +  }
 +  ntot += ndir;
 +
 +  return (ntot ? sqrt(buf[0]/ntot) : 0);
 +}
 +
 +static void check_pbc(FILE *fp,rvec x[],int shell)
 +{
 +  int m,now;
 +  
 +  now = shell-4;
 +  for(m=0; (m<DIM); m++)
 +    if (fabs(x[shell][m]-x[now][m]) > 0.3) {
 +      pr_rvecs(fp,0,"SHELL-X",x+now,5);
 +      break;
 +    }
 +}
 +
 +static void dump_shells(FILE *fp,rvec x[],rvec f[],real ftol,int ns,t_shell s[])
 +{
 +  int  i,shell;
 +  real ft2,ff2;
 +  
 +  ft2 = sqr(ftol);
 +  
 +  for(i=0; (i<ns); i++) {
 +    shell = s[i].shell;
 +    ff2   = iprod(f[shell],f[shell]);
 +    if (ff2 > ft2)
 +      fprintf(fp,"SHELL %5d, force %10.5f  %10.5f  %10.5f, |f| %10.5f\n",
 +            shell,f[shell][XX],f[shell][YY],f[shell][ZZ],sqrt(ff2));
 +    check_pbc(fp,x,shell);
 +  }
 +}
 +
 +static void init_adir(FILE *log,gmx_shellfc_t shfc,
 +                    gmx_constr_t constr,t_idef *idef,t_inputrec *ir,
 +                    t_commrec *cr,int dd_ac1,
 +                    gmx_large_int_t step,t_mdatoms *md,int start,int end,
 +                    rvec *x_old,rvec *x_init,rvec *x,
 +                    rvec *f,rvec *acc_dir,matrix box,
 +                    real lambda,real *dvdlambda,t_nrnb *nrnb)
 +{
 +  rvec   *xnold,*xnew;
 +  double w_dt;
 +  int    gf,ga,gt;
 +  real   dt,scale;
 +  int    n,d; 
 +  unsigned short *ptype;
 +  rvec   p,dx;
 +  
 +  if (DOMAINDECOMP(cr))
 +    n = dd_ac1;
 +  else
 +    n = end - start;
 +  if (n > shfc->adir_nalloc) {
 +    shfc->adir_nalloc = over_alloc_dd(n);
 +    srenew(shfc->adir_xnold,shfc->adir_nalloc);
 +    srenew(shfc->adir_xnew ,shfc->adir_nalloc);
 +  }
 +  xnold = shfc->adir_xnold;
 +  xnew  = shfc->adir_xnew;
 +    
 +  ptype = md->ptype;
 +
 +  dt = ir->delta_t;
 +
 +  /* Does NOT work with freeze or acceleration groups (yet) */
 +  for (n=start; n<end; n++) {  
 +    w_dt = md->invmass[n]*dt;
 +    
 +    for (d=0; d<DIM; d++) {
 +      if ((ptype[n] != eptVSite) && (ptype[n] != eptShell)) {
 +      xnold[n-start][d] = x[n][d] - (x_init[n][d] - x_old[n][d]);
 +      xnew[n-start][d] = 2*x[n][d] - x_old[n][d] + f[n][d]*w_dt*dt;
 +      } else {
 +      xnold[n-start][d] = x[n][d];
 +      xnew[n-start][d] = x[n][d];
 +      }
 +    }
 +  }
 +  constrain(log,FALSE,FALSE,constr,idef,ir,NULL,cr,step,0,md,
 +          x,xnold-start,NULL,box,
 +          lambda,dvdlambda,NULL,NULL,nrnb,econqCoord,FALSE,0,0);
 +  constrain(log,FALSE,FALSE,constr,idef,ir,NULL,cr,step,0,md,
 +          x,xnew-start,NULL,box,
 +          lambda,dvdlambda,NULL,NULL,nrnb,econqCoord,FALSE,0,0);
 +
 +  /* Set xnew to minus the acceleration */
 +  for (n=start; n<end; n++) {
 +    for(d=0; d<DIM; d++)
 +      xnew[n-start][d] =
 +      -(2*x[n][d]-xnold[n-start][d]-xnew[n-start][d])/sqr(dt)
 +      - f[n][d]*md->invmass[n];
 +    clear_rvec(acc_dir[n]);
 +  }
 +
 +  /* Project the acceleration on the old bond directions */
 +  constrain(log,FALSE,FALSE,constr,idef,ir,NULL,cr,step,0,md,
 +          x_old,xnew-start,acc_dir,box,
 +          lambda,dvdlambda,NULL,NULL,nrnb,econqDeriv_FlexCon,FALSE,0,0); 
 +}
 +
 +int relax_shell_flexcon(FILE *fplog,t_commrec *cr,gmx_bool bVerbose,
 +                      gmx_large_int_t mdstep,t_inputrec *inputrec,
 +                      gmx_bool bDoNS,int force_flags,
 +                      gmx_bool bStopCM,
 +                      gmx_localtop_t *top,
 +                      gmx_mtop_t* mtop,
 +                      gmx_constr_t constr,
 +                      gmx_enerdata_t *enerd,t_fcdata *fcd,
 +                      t_state *state,rvec f[],
 +                      tensor force_vir,
 +                      t_mdatoms *md,
 +                      t_nrnb *nrnb,gmx_wallcycle_t wcycle,
 +                      t_graph *graph,
 +                      gmx_groups_t *groups,
 +                      struct gmx_shellfc *shfc,
 +                      t_forcerec *fr,
 +                      gmx_bool bBornRadii,
 +                      double t,rvec mu_tot,
 +                      int natoms,gmx_bool *bConverged,
 +                      gmx_vsite_t *vsite,
 +                      FILE *fp_field)
 +{
 +  int    nshell;
 +  t_shell *shell;
 +  t_idef *idef;
 +  rvec   *pos[2],*force[2],*acc_dir=NULL,*x_old=NULL;
 +  real   Epot[2],df[2];
 +  rvec   dx;
 +  real   sf_dir,invdt;
 +  real   ftol,xiH,xiS,dum=0;
 +  char   sbuf[22];
 +  gmx_bool   bCont,bInit;
 +  int    nat,dd_ac0,dd_ac1=0,i;
 +  int    start=md->start,homenr=md->homenr,end=start+homenr,cg0,cg1;
 +  int    nflexcon,g,number_steps,d,Min=0,count=0;
 +#define  Try (1-Min)             /* At start Try = 1 */
 +
 +  bCont        = (mdstep == inputrec->init_step) && inputrec->bContinuation;
 +  bInit        = (mdstep == inputrec->init_step) || shfc->bForceInit;
 +  ftol         = inputrec->em_tol;
 +  number_steps = inputrec->niter;
 +  nshell       = shfc->nshell;
 +  shell        = shfc->shell;
 +  nflexcon     = shfc->nflexcon;
 +
 +  idef = &top->idef;
 +
 +  if (DOMAINDECOMP(cr)) {
 +    nat = dd_natoms_vsite(cr->dd);
 +    if (nflexcon > 0) {
 +      dd_get_constraint_range(cr->dd,&dd_ac0,&dd_ac1);
 +      nat = max(nat,dd_ac1);
 +    }
 +  } else {
 +    nat = state->natoms;
 +  }
 +
 +  if (nat > shfc->x_nalloc) {
 +    /* Allocate local arrays */
 +    shfc->x_nalloc = over_alloc_dd(nat);
 +    for(i=0; (i<2); i++) {
 +      srenew(shfc->x[i],shfc->x_nalloc);
 +      srenew(shfc->f[i],shfc->x_nalloc);
 +    }
 +  }
 +  for(i=0; (i<2); i++) {
 +    pos[i]   = shfc->x[i];
 +    force[i] = shfc->f[i];
 +  }
 +     
 +  /* With particle decomposition this code only works
 +   * when all particles involved with each shell are in the same cg.
 +   */
 +
 +  if (bDoNS && inputrec->ePBC != epbcNONE && !DOMAINDECOMP(cr)) {
 +    /* This is the only time where the coordinates are used
 +     * before do_force is called, which normally puts all
 +     * charge groups in the box.
 +     */
 +    if (PARTDECOMP(cr)) {
 +      pd_cg_range(cr,&cg0,&cg1);
 +    } else {
 +      cg0 = 0;
 +      cg1 = top->cgs.nr;
 +    }
 +    put_charge_groups_in_box(fplog,cg0,cg1,fr->ePBC,state->box,
 +                           &(top->cgs),state->x,fr->cg_cm);
 +    if (graph)
 +      mk_mshift(fplog,graph,fr->ePBC,state->box,state->x);
 +  }
 +
 +  /* After this all coordinate arrays will contain whole molecules */
 +  if (graph)
 +    shift_self(graph,state->box,state->x);
 +
 +  if (nflexcon) {
 +    if (nat > shfc->flex_nalloc) {
 +      shfc->flex_nalloc = over_alloc_dd(nat);
 +      srenew(shfc->acc_dir,shfc->flex_nalloc);
 +      srenew(shfc->x_old,shfc->flex_nalloc);
 +    }
 +    acc_dir = shfc->acc_dir;
 +    x_old   = shfc->x_old;
 +    for(i=0; i<homenr; i++) {
 +      for(d=0; d<DIM; d++)
 +        shfc->x_old[i][d] =
 +        state->x[start+i][d] - state->v[start+i][d]*inputrec->delta_t;
 +    }
 +  }
 +
 +  /* Do a prediction of the shell positions */
 +  if (shfc->bPredict && !bCont) {
 +    predict_shells(fplog,state->x,state->v,inputrec->delta_t,nshell,shell,
 +                 md->massT,NULL,bInit);
 +  }
 +
 +  /* do_force expected the charge groups to be in the box */
 +  if (graph)
 +    unshift_self(graph,state->box,state->x);
 +
 +  /* Calculate the forces first time around */
 +  if (gmx_debug_at) {
 +    pr_rvecs(debug,0,"x b4 do_force",state->x + start,homenr);
 +  }
 +  do_force(fplog,cr,inputrec,mdstep,nrnb,wcycle,top,mtop,groups,
 +         state->box,state->x,&state->hist,
 +         force[Min],force_vir,md,enerd,fcd,
 +         state->lambda,graph,
 +         fr,vsite,mu_tot,t,fp_field,NULL,bBornRadii,
 +         (bDoNS ? GMX_FORCE_NS : 0) | force_flags);
 +
 +  sf_dir = 0;
 +  if (nflexcon) {
 +    init_adir(fplog,shfc,
 +            constr,idef,inputrec,cr,dd_ac1,mdstep,md,start,end,
 +            shfc->x_old-start,state->x,state->x,force[Min],
 +            shfc->acc_dir-start,state->box,state->lambda,&dum,nrnb);
 +
 +    for(i=start; i<end; i++)
 +      sf_dir += md->massT[i]*norm2(shfc->acc_dir[i-start]);
 +  }
 +
 +  Epot[Min] = enerd->term[F_EPOT];
 +
 +  df[Min]=rms_force(cr,shfc->f[Min],nshell,shell,nflexcon,&sf_dir,&Epot[Min]);
 +  df[Try]=0;
 +  if (debug) {
 +    fprintf(debug,"df = %g  %g\n",df[Min],df[Try]);
 +  }
 +
 +  if (gmx_debug_at) {
 +    pr_rvecs(debug,0,"force0",force[Min],md->nr);
 +  }
 +
 +  if (nshell+nflexcon > 0) {
 +    /* Copy x to pos[Min] & pos[Try]: during minimization only the
 +     * shell positions are updated, therefore the other particles must
 +     * be set here.
 +     */
 +    memcpy(pos[Min],state->x,nat*sizeof(state->x[0]));
 +    memcpy(pos[Try],state->x,nat*sizeof(state->x[0]));
 +  }
 +  
 +  if (bVerbose && MASTER(cr))
 +    print_epot(stdout,mdstep,0,Epot[Min],df[Min],nflexcon,sf_dir);
 +
 +  if (debug) {
 +    fprintf(debug,"%17s: %14.10e\n",
 +          interaction_function[F_EKIN].longname,enerd->term[F_EKIN]);
 +    fprintf(debug,"%17s: %14.10e\n",
 +          interaction_function[F_EPOT].longname,enerd->term[F_EPOT]);
 +    fprintf(debug,"%17s: %14.10e\n",
 +          interaction_function[F_ETOT].longname,enerd->term[F_ETOT]);
 +    fprintf(debug,"SHELLSTEP %s\n",gmx_step_str(mdstep,sbuf));
 +  }
 +  
 +  /* First check whether we should do shells, or whether the force is 
 +   * low enough even without minimization.
 +   */
 +  *bConverged = (df[Min] < ftol);
 +  
 +  for(count=1; (!(*bConverged) && (count < number_steps)); count++) {
 +    if (vsite)
 +      construct_vsites(fplog,vsite,pos[Min],nrnb,inputrec->delta_t,state->v,
 +                     idef->iparams,idef->il,
 +                     fr->ePBC,fr->bMolPBC,graph,cr,state->box);
 +     
 +    if (nflexcon) {
 +      init_adir(fplog,shfc,
 +              constr,idef,inputrec,cr,dd_ac1,mdstep,md,start,end,
 +              x_old-start,state->x,pos[Min],force[Min],acc_dir-start,
 +              state->box,state->lambda,&dum,nrnb);
 +      
 +      directional_sd(fplog,pos[Min],pos[Try],acc_dir-start,start,end,
 +                   fr->fc_stepsize);
 +    }
 +    
 +    /* New positions, Steepest descent */
 +    shell_pos_sd(fplog,pos[Min],pos[Try],force[Min],nshell,shell,count); 
 +
 +    /* do_force expected the charge groups to be in the box */
 +    if (graph)
 +      unshift_self(graph,state->box,pos[Try]);
 +
 +    if (gmx_debug_at) {
 +      pr_rvecs(debug,0,"RELAX: pos[Min]  ",pos[Min] + start,homenr);
 +      pr_rvecs(debug,0,"RELAX: pos[Try]  ",pos[Try] + start,homenr);
 +    }
 +    /* Try the new positions */
 +    do_force(fplog,cr,inputrec,1,nrnb,wcycle,
 +           top,mtop,groups,state->box,pos[Try],&state->hist,
 +           force[Try],force_vir,
 +           md,enerd,fcd,state->lambda,graph,
 +           fr,vsite,mu_tot,t,fp_field,NULL,bBornRadii,
 +           force_flags);
 +    
 +    if (gmx_debug_at) {
 +      pr_rvecs(debug,0,"RELAX: force[Min]",force[Min] + start,homenr);
 +      pr_rvecs(debug,0,"RELAX: force[Try]",force[Try] + start,homenr);
 +    }
 +    sf_dir = 0;
 +    if (nflexcon) {
 +      init_adir(fplog,shfc,
 +              constr,idef,inputrec,cr,dd_ac1,mdstep,md,start,end,
 +              x_old-start,state->x,pos[Try],force[Try],acc_dir-start,
 +              state->box,state->lambda,&dum,nrnb);
 +
 +      for(i=start; i<end; i++)
 +      sf_dir += md->massT[i]*norm2(acc_dir[i-start]);
 +    }
 +
 +    Epot[Try] = enerd->term[F_EPOT]; 
 +    
 +    df[Try]=rms_force(cr,force[Try],nshell,shell,nflexcon,&sf_dir,&Epot[Try]);
 +
 +    if (debug)
 +      fprintf(debug,"df = %g  %g\n",df[Min],df[Try]);
 +
 +    if (debug) {
 +      if (gmx_debug_at)
 +      pr_rvecs(debug,0,"F na do_force",force[Try] + start,homenr);
 +      if (gmx_debug_at) {
 +      fprintf(debug,"SHELL ITER %d\n",count);
 +      dump_shells(debug,pos[Try],force[Try],ftol,nshell,shell);
 +      }
 +    }
 +
 +    if (bVerbose && MASTER(cr))
 +      print_epot(stdout,mdstep,count,Epot[Try],df[Try],nflexcon,sf_dir);
 +      
 +    *bConverged = (df[Try] < ftol);
 +    
 +    if ((df[Try] < df[Min])) {
 +      if (debug)
 +      fprintf(debug,"Swapping Min and Try\n");
 +      if (nflexcon) {
 +      /* Correct the velocities for the flexible constraints */
 +      invdt = 1/inputrec->delta_t;
 +      for(i=start; i<end; i++) {
 +        for(d=0; d<DIM; d++)
 +          state->v[i][d] += (pos[Try][i][d] - pos[Min][i][d])*invdt;
 +      }
 +      }
 +      Min  = Try;
 +    } else {
 +      decrease_step_size(nshell,shell);
 +    }
 +  }
 +  if (MASTER(cr) && !(*bConverged)) {
 +    /* Note that the energies and virial are incorrect when not converged */
 +    if (fplog)
 +      fprintf(fplog,
 +            "step %s: EM did not converge in %d iterations, RMS force %.3f\n",
 +            gmx_step_str(mdstep,sbuf),number_steps,df[Min]);
 +    fprintf(stderr,
 +          "step %s: EM did not converge in %d iterations, RMS force %.3f\n",
 +          gmx_step_str(mdstep,sbuf),number_steps,df[Min]);
 +  }
 +
 +  /* Copy back the coordinates and the forces */
 +  memcpy(state->x,pos[Min],nat*sizeof(state->x[0]));
 +  memcpy(f,force[Min],nat*sizeof(f[0]));
 +
 +  return count; 
 +}
 +
Simple merge
Simple merge
Simple merge
index 49f05710640cb2d70757b8f13803e936d81e1f65,0000000000000000000000000000000000000000..4764f8818accc3ebca25b699efc224d010a35582
mode 100644,000000..100644
--- /dev/null
@@@ -1,1866 -1,0 +1,1866 @@@
-                     wcycle,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#if ((defined WIN32 || defined _WIN32 || defined WIN64 || defined _WIN64) && !defined __CYGWIN__ && !defined __CYGWIN32__)
 +/* _isnan() */
 +#include <float.h>
 +#endif
 +
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "sysstuff.h"
 +#include "vec.h"
 +#include "statutil.h"
 +#include "vcm.h"
 +#include "mdebin.h"
 +#include "nrnb.h"
 +#include "calcmu.h"
 +#include "index.h"
 +#include "vsite.h"
 +#include "update.h"
 +#include "ns.h"
 +#include "trnio.h"
 +#include "xtcio.h"
 +#include "mdrun.h"
 +#include "confio.h"
 +#include "network.h"
 +#include "pull.h"
 +#include "xvgr.h"
 +#include "physics.h"
 +#include "names.h"
 +#include "xmdrun.h"
 +#include "ionize.h"
 +#include "disre.h"
 +#include "orires.h"
 +#include "dihre.h"
 +#include "pppm.h"
 +#include "pme.h"
 +#include "mdatoms.h"
 +#include "repl_ex.h"
 +#include "qmmm.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "topsort.h"
 +#include "coulomb.h"
 +#include "constr.h"
 +#include "shellfc.h"
 +#include "compute_io.h"
 +#include "mvdata.h"
 +#include "checkpoint.h"
 +#include "mtop_util.h"
 +#include "sighandler.h"
 +#include "membed.h"
 +#include "string2.h"
 +
 +#ifdef GMX_LIB_MPI
 +#include <mpi.h>
 +#endif
 +#ifdef GMX_THREADS
 +#include "tmpi.h"
 +#endif
 +
 +#ifdef GMX_FAHCORE
 +#include "corewrap.h"
 +#endif
 +
 +
 +double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
 +             const output_env_t oenv, gmx_bool bVerbose,gmx_bool bCompact,
 +             int nstglobalcomm,
 +             gmx_vsite_t *vsite,gmx_constr_t constr,
 +             int stepout,t_inputrec *ir,
 +             gmx_mtop_t *top_global,
 +             t_fcdata *fcd,
 +             t_state *state_global,
 +             t_mdatoms *mdatoms,
 +             t_nrnb *nrnb,gmx_wallcycle_t wcycle,
 +             gmx_edsam_t ed,t_forcerec *fr,
 +             int repl_ex_nst,int repl_ex_seed,gmx_membed_t *membed,
 +             real cpt_period,real max_hours,
 +             const char *deviceOptions,
 +             unsigned long Flags,
 +             gmx_runtime_t *runtime)
 +{
 +    gmx_mdoutf_t *outf;
 +    gmx_large_int_t step,step_rel;
 +    double     run_time;
 +    double     t,t0,lam0;
 +    gmx_bool       bGStatEveryStep,bGStat,bNstEner,bCalcEnerPres;
 +    gmx_bool       bNS,bNStList,bSimAnn,bStopCM,bRerunMD,bNotLastFrame=FALSE,
 +               bFirstStep,bStateFromTPX,bInitStep,bLastStep,
 +               bBornRadii,bStartingFromCpt;
 +    gmx_bool       bDoDHDL=FALSE;
 +    gmx_bool       do_ene,do_log,do_verbose,bRerunWarnNoV=TRUE,
 +               bForceUpdate=FALSE,bCPT;
 +    int        mdof_flags;
 +    gmx_bool       bMasterState;
 +    int        force_flags,cglo_flags;
 +    tensor     force_vir,shake_vir,total_vir,tmp_vir,pres;
 +    int        i,m;
 +    t_trxstatus *status;
 +    rvec       mu_tot;
 +    t_vcm      *vcm;
 +    t_state    *bufstate=NULL;   
 +    matrix     *scale_tot,pcoupl_mu,M,ebox;
 +    gmx_nlheur_t nlh;
 +    t_trxframe rerun_fr;
 +    gmx_repl_ex_t repl_ex=NULL;
 +    int        nchkpt=1;
 +
 +    gmx_localtop_t *top;      
 +    t_mdebin *mdebin=NULL;
 +    t_state    *state=NULL;
 +    rvec       *f_global=NULL;
 +    int        n_xtc=-1;
 +    rvec       *x_xtc=NULL;
 +    gmx_enerdata_t *enerd;
 +    rvec       *f=NULL;
 +    gmx_global_stat_t gstat;
 +    gmx_update_t upd=NULL;
 +    t_graph    *graph=NULL;
 +    globsig_t   gs;
 +
 +    gmx_bool        bFFscan;
 +    gmx_groups_t *groups;
 +    gmx_ekindata_t *ekind, *ekind_save;
 +    gmx_shellfc_t shellfc;
 +    int         count,nconverged=0;
 +    real        timestep=0;
 +    double      tcount=0;
 +    gmx_bool        bIonize=FALSE;
 +    gmx_bool        bTCR=FALSE,bConverged=TRUE,bOK,bSumEkinhOld,bExchanged;
 +    gmx_bool        bAppend;
 +    gmx_bool        bResetCountersHalfMaxH=FALSE;
 +    gmx_bool        bVV,bIterations,bFirstIterate,bTemp,bPres,bTrotter;
 +    real        temp0,mu_aver=0,dvdl;
 +    int         a0,a1,gnx=0,ii;
 +    atom_id     *grpindex=NULL;
 +    char        *grpname;
 +    t_coupl_rec *tcr=NULL;
 +    rvec        *xcopy=NULL,*vcopy=NULL,*cbuf=NULL;
 +    matrix      boxcopy={{0}},lastbox;
 +      tensor      tmpvir;
 +      real        fom,oldfom,veta_save,pcurr,scalevir,tracevir;
 +      real        vetanew = 0;
 +    double      cycles;
 +      real        saved_conserved_quantity = 0;
 +    real        last_ekin = 0;
 +      int         iter_i;
 +      t_extmass   MassQ;
 +    int         **trotter_seq; 
 +    char        sbuf[STEPSTRSIZE],sbuf2[STEPSTRSIZE];
 +    int         handled_stop_condition=gmx_stop_cond_none; /* compare to get_stop_condition*/
 +    gmx_iterate_t iterate;
 +    gmx_large_int_t multisim_nsteps=-1; /* number of steps to do  before first multisim 
 +                                          simulation stops. If equal to zero, don't
 +                                          communicate any more between multisims.*/
 +#ifdef GMX_FAHCORE
 +    /* Temporary addition for FAHCORE checkpointing */
 +    int chkpt_ret;
 +#endif
 +
 +    /* Check for special mdrun options */
 +    bRerunMD = (Flags & MD_RERUN);
 +    bIonize  = (Flags & MD_IONIZE);
 +    bFFscan  = (Flags & MD_FFSCAN);
 +    bAppend  = (Flags & MD_APPENDFILES);
 +    if (Flags & MD_RESETCOUNTERSHALFWAY)
 +    {
 +        if (ir->nsteps > 0)
 +        {
 +            /* Signal to reset the counters half the simulation steps. */
 +            wcycle_set_reset_counters(wcycle,ir->nsteps/2);
 +        }
 +        /* Signal to reset the counters halfway the simulation time. */
 +        bResetCountersHalfMaxH = (max_hours > 0);
 +    }
 +
 +    /* md-vv uses averaged full step velocities for T-control 
 +       md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
 +       md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
 +    bVV = EI_VV(ir->eI);
 +    if (bVV) /* to store the initial velocities while computing virial */
 +    {
 +        snew(cbuf,top_global->natoms);
 +    }
 +    /* all the iteratative cases - only if there are constraints */ 
 +    bIterations = ((IR_NPT_TROTTER(ir)) && (constr) && (!bRerunMD));
 +    bTrotter = (bVV && (IR_NPT_TROTTER(ir) || (IR_NVT_TROTTER(ir))));        
 +    
 +    if (bRerunMD)
 +    {
 +        /* Since we don't know if the frames read are related in any way,
 +         * rebuild the neighborlist at every step.
 +         */
 +        ir->nstlist       = 1;
 +        ir->nstcalcenergy = 1;
 +        nstglobalcomm     = 1;
 +    }
 +
 +    check_ir_old_tpx_versions(cr,fplog,ir,top_global);
 +
 +    nstglobalcomm = check_nstglobalcomm(fplog,cr,nstglobalcomm,ir);
 +    bGStatEveryStep = (nstglobalcomm == 1);
 +
 +    if (!bGStatEveryStep && ir->nstlist == -1 && fplog != NULL)
 +    {
 +        fprintf(fplog,
 +                "To reduce the energy communication with nstlist = -1\n"
 +                "the neighbor list validity should not be checked at every step,\n"
 +                "this means that exact integration is not guaranteed.\n"
 +                "The neighbor list validity is checked after:\n"
 +                "  <n.list life time> - 2*std.dev.(n.list life time)  steps.\n"
 +                "In most cases this will result in exact integration.\n"
 +                "This reduces the energy communication by a factor of 2 to 3.\n"
 +                "If you want less energy communication, set nstlist > 3.\n\n");
 +    }
 +
 +    if (bRerunMD || bFFscan)
 +    {
 +        ir->nstxtcout = 0;
 +    }
 +    groups = &top_global->groups;
 +
 +    /* Initial values */
 +    init_md(fplog,cr,ir,oenv,&t,&t0,&state_global->lambda,&lam0,
 +            nrnb,top_global,&upd,
 +            nfile,fnm,&outf,&mdebin,
 +            force_vir,shake_vir,mu_tot,&bSimAnn,&vcm,state_global,Flags);
 +
 +    clear_mat(total_vir);
 +    clear_mat(pres);
 +    /* Energy terms and groups */
 +    snew(enerd,1);
 +    init_enerdata(top_global->groups.grps[egcENER].nr,ir->n_flambda,enerd);
 +    if (DOMAINDECOMP(cr))
 +    {
 +        f = NULL;
 +    }
 +    else
 +    {
 +        snew(f,top_global->natoms);
 +    }
 +
 +    /* Kinetic energy data */
 +    snew(ekind,1);
 +    init_ekindata(fplog,top_global,&(ir->opts),ekind);
 +    /* needed for iteration of constraints */
 +    snew(ekind_save,1);
 +    init_ekindata(fplog,top_global,&(ir->opts),ekind_save);
 +    /* Copy the cos acceleration to the groups struct */    
 +    ekind->cosacc.cos_accel = ir->cos_accel;
 +
 +    gstat = global_stat_init(ir);
 +    debug_gmx();
 +
 +    /* Check for polarizable models and flexible constraints */
 +    shellfc = init_shell_flexcon(fplog,
 +                                 top_global,n_flexible_constraints(constr),
 +                                 (ir->bContinuation || 
 +                                  (DOMAINDECOMP(cr) && !MASTER(cr))) ?
 +                                 NULL : state_global->x);
 +
 +    if (DEFORM(*ir))
 +    {
 +#ifdef GMX_THREADS
 +        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
 +#endif
 +        set_deform_reference_box(upd,
 +                                 deform_init_init_step_tpx,
 +                                 deform_init_box_tpx);
 +#ifdef GMX_THREADS
 +        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
 +#endif
 +    }
 +
 +    {
 +        double io = compute_io(ir,top_global->natoms,groups,mdebin->ebin->nener,1);
 +        if ((io > 2000) && MASTER(cr))
 +            fprintf(stderr,
 +                    "\nWARNING: This run will generate roughly %.0f Mb of data\n\n",
 +                    io);
 +    }
 +
 +    if (DOMAINDECOMP(cr)) {
 +        top = dd_init_local_top(top_global);
 +
 +        snew(state,1);
 +        dd_init_local_state(cr->dd,state_global,state);
 +
 +        if (DDMASTER(cr->dd) && ir->nstfout) {
 +            snew(f_global,state_global->natoms);
 +        }
 +    } else {
 +        if (PAR(cr)) {
 +            /* Initialize the particle decomposition and split the topology */
 +            top = split_system(fplog,top_global,ir,cr);
 +
 +            pd_cg_range(cr,&fr->cg0,&fr->hcg);
 +            pd_at_range(cr,&a0,&a1);
 +        } else {
 +            top = gmx_mtop_generate_local_top(top_global,ir);
 +
 +            a0 = 0;
 +            a1 = top_global->natoms;
 +        }
 +
 +        state = partdec_init_local_state(cr,state_global);
 +        f_global = f;
 +
 +        atoms2md(top_global,ir,0,NULL,a0,a1-a0,mdatoms);
 +
 +        if (vsite) {
 +            set_vsite_top(vsite,top,mdatoms,cr);
 +        }
 +
 +        if (ir->ePBC != epbcNONE && !ir->bPeriodicMols) {
 +            graph = mk_graph(fplog,&(top->idef),0,top_global->natoms,FALSE,FALSE);
 +        }
 +
 +        if (shellfc) {
 +            make_local_shells(cr,mdatoms,shellfc);
 +        }
 +
 +        if (ir->pull && PAR(cr)) {
 +            dd_make_local_pull_groups(NULL,ir->pull,mdatoms);
 +        }
 +    }
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        /* Distribute the charge groups over the nodes from the master node */
 +        dd_partition_system(fplog,ir->init_step,cr,TRUE,1,
 +                            state_global,top_global,ir,
 +                            state,&f,mdatoms,top,fr,
 +                            vsite,shellfc,constr,
 +                            nrnb,wcycle,FALSE);
 +    }
 +
 +    update_mdatoms(mdatoms,state->lambda);
 +
 +    if (MASTER(cr))
 +    {
 +        if (opt2bSet("-cpi",nfile,fnm))
 +        {
 +            /* Update mdebin with energy history if appending to output files */
 +            if ( Flags & MD_APPENDFILES )
 +            {
 +                restore_energyhistory_from_state(mdebin,&state_global->enerhist);
 +            }
 +            else
 +            {
 +                /* We might have read an energy history from checkpoint,
 +                 * free the allocated memory and reset the counts.
 +                 */
 +                done_energyhistory(&state_global->enerhist);
 +                init_energyhistory(&state_global->enerhist);
 +            }
 +        }
 +        /* Set the initial energy history in state by updating once */
 +        update_energyhistory(&state_global->enerhist,mdebin);
 +    } 
 +
 +    if ((state->flags & (1<<estLD_RNG)) && (Flags & MD_READ_RNG)) {
 +        /* Set the random state if we read a checkpoint file */
 +        set_stochd_state(upd,state);
 +    }
 +
 +    /* Initialize constraints */
 +    if (constr) {
 +        if (!DOMAINDECOMP(cr))
 +            set_constraints(constr,top,ir,mdatoms,cr);
 +    }
 +
 +    /* Check whether we have to GCT stuff */
 +    bTCR = ftp2bSet(efGCT,nfile,fnm);
 +    if (bTCR) {
 +        if (MASTER(cr)) {
 +            fprintf(stderr,"Will do General Coupling Theory!\n");
 +        }
 +        gnx = top_global->mols.nr;
 +        snew(grpindex,gnx);
 +        for(i=0; (i<gnx); i++) {
 +            grpindex[i] = i;
 +        }
 +    }
 +
 +    if (repl_ex_nst > 0)
 +    {
 +        /* We need to be sure replica exchange can only occur
 +         * when the energies are current */
 +        check_nst_param(fplog,cr,"nstcalcenergy",ir->nstcalcenergy,
 +                        "repl_ex_nst",&repl_ex_nst);
 +        /* This check needs to happen before inter-simulation
 +         * signals are initialized, too */
 +    }
 +    if (repl_ex_nst > 0 && MASTER(cr))
 +        repl_ex = init_replica_exchange(fplog,cr->ms,state_global,ir,
 +                                        repl_ex_nst,repl_ex_seed);
 +
 +    if (!ir->bContinuation && !bRerunMD)
 +    {
 +        if (mdatoms->cFREEZE && (state->flags & (1<<estV)))
 +        {
 +            /* Set the velocities of frozen particles to zero */
 +            for(i=mdatoms->start; i<mdatoms->start+mdatoms->homenr; i++)
 +            {
 +                for(m=0; m<DIM; m++)
 +                {
 +                    if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
 +                    {
 +                        state->v[i][m] = 0;
 +                    }
 +                }
 +            }
 +        }
 +
 +        if (constr)
 +        {
 +            /* Constrain the initial coordinates and velocities */
 +            do_constrain_first(fplog,constr,ir,mdatoms,state,f,
 +                               graph,cr,nrnb,fr,top,shake_vir);
 +        }
 +        if (vsite)
 +        {
 +            /* Construct the virtual sites for the initial configuration */
 +            construct_vsites(fplog,vsite,state->x,nrnb,ir->delta_t,NULL,
 +                             top->idef.iparams,top->idef.il,
 +                             fr->ePBC,fr->bMolPBC,graph,cr,state->box);
 +        }
 +    }
 +
 +    debug_gmx();
 +  
 +    /* I'm assuming we need global communication the first time! MRS */
 +    cglo_flags = (CGLO_TEMPERATURE | CGLO_GSTAT
 +                  | (bVV ? CGLO_PRESSURE:0)
 +                  | (bVV ? CGLO_CONSTRAINT:0)
 +                  | (bRerunMD ? CGLO_RERUNMD:0)
 +                  | ((Flags & MD_READ_EKIN) ? CGLO_READEKIN:0));
 +    
 +    bSumEkinhOld = FALSE;
 +    compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
-                         wcycle,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
++                    NULL,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
 +                    constr,NULL,FALSE,state->box,
 +                    top_global,&pcurr,top_global->natoms,&bSumEkinhOld,cglo_flags);
 +    if (ir->eI == eiVVAK) {
 +        /* a second call to get the half step temperature initialized as well */ 
 +        /* we do the same call as above, but turn the pressure off -- internally to 
 +           compute_globals, this is recognized as a velocity verlet half-step 
 +           kinetic energy calculation.  This minimized excess variables, but 
 +           perhaps loses some logic?*/
 +        
 +        compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
++                        NULL,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
 +                        constr,NULL,FALSE,state->box,
 +                        top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
 +                        cglo_flags &~ CGLO_PRESSURE);
 +    }
 +    
 +    /* Calculate the initial half step temperature, and save the ekinh_old */
 +    if (!(Flags & MD_STARTFROMCPT)) 
 +    {
 +        for(i=0; (i<ir->opts.ngtc); i++) 
 +        {
 +            copy_mat(ekind->tcstat[i].ekinh,ekind->tcstat[i].ekinh_old);
 +        } 
 +    }
 +    if (ir->eI != eiVV) 
 +    {
 +        enerd->term[F_TEMP] *= 2; /* result of averages being done over previous and current step,
 +                                     and there is no previous step */
 +    }
 +    temp0 = enerd->term[F_TEMP];
 +    
 +    /* if using an iterative algorithm, we need to create a working directory for the state. */
 +    if (bIterations) 
 +    {
 +            bufstate = init_bufstate(state);
 +    }
 +    if (bFFscan) 
 +    {
 +        snew(xcopy,state->natoms);
 +        snew(vcopy,state->natoms);
 +        copy_rvecn(state->x,xcopy,0,state->natoms);
 +        copy_rvecn(state->v,vcopy,0,state->natoms);
 +        copy_mat(state->box,boxcopy);
 +    } 
 +    
 +    /* need to make an initiation call to get the Trotter variables set, as well as other constants for non-trotter
 +       temperature control */
 +    trotter_seq = init_npt_vars(ir,state,&MassQ,bTrotter);
 +    
 +    if (MASTER(cr))
 +    {
 +        if (constr && !ir->bContinuation && ir->eConstrAlg == econtLINCS)
 +        {
 +            fprintf(fplog,
 +                    "RMS relative constraint deviation after constraining: %.2e\n",
 +                    constr_rmsd(constr,FALSE));
 +        }
 +        fprintf(fplog,"Initial temperature: %g K\n",enerd->term[F_TEMP]);
 +        if (bRerunMD)
 +        {
 +            fprintf(stderr,"starting md rerun '%s', reading coordinates from"
 +                    " input trajectory '%s'\n\n",
 +                    *(top_global->name),opt2fn("-rerun",nfile,fnm));
 +            if (bVerbose)
 +            {
 +                fprintf(stderr,"Calculated time to finish depends on nsteps from "
 +                        "run input file,\nwhich may not correspond to the time "
 +                        "needed to process input trajectory.\n\n");
 +            }
 +        }
 +        else
 +        {
 +            char tbuf[20];
 +            fprintf(stderr,"starting mdrun '%s'\n",
 +                    *(top_global->name));
 +            if (ir->nsteps >= 0)
 +            {
 +                sprintf(tbuf,"%8.1f",(ir->init_step+ir->nsteps)*ir->delta_t);
 +            }
 +            else
 +            {
 +                sprintf(tbuf,"%s","infinite");
 +            }
 +            if (ir->init_step > 0)
 +            {
 +                fprintf(stderr,"%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
 +                        gmx_step_str(ir->init_step+ir->nsteps,sbuf),tbuf,
 +                        gmx_step_str(ir->init_step,sbuf2),
 +                        ir->init_step*ir->delta_t);
 +            }
 +            else
 +            {
 +                fprintf(stderr,"%s steps, %s ps.\n",
 +                        gmx_step_str(ir->nsteps,sbuf),tbuf);
 +            }
 +        }
 +        fprintf(fplog,"\n");
 +    }
 +
 +    /* Set and write start time */
 +    runtime_start(runtime);
 +    print_date_and_time(fplog,cr->nodeid,"Started mdrun",runtime);
 +    wallcycle_start(wcycle,ewcRUN);
 +    if (fplog)
 +        fprintf(fplog,"\n");
 +
 +    /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
 +#ifdef GMX_FAHCORE
 +    chkpt_ret=fcCheckPointParallel( cr->nodeid,
 +                                    NULL,0);
 +    if ( chkpt_ret == 0 ) 
 +        gmx_fatal( 3,__FILE__,__LINE__, "Checkpoint error on step %d\n", 0 );
 +#endif
 +
 +    debug_gmx();
 +    /***********************************************************
 +     *
 +     *             Loop over MD steps 
 +     *
 +     ************************************************************/
 +
 +    /* if rerunMD then read coordinates and velocities from input trajectory */
 +    if (bRerunMD)
 +    {
 +        if (getenv("GMX_FORCE_UPDATE"))
 +        {
 +            bForceUpdate = TRUE;
 +        }
 +
 +        rerun_fr.natoms = 0;
 +        if (MASTER(cr))
 +        {
 +            bNotLastFrame = read_first_frame(oenv,&status,
 +                                             opt2fn("-rerun",nfile,fnm),
 +                                             &rerun_fr,TRX_NEED_X | TRX_READ_V);
 +            if (rerun_fr.natoms != top_global->natoms)
 +            {
 +                gmx_fatal(FARGS,
 +                          "Number of atoms in trajectory (%d) does not match the "
 +                          "run input file (%d)\n",
 +                          rerun_fr.natoms,top_global->natoms);
 +            }
 +            if (ir->ePBC != epbcNONE)
 +            {
 +                if (!rerun_fr.bBox)
 +                {
 +                    gmx_fatal(FARGS,"Rerun trajectory frame step %d time %f does not contain a box, while pbc is used",rerun_fr.step,rerun_fr.time);
 +                }
 +                if (max_cutoff2(ir->ePBC,rerun_fr.box) < sqr(fr->rlistlong))
 +                {
 +                    gmx_fatal(FARGS,"Rerun trajectory frame step %d time %f has too small box dimensions",rerun_fr.step,rerun_fr.time);
 +                }
 +            }
 +        }
 +
 +        if (PAR(cr))
 +        {
 +            rerun_parallel_comm(cr,&rerun_fr,&bNotLastFrame);
 +        }
 +
 +        if (ir->ePBC != epbcNONE)
 +        {
 +            /* Set the shift vectors.
 +             * Necessary here when have a static box different from the tpr box.
 +             */
 +            calc_shifts(rerun_fr.box,fr->shift_vec);
 +        }
 +    }
 +
 +    /* loop over MD steps or if rerunMD to end of input trajectory */
 +    bFirstStep = TRUE;
 +    /* Skip the first Nose-Hoover integration when we get the state from tpx */
 +    bStateFromTPX = !opt2bSet("-cpi",nfile,fnm);
 +    bInitStep = bFirstStep && (bStateFromTPX || bVV);
 +    bStartingFromCpt = (Flags & MD_STARTFROMCPT) && bInitStep;
 +    bLastStep    = FALSE;
 +    bSumEkinhOld = FALSE;
 +    bExchanged   = FALSE;
 +
 +    init_global_signals(&gs,cr,ir,repl_ex_nst);
 +
 +    step = ir->init_step;
 +    step_rel = 0;
 +
 +    if (ir->nstlist == -1)
 +    {
 +        init_nlistheuristics(&nlh,bGStatEveryStep,step);
 +    }
 +
 +    if (MULTISIM(cr) && (repl_ex_nst <=0 ))
 +    {
 +        /* check how many steps are left in other sims */
 +        multisim_nsteps=get_multisim_nsteps(cr, ir->nsteps);
 +    }
 +
 +
 +    /* and stop now if we should */
 +    bLastStep = (bRerunMD || (ir->nsteps >= 0 && step_rel > ir->nsteps) ||
 +                 ((multisim_nsteps >= 0) && (step_rel >= multisim_nsteps )));
 +    while (!bLastStep || (bRerunMD && bNotLastFrame)) {
 +
 +        wallcycle_start(wcycle,ewcSTEP);
 +
 +        if (bRerunMD) {
 +            if (rerun_fr.bStep) {
 +                step = rerun_fr.step;
 +                step_rel = step - ir->init_step;
 +            }
 +            if (rerun_fr.bTime) {
 +                t = rerun_fr.time;
 +            }
 +            else
 +            {
 +                t = step;
 +            }
 +        } 
 +        else 
 +        {
 +            bLastStep = (step_rel == ir->nsteps);
 +            t = t0 + step*ir->delta_t;
 +        }
 +
 +        if (ir->efep != efepNO)
 +        {
 +            if (bRerunMD && rerun_fr.bLambda && (ir->delta_lambda!=0))
 +            {
 +                state_global->lambda = rerun_fr.lambda;
 +            }
 +            else
 +            {
 +                state_global->lambda = lam0 + step*ir->delta_lambda;
 +            }
 +            state->lambda = state_global->lambda;
 +            bDoDHDL = do_per_step(step,ir->nstdhdl);
 +        }
 +
 +        if (bSimAnn) 
 +        {
 +            update_annealing_target_temp(&(ir->opts),t);
 +        }
 +
 +        if (bRerunMD)
 +        {
 +            if (!(DOMAINDECOMP(cr) && !MASTER(cr)))
 +            {
 +                for(i=0; i<state_global->natoms; i++)
 +                {
 +                    copy_rvec(rerun_fr.x[i],state_global->x[i]);
 +                }
 +                if (rerun_fr.bV)
 +                {
 +                    for(i=0; i<state_global->natoms; i++)
 +                    {
 +                        copy_rvec(rerun_fr.v[i],state_global->v[i]);
 +                    }
 +                }
 +                else
 +                {
 +                    for(i=0; i<state_global->natoms; i++)
 +                    {
 +                        clear_rvec(state_global->v[i]);
 +                    }
 +                    if (bRerunWarnNoV)
 +                    {
 +                        fprintf(stderr,"\nWARNING: Some frames do not contain velocities.\n"
 +                                "         Ekin, temperature and pressure are incorrect,\n"
 +                                "         the virial will be incorrect when constraints are present.\n"
 +                                "\n");
 +                        bRerunWarnNoV = FALSE;
 +                    }
 +                }
 +            }
 +            copy_mat(rerun_fr.box,state_global->box);
 +            copy_mat(state_global->box,state->box);
 +
 +            if (vsite && (Flags & MD_RERUN_VSITE))
 +            {
 +                if (DOMAINDECOMP(cr))
 +                {
 +                    gmx_fatal(FARGS,"Vsite recalculation with -rerun is not implemented for domain decomposition, use particle decomposition");
 +                }
 +                if (graph)
 +                {
 +                    /* Following is necessary because the graph may get out of sync
 +                     * with the coordinates if we only have every N'th coordinate set
 +                     */
 +                    mk_mshift(fplog,graph,fr->ePBC,state->box,state->x);
 +                    shift_self(graph,state->box,state->x);
 +                }
 +                construct_vsites(fplog,vsite,state->x,nrnb,ir->delta_t,state->v,
 +                                 top->idef.iparams,top->idef.il,
 +                                 fr->ePBC,fr->bMolPBC,graph,cr,state->box);
 +                if (graph)
 +                {
 +                    unshift_self(graph,state->box,state->x);
 +                }
 +            }
 +        }
 +
 +        /* Stop Center of Mass motion */
 +        bStopCM = (ir->comm_mode != ecmNO && do_per_step(step,ir->nstcomm));
 +
 +        /* Copy back starting coordinates in case we're doing a forcefield scan */
 +        if (bFFscan)
 +        {
 +            for(ii=0; (ii<state->natoms); ii++)
 +            {
 +                copy_rvec(xcopy[ii],state->x[ii]);
 +                copy_rvec(vcopy[ii],state->v[ii]);
 +            }
 +            copy_mat(boxcopy,state->box);
 +        }
 +
 +        if (bRerunMD)
 +        {
 +            /* for rerun MD always do Neighbour Searching */
 +            bNS = (bFirstStep || ir->nstlist != 0);
 +            bNStList = bNS;
 +        }
 +        else
 +        {
 +            /* Determine whether or not to do Neighbour Searching and LR */
 +            bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
 +            
 +            bNS = (bFirstStep || bExchanged || bNStList ||
 +                   (ir->nstlist == -1 && nlh.nabnsb > 0));
 +
 +            if (bNS && ir->nstlist == -1)
 +            {
 +                set_nlistheuristics(&nlh,bFirstStep || bExchanged,step);
 +            }
 +        } 
 +
 +        /* check whether we should stop because another simulation has 
 +           stopped. */
 +        if (MULTISIM(cr))
 +        {
 +            if ( (multisim_nsteps >= 0) &&  (step_rel >= multisim_nsteps)  &&  
 +                 (multisim_nsteps != ir->nsteps) )  
 +            {
 +                if (bNS)
 +                {
 +                    if (MASTER(cr))
 +                    {
 +                        fprintf(stderr, 
 +                                "Stopping simulation %d because another one has finished\n",
 +                                cr->ms->sim);
 +                    }
 +                    bLastStep=TRUE;
 +                    gs.sig[eglsCHKPT] = 1;
 +                }
 +            }
 +        }
 +
 +        /* < 0 means stop at next step, > 0 means stop at next NS step */
 +        if ( (gs.set[eglsSTOPCOND] < 0 ) ||
 +             ( (gs.set[eglsSTOPCOND] > 0 ) && ( bNS || ir->nstlist==0)) )
 +        {
 +            bLastStep = TRUE;
 +        }
 +
 +        /* Determine whether or not to update the Born radii if doing GB */
 +        bBornRadii=bFirstStep;
 +        if (ir->implicit_solvent && (step % ir->nstgbradii==0))
 +        {
 +            bBornRadii=TRUE;
 +        }
 +        
 +        do_log = do_per_step(step,ir->nstlog) || bFirstStep || bLastStep;
 +        do_verbose = bVerbose &&
 +                  (step % stepout == 0 || bFirstStep || bLastStep);
 +
 +        if (bNS && !(bFirstStep && ir->bContinuation && !bRerunMD))
 +        {
 +            if (bRerunMD)
 +            {
 +                bMasterState = TRUE;
 +            }
 +            else
 +            {
 +                bMasterState = FALSE;
 +                /* Correct the new box if it is too skewed */
 +                if (DYNAMIC_BOX(*ir))
 +                {
 +                    if (correct_box(fplog,step,state->box,graph))
 +                    {
 +                        bMasterState = TRUE;
 +                    }
 +                }
 +                if (DOMAINDECOMP(cr) && bMasterState)
 +                {
 +                    dd_collect_state(cr->dd,state,state_global);
 +                }
 +            }
 +
 +            if (DOMAINDECOMP(cr))
 +            {
 +                /* Repartition the domain decomposition */
 +                wallcycle_start(wcycle,ewcDOMDEC);
 +                dd_partition_system(fplog,step,cr,
 +                                    bMasterState,nstglobalcomm,
 +                                    state_global,top_global,ir,
 +                                    state,&f,mdatoms,top,fr,
 +                                    vsite,shellfc,constr,
 +                                    nrnb,wcycle,do_verbose);
 +                wallcycle_stop(wcycle,ewcDOMDEC);
 +                /* If using an iterative integrator, reallocate space to match the decomposition */
 +            }
 +        }
 +
 +        if (MASTER(cr) && do_log && !bFFscan)
 +        {
 +            print_ebin_header(fplog,step,t,state->lambda);
 +        }
 +
 +        if (ir->efep != efepNO)
 +        {
 +            update_mdatoms(mdatoms,state->lambda); 
 +        }
 +
 +        if (bRerunMD && rerun_fr.bV)
 +        {
 +            
 +            /* We need the kinetic energy at minus the half step for determining
 +             * the full step kinetic energy and possibly for T-coupling.*/
 +            /* This may not be quite working correctly yet . . . . */
 +            compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
 +                            wcycle,enerd,NULL,NULL,NULL,NULL,mu_tot,
 +                            constr,NULL,FALSE,state->box,
 +                            top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
 +                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
 +        }
 +        clear_mat(force_vir);
 +        
 +        /* Ionize the atoms if necessary */
 +        if (bIonize)
 +        {
 +            ionize(fplog,oenv,mdatoms,top_global,t,ir,state->x,state->v,
 +                   mdatoms->start,mdatoms->start+mdatoms->homenr,state->box,cr);
 +        }
 +        
 +        /* Update force field in ffscan program */
 +        if (bFFscan)
 +        {
 +            if (update_forcefield(fplog,
 +                                  nfile,fnm,fr,
 +                                  mdatoms->nr,state->x,state->box)) {
 +                if (gmx_parallel_env_initialized())
 +                {
 +                    gmx_finalize();
 +                }
 +                exit(0);
 +            }
 +        }
 +
 +        /* We write a checkpoint at this MD step when:
 +         * either at an NS step when we signalled through gs,
 +         * or at the last step (but not when we do not want confout),
 +         * but never at the first step or with rerun.
 +         */
 +        bCPT = (((gs.set[eglsCHKPT] && (bNS || ir->nstlist == 0)) ||
 +                 (bLastStep && (Flags & MD_CONFOUT))) &&
 +                step > ir->init_step && !bRerunMD);
 +        if (bCPT)
 +        {
 +            gs.set[eglsCHKPT] = 0;
 +        }
 +
 +        /* Determine the energy and pressure:
 +         * at nstcalcenergy steps and at energy output steps (set below).
 +         */
 +        bNstEner = do_per_step(step,ir->nstcalcenergy);
 +        bCalcEnerPres =
 +            (bNstEner ||
 +             (ir->epc != epcNO && do_per_step(step,ir->nstpcouple)));
 +
 +        /* Do we need global communication ? */
 +        bGStat = (bCalcEnerPres || bStopCM ||
 +                  do_per_step(step,nstglobalcomm) ||
 +                  (ir->nstlist == -1 && !bRerunMD && step >= nlh.step_nscheck));
 +
 +        do_ene = (do_per_step(step,ir->nstenergy) || bLastStep);
 +
 +        if (do_ene || do_log)
 +        {
 +            bCalcEnerPres = TRUE;
 +            bGStat        = TRUE;
 +        }
 +        
 +        /* these CGLO_ options remain the same throughout the iteration */
 +        cglo_flags = ((bRerunMD ? CGLO_RERUNMD : 0) |
 +                      (bStopCM ? CGLO_STOPCM : 0) |
 +                      (bGStat ? CGLO_GSTAT : 0)
 +            );
 +        
 +        force_flags = (GMX_FORCE_STATECHANGED |
 +                       ((DYNAMIC_BOX(*ir) || bRerunMD) ? GMX_FORCE_DYNAMICBOX : 0) |
 +                       GMX_FORCE_ALLFORCES |
 +                       (bNStList ? GMX_FORCE_DOLR : 0) |
 +                       GMX_FORCE_SEPLRF |
 +                       (bCalcEnerPres ? GMX_FORCE_VIRIAL : 0) |
 +                       (bDoDHDL ? GMX_FORCE_DHDL : 0)
 +            );
 +        
 +        if (shellfc)
 +        {
 +            /* Now is the time to relax the shells */
 +            count=relax_shell_flexcon(fplog,cr,bVerbose,bFFscan ? step+1 : step,
 +                                      ir,bNS,force_flags,
 +                                      bStopCM,top,top_global,
 +                                      constr,enerd,fcd,
 +                                      state,f,force_vir,mdatoms,
 +                                      nrnb,wcycle,graph,groups,
 +                                      shellfc,fr,bBornRadii,t,mu_tot,
 +                                      state->natoms,&bConverged,vsite,
 +                                      outf->fp_field);
 +            tcount+=count;
 +
 +            if (bConverged)
 +            {
 +                nconverged++;
 +            }
 +        }
 +        else
 +        {
 +            /* The coordinates (x) are shifted (to get whole molecules)
 +             * in do_force.
 +             * This is parallellized as well, and does communication too. 
 +             * Check comments in sim_util.c
 +             */
 +        
 +            do_force(fplog,cr,ir,step,nrnb,wcycle,top,top_global,groups,
 +                     state->box,state->x,&state->hist,
 +                     f,force_vir,mdatoms,enerd,fcd,
 +                     state->lambda,graph,
 +                     fr,vsite,mu_tot,t,outf->fp_field,ed,bBornRadii,
 +                     (bNS ? GMX_FORCE_NS : 0) | force_flags);
 +        }
 +        
 +        if (bTCR)
 +        {
 +            mu_aver = calc_mu_aver(cr,state->x,mdatoms->chargeA,
 +                                   mu_tot,&top_global->mols,mdatoms,gnx,grpindex);
 +        }
 +        
 +        if (bTCR && bFirstStep)
 +        {
 +            tcr=init_coupling(fplog,nfile,fnm,cr,fr,mdatoms,&(top->idef));
 +            fprintf(fplog,"Done init_coupling\n"); 
 +            fflush(fplog);
 +        }
 +        
 +        if (bVV && !bStartingFromCpt && !bRerunMD)
 +        /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
 +        {
 +            if (ir->eI==eiVV && bInitStep) 
 +            {
 +                /* if using velocity verlet with full time step Ekin,
 +                 * take the first half step only to compute the 
 +                 * virial for the first step. From there,
 +                 * revert back to the initial coordinates
 +                 * so that the input is actually the initial step.
 +                 */
 +                copy_rvecn(state->v,cbuf,0,state->natoms); /* should make this better for parallelizing? */
 +            } else {
 +                /* this is for NHC in the Ekin(t+dt/2) version of vv */
 +                trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ1);            
 +            }
 +
 +            update_coords(fplog,step,ir,mdatoms,state,
 +                          f,fr->bTwinRange && bNStList,fr->f_twin,fcd,
 +                          ekind,M,wcycle,upd,bInitStep,etrtVELOCITY1,
 +                          cr,nrnb,constr,&top->idef);
 +            
 +            if (bIterations)
 +            {
 +                gmx_iterate_init(&iterate,bIterations && !bInitStep);
 +            }
 +            /* for iterations, we save these vectors, as we will be self-consistently iterating
 +               the calculations */
 +
 +            /*#### UPDATE EXTENDED VARIABLES IN TROTTER FORMULATION */
 +            
 +            /* save the state */
 +            if (bIterations && iterate.bIterate) { 
 +                copy_coupling_state(state,bufstate,ekind,ekind_save,&(ir->opts));
 +            }
 +            
 +            bFirstIterate = TRUE;
 +            while (bFirstIterate || (bIterations && iterate.bIterate))
 +            {
 +                if (bIterations && iterate.bIterate) 
 +                {
 +                    copy_coupling_state(bufstate,state,ekind_save,ekind,&(ir->opts));
 +                    if (bFirstIterate && bTrotter) 
 +                    {
 +                        /* The first time through, we need a decent first estimate
 +                           of veta(t+dt) to compute the constraints.  Do
 +                           this by computing the box volume part of the
 +                           trotter integration at this time. Nothing else
 +                           should be changed by this routine here.  If
 +                           !(first time), we start with the previous value
 +                           of veta.  */
 +                        
 +                        veta_save = state->veta;
 +                        trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ0);
 +                        vetanew = state->veta;
 +                        state->veta = veta_save;
 +                    } 
 +                } 
 +                
 +                bOK = TRUE;
 +                if ( !bRerunMD || rerun_fr.bV || bForceUpdate) {  /* Why is rerun_fr.bV here?  Unclear. */
 +                    dvdl = 0;
 +                    
 +                    update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,state,graph,f,
 +                                       &top->idef,shake_vir,NULL,
 +                                       cr,nrnb,wcycle,upd,constr,
 +                                       bInitStep,TRUE,bCalcEnerPres,vetanew);
 +                    
 +                    if (!bOK && !bFFscan)
 +                    {
 +                        gmx_fatal(FARGS,"Constraint error: Shake, Lincs or Settle could not solve the constrains");
 +                    }
 +                    
 +                } 
 +                else if (graph)
 +                { /* Need to unshift here if a do_force has been
 +                     called in the previous step */
 +                    unshift_self(graph,state->box,state->x);
 +                }
 +                
 +                
 +                /* if VV, compute the pressure and constraints */
 +                /* For VV2, we strictly only need this if using pressure
 +                 * control, but we really would like to have accurate pressures
 +                 * printed out.
 +                 * Think about ways around this in the future?
 +                 * For now, keep this choice in comments.
 +                 */
 +                /*bPres = (ir->eI==eiVV || IR_NPT_TROTTER(ir)); */
 +                    /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && IR_NPT_TROTTER(ir)));*/
 +                bPres = TRUE;
 +                bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK));
 +                compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
 +                                wcycle,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
 +                                constr,NULL,FALSE,state->box,
 +                                top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
 +                                cglo_flags 
 +                                | CGLO_ENERGY 
 +                                | (bTemp ? CGLO_TEMPERATURE:0) 
 +                                | (bPres ? CGLO_PRESSURE : 0) 
 +                                | (bPres ? CGLO_CONSTRAINT : 0)
 +                                | ((bIterations && iterate.bIterate) ? CGLO_ITERATE : 0)  
 +                                | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
 +                                | CGLO_SCALEEKIN 
 +                    );
 +                /* explanation of above: 
 +                   a) We compute Ekin at the full time step
 +                   if 1) we are using the AveVel Ekin, and it's not the
 +                   initial step, or 2) if we are using AveEkin, but need the full
 +                   time step kinetic energy for the pressure (always true now, since we want accurate statistics).
 +                   b) If we are using EkinAveEkin for the kinetic energy for the temperture control, we still feed in 
 +                   EkinAveVel because it's needed for the pressure */
 +                
 +                /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
 +                if (!bInitStep) 
 +                {
 +                    if (bTrotter)
 +                    {
 +                        trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ2);
 +                    } 
 +                    else 
 +                    {
 +                        update_tcouple(fplog,step,ir,state,ekind,wcycle,upd,&MassQ,mdatoms);
 +                    }
 +                }
 +                
 +                if (bIterations &&
 +                    done_iterating(cr,fplog,step,&iterate,bFirstIterate,
 +                                   state->veta,&vetanew)) 
 +                {
 +                    break;
 +                }
 +                bFirstIterate = FALSE;
 +            }
 +
 +            if (bTrotter && !bInitStep) {
 +                copy_mat(shake_vir,state->svir_prev);
 +                copy_mat(force_vir,state->fvir_prev);
 +                if (IR_NVT_TROTTER(ir) && ir->eI==eiVV) {
 +                    /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
 +                    enerd->term[F_TEMP] = sum_ekin(&(ir->opts),ekind,NULL,(ir->eI==eiVV),FALSE,FALSE);
 +                    enerd->term[F_EKIN] = trace(ekind->ekin);
 +                }
 +            }
 +            /* if it's the initial step, we performed this first step just to get the constraint virial */
 +            if (bInitStep && ir->eI==eiVV) {
 +                copy_rvecn(cbuf,state->v,0,state->natoms);
 +            }
 +            
 +            if (fr->bSepDVDL && fplog && do_log) 
 +            {
 +                fprintf(fplog,sepdvdlformat,"Constraint",0.0,dvdl);
 +            }
 +            enerd->term[F_DHDL_CON] += dvdl;
 +        }
 +    
 +        /* MRS -- now done iterating -- compute the conserved quantity */
 +        if (bVV) {
 +            saved_conserved_quantity = compute_conserved_from_auxiliary(ir,state,&MassQ);
 +            if (ir->eI==eiVV) 
 +            {
 +                last_ekin = enerd->term[F_EKIN]; /* does this get preserved through checkpointing? */
 +            }
 +            if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres)) 
 +            {
 +                saved_conserved_quantity -= enerd->term[F_DISPCORR];
 +            }
 +        }
 +        
 +        /* ########  END FIRST UPDATE STEP  ############## */
 +        /* ########  If doing VV, we now have v(dt) ###### */
 +        
 +        /* ################## START TRAJECTORY OUTPUT ################# */
 +        
 +        /* Now we have the energies and forces corresponding to the 
 +         * coordinates at time t. We must output all of this before
 +         * the update.
 +         * for RerunMD t is read from input trajectory
 +         */
 +        mdof_flags = 0;
 +        if (do_per_step(step,ir->nstxout)) { mdof_flags |= MDOF_X; }
 +        if (do_per_step(step,ir->nstvout)) { mdof_flags |= MDOF_V; }
 +        if (do_per_step(step,ir->nstfout)) { mdof_flags |= MDOF_F; }
 +        if (do_per_step(step,ir->nstxtcout)) { mdof_flags |= MDOF_XTC; }
 +        if (bCPT) { mdof_flags |= MDOF_CPT; };
 +
 +#if defined(GMX_FAHCORE) || defined(GMX_WRITELASTSTEP)
 +        if (bLastStep)
 +        {
 +            /* Enforce writing positions and velocities at end of run */
 +            mdof_flags |= (MDOF_X | MDOF_V);
 +        }
 +#endif
 +#ifdef GMX_FAHCORE
 +        if (MASTER(cr))
 +            fcReportProgress( ir->nsteps, step );
 +
 +        /* sync bCPT and fc record-keeping */
 +        if (bCPT && MASTER(cr))
 +            fcRequestCheckPoint();
 +#endif
 +        
 +        if (mdof_flags != 0)
 +        {
 +            wallcycle_start(wcycle,ewcTRAJ);
 +            if (bCPT)
 +            {
 +                if (state->flags & (1<<estLD_RNG))
 +                {
 +                    get_stochd_state(upd,state);
 +                }
 +                if (MASTER(cr))
 +                {
 +                    if (bSumEkinhOld)
 +                    {
 +                        state_global->ekinstate.bUpToDate = FALSE;
 +                    }
 +                    else
 +                    {
 +                        update_ekinstate(&state_global->ekinstate,ekind);
 +                        state_global->ekinstate.bUpToDate = TRUE;
 +                    }
 +                    update_energyhistory(&state_global->enerhist,mdebin);
 +                }
 +            }
 +            write_traj(fplog,cr,outf,mdof_flags,top_global,
 +                       step,t,state,state_global,f,f_global,&n_xtc,&x_xtc);
 +            if (bCPT)
 +            {
 +                nchkpt++;
 +                bCPT = FALSE;
 +            }
 +            debug_gmx();
 +            if (bLastStep && step_rel == ir->nsteps &&
 +                (Flags & MD_CONFOUT) && MASTER(cr) &&
 +                !bRerunMD && !bFFscan)
 +            {
 +                /* x and v have been collected in write_traj,
 +                 * because a checkpoint file will always be written
 +                 * at the last step.
 +                 */
 +                fprintf(stderr,"\nWriting final coordinates.\n");
 +                if (ir->ePBC != epbcNONE && !ir->bPeriodicMols &&
 +                    DOMAINDECOMP(cr))
 +                {
 +                    /* Make molecules whole only for confout writing */
 +                    do_pbc_mtop(fplog,ir->ePBC,state->box,top_global,state_global->x);
 +                }
 +                write_sto_conf_mtop(ftp2fn(efSTO,nfile,fnm),
 +                                    *top_global->name,top_global,
 +                                    state_global->x,state_global->v,
 +                                    ir->ePBC,state->box);
 +                debug_gmx();
 +            }
 +            wallcycle_stop(wcycle,ewcTRAJ);
 +        }
 +        
 +        /* kludge -- virial is lost with restart for NPT control. Must restart */
 +        if (bStartingFromCpt && bVV) 
 +        {
 +            copy_mat(state->svir_prev,shake_vir);
 +            copy_mat(state->fvir_prev,force_vir);
 +        }
 +        /*  ################## END TRAJECTORY OUTPUT ################ */
 +        
 +        /* Determine the wallclock run time up till now */
 +        run_time = gmx_gettime() - (double)runtime->real;
 +
 +        /* Check whether everything is still allright */    
 +        if (((int)gmx_get_stop_condition() > handled_stop_condition)
 +#ifdef GMX_THREADS
 +            && MASTER(cr)
 +#endif
 +            )
 +        {
 +            /* this is just make gs.sig compatible with the hack 
 +               of sending signals around by MPI_Reduce with together with
 +               other floats */
 +            if ( gmx_get_stop_condition() == gmx_stop_cond_next_ns )
 +                gs.sig[eglsSTOPCOND]=1;
 +            if ( gmx_get_stop_condition() == gmx_stop_cond_next )
 +                gs.sig[eglsSTOPCOND]=-1;
 +            /* < 0 means stop at next step, > 0 means stop at next NS step */
 +            if (fplog)
 +            {
 +                fprintf(fplog,
 +                        "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
 +                        gmx_get_signal_name(),
 +                        gs.sig[eglsSTOPCOND]==1 ? "NS " : "");
 +                fflush(fplog);
 +            }
 +            fprintf(stderr,
 +                    "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
 +                    gmx_get_signal_name(),
 +                    gs.sig[eglsSTOPCOND]==1 ? "NS " : "");
 +            fflush(stderr);
 +            handled_stop_condition=(int)gmx_get_stop_condition();
 +        }
 +        else if (MASTER(cr) && (bNS || ir->nstlist <= 0) &&
 +                 (max_hours > 0 && run_time > max_hours*60.0*60.0*0.99) &&
 +                 gs.sig[eglsSTOPCOND] == 0 && gs.set[eglsSTOPCOND] == 0)
 +        {
 +            /* Signal to terminate the run */
 +            gs.sig[eglsSTOPCOND] = 1;
 +            if (fplog)
 +            {
 +                fprintf(fplog,"\nStep %s: Run time exceeded %.3f hours, will terminate the run\n",gmx_step_str(step,sbuf),max_hours*0.99);
 +            }
 +            fprintf(stderr, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n",gmx_step_str(step,sbuf),max_hours*0.99);
 +        }
 +
 +        if (bResetCountersHalfMaxH && MASTER(cr) &&
 +            run_time > max_hours*60.0*60.0*0.495)
 +        {
 +            gs.sig[eglsRESETCOUNTERS] = 1;
 +        }
 +
 +        if (ir->nstlist == -1 && !bRerunMD)
 +        {
 +            /* When bGStatEveryStep=FALSE, global_stat is only called
 +             * when we check the atom displacements, not at NS steps.
 +             * This means that also the bonded interaction count check is not
 +             * performed immediately after NS. Therefore a few MD steps could
 +             * be performed with missing interactions.
 +             * But wrong energies are never written to file,
 +             * since energies are only written after global_stat
 +             * has been called.
 +             */
 +            if (step >= nlh.step_nscheck)
 +            {
 +                nlh.nabnsb = natoms_beyond_ns_buffer(ir,fr,&top->cgs,
 +                                                     nlh.scale_tot,state->x);
 +            }
 +            else
 +            {
 +                /* This is not necessarily true,
 +                 * but step_nscheck is determined quite conservatively.
 +                 */
 +                nlh.nabnsb = 0;
 +            }
 +        }
 +
 +        /* In parallel we only have to check for checkpointing in steps
 +         * where we do global communication,
 +         *  otherwise the other nodes don't know.
 +         */
 +        if (MASTER(cr) && ((bGStat || !PAR(cr)) &&
 +                           cpt_period >= 0 &&
 +                           (cpt_period == 0 || 
 +                            run_time >= nchkpt*cpt_period*60.0)) &&
 +            gs.set[eglsCHKPT] == 0)
 +        {
 +            gs.sig[eglsCHKPT] = 1;
 +        }
 +  
 +        if (bIterations)
 +        {
 +            gmx_iterate_init(&iterate,bIterations);
 +        }
 +    
 +        /* for iterations, we save these vectors, as we will be redoing the calculations */
 +        if (bIterations && iterate.bIterate) 
 +        {
 +            copy_coupling_state(state,bufstate,ekind,ekind_save,&(ir->opts));
 +        }
 +        bFirstIterate = TRUE;
 +        while (bFirstIterate || (bIterations && iterate.bIterate))
 +        {
 +            /* We now restore these vectors to redo the calculation with improved extended variables */    
 +            if (bIterations) 
 +            { 
 +                copy_coupling_state(bufstate,state,ekind_save,ekind,&(ir->opts));
 +            }
 +
 +            /* We make the decision to break or not -after- the calculation of Ekin and Pressure,
 +               so scroll down for that logic */
 +            
 +            /* #########   START SECOND UPDATE STEP ################# */
 +            /* Box is changed in update() when we do pressure coupling,
 +             * but we should still use the old box for energy corrections and when
 +             * writing it to the energy file, so it matches the trajectory files for
 +             * the same timestep above. Make a copy in a separate array.
 +             */
 +            copy_mat(state->box,lastbox);
 +
 +            bOK = TRUE;
 +            if (!(bRerunMD && !rerun_fr.bV && !bForceUpdate))
 +            {
 +                wallcycle_start(wcycle,ewcUPDATE);
 +                dvdl = 0;
 +                /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
 +                if (bTrotter) 
 +                {
 +                    if (bIterations && iterate.bIterate) 
 +                    {
 +                        if (bFirstIterate) 
 +                        {
 +                            scalevir = 1;
 +                        }
 +                        else 
 +                        {
 +                            /* we use a new value of scalevir to converge the iterations faster */
 +                            scalevir = tracevir/trace(shake_vir);
 +                        }
 +                        msmul(shake_vir,scalevir,shake_vir); 
 +                        m_add(force_vir,shake_vir,total_vir);
 +                        clear_mat(shake_vir);
 +                    }
 +                    trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ3);
 +                /* We can only do Berendsen coupling after we have summed
 +                 * the kinetic energy or virial. Since the happens
 +                 * in global_state after update, we should only do it at
 +                 * step % nstlist = 1 with bGStatEveryStep=FALSE.
 +                 */
 +                }
 +                else 
 +                {
 +                    update_tcouple(fplog,step,ir,state,ekind,wcycle,upd,&MassQ,mdatoms);
 +                    update_pcouple(fplog,step,ir,state,pcoupl_mu,M,wcycle,
 +                                   upd,bInitStep);
 +                }
 +
 +                if (bVV)
 +                {
 +                    /* velocity half-step update */
 +                    update_coords(fplog,step,ir,mdatoms,state,f,
 +                                  fr->bTwinRange && bNStList,fr->f_twin,fcd,
 +                                  ekind,M,wcycle,upd,FALSE,etrtVELOCITY2,
 +                                  cr,nrnb,constr,&top->idef);
 +                }
 +
 +                /* Above, initialize just copies ekinh into ekin,
 +                 * it doesn't copy position (for VV),
 +                 * and entire integrator for MD.
 +                 */
 +                
 +                if (ir->eI==eiVVAK) 
 +                {
 +                    copy_rvecn(state->x,cbuf,0,state->natoms);
 +                }
 +                
 +                update_coords(fplog,step,ir,mdatoms,state,f,fr->bTwinRange && bNStList,fr->f_twin,fcd,
 +                              ekind,M,wcycle,upd,bInitStep,etrtPOSITION,cr,nrnb,constr,&top->idef);
 +                wallcycle_stop(wcycle,ewcUPDATE);
 +
 +                update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,state,graph,f,
 +                                   &top->idef,shake_vir,force_vir,
 +                                   cr,nrnb,wcycle,upd,constr,
 +                                   bInitStep,FALSE,bCalcEnerPres,state->veta);  
 +                
 +                if (ir->eI==eiVVAK) 
 +                {
 +                    /* erase F_EKIN and F_TEMP here? */
 +                    /* just compute the kinetic energy at the half step to perform a trotter step */
 +                    compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
 +                                    wcycle,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
 +                                    constr,NULL,FALSE,lastbox,
 +                                    top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
 +                                    cglo_flags | CGLO_TEMPERATURE    
 +                        );
 +                    wallcycle_start(wcycle,ewcUPDATE);
 +                    trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ4);            
 +                    /* now we know the scaling, we can compute the positions again again */
 +                    copy_rvecn(cbuf,state->x,0,state->natoms);
 +
 +                    update_coords(fplog,step,ir,mdatoms,state,f,fr->bTwinRange && bNStList,fr->f_twin,fcd,
 +                                  ekind,M,wcycle,upd,bInitStep,etrtPOSITION,cr,nrnb,constr,&top->idef);
 +                    wallcycle_stop(wcycle,ewcUPDATE);
 +
 +                    /* do we need an extra constraint here? just need to copy out of state->v to upd->xp? */
 +                    /* are the small terms in the shake_vir here due
 +                     * to numerical errors, or are they important
 +                     * physically? I'm thinking they are just errors, but not completely sure. 
 +                     * For now, will call without actually constraining, constr=NULL*/
 +                    update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,state,graph,f,
 +                                       &top->idef,tmp_vir,force_vir,
 +                                       cr,nrnb,wcycle,upd,NULL,
 +                                       bInitStep,FALSE,bCalcEnerPres,
 +                                       state->veta);  
 +                }
 +                if (!bOK && !bFFscan) 
 +                {
 +                    gmx_fatal(FARGS,"Constraint error: Shake, Lincs or Settle could not solve the constrains");
 +                }
 +                
 +                if (fr->bSepDVDL && fplog && do_log) 
 +                {
 +                    fprintf(fplog,sepdvdlformat,"Constraint",0.0,dvdl);
 +                }
 +                enerd->term[F_DHDL_CON] += dvdl;
 +            } 
 +            else if (graph) 
 +            {
 +                /* Need to unshift here */
 +                unshift_self(graph,state->box,state->x);
 +            }
 +
 +            if (vsite != NULL) 
 +            {
 +                wallcycle_start(wcycle,ewcVSITECONSTR);
 +                if (graph != NULL) 
 +                {
 +                    shift_self(graph,state->box,state->x);
 +                }
 +                construct_vsites(fplog,vsite,state->x,nrnb,ir->delta_t,state->v,
 +                                 top->idef.iparams,top->idef.il,
 +                                 fr->ePBC,fr->bMolPBC,graph,cr,state->box);
 +                
 +                if (graph != NULL) 
 +                {
 +                    unshift_self(graph,state->box,state->x);
 +                }
 +                wallcycle_stop(wcycle,ewcVSITECONSTR);
 +            }
 +            
 +            /* ############## IF NOT VV, Calculate globals HERE, also iterate constraints ############ */
 +            if (ir->nstlist == -1 && bFirstIterate)
 +            {
 +                gs.sig[eglsNABNSB] = nlh.nabnsb;
 +            }
 +            compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
 +                            wcycle,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
 +                            constr,
 +                            bFirstIterate ? &gs : NULL, 
 +                            (step_rel % gs.nstms == 0) && 
 +                                (multisim_nsteps<0 || (step_rel<multisim_nsteps)),
 +                            lastbox,
 +                            top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
 +                            cglo_flags 
 +                            | (!EI_VV(ir->eI) ? CGLO_ENERGY : 0) 
 +                            | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0) 
 +                            | (!EI_VV(ir->eI) || bRerunMD ? CGLO_PRESSURE : 0) 
 +                            | (bIterations && iterate.bIterate ? CGLO_ITERATE : 0) 
 +                            | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
 +                            | CGLO_CONSTRAINT 
 +                );
 +            if (ir->nstlist == -1 && bFirstIterate)
 +            {
 +                nlh.nabnsb = gs.set[eglsNABNSB];
 +                gs.set[eglsNABNSB] = 0;
 +            }
 +            /* bIterate is set to keep it from eliminating the old ekin kinetic energy terms */
 +            /* #############  END CALC EKIN AND PRESSURE ################# */
 +        
 +            /* Note: this is OK, but there are some numerical precision issues with using the convergence of
 +               the virial that should probably be addressed eventually. state->veta has better properies,
 +               but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
 +               generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
 +
 +            if (bIterations && 
 +                done_iterating(cr,fplog,step,&iterate,bFirstIterate,
 +                               trace(shake_vir),&tracevir)) 
 +            {
 +                break;
 +            }
 +            bFirstIterate = FALSE;
 +        }
 +
 +        update_box(fplog,step,ir,mdatoms,state,graph,f,
 +                   ir->nstlist==-1 ? &nlh.scale_tot : NULL,pcoupl_mu,nrnb,wcycle,upd,bInitStep,FALSE);
 +        
 +        /* ################# END UPDATE STEP 2 ################# */
 +        /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
 +    
 +        /* The coordinates (x) were unshifted in update */
 +        if (bFFscan && (shellfc==NULL || bConverged))
 +        {
 +            if (print_forcefield(fplog,enerd->term,mdatoms->homenr,
 +                                 f,NULL,xcopy,
 +                                 &(top_global->mols),mdatoms->massT,pres))
 +            {
 +                if (gmx_parallel_env_initialized())
 +                {
 +                    gmx_finalize();
 +                }
 +                fprintf(stderr,"\n");
 +                exit(0);
 +            }
 +        }
 +        if (!bGStat)
 +        {
 +            /* We will not sum ekinh_old,                                                            
 +             * so signal that we still have to do it.                                                
 +             */
 +            bSumEkinhOld = TRUE;
 +        }
 +        
 +        if (bTCR)
 +        {
 +            /* Only do GCT when the relaxation of shells (minimization) has converged,
 +             * otherwise we might be coupling to bogus energies. 
 +             * In parallel we must always do this, because the other sims might
 +             * update the FF.
 +             */
 +
 +            /* Since this is called with the new coordinates state->x, I assume
 +             * we want the new box state->box too. / EL 20040121
 +             */
 +            do_coupling(fplog,oenv,nfile,fnm,tcr,t,step,enerd->term,fr,
 +                        ir,MASTER(cr),
 +                        mdatoms,&(top->idef),mu_aver,
 +                        top_global->mols.nr,cr,
 +                        state->box,total_vir,pres,
 +                        mu_tot,state->x,f,bConverged);
 +            debug_gmx();
 +        }
 +
 +        /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
 +        
 +        /* sum up the foreign energy and dhdl terms */
 +        sum_dhdl(enerd,state->lambda,ir);
 +
 +        /* use the directly determined last velocity, not actually the averaged half steps */
 +        if (bTrotter && ir->eI==eiVV) 
 +        {
 +            enerd->term[F_EKIN] = last_ekin;
 +        }
 +        enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
 +        
 +        if (bVV)
 +        {
 +            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
 +        }
 +        else 
 +        {
 +            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + compute_conserved_from_auxiliary(ir,state,&MassQ);
 +        }
 +        /* Check for excessively large energies */
 +        if (bIonize) 
 +        {
 +#ifdef GMX_DOUBLE
 +            real etot_max = 1e200;
 +#else
 +            real etot_max = 1e30;
 +#endif
 +            if (fabs(enerd->term[F_ETOT]) > etot_max) 
 +            {
 +                fprintf(stderr,"Energy too large (%g), giving up\n",
 +                        enerd->term[F_ETOT]);
 +            }
 +        }
 +        /* #########  END PREPARING EDR OUTPUT  ###########  */
 +        
 +        /* Time for performance */
 +        if (((step % stepout) == 0) || bLastStep) 
 +        {
 +            runtime_upd_proc(runtime);
 +        }
 +        
 +        /* Output stuff */
 +        if (MASTER(cr))
 +        {
 +            gmx_bool do_dr,do_or;
 +            
 +            if (!(bStartingFromCpt && (EI_VV(ir->eI)))) 
 +            {
 +                if (bNstEner)
 +                {
 +                    upd_mdebin(mdebin,bDoDHDL, TRUE,
 +                               t,mdatoms->tmass,enerd,state,lastbox,
 +                               shake_vir,force_vir,total_vir,pres,
 +                               ekind,mu_tot,constr);
 +                }
 +                else
 +                {
 +                    upd_mdebin_step(mdebin);
 +                }
 +                
 +                do_dr  = do_per_step(step,ir->nstdisreout);
 +                do_or  = do_per_step(step,ir->nstorireout);
 +                
 +                print_ebin(outf->fp_ene,do_ene,do_dr,do_or,do_log?fplog:NULL,
 +                           step,t,
 +                           eprNORMAL,bCompact,mdebin,fcd,groups,&(ir->opts));
 +            }
 +            if (ir->ePull != epullNO)
 +            {
 +                pull_print_output(ir->pull,step,t);
 +            }
 +            
 +            if (do_per_step(step,ir->nstlog))
 +            {
 +                if(fflush(fplog) != 0)
 +                {
 +                    gmx_fatal(FARGS,"Cannot flush logfile - maybe you are out of quota?");
 +                }
 +            }
 +        }
 +
 +
 +        /* Remaining runtime */
 +        if (MULTIMASTER(cr) && (do_verbose || gmx_got_usr_signal() ))
 +        {
 +            if (shellfc) 
 +            {
 +                fprintf(stderr,"\n");
 +            }
 +            print_time(stderr,runtime,step,ir,cr);
 +        }
 +
 +        /* Replica exchange */
 +        bExchanged = FALSE;
 +        if ((repl_ex_nst > 0) && (step > 0) && !bLastStep &&
 +            do_per_step(step,repl_ex_nst)) 
 +        {
 +            bExchanged = replica_exchange(fplog,cr,repl_ex,
 +                                          state_global,enerd->term,
 +                                          state,step,t);
 +
 +            if (bExchanged && DOMAINDECOMP(cr)) 
 +            {
 +                dd_partition_system(fplog,step,cr,TRUE,1,
 +                                    state_global,top_global,ir,
 +                                    state,&f,mdatoms,top,fr,
 +                                    vsite,shellfc,constr,
 +                                    nrnb,wcycle,FALSE);
 +            }
 +        }
 +        
 +        bFirstStep = FALSE;
 +        bInitStep = FALSE;
 +        bStartingFromCpt = FALSE;
 +
 +        /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
 +        /* With all integrators, except VV, we need to retain the pressure
 +         * at the current step for coupling at the next step.
 +         */
 +        if ((state->flags & (1<<estPRES_PREV)) &&
 +            (bGStatEveryStep ||
 +             (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
 +        {
 +            /* Store the pressure in t_state for pressure coupling
 +             * at the next MD step.
 +             */
 +            copy_mat(pres,state->pres_prev);
 +        }
 +        
 +        /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
 +
 +        if ( (membed!=NULL) && (!bLastStep) )
 +            rescale_membed(step_rel,membed,state_global->x);
 +        
 +        if (bRerunMD) 
 +        {
 +            if (MASTER(cr))
 +            {
 +                /* read next frame from input trajectory */
 +                bNotLastFrame = read_next_frame(oenv,status,&rerun_fr);
 +            }
 +
 +            if (PAR(cr))
 +            {
 +                rerun_parallel_comm(cr,&rerun_fr,&bNotLastFrame);
 +            }
 +        }
 +        
 +        if (!bRerunMD || !rerun_fr.bStep)
 +        {
 +            /* increase the MD step number */
 +            step++;
 +            step_rel++;
 +        }
 +        
 +        cycles = wallcycle_stop(wcycle,ewcSTEP);
 +        if (DOMAINDECOMP(cr) && wcycle)
 +        {
 +            dd_cycles_add(cr->dd,cycles,ddCyclStep);
 +        }
 +        
 +        if (step_rel == wcycle_get_reset_counters(wcycle) ||
 +            gs.set[eglsRESETCOUNTERS] != 0)
 +        {
 +            /* Reset all the counters related to performance over the run */
 +            reset_all_counters(fplog,cr,step,&step_rel,ir,wcycle,nrnb,runtime);
 +            wcycle_set_reset_counters(wcycle,-1);
 +            /* Correct max_hours for the elapsed time */
 +            max_hours -= run_time/(60.0*60.0);
 +            bResetCountersHalfMaxH = FALSE;
 +            gs.set[eglsRESETCOUNTERS] = 0;
 +        }
 +
 +    }
 +    /* End of main MD loop */
 +    debug_gmx();
 +    
 +    /* Stop the time */
 +    runtime_end(runtime);
 +    
 +    if (bRerunMD && MASTER(cr))
 +    {
 +        close_trj(status);
 +    }
 +    
 +    if (!(cr->duty & DUTY_PME))
 +    {
 +        /* Tell the PME only node to finish */
 +        gmx_pme_finish(cr);
 +    }
 +    
 +    if (MASTER(cr))
 +    {
 +        if (ir->nstcalcenergy > 0 && !bRerunMD) 
 +        {
 +            print_ebin(outf->fp_ene,FALSE,FALSE,FALSE,fplog,step,t,
 +                       eprAVER,FALSE,mdebin,fcd,groups,&(ir->opts));
 +        }
 +    }
 +
 +    done_mdoutf(outf);
 +
 +    debug_gmx();
 +
 +    if (ir->nstlist == -1 && nlh.nns > 0 && fplog)
 +    {
 +        fprintf(fplog,"Average neighborlist lifetime: %.1f steps, std.dev.: %.1f steps\n",nlh.s1/nlh.nns,sqrt(nlh.s2/nlh.nns - sqr(nlh.s1/nlh.nns)));
 +        fprintf(fplog,"Average number of atoms that crossed the half buffer length: %.1f\n\n",nlh.ab/nlh.nns);
 +    }
 +    
 +    if (shellfc && fplog)
 +    {
 +        fprintf(fplog,"Fraction of iterations that converged:           %.2f %%\n",
 +                (nconverged*100.0)/step_rel);
 +        fprintf(fplog,"Average number of force evaluations per MD step: %.2f\n\n",
 +                tcount/step_rel);
 +    }
 +    
 +    if (repl_ex_nst > 0 && MASTER(cr))
 +    {
 +        print_replica_exchange_statistics(fplog,repl_ex);
 +    }
 +    
 +    runtime->nsteps_done = step_rel;
 +    
 +    return 0;
 +}
index 11371e885c6b82bc22b5c600659cfa690306dc9e,0000000000000000000000000000000000000000..faac54e39c246df8ff113277dcb720dda0b8bf6f
mode 100644,000000..100644
--- /dev/null
@@@ -1,914 -1,0 +1,984 @@@
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
- /* get the number of threads based on how many there were requested, 
-    which algorithms we're using, and how many particles there are. */
- static int get_nthreads(int nthreads_requested, t_inputrec *inputrec,
-                         gmx_mtop_t *mtop)
++#ifdef __linux
++#define _GNU_SOURCE
++#include <sched.h>
++#include <sys/syscall.h>
++#endif
 +#include <signal.h>
 +#include <stdlib.h>
 +
 +#if ((defined WIN32 || defined _WIN32 || defined WIN64 || defined _WIN64) && !defined __CYGWIN__ && !defined __CYGWIN32__)
 +/* _isnan() */
 +#include <float.h>
 +#endif
 +
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "sysstuff.h"
 +#include "statutil.h"
 +#include "mdrun.h"
 +#include "network.h"
 +#include "pull.h"
 +#include "pull_rotation.h"
 +#include "names.h"
 +#include "disre.h"
 +#include "orires.h"
 +#include "dihre.h"
 +#include "pppm.h"
 +#include "pme.h"
 +#include "mdatoms.h"
 +#include "repl_ex.h"
 +#include "qmmm.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "coulomb.h"
 +#include "constr.h"
 +#include "mvdata.h"
 +#include "checkpoint.h"
 +#include "mtop_util.h"
 +#include "sighandler.h"
 +#include "tpxio.h"
 +#include "txtdump.h"
 +#include "pull_rotation.h"
 +#include "membed.h"
 +#include "macros.h"
 +
 +#ifdef GMX_LIB_MPI
 +#include <mpi.h>
 +#endif
 +#ifdef GMX_THREADS
 +#include "tmpi.h"
 +#endif
 +
 +#ifdef GMX_FAHCORE
 +#include "corewrap.h"
 +#endif
 +
 +#ifdef GMX_OPENMM
 +#include "md_openmm.h"
 +#endif
 +
++#ifdef GMX_OPENMP
++#include <omp.h>
++#endif
++
 +
 +typedef struct { 
 +    gmx_integrator_t *func;
 +} gmx_intp_t;
 +
 +/* The array should match the eI array in include/types/enums.h */
 +#ifdef GMX_OPENMM  /* FIXME do_md_openmm needs fixing */
 +const gmx_intp_t integrator[eiNR] = { {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm},{do_md_openmm}};
 +#else
 +const gmx_intp_t integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md},{do_md}};
 +#endif
 +
 +gmx_large_int_t     deform_init_init_step_tpx;
 +matrix              deform_init_box_tpx;
 +#ifdef GMX_THREADS
 +tMPI_Thread_mutex_t deform_init_box_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
 +#endif
 +
 +
 +#ifdef GMX_THREADS
 +struct mdrunner_arglist
 +{
 +    FILE *fplog;
 +    t_commrec *cr;
 +    int nfile;
 +    const t_filenm *fnm;
 +    output_env_t oenv;
 +    gmx_bool bVerbose;
 +    gmx_bool bCompact;
 +    int nstglobalcomm;
 +    ivec ddxyz;
 +    int dd_node_order;
 +    real rdd;
 +    real rconstr;
 +    const char *dddlb_opt;
 +    real dlb_scale;
 +    const char *ddcsx;
 +    const char *ddcsy;
 +    const char *ddcsz;
 +    int nstepout;
 +    int resetstep;
 +    int nmultisim;
 +    int repl_ex_nst;
 +    int repl_ex_seed;
 +    real pforce;
 +    real cpt_period;
 +    real max_hours;
 +    const char *deviceOptions;
 +    unsigned long Flags;
 +    int ret; /* return value */
 +};
 +
 +
 +/* The function used for spawning threads. Extracts the mdrunner() 
 +   arguments from its one argument and calls mdrunner(), after making
 +   a commrec. */
 +static void mdrunner_start_fn(void *arg)
 +{
 +    struct mdrunner_arglist *mda=(struct mdrunner_arglist*)arg;
 +    struct mdrunner_arglist mc=*mda; /* copy the arg list to make sure 
 +                                        that it's thread-local. This doesn't
 +                                        copy pointed-to items, of course,
 +                                        but those are all const. */
 +    t_commrec *cr;  /* we need a local version of this */
 +    FILE *fplog=NULL;
 +    t_filenm *fnm;
 +
 +    fnm = dup_tfn(mc.nfile, mc.fnm);
 +
 +    cr = init_par_threads(mc.cr);
 +
 +    if (MASTER(cr))
 +    {
 +        fplog=mc.fplog;
 +    }
 +
 +    mda->ret=mdrunner(cr->nnodes, fplog, cr, mc.nfile, fnm, mc.oenv, 
 +                      mc.bVerbose, mc.bCompact, mc.nstglobalcomm, 
 +                      mc.ddxyz, mc.dd_node_order, mc.rdd,
 +                      mc.rconstr, mc.dddlb_opt, mc.dlb_scale, 
 +                      mc.ddcsx, mc.ddcsy, mc.ddcsz, mc.nstepout, mc.resetstep, 
 +                      mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_seed, mc.pforce, 
 +                      mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.Flags);
 +}
 +
 +/* called by mdrunner() to start a specific number of threads (including 
 +   the main thread) for thread-parallel runs. This in turn calls mdrunner()
 +   for each thread. 
 +   All options besides nthreads are the same as for mdrunner(). */
 +static t_commrec *mdrunner_start_threads(int nthreads, 
 +              FILE *fplog,t_commrec *cr,int nfile, 
 +              const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
 +              gmx_bool bCompact, int nstglobalcomm,
 +              ivec ddxyz,int dd_node_order,real rdd,real rconstr,
 +              const char *dddlb_opt,real dlb_scale,
 +              const char *ddcsx,const char *ddcsy,const char *ddcsz,
 +              int nstepout,int resetstep,int nmultisim,int repl_ex_nst,
 +              int repl_ex_seed, real pforce,real cpt_period, real max_hours, 
 +              const char *deviceOptions, unsigned long Flags)
 +{
 +    int ret;
 +    struct mdrunner_arglist *mda;
 +    t_commrec *crn; /* the new commrec */
 +    t_filenm *fnmn;
 +
 +    /* first check whether we even need to start tMPI */
 +    if (nthreads<2)
 +        return cr;
 +
 +    /* a few small, one-time, almost unavoidable memory leaks: */
 +    snew(mda,1);
 +    fnmn=dup_tfn(nfile, fnm);
 +
 +    /* fill the data structure to pass as void pointer to thread start fn */
 +    mda->fplog=fplog;
 +    mda->cr=cr;
 +    mda->nfile=nfile;
 +    mda->fnm=fnmn;
 +    mda->oenv=oenv;
 +    mda->bVerbose=bVerbose;
 +    mda->bCompact=bCompact;
 +    mda->nstglobalcomm=nstglobalcomm;
 +    mda->ddxyz[XX]=ddxyz[XX];
 +    mda->ddxyz[YY]=ddxyz[YY];
 +    mda->ddxyz[ZZ]=ddxyz[ZZ];
 +    mda->dd_node_order=dd_node_order;
 +    mda->rdd=rdd;
 +    mda->rconstr=rconstr;
 +    mda->dddlb_opt=dddlb_opt;
 +    mda->dlb_scale=dlb_scale;
 +    mda->ddcsx=ddcsx;
 +    mda->ddcsy=ddcsy;
 +    mda->ddcsz=ddcsz;
 +    mda->nstepout=nstepout;
 +    mda->resetstep=resetstep;
 +    mda->nmultisim=nmultisim;
 +    mda->repl_ex_nst=repl_ex_nst;
 +    mda->repl_ex_seed=repl_ex_seed;
 +    mda->pforce=pforce;
 +    mda->cpt_period=cpt_period;
 +    mda->max_hours=max_hours;
 +    mda->deviceOptions=deviceOptions;
 +    mda->Flags=Flags;
 +
 +    fprintf(stderr, "Starting %d threads\n",nthreads);
 +    fflush(stderr);
 +    /* now spawn new threads that start mdrunner_start_fn(), while 
 +       the main thread returns */
 +    ret=tMPI_Init_fn(TRUE, nthreads, mdrunner_start_fn, (void*)(mda) );
 +    if (ret!=TMPI_SUCCESS)
 +        return NULL;
 +
 +    /* make a new comm_rec to reflect the new situation */
 +    crn=init_par_threads(cr);
 +    return crn;
 +}
 +
 +
-     int         nthreads=1;
++/* Get the number of threads to use for thread-MPI based on how many
++ * were requested, which algorithms we're using,
++ * and how many particles there are.
++ */
++static int get_nthreads_mpi(int nthreads_requested, t_inputrec *inputrec,
++                            gmx_mtop_t *mtop)
 +{
 +    int nthreads,nthreads_new;
 +    int min_atoms_per_thread;
 +    char *env;
 +
 +    nthreads = nthreads_requested;
 +
 +    /* determine # of hardware threads. */
 +    if (nthreads_requested < 1)
 +    {
 +        if ((env = getenv("GMX_MAX_THREADS")) != NULL)
 +        {
 +            nthreads = 0;
 +            sscanf(env,"%d",&nthreads);
 +            if (nthreads < 1)
 +            {
 +                gmx_fatal(FARGS,"GMX_MAX_THREADS (%d) should be larger than 0",
 +                          nthreads);
 +            }
 +        }
 +        else
 +        {
 +            nthreads = tMPI_Thread_get_hw_number();
 +        }
 +    }
 +
 +    if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
 +    {
 +        /* Steps are divided over the nodes iso splitting the atoms */
 +        min_atoms_per_thread = 0;
 +    }
 +    else
 +    {
 +        min_atoms_per_thread = MIN_ATOMS_PER_THREAD;
 +    }
 +
 +    /* Check if an algorithm does not support parallel simulation.  */
 +    if (nthreads != 1 && 
 +        ( inputrec->eI == eiLBFGS ||
 +          inputrec->coulombtype == eelEWALD ) )
 +    {
 +        fprintf(stderr,"\nThe integration or electrostatics algorithm doesn't support parallel runs. Not starting any threads.\n");
 +        nthreads = 1;
 +    }
 +    else if (nthreads_requested < 1 &&
 +             mtop->natoms/nthreads < min_atoms_per_thread)
 +    {
 +        /* the thread number was chosen automatically, but there are too many
 +           threads (too few atoms per thread) */
 +        nthreads_new = max(1,mtop->natoms/min_atoms_per_thread);
 +
 +        if (nthreads_new > 8 || (nthreads == 8 && nthreads_new > 4))
 +        {
 +            /* Use only multiples of 4 above 8 threads
 +             * or with an 8-core processor
 +             * (to avoid 6 threads on 8 core processors with 4 real cores).
 +             */
 +            nthreads_new = (nthreads_new/4)*4;
 +        }
 +        else if (nthreads_new > 4)
 +        {
 +            /* Avoid 5 or 7 threads */
 +            nthreads_new = (nthreads_new/2)*2;
 +        }
 +
 +        nthreads = nthreads_new;
 +
 +        fprintf(stderr,"\n");
 +        fprintf(stderr,"NOTE: Parallelization is limited by the small number of atoms,\n");
 +        fprintf(stderr,"      only starting %d threads.\n",nthreads);
 +        fprintf(stderr,"      You can use the -nt option to optimize the number of threads.\n\n");
 +    }
 +    return nthreads;
 +}
 +#endif
 +
 +
 +int mdrunner(int nthreads_requested, FILE *fplog,t_commrec *cr,int nfile,
 +             const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
 +             gmx_bool bCompact, int nstglobalcomm,
 +             ivec ddxyz,int dd_node_order,real rdd,real rconstr,
 +             const char *dddlb_opt,real dlb_scale,
 +             const char *ddcsx,const char *ddcsy,const char *ddcsz,
 +             int nstepout,int resetstep,int nmultisim,int repl_ex_nst,
 +             int repl_ex_seed, real pforce,real cpt_period,real max_hours,
 +             const char *deviceOptions, unsigned long Flags)
 +{
 +    double     nodetime=0,realtime;
 +    t_inputrec *inputrec;
 +    t_state    *state=NULL;
 +    matrix     box;
 +    gmx_ddbox_t ddbox={0};
 +    int        npme_major,npme_minor;
 +    real       tmpr1,tmpr2;
 +    t_nrnb     *nrnb;
 +    gmx_mtop_t *mtop=NULL;
 +    t_mdatoms  *mdatoms=NULL;
 +    t_forcerec *fr=NULL;
 +    t_fcdata   *fcd=NULL;
 +    real       ewaldcoeff=0;
 +    gmx_pme_t  *pmedata=NULL;
 +    gmx_vsite_t *vsite=NULL;
 +    gmx_constr_t constr;
 +    int        i,m,nChargePerturbed=-1,status,nalloc;
 +    char       *gro;
 +    gmx_wallcycle_t wcycle;
 +    gmx_bool       bReadRNG,bReadEkin;
 +    int        list;
 +    gmx_runtime_t runtime;
 +    int        rc;
 +    gmx_large_int_t reset_counters;
 +    gmx_edsam_t ed=NULL;
 +    t_commrec   *cr_old=cr; 
-         nthreads = get_nthreads(nthreads_requested, inputrec, mtop);
++    int         nthreads_mpi=1;
++    int         nthreads_pme=1;
 +    gmx_membed_t *membed=NULL;
 +
 +    /* CAUTION: threads may be started later on in this function, so
 +       cr doesn't reflect the final parallel state right now */
 +    snew(inputrec,1);
 +    snew(mtop,1);
 +
 +    if (bVerbose && SIMMASTER(cr))
 +    {
 +        fprintf(stderr,"Getting Loaded...\n");
 +    }
 +    
 +    if (Flags & MD_APPENDFILES) 
 +    {
 +        fplog = NULL;
 +    }
 +
 +    snew(state,1);
 +    if (MASTER(cr)) 
 +    {
 +        /* Read (nearly) all data required for the simulation */
 +        read_tpx_state(ftp2fn(efTPX,nfile,fnm),inputrec,state,NULL,mtop);
 +
 +        /* NOW the threads will be started: */
 +#ifdef GMX_THREADS
-         if (nthreads > 1)
++        nthreads_mpi = get_nthreads_mpi(nthreads_requested, inputrec, mtop);
 +
-             cr=mdrunner_start_threads(nthreads, fplog, cr_old, nfile, fnm, 
++        if (nthreads_mpi > 1)
 +        {
 +            /* now start the threads. */
-     wcycle = wallcycle_init(fplog,resetstep,cr);
++            cr=mdrunner_start_threads(nthreads_mpi, fplog, cr_old, nfile, fnm,
 +                                      oenv, bVerbose, bCompact, nstglobalcomm, 
 +                                      ddxyz, dd_node_order, rdd, rconstr, 
 +                                      dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
 +                                      nstepout, resetstep, nmultisim, 
 +                                      repl_ex_nst, repl_ex_seed, pforce, 
 +                                      cpt_period, max_hours, deviceOptions, 
 +                                      Flags);
 +            /* the main thread continues here with a new cr. We don't deallocate
 +               the old cr because other threads may still be reading it. */
 +            if (cr == NULL)
 +            {
 +                gmx_comm("Failed to spawn threads");
 +            }
 +        }
 +#endif
 +    }
 +    /* END OF CAUTION: cr is now reliable */
 +
 +    /* g_membed initialisation *
 +     * Because we change the mtop, init_membed is called before the init_parallel *
 +     * (in case we ever want to make it run in parallel) */
 +    if (opt2bSet("-membed",nfile,fnm))
 +    {
 +      fprintf(stderr,"Entering membed code");
 +        snew(membed,1);
 +        init_membed(fplog,membed,nfile,fnm,mtop,inputrec,state,cr,&cpt_period);
 +    }
 +
 +    if (PAR(cr))
 +    {
 +        /* now broadcast everything to the non-master nodes/threads: */
 +        init_parallel(fplog, cr, inputrec, mtop);
 +    }
 +    if (fplog != NULL)
 +    {
 +        pr_inputrec(fplog,0,"Input Parameters",inputrec,FALSE);
 +    }
 +
 +    /* now make sure the state is initialized and propagated */
 +    set_state_entries(state,inputrec,cr->nnodes);
 +
 +    /* A parallel command line option consistency check that we can
 +       only do after any threads have started. */
 +    if (!PAR(cr) &&
 +        (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
 +    {
 +        gmx_fatal(FARGS,
 +                  "The -dd or -npme option request a parallel simulation, "
 +#ifndef GMX_MPI
 +                  "but mdrun was compiled without threads or MPI enabled"
 +#else
 +#ifdef GMX_THREADS
 +                  "but the number of threads (option -nt) is 1"
 +#else
 +                  "but mdrun was not started through mpirun/mpiexec or only one process was requested through mpirun/mpiexec" 
 +#endif
 +#endif
 +            );
 +    }
 +
 +    if ((Flags & MD_RERUN) &&
 +        (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
 +    {
 +        gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
 +    }
 +
 +    if (can_use_allvsall(inputrec,mtop,TRUE,cr,fplog))
 +    {
 +        /* All-vs-all loops do not work with domain decomposition */
 +        Flags |= MD_PARTDEC;
 +    }
 +
 +    if (!EEL_PME(inputrec->coulombtype) || (Flags & MD_PARTDEC))
 +    {
 +        cr->npmenodes = 0;
 +    }
 +
 +#ifdef GMX_FAHCORE
 +    fcRegisterSteps(inputrec->nsteps,inputrec->init_step);
 +#endif
 +
 +    /* NMR restraints must be initialized before load_checkpoint,
 +     * since with time averaging the history is added to t_state.
 +     * For proper consistency check we therefore need to extend
 +     * t_state here.
 +     * So the PME-only nodes (if present) will also initialize
 +     * the distance restraints.
 +     */
 +    snew(fcd,1);
 +
 +    /* This needs to be called before read_checkpoint to extend the state */
 +    init_disres(fplog,mtop,inputrec,cr,Flags & MD_PARTDEC,fcd,state);
 +
 +    if (gmx_mtop_ftype_count(mtop,F_ORIRES) > 0)
 +    {
 +        if (PAR(cr) && !(Flags & MD_PARTDEC))
 +        {
 +            gmx_fatal(FARGS,"Orientation restraints do not work (yet) with domain decomposition, use particle decomposition (mdrun option -pd)");
 +        }
 +        /* Orientation restraints */
 +        if (MASTER(cr))
 +        {
 +            init_orires(fplog,mtop,state->x,inputrec,cr->ms,&(fcd->orires),
 +                        state);
 +        }
 +    }
 +
 +    if (DEFORM(*inputrec))
 +    {
 +        /* Store the deform reference box before reading the checkpoint */
 +        if (SIMMASTER(cr))
 +        {
 +            copy_mat(state->box,box);
 +        }
 +        if (PAR(cr))
 +        {
 +            gmx_bcast(sizeof(box),box,cr);
 +        }
 +        /* Because we do not have the update struct available yet
 +         * in which the reference values should be stored,
 +         * we store them temporarily in static variables.
 +         * This should be thread safe, since they are only written once
 +         * and with identical values.
 +         */
 +#ifdef GMX_THREADS
 +        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
 +#endif
 +        deform_init_init_step_tpx = inputrec->init_step;
 +        copy_mat(box,deform_init_box_tpx);
 +#ifdef GMX_THREADS
 +        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
 +#endif
 +    }
 +
 +    if (opt2bSet("-cpi",nfile,fnm)) 
 +    {
 +        /* Check if checkpoint file exists before doing continuation.
 +         * This way we can use identical input options for the first and subsequent runs...
 +         */
 +        if( gmx_fexist_master(opt2fn_master("-cpi",nfile,fnm,cr),cr) )
 +        {
 +            load_checkpoint(opt2fn_master("-cpi",nfile,fnm,cr),&fplog,
 +                            cr,Flags & MD_PARTDEC,ddxyz,
 +                            inputrec,state,&bReadRNG,&bReadEkin,
 +                            (Flags & MD_APPENDFILES));
 +            
 +            if (bReadRNG)
 +            {
 +                Flags |= MD_READ_RNG;
 +            }
 +            if (bReadEkin)
 +            {
 +                Flags |= MD_READ_EKIN;
 +            }
 +        }
 +    }
 +
 +    if (((MASTER(cr) || (Flags & MD_SEPPOT)) && (Flags & MD_APPENDFILES))
 +#ifdef GMX_THREADS
 +        /* With thread MPI only the master node/thread exists in mdrun.c,
 +         * therefore non-master nodes need to open the "seppot" log file here.
 +         */
 +        || (!MASTER(cr) && (Flags & MD_SEPPOT))
 +#endif
 +        )
 +    {
 +        gmx_log_open(ftp2fn(efLOG,nfile,fnm),cr,!(Flags & MD_SEPPOT),
 +                             Flags,&fplog);
 +    }
 +
 +    if (SIMMASTER(cr)) 
 +    {
 +        copy_mat(state->box,box);
 +    }
 +
 +    if (PAR(cr)) 
 +    {
 +        gmx_bcast(sizeof(box),box,cr);
 +    }
 +
 +    /* Essential dynamics */
 +    if (opt2bSet("-ei",nfile,fnm))
 +    {
 +        /* Open input and output files, allocate space for ED data structure */
 +        ed = ed_open(nfile,fnm,Flags,cr);
 +    }
 +
 +    if (bVerbose && SIMMASTER(cr))
 +    {
 +        fprintf(stderr,"Loaded with Money\n\n");
 +    }
 +
 +    if (PAR(cr) && !((Flags & MD_PARTDEC) ||
 +                     EI_TPI(inputrec->eI) ||
 +                     inputrec->eI == eiNM))
 +    {
 +        cr->dd = init_domain_decomposition(fplog,cr,Flags,ddxyz,rdd,rconstr,
 +                                           dddlb_opt,dlb_scale,
 +                                           ddcsx,ddcsy,ddcsz,
 +                                           mtop,inputrec,
 +                                           box,state->x,
 +                                           &ddbox,&npme_major,&npme_minor);
 +
 +        make_dd_communicators(fplog,cr,dd_node_order);
 +
 +        /* Set overallocation to avoid frequent reallocation of arrays */
 +        set_over_alloc_dd(TRUE);
 +    }
 +    else
 +    {
 +        /* PME, if used, is done on all nodes with 1D decomposition */
 +        cr->npmenodes = 0;
 +        cr->duty = (DUTY_PP | DUTY_PME);
 +        npme_major = 1;
 +        npme_minor = 1;
 +        if (!EI_TPI(inputrec->eI))
 +        {
 +            npme_major = cr->nnodes;
 +        }
 +        
 +        if (inputrec->ePBC == epbcSCREW)
 +        {
 +            gmx_fatal(FARGS,
 +                      "pbc=%s is only implemented with domain decomposition",
 +                      epbc_names[inputrec->ePBC]);
 +        }
 +    }
 +
 +    if (PAR(cr))
 +    {
 +        /* After possible communicator splitting in make_dd_communicators.
 +         * we can set up the intra/inter node communication.
 +         */
 +        gmx_setup_nodecomm(fplog,cr);
 +    }
 +
-                                   (Flags & MD_REPRODUCIBLE));
++    /* get number of OpenMP/PME threads
++     * env variable should be read only on one node to make sure it is identical everywhere */
++#ifdef GMX_OPENMP
++    if (EEL_PME(inputrec->coulombtype))
++    {
++        if (MASTER(cr))
++        {
++            char *ptr;
++            if ((ptr=getenv("GMX_PME_NTHREADS")) != NULL)
++            {
++                sscanf(ptr,"%d",&nthreads_pme);
++            }
++            if (fplog != NULL && nthreads_pme > 1)
++            {
++                fprintf(fplog,"Using %d threads for PME\n",nthreads_pme);
++            }
++        }
++        if (PAR(cr))
++        {
++            gmx_bcast_sim(sizeof(nthreads_pme),&nthreads_pme,cr);
++        }
++    }
++#endif
++
++    wcycle = wallcycle_init(fplog,resetstep,cr,nthreads_pme);
 +    if (PAR(cr))
 +    {
 +        /* Master synchronizes its value of reset_counters with all nodes 
 +         * including PME only nodes */
 +        reset_counters = wcycle_get_reset_counters(wcycle);
 +        gmx_bcast_sim(sizeof(reset_counters),&reset_counters,cr);
 +        wcycle_set_reset_counters(wcycle, reset_counters);
 +    }
 +
 +
 +    snew(nrnb,1);
 +    if (cr->duty & DUTY_PP)
 +    {
 +        /* For domain decomposition we allocate dynamically
 +         * in dd_partition_system.
 +         */
 +        if (DOMAINDECOMP(cr))
 +        {
 +            bcast_state_setup(cr,state);
 +        }
 +        else
 +        {
 +            if (PAR(cr))
 +            {
 +                bcast_state(cr,state,TRUE);
 +            }
 +        }
 +
 +        /* Dihedral Restraints */
 +        if (gmx_mtop_ftype_count(mtop,F_DIHRES) > 0)
 +        {
 +            init_dihres(fplog,mtop,inputrec,fcd);
 +        }
 +
 +        /* Initiate forcerecord */
 +        fr = mk_forcerec();
 +        init_forcerec(fplog,oenv,fr,fcd,inputrec,mtop,cr,box,FALSE,
 +                      opt2fn("-table",nfile,fnm),
 +                      opt2fn("-tabletf",nfile,fnm),
 +                      opt2fn("-tablep",nfile,fnm),
 +                      opt2fn("-tableb",nfile,fnm),FALSE,pforce);
 +
 +        /* version for PCA_NOT_READ_NODE (see md.c) */
 +        /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
 +          "nofile","nofile","nofile","nofile",FALSE,pforce);
 +          */        
 +        fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT);
 +
 +        /* Initialize QM-MM */
 +        if(fr->bQMMM)
 +        {
 +            init_QMMMrec(cr,box,mtop,inputrec,fr);
 +        }
 +
 +        /* Initialize the mdatoms structure.
 +         * mdatoms is not filled with atom data,
 +         * as this can not be done now with domain decomposition.
 +         */
 +        mdatoms = init_mdatoms(fplog,mtop,inputrec->efep!=efepNO);
 +
 +        /* Initialize the virtual site communication */
 +        vsite = init_vsite(mtop,cr);
 +
 +        calc_shifts(box,fr->shift_vec);
 +
 +        /* With periodic molecules the charge groups should be whole at start up
 +         * and the virtual sites should not be far from their proper positions.
 +         */
 +        if (!inputrec->bContinuation && MASTER(cr) &&
 +            !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
 +        {
 +            /* Make molecules whole at start of run */
 +            if (fr->ePBC != epbcNONE)
 +            {
 +                do_pbc_first_mtop(fplog,inputrec->ePBC,box,mtop,state->x);
 +            }
 +            if (vsite)
 +            {
 +                /* Correct initial vsite positions are required
 +                 * for the initial distribution in the domain decomposition
 +                 * and for the initial shell prediction.
 +                 */
 +                construct_vsites_mtop(fplog,vsite,mtop,state->x);
 +            }
 +        }
 +
 +        /* Initiate PPPM if necessary */
 +        if (fr->eeltype == eelPPPM)
 +        {
 +            if (mdatoms->nChargePerturbed)
 +            {
 +                gmx_fatal(FARGS,"Free energy with %s is not implemented",
 +                          eel_names[fr->eeltype]);
 +            }
 +            status = gmx_pppm_init(fplog,cr,oenv,FALSE,TRUE,box,
 +                                   getenv("GMXGHAT"),inputrec, (Flags & MD_REPRODUCIBLE));
 +            if (status != 0)
 +            {
 +                gmx_fatal(FARGS,"Error %d initializing PPPM",status);
 +            }
 +        }
 +
 +        if (EEL_PME(fr->eeltype))
 +        {
 +            ewaldcoeff = fr->ewaldcoeff;
 +            pmedata = &fr->pmedata;
 +        }
 +        else
 +        {
 +            pmedata = NULL;
 +        }
 +    }
 +    else
 +    {
 +        /* This is a PME only node */
 +
 +        /* We don't need the state */
 +        done_state(state);
 +
 +        ewaldcoeff = calc_ewaldcoeff(inputrec->rcoulomb, inputrec->ewald_rtol);
 +        snew(pmedata,1);
 +    }
 +
 +    /* Initiate PME if necessary,
 +     * either on all nodes or on dedicated PME nodes only. */
 +    if (EEL_PME(inputrec->coulombtype))
 +    {
 +        if (mdatoms)
 +        {
 +            nChargePerturbed = mdatoms->nChargePerturbed;
 +        }
 +        if (cr->npmenodes > 0)
 +        {
 +            /* The PME only nodes need to know nChargePerturbed */
 +            gmx_bcast_sim(sizeof(nChargePerturbed),&nChargePerturbed,cr);
 +        }
++
++
++        /* Set CPU affinity. Can be important for performance.
++           On some systems (e.g. Cray) CPU Affinity is set by default.
++           But default assigning doesn't work (well) with only some ranks
++           having threads. This causes very low performance.
++           External tools have cumbersome syntax for setting affinity
++           in the case that only some ranks have threads.
++           Thus it is important that GROMACS sets the affinity internally at
++           if only PME is using threads.
++        */
++
++#ifdef GMX_OPENMP
++#ifdef __linux
++#ifdef GMX_LIB_MPI
++        {
++            int core;
++            MPI_Comm comm_intra; /* intra communicator (but different to nc.comm_intra includes PME nodes) */
++            MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),gmx_node_rank(),&comm_intra);
++            int local_omp_nthreads = (cr->duty & DUTY_PME) ? nthreads_pme : 1; /* threads on this node */
++            MPI_Scan(&local_omp_nthreads,&core, 1, MPI_INT, MPI_SUM, comm_intra);
++            core-=local_omp_nthreads; /* make exclusive scan */
++#pragma omp parallel firstprivate(core) num_threads(local_omp_nthreads)
++            {
++                cpu_set_t mask;
++                CPU_ZERO(&mask);
++                core+=omp_get_thread_num();
++                CPU_SET(core,&mask);
++                sched_setaffinity((pid_t) syscall (SYS_gettid),sizeof(cpu_set_t),&mask);
++            }
++        }
++#endif /*GMX_MPI*/
++#endif /*__linux*/
++#endif /*GMX_OPENMP*/
++
 +        if (cr->duty & DUTY_PME)
 +        {
 +            status = gmx_pme_init(pmedata,cr,npme_major,npme_minor,inputrec,
 +                                  mtop ? mtop->natoms : 0,nChargePerturbed,
++                                  (Flags & MD_REPRODUCIBLE),nthreads_pme);
 +            if (status != 0) 
 +            {
 +                gmx_fatal(FARGS,"Error %d initializing PME",status);
 +            }
 +        }
 +    }
 +
 +
 +    if (integrator[inputrec->eI].func == do_md
 +#ifdef GMX_OPENMM
 +        ||
 +        integrator[inputrec->eI].func == do_md_openmm
 +#endif
 +        )
 +    {
 +        /* Turn on signal handling on all nodes */
 +        /*
 +         * (A user signal from the PME nodes (if any)
 +         * is communicated to the PP nodes.
 +         */
 +        signal_handler_install();
 +    }
 +
 +    if (cr->duty & DUTY_PP)
 +    {
 +        if (inputrec->ePull != epullNO)
 +        {
 +            /* Initialize pull code */
 +            init_pull(fplog,inputrec,nfile,fnm,mtop,cr,oenv,
 +                      EI_DYNAMICS(inputrec->eI) && MASTER(cr),Flags);
 +        }
 +        
 +        if (inputrec->bRot)
 +        {
 +           /* Initialize enforced rotation code */
 +           init_rot(fplog,inputrec,nfile,fnm,cr,state->x,state->box,mtop,oenv,
 +                    bVerbose,Flags);
 +        }
 +
 +        constr = init_constraints(fplog,mtop,inputrec,ed,state,cr);
 +
 +        if (DOMAINDECOMP(cr))
 +        {
 +            dd_init_bondeds(fplog,cr->dd,mtop,vsite,constr,inputrec,
 +                            Flags & MD_DDBONDCHECK,fr->cginfo_mb);
 +
 +            set_dd_parameters(fplog,cr->dd,dlb_scale,inputrec,fr,&ddbox);
 +
 +            setup_dd_grid(fplog,cr->dd);
 +        }
 +
 +        /* Now do whatever the user wants us to do (how flexible...) */
 +        integrator[inputrec->eI].func(fplog,cr,nfile,fnm,
 +                                      oenv,bVerbose,bCompact,
 +                                      nstglobalcomm,
 +                                      vsite,constr,
 +                                      nstepout,inputrec,mtop,
 +                                      fcd,state,
 +                                      mdatoms,nrnb,wcycle,ed,fr,
 +                                      repl_ex_nst,repl_ex_seed,
 +                                      membed,
 +                                      cpt_period,max_hours,
 +                                      deviceOptions,
 +                                      Flags,
 +                                      &runtime);
 +
 +        if (inputrec->ePull != epullNO)
 +        {
 +            finish_pull(fplog,inputrec->pull);
 +        }
 +        
 +        if (inputrec->bRot)
 +        {
 +            finish_rot(fplog,inputrec->rot);
 +        }
 +
 +    } 
 +    else 
 +    {
 +        /* do PME only */
 +        gmx_pmeonly(*pmedata,cr,nrnb,wcycle,ewaldcoeff,FALSE,inputrec);
 +    }
 +
 +    if (EI_DYNAMICS(inputrec->eI) || EI_TPI(inputrec->eI))
 +    {
 +        /* Some timing stats */  
 +        if (SIMMASTER(cr))
 +        {
 +            if (runtime.proc == 0)
 +            {
 +                runtime.proc = runtime.real;
 +            }
 +        }
 +        else
 +        {
 +            runtime.real = 0;
 +        }
 +    }
 +
 +    wallcycle_stop(wcycle,ewcRUN);
 +
 +    /* Finish up, write some stuff
 +     * if rerunMD, don't write last frame again 
 +     */
 +    finish_run(fplog,cr,ftp2fn(efSTO,nfile,fnm),
 +               inputrec,nrnb,wcycle,&runtime,
 +               EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
 +    
 +    if (opt2bSet("-membed",nfile,fnm))
 +    {
 +        sfree(membed);
 +    }
 +
 +    /* Does what it says */  
 +    print_date_and_time(fplog,cr->nodeid,"Finished mdrun",&runtime);
 +
 +    /* Close logfile already here if we were appending to it */
 +    if (MASTER(cr) && (Flags & MD_APPENDFILES))
 +    {
 +        gmx_log_close(fplog);
 +    } 
 +
 +    rc=(int)gmx_get_stop_condition();
 +
 +#ifdef GMX_THREADS
 +    /* we need to join all threads. The sub-threads join when they
 +       exit this function, but the master thread needs to be told to 
 +       wait for that. */
 +    if (PAR(cr) && MASTER(cr))
 +    {
 +        tMPI_Finalize();
 +    }
 +#endif
 +
 +    return rc;
 +}
index 99a46d5a9b49dfcec42cf572448bd9c76d58afea,91e2ac7c9cfc8a7224cf5116c292291ba5eeca4e..3002230e8f4e4c86dfcc6366973152ed77787ad8
@@@ -18,8 -19,8 +18,8 @@@ add_library(gmxan
              gmx_nmens.c     gmx_order.c     gmx_principal.c 
              gmx_polystat.c  gmx_potential.c gmx_rama.c      
              gmx_rdf.c       gmx_rms.c       gmx_rmsf.c      
-             gmx_rotacf.c    gmx_saltbr.c    gmx_sas.c              
+             gmx_rotacf.c    gmx_saltbr.c    gmx_sas.c       gmx_sans.c
 -            gmx_select.c    gmx_rmsdist.c   gmx_rotmat.c
 +            gmx_rmsdist.c   gmx_rotmat.c
              gmx_sgangle.c   gmx_sorient.c   gmx_spol.c      gmx_tcaf.c      
              gmx_traj.c      gmx_velacc.c    gmx_helixorient.c 
              gmx_clustsize.c gmx_mdmat.c     gmx_wham.c      
@@@ -28,9 -29,9 +28,9 @@@
              gmx_editconf.c  gmx_genbox.c    gmx_genion.c    gmx_genconf.c   
              gmx_genpr.c     gmx_eneconv.c   gmx_vanhove.c   gmx_wheel.c     
              addconf.c       calcpot.c       edittop.c       gmx_bar.c
 -            gmx_membed.c    gmx_pme_error.c gmx_options.c   gmx_dos.c
 +            gmx_pme_error.c gmx_options.c   gmx_dos.c
              gmx_hydorder.c  gmx_densorder.c powerspect.c    dens_filter.c
-             binsearch.c
+             binsearch.c     gmx_dyecoupl.c
              )
  
  
@@@ -50,11 -51,11 +50,11 @@@ set(GMX_TOOLS_PROGRAM
      g_dyndom g_enemat g_energy g_lie g_filter g_gyrate
      g_h2order g_hbond g_helix g_mindist g_msd g_morph g_nmeig
      g_nmens g_order g_kinetics g_polystat g_potential g_rama g_rdf g_rms
 -    g_rmsf g_rotacf g_saltbr g_sas g_select g_sgangle g_sham g_sorient
 +    g_rmsf g_rotacf g_saltbr g_sas g_sgangle g_sham g_sorient
      g_spol g_spatial g_tcaf g_traj g_tune_pme g_vanhove
      g_velacc g_clustsize g_mdmat g_wham g_sigeps g_bar
 -    g_membed g_pme_error g_rmsdist g_rotmat g_options
 +    g_pme_error g_rmsdist g_rotmat g_options
-     g_dos    g_hydorder  g_densorder
+     g_dos    g_hydorder  g_densorder g_dyecoupl g_sans
      )
  
  set(GMX_TOOLS_PROGRAMS_NOT_FOR_INSTALLATION
index 0000000000000000000000000000000000000000,1817a5e5fd0c3e920def29fe87ea18d84408a2d5..2fc5b30cacd8fc9312fdbaad48243b99eb8532fb
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,431 +1,428 @@@
 -#include <nbsearch.h>
 -#include <trajana.h>
 -#include <math.h>
+ /*
+  *
+  *                This source code is part of
+  *
+  *                 G   R   O   M   A   C   S
+  *
+  *          GROningen MAchine for Chemical Simulations
+  *
+  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+  * Copyright (c) 2001-2009, The GROMACS development team,
+  * check out http://www.gromacs.org for more information.
+  * This program is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU General Public License
+  * as published by the Free Software Foundation; either version 2
+  * of the License, or (at your option) any later version.
+  *
+  * If you want to redistribute modifications, please consider that
+  * scientific software is very special. Version control is crucial -
+  * bugs must be traceable. We will be happy to consider code for
+  * inclusion in the official distribution, but derived work must not
+  * be called official GROMACS. Details are found in the README & COPYING
+  * files - if they are missing, get the official version at www.gromacs.org.
+  *
+  * To help us fund GROMACS development, we humbly ask that you cite
+  * the papers on the package - you can find them in the top README file.
+  *
+  * For more info, check our website at http://www.gromacs.org
+  */
+ #include <copyrite.h>
+ #include <filenm.h>
+ #include <macros.h>
+ #include <pbc.h>
+ #include <smalloc.h>
+ #include <statutil.h>
+ #include <vec.h>
+ #include <xvgr.h>
+ int gmx_dyecoupl(int argc, char *argv[])
+ {
+     const char *desc[] =
+     {
+             "This tool extracts dye dynamics from trajectory files.",
+             "Currently, R and kappa^2 between dyes is extracted for (F)RET",
+             "simulations with assumed dipolar coupling as in the Foerster equation.",
+             "It further allows the calculation of R(t) and kappa^2(t), R and",
+             "kappa^2 histograms and averages, as well as the instantaneous FRET",
+             "efficiency E(t) for a specified Foerster radius R_0 (switch [TT]-R0[tt]).",
+             "The input dyes have to be whole (see res and mol pbc options",
+             "in [TT]trjconv[tt]).",
+             "The dye transition dipole moment has to be defined by at least",
+             "a single atom pair, however multiple atom pairs can be provided ",
+             "in the index file. The distance R is calculated on the basis of",
+             "the COMs of the given atom pairs.",
+             "The [TT]-pbcdist[tt] option calculates distances to the nearest periodic",
+             "image instead to the distance in the box. This works however only,"
+             "for periodic boundaries in all 3 dimensions.",
+             "The [TT]-norm[tt] option (area-) normalizes the histograms."
+     };
+     
+       static gmx_bool bPBCdist = FALSE, bNormHist = FALSE;
+     int histbins = 50;
+     output_env_t oenv;
+     real R0=-1;
+     t_pargs pa[] =
+     {
+             { "-pbcdist", FALSE, etBOOL, { &bPBCdist },"Distance R based on PBC" },
+             { "-norm", FALSE, etBOOL, { &bNormHist },"Normalize histograms" },
+             { "-bins", FALSE, etINT, {&histbins},"# of histogram bins" },
+             { "-R0", FALSE, etREAL, {&R0},"Foerster radius including kappa^2=2/3 in nm" }
+     };
+ #define NPA asize(pa)
+     t_filenm fnm[] =
+     {
+             { efTRX, "-f", NULL, ffREAD },
+             { efNDX, NULL, NULL, ffREAD },
+             { efXVG, "-ot", "rkappa",ffOPTWR },
+             { efXVG, "-oe", "insteff",ffOPTWR },
+             { efDAT, "-o", "rkappa",ffOPTWR },
+             { efXVG, "-rhist","rhist", ffOPTWR },
+             { efXVG, "-khist", "khist", ffOPTWR }
+     };
+ #define NFILE asize(fnm)
+     const char *in_trajfile, *in_ndxfile, *out_xvgrkfile = NULL, *out_xvginstefffile = NULL, *out_xvgrhistfile = NULL, *out_xvgkhistfile = NULL,*out_datfile=NULL;
+     gmx_bool bHaveFirstFrame, bHaveNextFrame, indexOK = TRUE;
+     int ndon, nacc;
+     atom_id *donindex, *accindex;
+     char *grpnm;
+     t_atoms *atoms = NULL;
+     t_trxstatus *status;
+     t_trxframe fr;
+     int flags;
+     int allocblock = 1000;
+     real histexpand = 1e-6;
+     rvec donvec, accvec, donpos, accpos, dist, distnorm;
+     int natoms;
+     /*we rely on PBC autodetection (...currently)*/
+     int ePBC = -1;
+     real *rvalues=NULL, *kappa2values=NULL, *rhist=NULL, *khist=NULL;
+     t_pbc *pbc=NULL;
+     int i, bin;
+     FILE *rkfp = NULL, *rhfp = NULL, *khfp = NULL,*datfp=NULL,*iefp=NULL;
+     gmx_bool bRKout, bRhistout, bKhistout,bDatout,bInstEffout;
+     const char *rkleg[2] = { "R", "\\f{Symbol}k\\f{}\\S2\\N" };
+     const char *rhleg[1] = { "p(R)" };
+     const char *khleg[1] = { "p(\\f{Symbol}k\\f{}\\S2\\N)" };
+     const char *ieleg[1] = { "E\\sRET\\N(t)" };
+     real R, kappa2, insteff, Rs = 0., kappa2s = 0., insteffs=0., rmax, rmin, kmin = 0., kmax = 4.,
+             rrange, krange, rincr, kincr,Rfrac;
+     int rkcount = 0, rblocksallocated = 0, kblocksallocated = 0;
+     CopyRight(stderr, argv[0]);
+     parse_common_args(&argc,argv,PCA_CAN_BEGIN | PCA_CAN_END | PCA_CAN_VIEW | PCA_TIME_UNIT | PCA_BE_NICE, NFILE,fnm,NPA,pa,asize(desc),desc, 0,NULL,&oenv);
+     /* Check command line options for filenames and set bool flags when switch used*/
+     in_trajfile = opt2fn("-f", NFILE, fnm);
+     in_ndxfile = opt2fn("-n", NFILE, fnm);
+     out_xvgrkfile = opt2fn("-ot", NFILE, fnm);
+     out_xvgrhistfile = opt2fn("-rhist", NFILE, fnm);
+     out_xvgkhistfile = opt2fn("-khist", NFILE, fnm);
+     out_xvginstefffile = opt2fn("-oe", NFILE, fnm);
+     out_datfile = opt2fn("-o",NFILE,fnm);
+     bRKout = opt2bSet("-ot", NFILE, fnm);
+     bRhistout = opt2bSet("-rhist", NFILE, fnm);
+     bKhistout = opt2bSet("-khist", NFILE, fnm);
+     bDatout = opt2bSet("-o", NFILE, fnm);
+     bInstEffout = opt2bSet("-oe", NFILE, fnm);
+     /* PBC warning. */
+     if (bPBCdist)
+     {
+         printf("Calculating distances to periodic image.\n");
+         printf("Be careful! This produces only valid results for PBC in all three dimensions\n");
+     }
+     if (bInstEffout && R0<=0.)
+     {
+         gmx_fatal(FARGS,"You have to specify R0 and R0 has to be larger than 0 nm.\n\n");
+     }
+     printf("Select group with donor atom pairs defining the transition moment\n");
+     get_index(atoms, ftp2fn_null(efNDX, NFILE, fnm), 1, &ndon, &donindex,&grpnm);
+     printf("Select group with acceptor atom pairs defining the transition moment\n");
+     get_index(atoms, ftp2fn_null(efNDX, NFILE, fnm), 1, &nacc, &accindex,&grpnm);
+     printf("Reading first frame\n");
+     /* open trx file for reading */
+     flags=0;
+     flags = flags | TRX_READ_X;
+     bHaveFirstFrame = read_first_frame(oenv, &status, in_trajfile, &fr, flags);
+     if (bHaveFirstFrame)
+     {
+         printf("First frame is OK\n");
+         natoms = fr.natoms;
+         if ((ndon % 2 != 0) || (nacc % 2 != 0))
+         {
+             indexOK = FALSE;
+         }
+         else
+         {
+             for (i = 0; i < ndon;i++)
+             {
+                 if (donindex[i] >= natoms)
+                     indexOK = FALSE;
+             }
+             for (i = 0; i < nacc;i++)
+             {
+                 if (accindex[i] >= natoms)
+                     indexOK = FALSE;
+             }
+         }
+         if (indexOK)
+         {
+             if (bDatout)
+             {
+                 datfp = fopen(out_datfile,"w");
+             }
+             if (bRKout)
+             {
+                 rkfp = xvgropen(out_xvgrkfile,
+                         "Distance and \\f{Symbol}k\\f{}\\S2\\N trajectory",
+                         "Time (ps)", "Distance (nm) / \\f{Symbol}k\\f{}\\S2\\N",
+                         oenv);
+                 xvgr_legend(rkfp, 2, rkleg, oenv);
+             }
+             if (bInstEffout)
+             {
+                 iefp = xvgropen(out_xvginstefffile,
+                         "Instantaneous RET Efficiency",
+                         "Time (ps)", "RET Efficiency",
+                         oenv);
+                 xvgr_legend(iefp, 1, ieleg, oenv);
+             }
+             if (bRhistout)
+             {
+                 snew(rvalues, allocblock);
+                 rblocksallocated += 1;
+                 snew(rhist, histbins);
+             }
+             if (bKhistout)
+             {
+                 snew(kappa2values, allocblock);
+                 kblocksallocated += 1;
+                 snew(khist, histbins);
+             }
+             do
+             {
+                 clear_rvec(donvec);
+                 clear_rvec(accvec);
+                 clear_rvec(donpos);
+                 clear_rvec(accpos);
+                 for (i = 0; i < ndon / 2; i++)
+                 {
+                     rvec_sub(donvec, fr.x[donindex[2 * i]], donvec);
+                     rvec_add(donvec, fr.x[donindex[2 * i + 1]], donvec);
+                     rvec_add(donpos, fr.x[donindex[2 * i]], donpos);
+                     rvec_add(donpos, fr.x[donindex[2 * i + 1]], donpos);
+                 }
+                 for (i = 0; i < nacc / 2; i++)
+                 {
+                     rvec_sub(accvec, fr.x[accindex[2 * i]], accvec);
+                     rvec_add(accvec, fr.x[accindex[2 * i + 1]], accvec);
+                     rvec_add(accpos, fr.x[accindex[2 * i]], accpos);
+                     rvec_add(accpos, fr.x[accindex[2 * i + 1]], accpos);
+                 }
+                 unitv(donvec, donvec);
+                 unitv(accvec, accvec);
+                 svmul((real) 1. / ndon, donpos, donpos);
+                 svmul((real) 1. / nacc, accpos, accpos);
+                 if (bPBCdist)
+                 {
+                     set_pbc(pbc, ePBC, fr.box);
+                     pbc_dx(pbc, donpos, accpos, dist);
+                 }
+                 else
+                 {
+                     rvec_sub(donpos, accpos, dist);
+                 }
+                 unitv(dist, distnorm);
+                 R = norm(dist);
+                 kappa2 = iprod(donvec, accvec)- 3.* (iprod(donvec, distnorm) * iprod(distnorm, accvec));
+                 kappa2 *= kappa2;
+                 if (R0>0)
+                 {
+                     Rfrac=R/R0;
+                     insteff=1/(1+(Rfrac*Rfrac*Rfrac*Rfrac*Rfrac*Rfrac)*2/3/kappa2);
+                     insteffs+=insteff;
+                     if (bInstEffout)
+                     {
+                         fprintf(iefp, "%12.7f %12.7f\n", fr.time, insteff);
+                     }
+                 }
+                 Rs += R;
+                 kappa2s += kappa2;
+                 rkcount++;
+                 if (bRKout)
+                     fprintf(rkfp, "%12.7f %12.7f %12.7f\n", fr.time, R, kappa2);
+                 if (bDatout)
+                     fprintf(datfp, "%12.7f %12.7f %12.7f\n", fr.time, R, kappa2);
+                 if (bRhistout)
+                 {
+                     rvalues[rkcount-1] = R;
+                     if (rkcount % allocblock == 0)
+                     {
+                         srenew(rvalues, allocblock*(rblocksallocated+1));
+                         rblocksallocated += 1;
+                     }
+                 }
+                 if (bKhistout)
+                 {
+                     kappa2values[rkcount-1] = kappa2;
+                     if (rkcount % allocblock == 0)
+                     {
+                         srenew(kappa2values, allocblock*(kblocksallocated+1));
+                         kblocksallocated += 1;
+                     }
+                 }
+                 bHaveNextFrame = read_next_frame(oenv, status, &fr);
+             } while (bHaveNextFrame);
+             if (bRKout)
+                 ffclose(rkfp);
+             if (bDatout)
+                 ffclose(datfp);
+             if (bInstEffout)
+                 ffclose(iefp);
+             if (bRhistout)
+             {
+                 printf("Writing R-Histogram\n");
+                 rmin = rvalues[0];
+                 rmax = rvalues[0];
+                 for (i = 1; i < rkcount; i++)
+                 {
+                     if (rvalues[i] < rmin)
+                         rmin = rvalues[i];
+                     else if (rvalues[i] > rmax)
+                         rmax = rvalues[i];
+                 }
+                 rmin -= histexpand;
+                 rmax += histexpand;
+                 rrange = rmax - rmin;
+                 rincr = rrange / histbins;
+                 for (i = 1; i < rkcount; i++)
+                 {
+                     bin = (int) ((rvalues[i] - rmin) / rincr);
+                     rhist[bin] += 1;
+                 }
+                 if (bNormHist)
+                 {
+                     for (i = 0; i < histbins; i++)
+                         rhist[i] /= rkcount * rrange/histbins;
+                     rhfp = xvgropen(out_xvgrhistfile, "Distance Distribution",
+                             "R (nm)", "Normalized Probability", oenv);
+                 } else
+                 {
+                     rhfp = xvgropen(out_xvgrhistfile, "Distance Distribution",
+                             "R (nm)", "Probability", oenv);
+                 }
+                 xvgr_legend(rhfp, 1, rhleg, oenv);
+                 for (i = 0; i < histbins; i++)
+                 {
+                     fprintf(rhfp, "%12.7f %12.7f\n", (i + 0.5) * rincr + rmin,
+                             rhist[i]);
+                 }
+                 ffclose(rhfp);
+             }
+             if (bKhistout)
+             {
+                 printf("Writing kappa^2-Histogram\n");
+                 krange = kmax - kmin;
+                 kincr = krange / histbins;
+                 for (i = 1; i < rkcount; i++)
+                 {
+                     bin = (int) ((kappa2values[i] - kmin) / kincr);
+                     khist[bin] += 1;
+                 }
+                 if (bNormHist)
+                 {
+                     for (i = 0; i < histbins; i++)
+                         khist[i] /= rkcount * krange/histbins;
+                     khfp = xvgropen(out_xvgkhistfile,
+                             "\\f{Symbol}k\\f{}\\S2\\N Distribution",
+                             "\\f{Symbol}k\\f{}\\S2\\N",
+                             "Normalized Probability", oenv);
+                 } else
+                 {
+                     khfp = xvgropen(out_xvgkhistfile,
+                             "\\f{Symbol}k\\f{}\\S2\\N Distribution",
+                             "\\f{Symbol}k\\f{}\\S2\\N", "Probability", oenv);
+                 }
+                 xvgr_legend(khfp, 1, khleg, oenv);
+                 for (i = 0; i < histbins; i++)
+                 {
+                     fprintf(khfp, "%12.7f %12.7f\n", (i + 0.5) * kincr + kmin,
+                             khist[i]);
+                 }
+                 ffclose(khfp);
+             }
+             printf("\nAverages:\n");
+             printf("R_avg   = %8.4f nm\nKappa^2 = %8.4f\n", Rs / rkcount,
+                     kappa2s / rkcount);
+             if (R0>0)
+             {
+                 printf("E_RETavg   = %8.4f\n", insteffs / rkcount);
+             }
+             please_cite(stdout,"Hoefling2011");
+         }
+         else
+         {
+             gmx_fatal(FARGS,"Index file invalid, check your index file for correct pairs.\n");
+         }
+     }
+     else
+     {
+         gmx_fatal(FARGS,"Could not read first frame of the trajectory.\n");
+     }
+     thanx(stderr);
+     return 0;
+ }
index 3db3fad60d5c715f3d8fb6dffb141385d1c72949,8c2c89cd9448b0355493a9e72fe45e221f5d1fde..cceb219fb276f8e8239019c0aafcdd047eb426d0
  #include <config.h>
  #endif
  
+ #ifdef __linux
+ #define _GNU_SOURCE
+ #include <sched.h>
+ #include <sys/syscall.h>
+ #endif
  #include <signal.h>
  #include <stdlib.h>
  #include "typedefs.h"
 -#include "smalloc.h"
  #include "sysstuff.h"
 -#include "vec.h"
  #include "statutil.h"
  #include "macros.h"
  #include "copyrite.h"