Merge release-4-6 into master
authorRoland Schulz <roland@utk.edu>
Sun, 29 Apr 2012 01:52:57 +0000 (21:52 -0400)
committerRoland Schulz <roland@utk.edu>
Sun, 29 Apr 2012 01:55:58 +0000 (21:55 -0400)
Conflicts:
CMakeLists.txt (modified)
cmake/gmxCFlags.cmake
src/gromacs/legacyheaders/CMakeLists.txt (modified)
src/tools/gmx_membed.c
src/tools/gmx_select.c -> change applied to
            src/gromacs/trajectoryanalysis/modules/select.cpp

Conflicts (Deleted):
src/gromacs/legacyheaders/pppm.h
src/gromacs/mdlib/ghat.c
src/gromacs/mdlib/pppm.c

Moved:
include/gmx_header_config.h.cmakein -> src/gromacs/legacyheaders/gmx_header_config.h.cmakein

Change-Id: Iec19966d1d7d3dfe817fd3f28197ea620bf55b3e

75 files changed:
1  2 
CMakeLists.txt
cmake/gmxCFlags.cmake
src/config.h.cmakein
src/gromacs/genversion.sh
src/gromacs/gmxlib/atomprop.c
src/gromacs/gmxlib/checkpoint.c
src/gromacs/gmxlib/copyrite.c
src/gromacs/gmxlib/futil.c
src/gromacs/gmxlib/gmx_random.c
src/gromacs/gmxlib/gmxcpp.c
src/gromacs/gmxlib/index.c
src/gromacs/gmxlib/main.c
src/gromacs/gmxlib/matio.c
src/gromacs/gmxlib/mshift.c
src/gromacs/gmxlib/names.c
src/gromacs/gmxlib/oenv.c
src/gromacs/gmxlib/rando.c
src/gromacs/gmxlib/rmpbc.c
src/gromacs/gmxlib/shift_util.c
src/gromacs/gmxlib/sighandler.c
src/gromacs/gmxlib/statutil.c
src/gromacs/gmxlib/string2.c
src/gromacs/gmxlib/tpxio.c
src/gromacs/gmxlib/vmdio.c
src/gromacs/gmxlib/wman.c
src/gromacs/gmxpreprocess/fflibutil.c
src/gromacs/gmxpreprocess/gen_ad.c
src/gromacs/gmxpreprocess/genhydro.c
src/gromacs/gmxpreprocess/pdb2top.c
src/gromacs/gmxpreprocess/pgutil.c
src/gromacs/gmxpreprocess/pgutil.h
src/gromacs/gmxpreprocess/readir.c
src/gromacs/gmxpreprocess/topio.c
src/gromacs/legacyheaders/CMakeLists.txt
src/gromacs/legacyheaders/checkpoint.h
src/gromacs/legacyheaders/coulomb.h
src/gromacs/legacyheaders/futil.h
src/gromacs/legacyheaders/gmx_fatal.h
src/gromacs/legacyheaders/gmx_header_config.h.cmakein
src/gromacs/legacyheaders/mdrun.h
src/gromacs/legacyheaders/string2.h
src/gromacs/legacyheaders/sysstuff.h
src/gromacs/legacyheaders/thread_mpi/numa_malloc.h
src/gromacs/legacyheaders/types/commrec.h
src/gromacs/legacyheaders/types/enums.h
src/gromacs/legacyheaders/update.h
src/gromacs/legacyheaders/xdrf.h
src/gromacs/mdlib/coupling.c
src/gromacs/mdlib/domdec.c
src/gromacs/mdlib/domdec_setup.c
src/gromacs/mdlib/fft5d.c
src/gromacs/mdlib/force.c
src/gromacs/mdlib/forcerec.c
src/gromacs/mdlib/iteratedconstraints.c
src/gromacs/mdlib/md_support.c
src/gromacs/mdlib/mdatom.c
src/gromacs/mdlib/minimize.c
src/gromacs/mdlib/nlistheuristics.c
src/gromacs/mdlib/pme.c
src/gromacs/mdlib/shellfc.c
src/gromacs/mdlib/sim_util.c
src/gromacs/mdlib/stat.c
src/gromacs/mdlib/tables.c
src/gromacs/mdlib/update.c
src/gromacs/trajectoryanalysis/modules/select.cpp
src/programs/grompp/grompp.c
src/programs/mdrun/md.c
src/programs/mdrun/md_openmm.c
src/programs/mdrun/mdrun.c
src/programs/mdrun/runner.c
src/programs/mdrun/xutils.c
src/tools/gmx_editconf.c
src/tools/gmx_membed.c
src/tools/gmx_msd.c
src/tools/gmx_trjconv.c

diff --cc CMakeLists.txt
index 8348739ae87f5e3956496a868b6ac009e2388662,e5ebf9a10d9c334dbc4fea1fcd1a4991f2d9aca4..fd8f6ad481432daaa68ae0616d364665d7538468
@@@ -506,11 -483,9 +518,12 @@@ find_package(Doxygen
  ########################################################################
  
  add_definitions( -DHAVE_CONFIG_H )
 +include_directories(${CMAKE_SOURCE_DIR}/src)
 +# Required for config.h, maybe should only be set in src/CMakeLists.txt
  include_directories(${CMAKE_BINARY_DIR}/src)
 -include_directories(${CMAKE_BINARY_DIR}/include)
 -include_directories(${CMAKE_SOURCE_DIR}/include)
 +# Required for now to make old code compile
++include_directories(${CMAKE_BINARY_DIR}/src/gromacs/legacyheaders)
 +include_directories(${CMAKE_SOURCE_DIR}/src/gromacs/legacyheaders)
  
  include(gmxCheckBuildUserTime)
  gmx_check_build_user_time(BUILD_TIME BUILD_USER BUILD_MACHINE)
index 4e9762d99f563743f3cf645346e2fca2dd9c009b,cc6a5d64354d5096ce838d402a0faa5cc954bc3d..44d91f5907a451cabe9524a44049aaabdc86748d
@@@ -47,13 -47,11 +47,12 @@@ MACRO(gmx_c_flags
          if(NOT GMX_OPENMP)
              GMX_TEST_CFLAG(CXXFLAGS_PRAGMA "-Wno-unknown-pragmas" GMXC_CXXFLAGS)
          endif()
 -        GMX_TEST_CXXFLAG(CXXFLAGS_WARN "-Wall -Wno-unused -Wunused-value" GMXC_CXXFLAGS)
 -        GMX_TEST_CXXFLAG(CXXFLAGS_WARN "-Wextra -Wno-missing-field-initializers -Wno-sign-compare" GMXC_CXXFLAGS)
 +        GMX_TEST_CXXFLAG(CXXFLAGS_WARN "-Wall -Wno-unused-function -Wno-unused-parameter" GMXC_CXXFLAGS)
 +        GMX_TEST_CXXFLAG(CXXFLAGS_WARN "-Wnon-virtual-dtor" GMXC_CXXFLAGS)
 +        GMX_TEST_CXXFLAG(CXXFLAGS_WARN "-Wextra -Wno-missing-field-initializers" GMXC_CXXFLAGS)
        # new in gcc 4.5
-         GMX_TEST_CXXFLAG(CXXFLAGS_EXCESS_PREC "-fexcess-precision=fast" 
-                           GMXC_CXXFLAGS)
-         GMX_TEST_CXXFLAG(CXXFLAGS_COPT "-fomit-frame-pointer -finline-functions -funroll-all-loops" 
+         GMX_TEST_CXXFLAG(CXXFLAGS_EXCESS_PREC "-fexcess-precision=fast" GMXC_CXXFLAGS_RELEASE)
+         GMX_TEST_CXXFLAG(CXXFLAGS_COPT "-fomit-frame-pointer -finline-functions -funroll-all-loops"
                           GMXC_CXXFLAGS_RELEASE)
          GMX_TEST_CXXFLAG(CXXFLAGS_NOINLINE "-fno-inline" GMXC_CXXFLAGS_DEBUG)
      endif()
Simple merge
Simple merge
Simple merge
Simple merge
index 828aa3d621fb20e944044a984fd9d7be63f2f905,0000000000000000000000000000000000000000..9eb5470759aeda8ca3afd4f47bba1adf95071f77
mode 100644,000000..100644
--- /dev/null
@@@ -1,663 -1,0 +1,677 @@@
-       6, 2011, "e19791"
-     }
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROningen Mixture of Alchemy and Childrens' Stories
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#ifdef GMX_THREAD_MPI
 +#include <thread_mpi.h>
 +#endif
 +
 +/* This file is completely threadsafe - keep it that way! */
 +
 +#include <string.h>
 +#include <ctype.h>
 +#include "sysstuff.h"
 +#include "smalloc.h"
 +#include "string2.h"
 +#include "macros.h"
 +#include <time.h>
 +#include "random.h"
 +#include "statutil.h"
 +#include "copyrite.h"
 +#include "strdb.h"
 +#include "futil.h"
 +
 +static void pr_two(FILE *out,int c,int i)
 +{
 +  if (i < 10)
 +    fprintf(out,"%c0%1d",c,i);
 +  else
 +    fprintf(out,"%c%2d",c,i);
 +}
 +
 +void pr_difftime(FILE *out,double dt)
 +{
 +  int    ndays,nhours,nmins,nsecs;
 +  gmx_bool   bPrint,bPrinted;
 +
 +  ndays = dt/(24*3600);
 +  dt    = dt-24*3600*ndays;
 +  nhours= dt/3600;
 +  dt    = dt-3600*nhours;
 +  nmins = dt/60;
 +  dt    = dt-nmins*60;
 +  nsecs = dt;
 +  bPrint= (ndays > 0);
 +  bPrinted=bPrint;
 +  if (bPrint) 
 +    fprintf(out,"%d",ndays);
 +  bPrint=bPrint || (nhours > 0);
 +  if (bPrint) {
 +    if (bPrinted)
 +      pr_two(out,'d',nhours);
 +    else 
 +      fprintf(out,"%d",nhours);
 +  }
 +  bPrinted=bPrinted || bPrint;
 +  bPrint=bPrint || (nmins > 0);
 +  if (bPrint) {
 +    if (bPrinted)
 +      pr_two(out,'h',nmins);
 +    else 
 +      fprintf(out,"%d",nmins);
 +  }
 +  bPrinted=bPrinted || bPrint;
 +  if (bPrinted)
 +    pr_two(out,':',nsecs);
 +  else
 +    fprintf(out,"%ds",nsecs);
 +  fprintf(out,"\n");
 +}
 +
 +
 +gmx_bool be_cool(void)
 +{
 +  /* Yes, it is bad to check the environment variable every call,
 +   * but we dont call this routine often, and it avoids using 
 +   * a mutex for locking the variable...
 +   */
 +#ifdef GMX_FAHCORE
 +  /*be uncool*/
 +  return FALSE;
 +#else
 +  return (getenv("GMX_NO_QUOTES") == NULL);
 +#endif
 +}
 +
 +void space(FILE *out, int n)
 +{
 +  fprintf(out,"%*s",n,"");
 +}
 +
 +void f(char *a)
 +{
 +    int i;
 +    int len=strlen(a);
 +    
 +    for(i=0;i<len;i++)
 +        a[i]=~a[i]; 
 +}
 +
 +static void sp_print(FILE *out,const char *s)
 +{
 +  int slen;
 +  
 +  slen=strlen(s);
 +  space(out,(80-slen)/2);
 +  fprintf(out,"%s\n",s);
 +}
 +
 +static void ster_print(FILE *out,const char *s)
 +{
 +  int  slen;
 +  char buf[128];
 +  
 +  snprintf(buf,128,":-)  %s  (-:",s);
 +  slen=strlen(buf);
 +  space(out,(80-slen)/2);
 +  fprintf(out,"%s\n",buf);
 +}
 +
 +
 +static void pukeit(const char *db,const char *defstring, char *retstring, 
 +                 int retsize, int *cqnum)
 +{
 +  FILE *fp;
 +  char **help;
 +  int  i,nhlp;
 +  int  seed;
 + 
 +  if (be_cool() && ((fp = low_libopen(db,FALSE)) != NULL)) {
 +    nhlp=fget_lines(fp,&help);
 +    /* for libraries we can use the low-level close routines */
 +    ffclose(fp);
 +    seed=time(NULL);
 +    *cqnum=nhlp*rando(&seed);
 +    if (strlen(help[*cqnum]) >= STRLEN)
 +      help[*cqnum][STRLEN-1] = '\0';
 +    strncpy(retstring,help[*cqnum],retsize);
 +    f(retstring);
 +    for(i=0; (i<nhlp); i++)
 +      sfree(help[i]);
 +    sfree(help);
 +  }
 +  else 
 +    strncpy(retstring,defstring,retsize);
 +}
 +
 +void bromacs(char *retstring, int retsize)
 +{
 +  int dum;
 +
 +  pukeit("bromacs.dat",
 +       "Groningen Machine for Chemical Simulation",
 +       retstring,retsize,&dum);
 +}
 +
 +void cool_quote(char *retstring, int retsize, int *cqnum)
 +{
 +  char *tmpstr;
 +  char *s,*ptr;
 +  int tmpcq,*p;
 +  
 +  if (cqnum!=NULL)
 +    p = cqnum;
 +  else
 +    p = &tmpcq;
 +  
 +  /* protect audience from explicit lyrics */
 +  snew(tmpstr,retsize+1);
 +  pukeit("gurgle.dat","Thanx for Using GROMACS - Have a Nice Day",
 +       tmpstr,retsize-2,p);
 +
 +  if ((ptr = strchr(tmpstr,'_')) != NULL) {
 +    *ptr='\0';
 +    ptr++;
 +    sprintf(retstring,"\"%s\" %s",tmpstr,ptr);
 +  }
 +  else {
 +    strcpy(retstring,tmpstr);
 +  }
 +  sfree(tmpstr);
 +}
 +
 +void CopyRight(FILE *out,const char *szProgram)
 +{
 +  static const char * CopyrightText[] = {
 +             "Written by Emile Apol, Rossen Apostolov, Herman J.C. Berendsen,",
 +             "Aldert van Buuren, Pär Bjelkmar, Rudi van Drunen, Anton Feenstra, ",
 +             "Gerrit Groenhof, Peter Kasson, Per Larsson, Pieter Meulenhoff, ",
 +             "Teemu Murtola, Szilard Pall, Sander Pronk, Roland Schulz, ",
 +             "Michael Shirts, Alfons Sijbers, Peter Tieleman,\n",
 +             "Berk Hess, David van der Spoel, and Erik Lindahl.\n",
 +             "Copyright (c) 1991-2000, University of Groningen, The Netherlands.",
 +             "Copyright (c) 2001-2010, The GROMACS development team at",
 +             "Uppsala University & The Royal Institute of Technology, Sweden.",
 +             "check out http://www.gromacs.org for more information.\n"
 +  };
 +
 +  static const char * GPLText[] = {
 +              "This program is free software; you can redistribute it and/or",
 +              "modify it under the terms of the GNU General Public License",
 +              "as published by the Free Software Foundation; either version 2",
 +              "of the License, or (at your option) any later version."
 +  };
 +
 +  /* Dont change szProgram arbitrarily - it must be argv[0], i.e. the 
 +   * name of a file. Otherwise, we won't be able to find the library dir.
 +   */
 +#define NCR (int)asize(CopyrightText)
 +#ifdef GMX_FAHCORE
 +#define NGPL 0 /*FAH has an exception permission from GPL to allow digital signatures in Gromacs*/
 +#else
 +#define NGPL (int)asize(GPLText)
 +#endif
 +
 +  char buf[256],tmpstr[1024];
 +  int i;
 +
 +#ifdef GMX_FAHCORE
 +  set_program_name("Gromacs");
 +#else
 +  set_program_name(szProgram);
 +#endif
 +
 +  ster_print(out,"G  R  O  M  A  C  S");
 +  fprintf(out,"\n");
 +  
 +  bromacs(tmpstr,1023);
 +  sp_print(out,tmpstr); 
 +  fprintf(out,"\n");
 +
 +  ster_print(out,GromacsVersion());
 +  fprintf(out,"\n");
 +
 +  /* fprintf(out,"\n");*/
 +
 +  /* sp_print(out,"PLEASE NOTE: THIS IS A BETA VERSION\n");
 +  
 +  fprintf(out,"\n"); */
 +
 +  for(i=0; (i<NCR); i++) 
 +    sp_print(out,CopyrightText[i]);
 +  for(i=0; (i<NGPL); i++)
 +    sp_print(out,GPLText[i]);
 +
 +  fprintf(out,"\n");
 +
 +  snprintf(buf,256,"%s",Program());
 +#ifdef GMX_DOUBLE
 +  strcat(buf," (double precision)");
 +#endif
 +  ster_print(out,buf);
 +  fprintf(out,"\n");
 +}
 +
 +
 +void thanx(FILE *fp)
 +{
 +  char cq[1024];
 +  int  cqnum;
 +
 +  /* protect the audience from suggestive discussions */
 +  cool_quote(cq,1023,&cqnum);
 +  
 +  if (be_cool()) 
 +    fprintf(fp,"\ngcq#%d: %s\n\n",cqnum,cq);
 +  else
 +    fprintf(fp,"\n%s\n\n",cq);
 +}
 +
 +typedef struct {
 +  const char *key;
 +  const char *author;
 +  const char *title;
 +  const char *journal;
 +  int volume,year;
 +  const char *pages;
 +} t_citerec;
 +
 +void please_cite(FILE *fp,const char *key)
 +{
 +  static const t_citerec citedb[] = {
 +    { "Allen1987a",
 +      "M. P. Allen and D. J. Tildesley",
 +      "Computer simulation of liquids",
 +      "Oxford Science Publications",
 +      1, 1987, "1" },
 +    { "Berendsen95a",
 +      "H. J. C. Berendsen, D. van der Spoel and R. van Drunen",
 +      "GROMACS: A message-passing parallel molecular dynamics implementation",
 +      "Comp. Phys. Comm.",
 +      91, 1995, "43-56" },
 +    { "Berendsen84a",
 +      "H. J. C. Berendsen, J. P. M. Postma, A. DiNola and J. R. Haak",
 +      "Molecular dynamics with coupling to an external bath",
 +      "J. Chem. Phys.",
 +      81, 1984, "3684-3690" },
 +    { "Ryckaert77a",
 +      "J. P. Ryckaert and G. Ciccotti and H. J. C. Berendsen",
 +      "Numerical Integration of the Cartesian Equations of Motion of a System with Constraints; Molecular Dynamics of n-Alkanes",
 +      "J. Comp. Phys.",
 +      23, 1977, "327-341" },
 +    { "Miyamoto92a",
 +      "S. Miyamoto and P. A. Kollman",
 +      "SETTLE: An Analytical Version of the SHAKE and RATTLE Algorithms for Rigid Water Models",
 +      "J. Comp. Chem.",
 +      13, 1992, "952-962" },
 +    { "Cromer1968a",
 +      "D. T. Cromer & J. B. Mann",
 +      "X-ray scattering factors computed from numerical Hartree-Fock wave functions",
 +      "Acta Cryst. A",
 +      24, 1968, "321" },
 +    { "Barth95a",
 +      "E. Barth and K. Kuczera and B. Leimkuhler and R. D. Skeel",
 +      "Algorithms for Constrained Molecular Dynamics",
 +      "J. Comp. Chem.",
 +      16, 1995, "1192-1209" },
 +    { "Essmann95a",
 +      "U. Essmann, L. Perera, M. L. Berkowitz, T. Darden, H. Lee and L. G. Pedersen ",
 +      "A smooth particle mesh Ewald method",
 +      "J. Chem. Phys.",
 +      103, 1995, "8577-8592" },
 +    { "Torda89a",
 +      "A. E. Torda and R. M. Scheek and W. F. van Gunsteren",
 +      "Time-dependent distance restraints in molecular dynamics simulations",
 +      "Chem. Phys. Lett.",
 +      157, 1989, "289-294" },
 +    { "Tironi95a",
 +      "I. G. Tironi and R. Sperb and P. E. Smith and W. F. van Gunsteren",
 +      "Generalized reaction field method for molecular dynamics simulations",
 +      "J. Chem. Phys",
 +      102, 1995, "5451-5459" },
 +    { "Hess97a",
 +      "B. Hess and H. Bekker and H. J. C. Berendsen and J. G. E. M. Fraaije",
 +      "LINCS: A Linear Constraint Solver for molecular simulations",
 +      "J. Comp. Chem.",
 +      18, 1997, "1463-1472" },
 +    { "Hess2008a",
 +      "B. Hess",
 +      "P-LINCS: A Parallel Linear Constraint Solver for molecular simulation",
 +      "J. Chem. Theory Comput.",
 +      4, 2008, "116-122" },
 +    { "Hess2008b",
 +      "B. Hess and C. Kutzner and D. van der Spoel and E. Lindahl",
 +      "GROMACS 4: Algorithms for highly efficient, load-balanced, and scalable molecular simulation",
 +      "J. Chem. Theory Comput.",
 +      4, 2008, "435-447" },
 +    { "Hub2010",
 +      "J. S. Hub, B. L. de Groot and D. van der Spoel",
 +      "g_wham - A free weighted histogram analysis implementation including robust error and autocorrelation estimates",
 +      "J. Chem. Theory Comput.",
 +      6, 2010, "3713-3720"}, 
 +    { "In-Chul99a",
 +      "Y. In-Chul and M. L. Berkowitz",
 +      "Ewald summation for systems with slab geometry",
 +      "J. Chem. Phys.",
 +      111, 1999, "3155-3162" },
 +    { "DeGroot97a",
 +      "B. L. de Groot and D. M. F. van Aalten and R. M. Scheek and A. Amadei and G. Vriend and H. J. C. Berendsen",
 +      "Prediction of Protein Conformational Freedom From Distance Constrains",
 +      "Proteins",
 +      29, 1997, "240-251" },
 +    { "Spoel98a",
 +      "D. van der Spoel and P. J. van Maaren and H. J. C. Berendsen",
 +      "A systematic study of water models for molecular simulation. Derivation of models optimized for use with a reaction-field.",
 +      "J. Chem. Phys.",
 +      108, 1998, "10220-10230" },
 +    { "Wishart98a",
 +      "D. S. Wishart and A. M. Nip",
 +      "Protein Chemical Shift Analysis: A Practical Guide",
 +      "Biochem. Cell Biol.",
 +      76, 1998, "153-163" },
 +    { "Maiorov95",
 +      "V. N. Maiorov and G. M. Crippen",
 +      "Size-Independent Comparison of Protein Three-Dimensional Structures",
 +      "PROTEINS: Struct. Funct. Gen.",
 +      22, 1995, "273-283" },
 +    { "Feenstra99",
 +      "K. A. Feenstra and B. Hess and H. J. C. Berendsen",
 +      "Improving Efficiency of Large Time-scale Molecular Dynamics Simulations of Hydrogen-rich Systems",
 +      "J. Comput. Chem.",
 +      20, 1999, "786-798" },
 +    { "Timneanu2004a",
 +      "N. Timneanu and C. Caleman and J. Hajdu and D. van der Spoel",
 +      "Auger Electron Cascades in Water and Ice",
 +      "Chem. Phys.",
 +      299, 2004, "277-283" },
 +    { "Pascal2011a",
 +      "T. A. Pascal and S. T. Lin and W. A. Goddard III",
 +      "Thermodynamics of liquids: standard molar entropies and heat capacities of common solvents from 2PT molecular dynamics",
 +      "Phys. Chem. Chem. Phys.",
 +      13, 2011, "169-181" },
 +    { "Caleman2011b",
 +      "C. Caleman and M. Hong and J. S. Hub and L. T. da Costa and P. J. van Maaren and D. van der Spoel",
 +      "Force Field Benchmark 1: Density, Heat of Vaporization, Heat Capacity, Surface Tension and Dielectric Constant of 152 Organic Liquids",
 +      "Submitted",
 +      0, 2011, "" },
 +    { "Lindahl2001a",
 +      "E. Lindahl and B. Hess and D. van der Spoel",
 +      "GROMACS 3.0: A package for molecular simulation and trajectory analysis",
 +      "J. Mol. Mod.",
 +      7, 2001, "306-317" },
 +    { "Wang2001a",
 +      "J. Wang and W. Wang and S. Huo and M. Lee and P. A. Kollman",
 +      "Solvation model based on weighted solvent accessible surface area",
 +      "J. Phys. Chem. B",
 +      105, 2001, "5055-5067" },
 +    { "Eisenberg86a",
 +      "D. Eisenberg and A. D. McLachlan",
 +      "Solvation energy in protein folding and binding",
 +      "Nature",
 +      319, 1986, "199-203" },
 +    { "Eisenhaber95",
 +      "Frank Eisenhaber and Philip Lijnzaad and Patrick Argos and Chris Sander and Michael Scharf",
 +      "The Double Cube Lattice Method: Efficient Approaches to Numerical Integration of Surface Area and Volume and to Dot Surface Contouring of Molecular Assemblies",
 +      "J. Comp. Chem.",
 +      16, 1995, "273-284" },
 +    { "Hess2002",
 +      "B. Hess, H. Saint-Martin and H.J.C. Berendsen",
 +      "Flexible constraints: an adiabatic treatment of quantum degrees of freedom, with application to the flexible and polarizable MCDHO model for water",
 +      "J. Chem. Phys.",
 +      116, 2002, "9602-9610" },
 +    { "Hetenyi2002b",
 +      "Csaba Hetenyi and David van der Spoel",
 +      "Efficient docking of peptides to proteins without prior knowledge of the binding site.",
 +      "Prot. Sci.",
 +      11, 2002, "1729-1737" },
 +    { "Hess2003",
 +      "B. Hess and R.M. Scheek",
 +      "Orientation restraints in molecular dynamics simulations using time and ensemble averaging",
 +      "J. Magn. Res.",
 +      164, 2003, "19-27" },
 +    { "Rappe1991a",
 +      "A. K. Rappe and W. A. Goddard III",
 +      "Charge Equillibration for Molecular Dynamics Simulations",
 +      "J. Phys. Chem.",
 +      95, 1991, "3358-3363" },
 +    { "Mu2005a",
 +      "Y. Mu, P. H. Nguyen and G. Stock",
 +      "Energy landscape of a small peptide revelaed by dihedral angle principal component analysis",
 +      "Prot. Struct. Funct. Bioinf.",
 +      58, 2005, "45-52" },
 +    { "Okabe2001a",
 +      "T. Okabe and M. Kawata and Y. Okamoto and M. Mikami",
 +      "Replica-exchange {M}onte {C}arlo method for the isobaric-isothermal ensemble",
 +      "Chem. Phys. Lett.",
 +      335, 2001, "435-439" },
 +    { "Hukushima96a",
 +      "K. Hukushima and K. Nemoto",
 +      "Exchange Monte Carlo Method and Application to Spin Glass Simulations",
 +      "J. Phys. Soc. Jpn.",
 +      65, 1996, "1604-1608" },
 +    { "Tropp80a",
 +      "J. Tropp",
 +      "Dipolar Relaxation and Nuclear Overhauser effects in nonrigid molecules: The effect of fluctuating internuclear distances",
 +      "J. Chem. Phys.",
 +      72, 1980, "6035-6043" },
 +    { "Bultinck2002a",
 +       "P. Bultinck and W. Langenaeker and P. Lahorte and F. De Proft and P. Geerlings and M. Waroquier and J. P. Tollenaere",
 +      "The electronegativity equalization method I: Parametrization and validation for atomic charge calculations",
 +      "J. Phys. Chem. A",
 +      106, 2002, "7887-7894" },
 +    { "Yang2006b",
 +      "Q. Y. Yang and K. A. Sharp",
 +      "Atomic charge parameters for the finite difference Poisson-Boltzmann method using electronegativity neutralization",
 +      "J. Chem. Theory Comput.",
 +      2, 2006, "1152-1167" },
 +    { "Spoel2005a",
 +      "D. van der Spoel, E. Lindahl, B. Hess, G. Groenhof, A. E. Mark and H. J. C. Berendsen",
 +      "GROMACS: Fast, Flexible and Free",
 +      "J. Comp. Chem.",
 +      26, 2005, "1701-1719" },
 +    { "Spoel2006b",
 +      "D. van der Spoel, P. J. van Maaren, P. Larsson and N. Timneanu",
 +      "Thermodynamics of hydrogen bonding in hydrophilic and hydrophobic media",
 +      "J. Phys. Chem. B",
 +      110, 2006, "4393-4398" },
 +    { "Spoel2006d",
 +      "D. van der Spoel and M. M. Seibert",
 +      "Protein folding kinetics and thermodynamics from atomistic simulations",
 +      "Phys. Rev. Letters",
 +      96, 2006, "238102" },
 +    { "Palmer94a",
 +      "B. J. Palmer",
 +      "Transverse-current autocorrelation-function calculations of the shear viscosity for molecular liquids",
 +      "Phys. Rev. E",
 +      49, 1994, "359-366" },
 +    { "Bussi2007a",
 +      "G. Bussi, D. Donadio and M. Parrinello",
 +      "Canonical sampling through velocity rescaling",
 +      "J. Chem. Phys.",
 +      126, 2007, "014101" },
 +    { "Hub2006",
 +      "J. S. Hub and B. L. de Groot",
 +      "Does CO2 permeate through Aquaporin-1?",
 +      "Biophys. J.",
 +      91, 2006, "842-848" },
 +    { "Hub2008",
 +      "J. S. Hub and B. L. de Groot",
 +      "Mechanism of selectivity in aquaporins and aquaglyceroporins",
 +      "PNAS",
 +      105, 2008, "1198-1203" },
 +    { "Friedrich2009",
 +      "M. S. Friedrichs, P. Eastman, V. Vaidyanathan, M. Houston, S. LeGrand, A. L. Beberg, D. L. Ensign, C. M. Bruns, and V. S. Pande",
 +      "Accelerating Molecular Dynamic Simulation on Graphics Processing Units",
 +      "J. Comp. Chem.",
 +      30, 2009, "864-872" },
 +    { "Engin2010",
 +      "O. Engin, A. Villa, M. Sayar and B. Hess",
 +      "Driving Forces for Adsorption of Amphiphilic Peptides to Air-Water Interface",
 +      "J. Phys. Chem. B",
 +      0, 2010, "???" },
 +    { "Fritsch12",
 +      "S. Fritsch, C. Junghans and K. Kremer",
 +      "Adaptive molecular simulation study on structure formation of toluene around C60 using Gromacs",
 +      "J. Chem. Theo. Comp.",
 +      0, 2012, "doi:10.1021/ct200706f" },
 +    { "Junghans10",
 +      "C. Junghans and S. Poblete",
 +      "A reference implementation of the adaptive resolution scheme in ESPResSo",
 +      "Comp. Phys. Comm.",
 +      181, 2010, "1449" },
 +    { "Wang2010",
 +      "H. Wang, F. Dommert, C.Holm",
 +      "Optimizing working parameters of the smooth particle mesh Ewald algorithm in terms of accuracy and efficiency",
 +      "J. Chem. Phys. B",
 +      133, 2010, "034117" },
 +    { "Kutzner2011",
 +      "C. Kutzner and J. Czub and H. Grubmuller",
 +      "Keep it Flexible: Driving Macromolecular Rotary Motions in Atomistic Simulations with GROMACS",
 +      "J. Chem. Theory Comput.",
 +      7, 2011, "1381-1393" },
 +    { "Hoefling2011",
 +      "M. Hoefling, N. Lima, D. Haenni, C.A.M. Seidel, B. Schuler, H. Grubmuller",
 +      "Structural Heterogeneity and Quantitative FRET Efficiency Distributions of Polyprolines through a Hybrid Atomistic Simulation and Monte Carlo Approach",
 +      "PLoS ONE",
++      6, 2011, "e19791" },
++    { "Hockney1988",
++      "R. W. Hockney and J. W. Eastwood",
++      "Computer simulation using particles",
++      "IOP, Bristol",
++      1, 1988, "1" },
++    { "Ballenegger2012",
++      "V. Ballenegger, J.J. Cerda, and C. Holm",
++      "How to Convert SPME to P3M: Influence Functions and Error Estimates",
++      "J. Chem. Theory Comput.",
++      8, 2012, "936-947" },
++    { "Garmay2012",
++      "Garmay Yu, Shvetsov A, Karelov D, Lebedev D, Radulescu A, Petukhov M, Isaev-Ivanov V",
++      "Correlated motion of protein subdomains and large-scale conformational flexibility of RecA protein filament",
++      "Journal of Physics: Conference Series",
++      340, 2012, "012094" }
 +  };
 +#define NSTR (int)asize(citedb)
 +  
 +  int  j,index;
 +  char *author;
 +  char *title;
 +#define LINE_WIDTH 79
 +  
 +  if (fp == NULL)
 +    return;
 +
 +  for(index=0; (index<NSTR) && (strcmp(citedb[index].key,key) != 0); index++)
 +    ;
 +  
 +  fprintf(fp,"\n++++ PLEASE READ AND CITE THE FOLLOWING REFERENCE ++++\n");
 +  if (index < NSTR) {
 +    /* Insert newlines */
 +    author = wrap_lines(citedb[index].author,LINE_WIDTH,0,FALSE);
 +    title  = wrap_lines(citedb[index].title,LINE_WIDTH,0,FALSE);
 +    fprintf(fp,"%s\n%s\n%s %d (%d) pp. %s\n",
 +          author,title,citedb[index].journal,
 +          citedb[index].volume,citedb[index].year,
 +          citedb[index].pages);
 +    sfree(author);
 +    sfree(title);
 +  }
 +  else {
 +    fprintf(fp,"Entry %s not found in citation database\n",key);
 +  }
 +  fprintf(fp,"-------- -------- --- Thank You --- -------- --------\n\n");
 +  fflush(fp);
 +}
 +
 +#ifdef USE_VERSION_H
 +/* Version information generated at compile time. */
 +#include "version.h"
 +#else
 +/* Fall back to statically defined version. */
 +static const char _gmx_ver_string[]="VERSION " VERSION;
 +#endif
 +
 +/* This routine only returns a static (constant) string, so we use a 
 + * mutex to initialize it. Since the string is only written to the
 + * first time, there is no risk with multiple calls overwriting the
 + * output for each other.
 + */
 +const char *GromacsVersion()
 +{
 +  return _gmx_ver_string;
 +}
 +
 +
 +void gmx_print_version_info(FILE *fp)
 +{
 +    fprintf(fp, "Version:          %s\n", _gmx_ver_string);
 +#ifdef USE_VERSION_H
 +    fprintf(fp, "GIT SHA1 hash:    %s\n", _gmx_full_git_hash);
 +    /* Only print out the branch information if present.
 +     * The generating script checks whether the branch point actually
 +     * coincides with the hash reported above, and produces an empty string
 +     * in such cases. */
 +    if (_gmx_central_base_hash[0] != 0)
 +    {
 +        fprintf(fp, "Branched from:    %s\n", _gmx_central_base_hash);
 +    }
 +#endif
 +
 +#ifdef GMX_DOUBLE
 +    fprintf(fp, "Precision:        double\n");
 +#else
 +    fprintf(fp, "Precision:        single\n");
 +#endif
 +
 +#ifdef GMX_THREAD_MPI
 +    fprintf(fp, "Parallellization: thread_mpi\n");
 +#elif defined(GMX_MPI)
 +    fprintf(fp, "Parallellization: MPI\n");
 +#else
 +    fprintf(fp, "Parallellization: none\n");
 +#endif
 +
 +#ifdef GMX_FFT_FFTPACK
 +    fprintf(fp, "FFT Library:      fftpack\n");
 +#elif defined(GMX_FFT_FFTW3)
 +    fprintf(fp, "FFT Library:      fftw3\n");
 +#elif defined(GMX_FFT_MKL)
 +    fprintf(fp, "FFT Library:      MKL\n");
 +#else
 +    fprintf(fp, "FFT Library:      unknown\n");
 +#endif
 +
 +}
Simple merge
Simple merge
Simple merge
index 4d899379123c5d7f16f7ce58012cda555c9c76ca,0000000000000000000000000000000000000000..cfee70125aadaa1f518b3b353978d06273842dd1
mode 100644,000000..100644
--- /dev/null
@@@ -1,1154 -1,0 +1,1154 @@@
-       while ((i=sscanf(pt,"%s",str)) == 1) {
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROningen Mixture of Alchemy and Childrens' Stories
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <ctype.h>
 +#include <string.h>
 +#include "sysstuff.h"
 +#include "strdb.h"
 +#include "futil.h"
 +#include "macros.h"
 +#include "names.h"
 +#include "string2.h"
 +#include "statutil.h"
 +#include "confio.h"
 +#include "main.h"
 +#include "copyrite.h"
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "invblock.h"
 +#include "macros.h"
 +#include "index.h"
 +#include "txtdump.h"
 +#include "gmxfio.h"
 +
 +
 +
 +const char gmx_residuetype_undefined[]="Other";
 +
 +struct gmx_residuetype
 +{
 +    int      n; 
 +    char **  resname;
 +    char **  restype;
 +    
 +};
 +
 +
 +static gmx_bool gmx_ask_yesno(gmx_bool bASK)
 +{
 +  char c;
 +
 +  if (bASK) {
 +    do {
 +      c=toupper(fgetc(stdin));
 +    } while ((c != 'Y') && (c != 'N'));
 +
 +    return (c == 'Y');
 +  }
 +  else
 +    return FALSE;
 +}
 +
 +t_blocka *new_blocka(void)
 +{
 +  t_blocka *block;
 +
 +  snew(block,1);
 +  snew(block->index,1);
 +
 +  return block;
 +}
 +
 +void write_index(const char *outf, t_blocka *b,char **gnames)
 +{
 +  FILE *out;
 +  int  i,j,k;
 +
 +  out=gmx_fio_fopen(outf,"w");
 +  /* fprintf(out,"%5d  %5d\n",b->nr,b->nra); */
 +  for(i=0; (i<b->nr); i++) {
 +    fprintf(out,"[ %s ]\n",gnames[i]);
 +    for(k=0,j=b->index[i]; j<b->index[i+1]; j++,k++) {
 +      fprintf(out,"%4d ",b->a[j]+1);
 +      if ((k % 15) == 14)
 +      fprintf(out,"\n");
 +    }
 +    fprintf(out,"\n");
 +  }
 +  gmx_fio_fclose(out);
 +}
 +
 +void add_grp(t_blocka *b,char ***gnames,int nra,atom_id a[],const char *name)
 +{
 +  int i;
 +
 +  srenew(b->index,b->nr+2);
 +  srenew(*gnames,b->nr+1);
 +  (*gnames)[b->nr]=strdup(name);
 +
 +  srenew(b->a,b->nra+nra);
 +  for(i=0; (i<nra); i++)
 +    b->a[b->nra++]=a[i];
 +  b->nr++;
 +  b->index[b->nr]=b->nra;
 +}
 +
 +/* compare index in `a' with group in `b' at `index', 
 +   when `index'<0 it is relative to end of `b' */
 +static gmx_bool grp_cmp(t_blocka *b, int nra, atom_id a[], int index)
 +{
 +  int i;
 +  
 +  if (index < 0)
 +    index = b->nr-1+index;
 +  if (index >= b->nr)
 +    gmx_fatal(FARGS,"no such index group %d in t_blocka (nr=%d)",index,b->nr);
 +  /* compare sizes */
 +  if ( nra != b->index[index+1] - b->index[index] )
 +    return FALSE;
 +  for(i=0; i<nra; i++)
 +    if ( a[i] != b->a[b->index[index]+i] )
 +      return FALSE;
 +  return TRUE;
 +}
 +
 +static void 
 +p_status(const char **restype, int nres, const char **typenames, int ntypes)
 +{
 +    int i,j;
 +    int found;
 +    
 +    int * counter;
 +    
 +    snew(counter,ntypes);
 +    for(i=0;i<ntypes;i++)
 +    {
 +        counter[i]=0;
 +    }
 +    for(i=0;i<nres;i++)
 +    {
 +        found=0;
 +        for(j=0;j<ntypes;j++)
 +        {
 +            if(!gmx_strcasecmp(restype[i],typenames[j]))
 +            {
 +                counter[j]++;
 +            }
 +        }
 +    }
 +    
 +    for(i=0; (i<ntypes); i++) 
 +    {
 +        if (counter[i] > 0)
 +        {
 +            printf("There are: %5d %10s residues\n",counter[i],typenames[i]);
 +        }
 +    }
 +
 +    sfree(counter);
 +}
 +
 +
 +atom_id *
 +mk_aid(t_atoms *atoms,const char ** restype,const char * typestring,int *nra,gmx_bool bMatch)
 +/* Make an array of atom_ids for all atoms with residuetypes matching typestring, or the opposite if bMatch is false */
 +{
 +    atom_id *a;
 +    int     i;
 +    int     res;
 +    
 +    snew(a,atoms->nr);
 +    *nra=0;
 +    for(i=0; (i<atoms->nr); i++) 
 +    {
 +        res=!gmx_strcasecmp(restype[atoms->atom[i].resind],typestring);
 +        if(bMatch==FALSE)
 +        {
 +            res=!res;
 +        }
 +        if(res)
 +        {
 +            a[(*nra)++]=i;
 +        }
 +    }
 +  
 +    return a;
 +}
 +
 +typedef struct {
 +  char *rname;
 +  gmx_bool bNeg;
 +  char *gname;
 +} restp_t;
 +
 +static void analyse_other(const char ** restype,t_atoms *atoms,
 +                        t_blocka *gb,char ***gn,gmx_bool bASK,gmx_bool bVerb)
 +{
 +  restp_t *restp=NULL;
 +  char **attp=NULL;
 +  char *rname,*aname;
 +  atom_id *other_ndx,*aid,*aaid;
 +  int  i,j,k,l,resind,naid,naaid,natp,nrestp=0;
 +  
 +    for(i=0; (i<atoms->nres); i++)
 +    {
 +        if (gmx_strcasecmp(restype[i],"Protein") && gmx_strcasecmp(restype[i],"DNA") && gmx_strcasecmp(restype[i],"RNA") && gmx_strcasecmp(restype[i],"Water"))
 +        {
 +            break;
 +        }
 +    }
 +  if (i < atoms->nres) {
 +    /* we have others */
 +    if (bVerb)
 +      printf("Analysing residues not classified as Protein/DNA/RNA/Water and splitting into groups...\n");
 +    snew(other_ndx,atoms->nr);
 +    for(k=0; (k<atoms->nr); k++) {
 +      resind = atoms->atom[k].resind;
 +      rname = *atoms->resinfo[resind].name;
 +        if (gmx_strcasecmp(restype[resind],"Protein") && gmx_strcasecmp(restype[resind],"DNA") && 
 +            gmx_strcasecmp(restype[resind],"RNA") && gmx_strcasecmp(restype[resind],"Water")) 
 +        {
 +
 +      for(l=0; (l<nrestp); l++)
 +        if (strcmp(restp[l].rname,rname) == 0)
 +          break;
 +      if (l==nrestp) {
 +        srenew(restp,nrestp+1);
 +        restp[nrestp].rname = strdup(rname);
 +        restp[nrestp].bNeg  = FALSE;
 +        restp[nrestp].gname = strdup(rname);
 +        nrestp++;
 +        
 +    }
 +      }
 +    }
 +    for(i=0; (i<nrestp); i++) {
 +      snew(aid,atoms->nr);
 +      naid=0;
 +      for(j=0; (j<atoms->nr); j++) {
 +      rname = *atoms->resinfo[atoms->atom[j].resind].name;
 +      if ((strcmp(restp[i].rname,rname) == 0 && !restp[i].bNeg) ||
 +          (strcmp(restp[i].rname,rname) != 0 &&  restp[i].bNeg)) {
 +        aid[naid++] = j;
 +      }
 +      }
 +      add_grp(gb,gn,naid,aid,restp[i].gname);
 +      if (bASK) {
 +      printf("split %s into atoms (y/n) ? ",restp[i].gname);
 +      fflush(stdout);
 +      if (gmx_ask_yesno(bASK)) {
 +        natp=0;
 +        for(k=0; (k<naid); k++) {
 +          aname=*atoms->atomname[aid[k]];
 +          for(l=0; (l<natp); l++)
 +            if (strcmp(aname,attp[l]) == 0)
 +              break;
 +          if (l == natp) {
 +            srenew(attp,++natp);
 +            attp[natp-1]=aname;
 +          }
 +        }
 +        if (natp > 1) {
 +          for(l=0; (l<natp); l++) {
 +            snew(aaid,naid);
 +            naaid=0;
 +            for(k=0; (k<naid); k++) {
 +              aname=*atoms->atomname[aid[k]];
 +              if (strcmp(aname,attp[l])==0) 
 +                aaid[naaid++]=aid[k];
 +            }
 +            add_grp(gb,gn,naaid,aaid,attp[l]);
 +            sfree(aaid);
 +          }
 +        }
 +        sfree(attp);
 +        attp=NULL;
 +      }
 +      sfree(aid);
 +      }
 +    }
 +    sfree(other_ndx);
 +  }
 +}
 +
 +/*! /brief Instances of this struct contain the data necessary to
 + *         construct a single (protein) index group in
 + *         analyse_prot(). */
 +typedef struct gmx_help_make_index_group
 +{
 +  /** The set of atom names that will be used to form this index group */
 +  const char **defining_atomnames;
 +  /** Size of the defining_atomnames array */
 +  const int num_defining_atomnames;
 +  /** Name of this index group */
 +  const char *group_name;
 +  /** Whether the above atom names name the atoms in the group, or
 +      those not in the group */
 +  gmx_bool bTakeComplement;
 +  /** The index in wholename gives the first item in the arrays of
 +     atomnames that should be tested with 'gmx_strncasecmp' in stead of
 +     gmx_strcasecmp, or -1 if all items should be tested with strcasecmp
 +     This is comparable to using a '*' wildcard at the end of specific
 +     atom names, but that is more involved to implement...
 +   */
 +  int wholename;
 +  /** Only create this index group if it differs from the one specified in compareto,
 +     where -1 means to always create this group. */
 +  int compareto;
 +} t_gmx_help_make_index_group;
 +
 +static void analyse_prot(const char ** restype,t_atoms *atoms,
 +                       t_blocka *gb,char ***gn,gmx_bool bASK,gmx_bool bVerb)
 +{
 +  /* lists of atomnames to be used in constructing index groups: */
 +  static const char *pnoh[]    = { "H", "HN" };
 +  static const char *pnodum[]  = { "MN1",  "MN2",  "MCB1", "MCB2", "MCG1", "MCG2", 
 +                           "MCD1", "MCD2", "MCE1", "MCE2", "MNZ1", "MNZ2" };
 +  static const char *calpha[]  = { "CA" };
 +  static const char *bb[]      = { "N","CA","C" };
 +  static const char *mc[]      = { "N","CA","C","O","O1","O2","OC1","OC2","OT","OXT" };
 +  static const char *mcb[]     = { "N","CA","CB","C","O","O1","O2","OC1","OC2","OT","OXT" };
 +  static const char *mch[]     = { "N","CA","C","O","O1","O2","OC1","OC2","OT","OXT",
 +                                 "H1","H2","H3","H","HN" };
 +
 +  static const t_gmx_help_make_index_group constructing_data[] =
 +    {{ NULL,   0, "Protein",      TRUE,  -1, -1},
 +     { pnoh,   asize(pnoh),   "Protein-H",    TRUE,  0,  -1},
 +     { calpha, asize(calpha), "C-alpha",      FALSE, -1, -1},
 +     { bb,     asize(bb),     "Backbone",     FALSE, -1, -1},
 +     { mc,     asize(mc),     "MainChain",    FALSE, -1, -1},
 +     { mcb,    asize(mcb),    "MainChain+Cb", FALSE, -1, -1},
 +     { mch,    asize(mch),    "MainChain+H",  FALSE, -1, -1},
 +     { mch,    asize(mch),    "SideChain",    TRUE,  -1, -1},
 +     { mch,    asize(mch),    "SideChain-H",  TRUE,  11, -1},
 +     { pnodum, asize(pnodum), "Prot-Masses",  TRUE,  -1, 0},
 +    };
 +  const int num_index_groups = asize(constructing_data);
 +
 +  int     n,j;
 +  atom_id *aid;
 +  int     nra,nnpres,npres;
 +  gmx_bool    match;
 +  char    ndx_name[STRLEN],*atnm;
 +  int i;
 +
 +  if (bVerb)
 +  {
 +    printf("Analysing Protein...\n");
 +  }
 +  snew(aid,atoms->nr);
 +
 +  /* calculate the number of protein residues */
 +  npres=0;
 +  for(i=0; (i<atoms->nres); i++) {
 +    if (0 == gmx_strcasecmp(restype[i],"Protein")) {
 +      npres++;
 +    }
 +  }
 +  /* find matching or complement atoms */
 +  for(i=0; (i<(int)num_index_groups); i++) {
 +    nra=0;
 +    for(n=0; (n<atoms->nr); n++) {
 +      if (0 == gmx_strcasecmp(restype[atoms->atom[n].resind],"Protein")) {
 +      match=FALSE;
 +      for(j=0; (j<constructing_data[i].num_defining_atomnames); j++) {
 +        /* skip digits at beginning of atomname, e.g. 1H */
 +        atnm=*atoms->atomname[n];
 +        while (isdigit(atnm[0])) {
 +          atnm++;
 +        }
 +        if ( (constructing_data[i].wholename==-1) || (j<constructing_data[i].wholename) ) {
 +          if (0 == gmx_strcasecmp(constructing_data[i].defining_atomnames[j],atnm)) {
 +            match=TRUE;
 +          }
 +        } else {
 +          if (0 == gmx_strncasecmp(constructing_data[i].defining_atomnames[j],atnm,strlen(constructing_data[i].defining_atomnames[j]))) {
 +            match=TRUE;
 +          }
 +        }
 +      }
 +      if (constructing_data[i].bTakeComplement != match) {
 +        aid[nra++]=n;
 +      }
 +      }
 +    }
 +    /* if we want to add this group always or it differs from previous 
 +       group, add it: */
 +    if ( -1 == constructing_data[i].compareto || !grp_cmp(gb,nra,aid,constructing_data[i].compareto-i) ) {
 +      add_grp(gb,gn,nra,aid,constructing_data[i].group_name);
 +    }
 +  }
 +  
 +  if (bASK) {
 +    for(i=0; (i<(int)num_index_groups); i++) {
 +      printf("Split %12s into %5d residues (y/n) ? ",constructing_data[i].group_name,npres);
 +      if (gmx_ask_yesno(bASK)) {
 +      int resind;
 +      nra = 0;
 +      for(n=0;((atoms->atom[n].resind < npres) && (n<atoms->nr));) {
 +        resind = atoms->atom[n].resind;
 +        for(;((atoms->atom[n].resind==resind) && (n<atoms->nr));n++) {
 +          match=FALSE;
 +          for(j=0;(j<constructing_data[i].num_defining_atomnames); j++) {
 +            if (0 == gmx_strcasecmp(constructing_data[i].defining_atomnames[j],*atoms->atomname[n])) {
 +              match=TRUE;
 +            }
 +          }
 +          if (constructing_data[i].bTakeComplement != match) {
 +            aid[nra++]=n;
 +          }
 +        }
 +        /* copy the residuename to the tail of the groupname */
 +        if (nra > 0) {
 +          t_resinfo *ri;
 +          ri = &atoms->resinfo[resind];
 +          sprintf(ndx_name,"%s_%s%d%c",
 +                  constructing_data[i].group_name,*ri->name,ri->nr,ri->ic==' ' ? '\0' : ri->ic);
 +          add_grp(gb,gn,nra,aid,ndx_name);
 +          nra = 0;
 +        }
 +      }
 +      } 
 +    }
 +    printf("Make group with sidechain and C=O swapped (y/n) ? ");
 +    if (gmx_ask_yesno(bASK)) {
 +      /* Make swap sidechain C=O index */
 +      int resind,hold;
 +      nra = 0;
 +      for(n=0;((atoms->atom[n].resind < npres) && (n<atoms->nr));) {
 +      resind = atoms->atom[n].resind;
 +      hold  = -1;
 +      for(;((atoms->atom[n].resind==resind) && (n<atoms->nr));n++)
 +        if (strcmp("CA",*atoms->atomname[n]) == 0) {
 +          aid[nra++]=n;
 +          hold=nra;
 +          nra+=2;
 +        } else if (strcmp("C",*atoms->atomname[n]) == 0) {
 +          if (hold == -1) {
 +            gmx_incons("Atom naming problem");
 +          }
 +          aid[hold]=n;
 +        } else if (strcmp("O",*atoms->atomname[n]) == 0) {
 +          if (hold == -1) {
 +            gmx_incons("Atom naming problem");
 +          }
 +          aid[hold+1]=n;
 +        } else if (strcmp("O1",*atoms->atomname[n]) == 0) {
 +          if (hold == -1) {
 +            gmx_incons("Atom naming problem");
 +          }
 +          aid[hold+1]=n;
 +        } else 
 +          aid[nra++]=n;
 +      }
 +      /* copy the residuename to the tail of the groupname */
 +      if (nra > 0) {
 +      add_grp(gb,gn,nra,aid,"SwapSC-CO");
 +      nra = 0;
 +      } 
 +    }
 +  }
 +  sfree(aid);
 +}
 +
 +
 +
 +
 +/* Return 0 if the name was found, otherwise -1.
 + * p_restype is set to a pointer to the type name, or 'Other' if we did not find it.
 + */
 +int
 +gmx_residuetype_get_type(gmx_residuetype_t rt,const char * resname, const char ** p_restype)
 +{
 +    int    i,rc;
 +    
 +    rc=-1;
 +    for(i=0;i<rt->n && rc;i++)
 +    {
 +        rc=gmx_strcasecmp(rt->resname[i],resname);
 +    }
 +    
 +    *p_restype = (rc==0) ? rt->restype[i-1] : gmx_residuetype_undefined;
 +    
 +    return rc;
 +}
 +
 +int
 +gmx_residuetype_add(gmx_residuetype_t rt,const char *newresname, const char *newrestype)
 +{
 +    int     i;
 +    int     found;
 +    const char *  p_oldtype;
 +    
 +    found = !gmx_residuetype_get_type(rt,newresname,&p_oldtype);
 +    
 +    if(found && gmx_strcasecmp(p_oldtype,newrestype))
 +    {
 +        fprintf(stderr,"Warning: Residue '%s' already present with type '%s' in database, ignoring new type '%s'.",
 +                newresname,p_oldtype,newrestype);
 +    }
 +    
 +    if(found==0)
 +    {
 +        srenew(rt->resname,rt->n+1);
 +        srenew(rt->restype,rt->n+1);
 +        rt->resname[rt->n]=strdup(newresname);
 +        rt->restype[rt->n]=strdup(newrestype);
 +        rt->n++;
 +    }
 +  
 +    return 0;
 +}
 +
 +
 +int
 +gmx_residuetype_init(gmx_residuetype_t *prt)
 +{
 +    FILE *  db;
 +    char    line[STRLEN];
 +    char    resname[STRLEN],restype[STRLEN],dum[STRLEN];
 +    char *  p;
 +    int     i;
 +    struct gmx_residuetype *rt;
 +    
 +    snew(rt,1);
 +    *prt=rt;
 +    
 +    rt->n        = 0;
 +    rt->resname  = NULL;
 +    rt->restype = NULL;
 +    
 +    db=libopen("residuetypes.dat");
 +    
 +    while(get_a_line(db,line,STRLEN)) 
 +    {
 +        strip_comment(line);
 +        trim(line);
 +        if(line[0]!='\0')
 +        {
 +            if(sscanf(line,"%s %s %s",resname,restype,dum)!=2)
 +            {
 +                gmx_fatal(FARGS,"Incorrect number of columns (2 expected) for line in residuetypes.dat");
 +            }
 +            gmx_residuetype_add(rt,resname,restype);
 +        }
 +    }
 +    
 +    fclose(db);
 +    
 +    return 0;
 +}
 +
 +
 +
 +int
 +gmx_residuetype_destroy(gmx_residuetype_t rt)
 +{
 +    int i;
 +    
 +    for(i=0;i<rt->n;i++)
 +    {
 +        free(rt->resname[i]);
 +        free(rt->restype[i]);
 +    }
 +    free(rt);
 +    
 +    return 0;
 +}
 +
 +int
 +gmx_residuetype_get_alltypes(gmx_residuetype_t    rt,
 +                             const char ***       p_typenames,
 +                             int *                ntypes)
 +{
 +    int      i,j,n;
 +    int      found;
 +    const char **  my_typename;
 +    char *   p;
 +    
 +    n=0;
 +    
 +    my_typename=NULL;
 +    for(i=0;i<rt->n;i++)
 +    {
 +        p=rt->restype[i];
 +        found=0;
 +        for(j=0;j<n && !found;j++)
 +        {
 +            found=!gmx_strcasecmp(p,my_typename[j]);
 +        }
 +        
 +        if(!found)
 +        {
 +            srenew(my_typename,n+1);
 +            my_typename[n]=p;
 +            n++;
 +        }
 +    }
 +    *ntypes=n;
 +    *p_typenames=my_typename; 
 +    
 +    return 0;
 +}
 +    
 +
 +
 +gmx_bool 
 +gmx_residuetype_is_protein(gmx_residuetype_t rt, const char *resnm)
 +{
 +    gmx_bool rc;
 +    const char *p_type;
 +    
 +    if(gmx_residuetype_get_type(rt,resnm,&p_type)==0 &&
 +       gmx_strcasecmp(p_type,"Protein")==0)
 +    {
 +        rc=TRUE;
 +    }
 +    else
 +    {
 +        rc=FALSE;
 +    }
 +    return rc;
 +}
 +
 +gmx_bool 
 +gmx_residuetype_is_dna(gmx_residuetype_t rt, const char *resnm)
 +{
 +    gmx_bool rc;
 +    const char *p_type;
 +
 +    if(gmx_residuetype_get_type(rt,resnm,&p_type)==0 &&
 +       gmx_strcasecmp(p_type,"DNA")==0)
 +    {
 +        rc=TRUE;
 +    }
 +    else
 +    {
 +        rc=FALSE;
 +    }
 +    return rc;
 +}
 +
 +gmx_bool 
 +gmx_residuetype_is_rna(gmx_residuetype_t rt, const char *resnm)
 +{
 +    gmx_bool rc;
 +    const char *p_type;
 +
 +    if(gmx_residuetype_get_type(rt,resnm,&p_type)==0 &&
 +       gmx_strcasecmp(p_type,"RNA")==0)
 +    {
 +        rc=TRUE;
 +    }
 +    else
 +    {
 +        rc=FALSE;
 +    }
 +    return rc;
 +}
 +
 +/* Return the size of the arrays */
 +int
 +gmx_residuetype_get_size(gmx_residuetype_t rt)
 +{
 +    return rt->n;
 +}
 +
 +/* Search for a residuetype with name resnm within the
 + * gmx_residuetype database. Return the index if found,
 + * otherwise -1.
 + */
 +int
 +gmx_residuetype_get_index(gmx_residuetype_t rt, const char *resnm)
 +{
 +    int i,rc;
 +
 +    rc=-1;
 +    for(i=0;i<rt->n && rc;i++)
 +    {
 +        rc=gmx_strcasecmp(rt->resname[i],resnm);
 +    }
 +
 +    return (0 == rc) ? i-1 : -1;
 +}
 +
 +/* Return the name of the residuetype with the given index, or
 + * NULL if not found. */
 +const char *
 +gmx_residuetype_get_name(gmx_residuetype_t rt, int index)
 +{
 +  if(index >= 0 && index < rt->n) {
 +    return rt->resname[index];
 +  } else {
 +    return NULL;
 +  }
 +}
 +
 +
 +
 +void analyse(t_atoms *atoms,t_blocka *gb,char ***gn,gmx_bool bASK,gmx_bool bVerb)
 +{
 +    gmx_residuetype_t rt;
 +    char    *resnm;
 +    atom_id *aid;
 +    const char **  restype;
 +    int     nra;
 +    int     i,k;
 +    size_t  j;
 +    int     ntypes;
 +    char *  p;
 +    const char ** p_typename;
 +    int     iwater,iion;
 +    int     nwater,nion;
 +    int     found;
 +    
 +    if (bVerb)
 +    {
 +        printf("Analysing residue names:\n");
 +    }
 +    /* Create system group, every single atom */
 +    snew(aid,atoms->nr);
 +    for(i=0;i<atoms->nr;i++)
 +    {
 +        aid[i]=i;
 +    }
 +    add_grp(gb,gn,atoms->nr,aid,"System"); 
 +    sfree(aid);
 +
 +    /* For every residue, get a pointer to the residue type name */
 +    gmx_residuetype_init(&rt);
 +
 +    snew(restype,atoms->nres);
 +    ntypes = 0;
 +    p_typename = NULL;
 +    for(i=0;i<atoms->nres;i++)
 +    {
 +        resnm = *atoms->resinfo[i].name;
 +        gmx_residuetype_get_type(rt,resnm,&(restype[i]));
 +
 +        /* Note that this does not lead to a N*N loop, but N*K, where
 +         * K is the number of residue _types_, which is small and independent of N.
 +         */
 +        found = 0;
 +        for(k=0;k<ntypes && !found;k++)
 +        {
 +            found = !strcmp(restype[i],p_typename[k]);
 +        }
 +        if(!found)
 +        {
 +            srenew(p_typename,ntypes+1);
 +            p_typename[ntypes++] = strdup(restype[i]);
 +        }
 +    }    
 +    
 +    if (bVerb)
 +    {
 +        p_status(restype,atoms->nres,p_typename,ntypes);
 +    }
 +
 +    for(k=0;k<ntypes;k++)
 +    {              
 +        aid=mk_aid(atoms,restype,p_typename[k],&nra,TRUE);
 +
 +        /* Check for special types to do fancy stuff with */
 +        
 +        if(!gmx_strcasecmp(p_typename[k],"Protein") && nra>0)
 +        {
 +            sfree(aid);
 +            /* PROTEIN */
 +            analyse_prot(restype,atoms,gb,gn,bASK,bVerb);
 +            
 +            /* Create a Non-Protein group */
 +            aid=mk_aid(atoms,restype,"Protein",&nra,FALSE);
 +            if ((nra > 0) && (nra < atoms->nr))
 +            {
 +                add_grp(gb,gn,nra,aid,"non-Protein"); 
 +            }
 +            sfree(aid);
 +        }
 +        else if(!gmx_strcasecmp(p_typename[k],"Water") && nra>0)
 +        {
 +            add_grp(gb,gn,nra,aid,p_typename[k]); 
 +            /* Add this group as 'SOL' too, for backward compatibility with older gromacs versions */
 +            add_grp(gb,gn,nra,aid,"SOL"); 
 +
 +            sfree(aid);
 +
 +            /* Solvent, create a negated group too */
 +            aid=mk_aid(atoms,restype,"Water",&nra,FALSE);
 +            if ((nra > 0) && (nra < atoms->nr))
 +            {
 +                add_grp(gb,gn,nra,aid,"non-Water"); 
 +            }
 +            sfree(aid);
 +        }
 +        else if(nra>0)
 +        {
 +            /* Other groups */
 +            add_grp(gb,gn,nra,aid,p_typename[k]); 
 +            sfree(aid);
 +            analyse_other(restype,atoms,gb,gn,bASK,bVerb);
 +        }
 +    }
 +    
 +    sfree(p_typename);
 +    sfree(restype);
 +    gmx_residuetype_destroy(rt);      
 +    
 +    /* Create a merged water_and_ions group */
 +    iwater = -1;
 +    iion   = -1;
 +    nwater = 0;
 +    nion   = 0;
 +        
 +    for(i=0;i<gb->nr;i++)
 +    {        
 +        if(!gmx_strcasecmp((*gn)[i],"Water"))
 +        {
 +            iwater = i;
 +            nwater = gb->index[i+1]-gb->index[i];
 +        }
 +        else if(!gmx_strcasecmp((*gn)[i],"Ion"))
 +        {
 +            iion = i;
 +            nion = gb->index[i+1]-gb->index[i];
 +        }
 +    }
 +    
 +    if(nwater>0 && nion>0)
 +    {
 +        srenew(gb->index,gb->nr+2);
 +        srenew(*gn,gb->nr+1);
 +        (*gn)[gb->nr] = strdup("Water_and_ions");
 +        srenew(gb->a,gb->nra+nwater+nion);
 +        if(nwater>0)
 +        {
 +            for(i=gb->index[iwater];i<gb->index[iwater+1];i++)
 +            {
 +                gb->a[gb->nra++] = gb->a[i];
 +            }
 +        }
 +        if(nion>0)
 +        {
 +            for(i=gb->index[iion];i<gb->index[iion+1];i++)
 +            {
 +                gb->a[gb->nra++] = gb->a[i];
 +            }
 +        }
 +        gb->nr++;
 +        gb->index[gb->nr]=gb->nra;
 +    }
 +}
 +
 +
 +void check_index(char *gname,int n,atom_id index[],char *traj,int natoms)
 +{
 +  int i;
 +  
 +  for(i=0; i<n; i++)
 +    if (index[i] >= natoms)
 +      gmx_fatal(FARGS,"%s atom number (index[%d]=%d) is larger than the number of atoms in %s (%d)",
 +                gname ? gname : "Index",i+1, index[i]+1,
 +                traj ? traj : "the trajectory",natoms);
 +    else if (index[i] < 0)
 +      gmx_fatal(FARGS,"%s atom number (index[%d]=%d) is less than zero",
 +              gname ? gname : "Index",i+1, index[i]+1);
 +}
 +
 +t_blocka *init_index(const char *gfile, char ***grpname)
 +{
 +  FILE     *in;
 +  t_blocka  *b;
 +  int      a,maxentries;
 +  int      i,j,ng,nread;
 +  char     line[STRLEN],*pt,str[STRLEN];
 +
 +  in=gmx_fio_fopen(gfile,"r");
 +  snew(b,1);
 +  get_a_line(in,line,STRLEN);
 +  if ( line[0]=='[' ) {
 +    /* new format */
 +    b->nr=0;
 +    b->index=NULL;
 +    b->nra=0;
 +    b->a=NULL;
 +    *grpname=NULL;
 +    maxentries=0;
 +    do {
 +      if (get_header(line,str)) {
 +      b->nr++;
 +      srenew(b->index,b->nr+1);
 +      srenew(*grpname,b->nr);
 +      if (b->nr==1)
 +        b->index[0]=0;
 +      b->index[b->nr]=b->index[b->nr-1];
 +      (*grpname)[b->nr-1]=strdup(str);
 +      } else {
 +      pt=line;
++      while (sscanf(pt,"%s",str) == 1) {
 +        i=b->index[b->nr];
 +        if (i>=maxentries) {
 +          maxentries+=1024;
 +          srenew(b->a,maxentries);
 +        }
 +        b->a[i]=strtol(str, NULL, 10)-1;
 +        b->index[b->nr]++;
 +        (b->nra)++;
 +        pt=strstr(pt,str)+strlen(str);
 +      }
 +      }
 +    } while (get_a_line(in,line,STRLEN));
 +  } 
 +  else {
 +    /* old format */
 +    sscanf(line,"%d%d",&b->nr,&b->nra);
 +    snew(b->index,b->nr+1);
 +    snew(*grpname,b->nr);
 +    b->index[0]=0;
 +    snew(b->a,b->nra);
 +    for (i=0; (i<b->nr); i++) {
 +      nread=fscanf(in,"%s%d",str,&ng);
 +      (*grpname)[i]=strdup(str);
 +      b->index[i+1]=b->index[i]+ng;
 +      if (b->index[i+1] > b->nra)
 +      gmx_fatal(FARGS,"Something wrong in your indexfile at group %s",str);
 +      for(j=0; (j<ng); j++) {
 +      nread=fscanf(in,"%d",&a);
 +      b->a[b->index[i]+j]=a;
 +      }
 +    }
 +  }
 +  gmx_fio_fclose(in);
 +
 +  for(i=0; (i<b->nr); i++) {
 +    for(j=b->index[i]; (j<b->index[i+1]); j++) {
 +      if (b->a[j] < 0) 
 +      fprintf(stderr,"\nWARNING: negative index %d in group %s\n\n",
 +              b->a[j],(*grpname)[i]);
 +    }
 +  }
 +  
 +  return b;
 +}
 +
 +static void minstring(char *str)
 +{
 +  int i;
 +
 +  for (i=0; (i < (int)strlen(str)); i++) 
 +    if (str[i]=='-')
 +      str[i]='_';
 +}
 +
 +int find_group(char s[], int ngrps, char **grpname)
 +{
 +  int aa, i, n;
 +  char string[STRLEN];
 +  gmx_bool bMultiple;
 +  
 +  bMultiple = FALSE;
 +  n = strlen(s);
 +  aa=NOTSET;
 +  /* first look for whole name match */
 +  if (aa==NOTSET)
 +    for(i=0; i<ngrps; i++)
 +      if (gmx_strcasecmp_min(s,grpname[i])==0) {
 +      if(aa!=NOTSET)
 +        bMultiple = TRUE;
 +      aa=i;
 +      }
 +  /* second look for first string match */
 +  if (aa==NOTSET)
 +    for(i=0; i<ngrps; i++)
 +      if (gmx_strncasecmp_min(s,grpname[i],n)==0) {
 +      if(aa!=NOTSET)
 +        bMultiple = TRUE;
 +      aa=i;
 +      }
 +  /* last look for arbitrary substring match */
 +  if (aa==NOTSET) {
 +    upstring(s);
 +    minstring(s);
 +    for(i=0; i<ngrps; i++) {
 +      strcpy(string, grpname[i]);
 +      upstring(string);
 +      minstring(string);
 +      if (strstr(string,s)!=NULL) {
 +      if(aa!=NOTSET)
 +        bMultiple = TRUE;
 +      aa=i;
 +      }
 +    }
 +  }
 +  if (bMultiple) {
 +    printf("Error: Multiple groups '%s' selected\n", s);
 +    aa=NOTSET;
 +  }
 +  return aa;
 +}
 +
 +static int qgroup(int *a, int ngrps, char **grpname)
 +{
 +    char s[STRLEN];
 +    int  aa;
 +    gmx_bool bInRange;
 +    char *end;
 +
 +    do {
 +        fprintf(stderr,"Select a group: ");
 +        do {
 +            if ( scanf("%s",s)!=1 ) 
 +                gmx_fatal(FARGS,"Cannot read from input");
 +            trim(s); /* remove spaces */
 +        } while (strlen(s)==0);
 +        aa = strtol(s, &end, 10);
 +        if (aa==0 && end[0] != '\0') /* string entered */
 +            aa = find_group(s, ngrps, grpname);
 +        bInRange = (aa >= 0 && aa < ngrps);
 +        if (!bInRange)
 +            printf("Error: No such group '%s'\n", s);
 +    } while (!bInRange);
 +    printf("Selected %d: '%s'\n", aa, grpname[aa]);
 +    *a = aa;
 +    return aa;
 +}
 +
 +static void rd_groups(t_blocka *grps,char **grpname,char *gnames[],
 +                    int ngrps,int isize[],atom_id *index[],int grpnr[])
 +{
 +  int i,j,gnr1;
 +
 +  if (grps->nr==0)
 +    gmx_fatal(FARGS,"Error: no groups in indexfile");
 +  for(i=0; (i<grps->nr); i++)
 +    fprintf(stderr,"Group %5d (%15s) has %5d elements\n",i,grpname[i],
 +         grps->index[i+1]-grps->index[i]);
 +  for(i=0; (i<ngrps); i++) {
 +    if (grps->nr > 1)
 +      do {
 +      gnr1=qgroup(&grpnr[i], grps->nr, grpname);
 +      if ((gnr1<0) || (gnr1>=grps->nr))
 +        fprintf(stderr,"Select between %d and %d.\n",0,grps->nr-1);
 +      }       while ((gnr1<0) || (gnr1>=grps->nr));
 +    else {
 +      fprintf(stderr,"There is one group in the index\n");
 +      gnr1=0;
 +    }
 +    gnames[i]=strdup(grpname[gnr1]);
 +    isize[i]=grps->index[gnr1+1]-grps->index[gnr1];
 +    snew(index[i],isize[i]);
 +    for(j=0; (j<isize[i]); j++)
 +      index[i][j]=grps->a[grps->index[gnr1]+j];
 +  }
 +}
 +
 +void rd_index(const char *statfile,int ngrps,int isize[],
 +            atom_id *index[],char *grpnames[])
 +{
 +  char    **gnames;
 +  t_blocka *grps;
 +  int     *grpnr;
 +  
 +  snew(grpnr,ngrps);
 +  if (!statfile)
 +    gmx_fatal(FARGS,"No index file specified");
 +  grps=init_index(statfile,&gnames);
 +  rd_groups(grps,gnames,grpnames,ngrps,isize,index,grpnr);
 +}
 +
 +void rd_index_nrs(char *statfile,int ngrps,int isize[],
 +                atom_id *index[],char *grpnames[],int grpnr[])
 +{
 +  char    **gnames;
 +  t_blocka *grps;
 +  
 +  if (!statfile)
 +    gmx_fatal(FARGS,"No index file specified");
 +  grps=init_index(statfile,&gnames);
 +  
 +  rd_groups(grps,gnames,grpnames,ngrps,isize,index,grpnr);
 +}
 +
 +void get_index(t_atoms *atoms, const char *fnm, int ngrps,
 +             int isize[], atom_id *index[],char *grpnames[])
 +{
 +  char    ***gnames;
 +  t_blocka *grps = NULL; 
 +  int     *grpnr;
 +  
 +  snew(grpnr,ngrps);
 +  snew(gnames,1);
 +  if (fnm != NULL) {
 +    grps=init_index(fnm,gnames);
 +  }
 +  else if (atoms) {
 +    snew(grps,1);
 +    snew(grps->index,1);
 +    analyse(atoms,grps,gnames,FALSE,FALSE);
 +  }
 +  else 
 +    gmx_incons("You need to supply a valid atoms structure or a valid index file name");
 +  
 +  rd_groups(grps,*gnames,grpnames,ngrps,isize,index,grpnr);
 +}
 +
 +t_cluster_ndx *cluster_index(FILE *fplog,const char *ndx)
 +{
 +  t_cluster_ndx *c;
 +  int i;
 +  
 +  snew(c,1);
 +  c->clust     = init_index(ndx,&c->grpname);
 +  c->maxframe = -1;
 +  for(i=0; (i<c->clust->nra); i++)
 +    c->maxframe = max(c->maxframe,c->clust->a[i]);
 +  fprintf(fplog ? fplog : stdout,
 +        "There are %d clusters containing %d structures, highest framenr is %d\n",
 +        c->clust->nr,c->clust->nra,c->maxframe);
 +  if (debug) {
 +    pr_blocka(debug,0,"clust",c->clust,TRUE);
 +    for(i=0; (i<c->clust->nra); i++)
 +      if ((c->clust->a[i] < 0) || (c->clust->a[i] > c->maxframe))
 +      gmx_fatal(FARGS,"Range check error for c->clust->a[%d] = %d\n"
 +                "should be within 0 and %d",i,c->clust->a[i],c->maxframe+1);
 +  }
 +  c->inv_clust=make_invblocka(c->clust,c->maxframe);
 +        
 +  return c;
 +}
 +
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
index f12fae5c506c8cb7f9af4dfbbf74fbda4d14f974,0000000000000000000000000000000000000000..e1e66b8c2670b64114aca4e0759789188bab8c02
mode 100644,000000..100644
--- /dev/null
@@@ -1,605 -1,0 +1,606 @@@
- #if ((defined WIN32 || defined _WIN32 || defined WIN64 || defined _WIN64) && !defined __CYGWIN__ && !defined __CYGWIN32__)
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROningen Mixture of Alchemy and Childrens' Stories
 + */
 +/* This file is completely threadsafe - keep it that way! */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
++#include "gmx_header_config.h"
 +
 +#ifdef GMX_CRAY_XT3
 +#undef HAVE_PWD_H
 +#endif
 +
 +#include <stdio.h>
 +#include <ctype.h>
 +#include <stdlib.h>
 +#include <errno.h>
 +#include <sys/types.h>
 +#include <time.h>
 +
 +#ifdef HAVE_SYS_TIME_H
 +#include <sys/time.h>
 +#endif
 +
 +
 +#ifdef HAVE_PWD_H
 +#include <pwd.h>
 +#endif
 +#include <time.h>
 +
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "gmx_fatal.h"
 +#include "macros.h"
 +#include "string2.h"
 +#include "futil.h"
 +
 +int continuing(char *s)
 +/* strip trailing spaces and if s ends with a CONTINUE remove that too.
 + * returns TRUE if s ends with a CONTINUE, FALSE otherwise.
 + */
 +{
 +  int sl;
 +
 +  rtrim(s);
 +  sl = strlen(s);
 +  if ((sl > 0) && (s[sl-1] == CONTINUE)) {
 +    s[sl-1] = 0;
 +    return TRUE;
 +  }
 +  else
 +    return FALSE;
 +}
 +
 +
 +
 +char *fgets2(char *line, int n, FILE *stream)
 +/* This routine reads a string from stream of max length n
 + * and zero terminated, without newlines
 + * line should be long enough (>= n)
 + */
 +{
 +  char *c;
 +  if (fgets(line,n,stream) == NULL) {
 +    return NULL;
 +  }
 +  if ((c=strchr(line,'\n')) != NULL) {
 +    *c = '\0';
 +  } else {
 +    /* A line not ending in a newline can only occur at the end of a file,
 +     * or because of n being too small.
 +     * Since both cases occur very infrequently, we can check for EOF.
 +     */
 +    if (!gmx_eof(stream)) {
 +      gmx_fatal(FARGS,"An input file contains a line longer than %d characters, while the buffer passed to fgets2 has size %d. The line starts with: '%20.20s'",n,n,line);
 +    }
 +  }
 +  if ((c=strchr(line,'\r')) != NULL) {
 +    *c = '\0';
 +  }
 +
 +  return line;
 +}
 +
 +void strip_comment (char *line)
 +{
 +  char *c;
 +
 +  if (!line)
 +    return;
 +
 +  /* search for a comment mark and replace it by a zero */
 +  if ((c = strchr(line,COMMENTSIGN)) != NULL) 
 +    (*c) = 0;
 +}
 +
 +void upstring (char *str)
 +{
 +  int i;
 +
 +  for (i=0; (i < (int)strlen(str)); i++) 
 +    str[i] = toupper(str[i]);
 +}
 +
 +void ltrim (char *str)
 +{
 +  char *tr;
 +  int i,c;
 +
 +  if (NULL == str)
 +    return;
 +
 +  c = 0;
 +  while (('\0' != str[c]) && isspace(str[c]))
 +    c++;
 +  if (c > 0) 
 +    {
 +      for(i=c; ('\0' != str[i]); i++)
 +      str[i-c] = str[i];
 +      str[i-c] = '\0';
 +    }
 +}
 +
 +void rtrim (char *str)
 +{
 +  int nul;
 +
 +  if (NULL == str)
 +    return;
 +
 +  nul = strlen(str)-1;
 +  while ((nul > 0) && ((str[nul] == ' ') || (str[nul] == '\t')) ) {
 +    str[nul] = '\0';
 +    nul--;
 +  }
 +}
 +
 +void trim (char *str)
 +{
 +  ltrim (str);
 +  rtrim (str);
 +}
 +
 +char *
 +gmx_ctime_r(const time_t *clock,char *buf, int n)
 +{
 +    char tmpbuf[STRLEN];
 +  
++#ifdef GMX_NATIVE_WINDOWS
 +    /* Windows */
 +    ctime_s( tmpbuf, STRLEN, clock );
 +#elif (defined(__sun))
 +    /*Solaris*/
 +    ctime_r(clock, tmpbuf, n);
 +#else
 +    ctime_r(clock,tmpbuf);
 +#endif
 +    strncpy(buf,tmpbuf,n-1);
 +    buf[n-1]='\0';
 +    
 +    return buf;
 +}
 +          
 +void nice_header (FILE *out,const char *fn)
 +{
 +  const char *unk = "onbekend";
 +  time_t clock;
 +  const char *user=unk;
 +  int    gh;
 +  uid_t  uid;
 +  char   buf[256]="";
 +  char   timebuf[STRLEN];
 +#ifdef HAVE_PWD_H
 +  struct passwd *pw;
 +#endif
 +
 +  /* Print a nice header above the file */
 +  time(&clock);
 +  fprintf (out,"%c\n",COMMENTSIGN);
 +  fprintf (out,"%c\tFile '%s' was generated\n",COMMENTSIGN,fn ? fn : unk);
 +  
 +#ifdef HAVE_PWD_H
 +  uid = getuid();
 +  pw  = getpwuid(uid);
 +  gh  = gethostname(buf,255);
 +  user= pw->pw_name;
 +#else
 +  uid = 0;
 +  gh  = -1;
 +#endif
 +  
 +  gmx_ctime_r(&clock,timebuf,STRLEN);
 +  fprintf (out,"%c\tBy user: %s (%d)\n",COMMENTSIGN,
 +         user ? user : unk,(int) uid);
 +  fprintf(out,"%c\tOn host: %s\n",COMMENTSIGN,(gh == 0) ? buf : unk);
 +
 +  fprintf (out,"%c\tAt date: %s",COMMENTSIGN,timebuf);
 +  fprintf (out,"%c\n",COMMENTSIGN);
 +}
 +
 +int gmx_strcasecmp_min(const char *str1, const char *str2)
 +{
 +  char ch1,ch2;
 +  
 +  do
 +    {
 +      do
 +      ch1=toupper(*(str1++));
 +      while ((ch1=='-') || (ch1=='_'));
 +      do 
 +      ch2=toupper(*(str2++));
 +      while ((ch2=='-') || (ch2=='_'));
 +      if (ch1!=ch2) return (ch1-ch2);
 +    }
 +  while (ch1);
 +  return 0; 
 +}
 +
 +int gmx_strncasecmp_min(const char *str1, const char *str2, int n)
 +{
 +  char ch1,ch2;
 +  char *stri1, *stri2;
 +
 +  stri1=(char *)str1;
 +  stri2=(char *)str2;  
 +  do
 +    {
 +      do
 +      ch1=toupper(*(str1++));
 +      while ((ch1=='-') || (ch1=='_'));
 +      do 
 +      ch2=toupper(*(str2++));
 +      while ((ch2=='-') || (ch2=='_'));
 +      if (ch1!=ch2) return (ch1-ch2);
 +    }
 +  while (ch1 && (str1-stri1<n) && (str2-stri2<n));
 +  return 0; 
 +}
 +
 +int gmx_strcasecmp(const char *str1, const char *str2)
 +{
 +  char ch1,ch2;
 +  
 +  do
 +    {
 +      ch1=toupper(*(str1++));
 +      ch2=toupper(*(str2++));
 +      if (ch1!=ch2) return (ch1-ch2);
 +    }
 +  while (ch1);
 +  return 0; 
 +}
 +
 +int gmx_strncasecmp(const char *str1, const char *str2, int n)
 +{
 +  char ch1,ch2;
 + 
 +  if(n==0) 
 +    return 0;
 +
 +  do
 +    {
 +      ch1=toupper(*(str1++));
 +      ch2=toupper(*(str2++));
 +      if (ch1!=ch2) return (ch1-ch2);
 +      n--;
 +    }
 +  while (ch1 && n);
 +  return 0; 
 +}
 +
 +char *gmx_strdup(const char *src)
 +{
 +  char *dest;
 +
 +  snew(dest,strlen(src)+1);
 +  strcpy(dest,src);
 +  
 +  return dest;
 +}
 +
 +char *
 +gmx_strndup(const char *src, int n)
 +{
 +    int   len;
 +    char *dest;
 +
 +    len = strlen(src);
 +    if (len > n) 
 +    {
 +        len = n;
 +    }
 +    snew(dest, len+1);
 +    strncpy(dest, src, len);
 +    dest[len] = 0;
 +    return dest;
 +}
 +
 +/*!
 + * \param[in] pattern  Pattern to match against.
 + * \param[in] str      String to match.
 + * \returns   0 on match, GMX_NO_WCMATCH if there is no match.
 + *
 + * Matches \p str against \p pattern, which may contain * and ? wildcards.
 + * All other characters are matched literally.
 + * Currently, it is not possible to match literal * or ?.
 + */
 +int
 +gmx_wcmatch(const char *pattern, const char *str)
 +{
 +    while (*pattern)
 +    {
 +        if (*pattern == '*')
 +        {
 +            /* Skip multiple wildcards in a sequence */
 +            while (*pattern == '*' || *pattern == '?')
 +            {
 +                ++pattern;
 +                /* For ?, we need to check that there are characters left
 +                 * in str. */
 +                if (*pattern == '?')
 +                {
 +                    if (*str == 0)
 +                    {
 +                        return GMX_NO_WCMATCH;
 +                    }
 +                    else
 +                    {
 +                        ++str;
 +                    }
 +                }
 +            }
 +            /* If the pattern ends after the star, we have a match */
 +            if (*pattern == 0)
 +            {
 +                return 0;
 +            }
 +            /* Match the rest against each possible suffix of str */
 +            while (*str)
 +            {
 +                /* Only do the recursive call if the first character
 +                 * matches. We don't have to worry about wildcards here,
 +                 * since we have processed them above. */
 +                if (*pattern == *str)
 +                {
 +                    int rc;
 +                    /* Match the suffix, and return if a match or an error */
 +                    rc = gmx_wcmatch(pattern, str);
 +                    if (rc != GMX_NO_WCMATCH)
 +                    {
 +                        return rc;
 +                    }
 +                }
 +                ++str;
 +            }
 +            /* If no suffix of str matches, we don't have a match */
 +            return GMX_NO_WCMATCH;
 +        }
 +        else if ((*pattern == '?' && *str != 0) || *pattern == *str)
 +        {
 +            ++str;
 +        }
 +        else
 +        {
 +            return GMX_NO_WCMATCH;
 +        }
 +        ++pattern;
 +    }
 +    /* When the pattern runs out, we have a match if the string has ended. */
 +    return (*str == 0) ? 0 : GMX_NO_WCMATCH;
 +}
 +
 +char *wrap_lines(const char *buf,int line_width, int indent,gmx_bool bIndentFirst)
 +{
 +  char *b2;
 +  int i,i0,i2,j,b2len,lspace=0,l2space=0;
 +  gmx_bool bFirst,bFitsOnLine;
 +
 +  /* characters are copied from buf to b2 with possible spaces changed
 +   * into newlines and extra space added for indentation.
 +   * i indexes buf (source buffer) and i2 indexes b2 (destination buffer)
 +   * i0 points to the beginning of the current line (in buf, source)
 +   * lspace and l2space point to the last space on the current line
 +   * bFirst is set to prevent indentation of first line
 +   * bFitsOnLine says if the first space occurred before line_width, if 
 +   * that is not the case, we have a word longer than line_width which 
 +   * will also not fit on the next line, so we might as well keep it on 
 +   * the current line (where it also won't fit, but looks better)
 +   */
 +  
 +  b2=NULL;
 +  b2len=strlen(buf)+1+indent;
 +  snew(b2,b2len);
 +  i0=i2=0;
 +  if (bIndentFirst)
 +    for(i2=0; (i2<indent); i2++)
 +      b2[i2] = ' ';
 +  bFirst=TRUE;
 +  do {
 +    l2space = -1;
 +    /* find the last space before end of line */
 +    for(i=i0; ((i-i0 < line_width) || (l2space==-1)) && (buf[i]); i++) {
 +      b2[i2++] = buf[i];
 +      /* remember the position of a space */
 +      if (buf[i] == ' ') {
 +        lspace = i;
 +      l2space = i2-1;
 +      }
 +      /* if we have a newline before the line is full, reset counters */
 +      if (buf[i]=='\n' && buf[i+1]) { 
 +      i0=i+1;
 +      b2len+=indent;
 +      srenew(b2, b2len);
 +      /* add indentation after the newline */
 +      for(j=0; (j<indent); j++)
 +        b2[i2++]=' ';
 +      }
 +    }
 +    /* If we are at the last newline, copy it */
 +    if (buf[i]=='\n' && !buf[i+1]) {
 +      b2[i2++] = buf[i++];
 +    }
 +    /* if we're not at the end of the string */
 +    if (buf[i]) {
 +      /* check if one word does not fit on the line */
 +      bFitsOnLine = (i-i0 <= line_width);
 +      /* reset line counters to just after the space */
 +      i0 = lspace+1;
 +      i2 = l2space+1;
 +      /* if the words fit on the line, and we're beyond the indentation part */
 +      if ( (bFitsOnLine) && (l2space >= indent) ) {
 +      /* start a new line */
 +      b2[l2space] = '\n';
 +      /* and add indentation */
 +      if (indent) {
 +        if (bFirst) {
 +          line_width-=indent;
 +          bFirst=FALSE;
 +        }
 +        b2len+=indent;
 +        srenew(b2, b2len);
 +        for(j=0; (j<indent); j++)
 +          b2[i2++]=' ';
 +        /* no extra spaces after indent; */
 +        while(buf[i0]==' ')
 +          i0++;
 +      }
 +      }
 +    }
 +  } while (buf[i]);
 +  b2[i2] = '\0';
 +  
 +  return b2;
 +}
 +
 +char **split(char sep,const char *str)
 +{
 +  char **ptr = NULL;
 +  int  n,nn,nptr = 0;
 +  
 +  if (str == NULL)
 +    return NULL;
 +  nn = strlen(str);
 +  for(n=0; (n<nn); n++)
 +    if (str[n] == sep)
 +      nptr++;
 +  snew(ptr,nptr+2);
 +  nptr = 0;
 +  while (*str != '\0') {
 +    while ((*str != '\0') && (*str == sep))
 +      str++;
 +    if (*str != '\0') {
 +      snew(ptr[nptr],1+strlen(str));
 +      n = 0;
 +      while ((*str != '\0') && (*str != sep)) {
 +      ptr[nptr][n] = *str;
 +      str++;
 +      n++;
 +      }
 +      ptr[nptr][n] = '\0';
 +      nptr++;
 +    }
 +  }
 +  ptr[nptr] = NULL;
 +  
 +  return ptr;
 +}
 +
 +
 +gmx_large_int_t
 +str_to_large_int_t(const char *str, char **endptr)
 +{
 +      int         sign = 1;
 +      gmx_large_int_t  val  = 0;
 +      char        ch;
 +      const char  *p;
 +      
 +      p = str;
 +      if(p==NULL)
 +      {
 +              *endptr=NULL;
 +              return 0;
 +      }
 +      
 +      /* Strip off initial white space */
 +      while(isspace(*p))
 +      {
 +              p++;
 +      }
 +      /* Conform to ISO C99 - return original pointer if string does not contain a number */
 +      if(*str=='\0')
 +      {
 +              *endptr=(char *)str;
 +      }
 +      
 +      if(*p=='-')
 +      {
 +              p++;
 +              sign *= -1;
 +      }
 +      
 +      while( ((ch=*p) != '\0') && isdigit(ch) )
 +      {
 +              /* Important to add sign here, so we dont overflow in final multiplication */
 +              ch = (ch-'0')*sign; 
 +              val = val*10 + ch;
 +              if(ch != val%10) 
 +              {
 +                      /* Some sort of overflow has occured, set endptr to original string */
 +                      *endptr=(char *)str;
 +                      errno = ERANGE;
 +                      return(0);
 +              }
 +              p++;
 +      }
 +      
 +      *endptr=(char *)p;
 +      
 +      return val;
 +}
 +
 +char *gmx_strsep(char **stringp, const char *delim)
 +{
 +    char *ret;
 +    int len=strlen(delim);
 +    int i,j=0;
 +    int found=0;
 +
 +    if (! *stringp)
 +        return NULL;
 +    ret=*stringp;
 +    do
 +    {
 +        if ( (*stringp)[j] == '\0')
 +        {
 +            found=1;
 +            *stringp=NULL;
 +            break;
 +        }
 +        for (i=0;i<len;i++)
 +        {
 +            if ( (*stringp)[j]==delim[i])
 +            {
 +                (*stringp)[j]='\0';
 +                *stringp=*stringp+j+1;
 +                found=1;
 +                break;
 +            }
 +        }
 +        j++;
 +    } while (!found);
 +
 +    return ret;
 +}
 +
Simple merge
Simple merge
index e26654ca73a13530df4e1e87e31f2455dce22939,0000000000000000000000000000000000000000..9d9bd887eb2039c1e050ba39076a5d3db96c0205
mode 100644,000000..100644
--- /dev/null
@@@ -1,1332 -1,0 +1,1333 @@@
- #if ((defined WIN32 || defined _WIN32 || defined WIN64 || defined _WIN64) && !defined __CYGWIN__ && !defined __CYGWIN32__)
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROningen Mixture of Alchemy and Childrens' Stories
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
++#include "gmx_header_config.h"
 +
 +#include "string2.h"
 +#include "smalloc.h"
 +#include "sysstuff.h"
 +#include "filenm.h"
 +#include "macros.h"
 +#include "replace.h"
 +#include "wman.h"
 +#include "statutil.h"
 +#include "copyrite.h"
 +#include "strdb.h"
 +#include <time.h>
 +#include "readinp.h"
 +
 +/* The source code in this file should be thread-safe. 
 +         Please keep it that way. */
 +
 +
 +typedef struct {
 +  const char *search,*replace;
 +} t_sandr_const;
 +
 +typedef struct {
 +  char *search,*replace;
 +} t_sandr;
 +
 +/* The order of these arrays is significant. Text search and replace
 + * for each element occurs in order, so earlier changes can induce
 + * subsequent changes even though the original text might not appear
 + * to invoke the latter changes. */
 +
 +const t_sandr_const sandrTeX[] = {
 +  { "[TT]", "{\\tt " },
 +  { "[tt]", "}"      },
 +  { "[BB]", "{\\bf " },
 +  { "[bb]", "}"      },
 +  { "[IT]", "{\\em " },
 +  { "[it]", "}"      },
 +  { "[PAR]","\n\n"   },
 +  /* Escaping underscore for LaTeX is no longer necessary, and it breaks
 +   * text searching and the index if you do. */
 +  /*
 +  { "_",    "\\_"    },
 +  */
 +  { "$",    "\\$"    },
 +  { "<=",   "\\ensuremath{\\leq{}}"},
 +  { ">=",   "\\ensuremath{\\geq{}}"},
 +  { "<",    "\\textless{}" },
 +  { ">",    "\\textgreater{}" },
 +  { "^",    "\\^{}"    },
 +  { "\\^{}t", "\\ensuremath{^t}" },
 +  { "\\^{}a", "\\ensuremath{^a}" },
 +  { "\\^{}b", "\\ensuremath{^b}" },
 +  { "\\^{}2", "\\ensuremath{^2}" },
 +  { "\\^{}3", "\\ensuremath{^3}" },
 +  { "\\^{}6", "\\ensuremath{^6}" },
 +  { "#",    "\\#"    },
 +  { "[BR]", "\\\\"   },
 +  { "%",    "\\%"    },
 +  { "&",    "\\&"    },
 +  /* The next couple of lines allow true Greek symbols to be written to the 
 +     manual, which makes it look pretty */
 +  { "[GRK]", "\\ensuremath{\\" },
 +  { "[grk]", "}" },
 +  { "[MATH]","\\ensuremath{" },
 +  { "[math]","}" },
 +  { "[CHEVRON]", "\\ensuremath{<}" },
 +  { "[chevron]", "\\ensuremath{>}" },
 +  { "[MAG]", "\\ensuremath{|}" },
 +  { "[mag]", "\\ensuremath{|}" },
 +  { "[INT]","\\ensuremath{\\int" },
 +  { "[FROM]","_" },
 +  { "[from]","" },
 +  { "[TO]", "^" },
 +  { "[to]", "" },
 +  { "[int]","}" },
 +  { "[SUM]","\\ensuremath{\\sum" },
 +  { "[sum]","}" },
 +  { "[SUB]","\\ensuremath{_{" },
 +  { "[sub]","}}" },
 +  { "[SQRT]","\\ensuremath{\\sqrt{" },
 +  { "[sqrt]","}}" },
 +  { "[EXP]","\\ensuremath{\\exp{(" },
 +  { "[exp]",")}}" },
 +  { "[LN]","\\ensuremath{\\ln{(" },
 +  { "[ln]",")}}" },
 +  { "[LOG]","\\ensuremath{\\log{(" },
 +  { "[log]",")}}" },
 +  { "[COS]","\\ensuremath{\\cos{(" },
 +  { "[cos]",")}}" },
 +  { "[SIN]","\\ensuremath{\\sin{(" },
 +  { "[sin]",")}}" },
 +  { "[TAN]","\\ensuremath{\\tan{(" },
 +  { "[tan]",")}}" },
 +  { "[COSH]","\\ensuremath{\\cosh{(" },
 +  { "[cosh]",")}}" },
 +  { "[SINH]","\\ensuremath{\\sinh{(" },
 +  { "[sinh]",")}}" },
 +  { "[TANH]","\\ensuremath{\\tanh{(" },
 +  { "[tanh]",")}}" }
 +};
 +#define NSRTEX asize(sandrTeX)
 +
 +const t_sandr_const sandrTty[] = {
 +  { "[TT]", "" },
 +  { "[tt]", "" },
 +  { "[BB]", "" },
 +  { "[bb]", "" },
 +  { "[IT]", "" },
 +  { "[it]", "" },
 +  { "[MATH]","" },
 +  { "[math]","" },
 +  { "[CHEVRON]","<" },
 +  { "[chevron]",">" },
 +  { "[MAG]", "|" },
 +  { "[mag]", "|" },
 +  { "[INT]","integral" },
 +  { "[FROM]"," from " },
 +  { "[from]","" },
 +  { "[TO]", " to " },
 +  { "[to]", " of" },
 +  { "[int]","" },
 +  { "[SUM]","sum" },
 +  { "[sum]","" },
 +  { "[SUB]","_" },
 +  { "[sub]","" },
 +  { "[SQRT]","sqrt(" },
 +  { "[sqrt]",")" },
 +  { "[EXP]","exp(" },
 +  { "[exp]",")" },
 +  { "[LN]","ln(" },
 +  { "[ln]",")" },
 +  { "[LOG]","log(" },
 +  { "[log]",")" },
 +  { "[COS]","cos(" },
 +  { "[cos]",")" },
 +  { "[SIN]","sin(" },
 +  { "[sin]",")" },
 +  { "[TAN]","tan(" },
 +  { "[tan]",")" },
 +  { "[COSH]","cosh(" },
 +  { "[cosh]",")" },
 +  { "[SINH]","sinh(" },
 +  { "[sinh]",")" },
 +  { "[TANH]","tanh(" },
 +  { "[tanh]",")" },
 +  { "[PAR]","\n\n" },
 +  { "[BR]", "\n"},
 +  { "[GRK]", "" },
 +  { "[grk]", "" }
 +};
 +#define NSRTTY asize(sandrTty)
 +
 +const t_sandr_const sandrWiki[] = {
 +  { "&",    "&amp;" },
 +  { "<",    "&lt;" },
 +  { ">",    "&gt;" },
 +  { "[TT]", "&lt;code&gt;" },
 +  { "[tt]", "&lt;/code&gt;" },
 +  { "[BB]", "'''" },
 +  { "[bb]", "'''" },
 +  { "[IT]", "''" },
 +  { "[it]", "''" },
 +  { "[MATH]","" },
 +  { "[math]","" },
 +  { "[CHEVRON]","<" },
 +  { "[chevron]",">" },
 +  { "[MAG]", "|" },
 +  { "[mag]", "|" },
 +  { "[INT]","integral" },
 +  { "[FROM]"," from " },
 +  { "[from]","" },
 +  { "[TO]", " to " },
 +  { "[to]", " of" },
 +  { "[int]","" },
 +  { "[SUM]","sum" },
 +  { "[sum]","" },
 +  { "[SUB]","_" },
 +  { "[sub]","" },
 +  { "[SQRT]","sqrt(" },
 +  { "[sqrt]",")", },
 +  { "[EXP]","exp(" },
 +  { "[exp]",")" },
 +  { "[LN]","ln(" },
 +  { "[ln]",")" },
 +  { "[LOG]","log(" },
 +  { "[log]",")" },
 +  { "[COS]","cos(" },
 +  { "[cos]",")" },
 +  { "[SIN]","sin(" },
 +  { "[sin]",")" },
 +  { "[TAN]","tan(" },
 +  { "[tan]",")" },
 +  { "[COSH]","cosh(" },
 +  { "[cosh]",")" },
 +  { "[SINH]","sinh(" },
 +  { "[sinh]",")" },
 +  { "[TANH]","tanh(" },
 +  { "[tanh]",")" },
 +  { "[PAR]","\n\n" },
 +  { "[BR]", "\n" },
 +  { "[GRK]", "&" },
 +  { "[grk]", ";" }
 +};
 +#define NSRWIKI asize(sandrWiki)
 +
 +const t_sandr_const sandrNROFF[] = {
 +  { "[TT]", "\\fB " },
 +  { "[tt]", "\\fR" },
 +  { "[BB]", "\\fB " },
 +  { "[bb]", "\\fR" },
 +  { "[IT]", "\\fI " },
 +  { "[it]", "\\fR" },
 +  { "[MATH]","" },
 +  { "[math]","" },
 +  { "[CHEVRON]","<" },
 +  { "[chevron]",">" },
 +  { "[MAG]", "|" },
 +  { "[mag]", "|" },
 +  { "[INT]","integral" },
 +  { "[FROM]"," from " },
 +  { "[from]","" },
 +  { "[TO]", " to " },
 +  { "[to]", " of" },
 +  { "[int]","" },
 +  { "[SUM]","sum" },
 +  { "[sum]","" },
 +  { "[SUB]","_" },
 +  { "[sub]","" },
 +  { "[SQRT]","sqrt(" },
 +  { "[sqrt]",")", },
 +  { "[EXP]","exp(" },
 +  { "[exp]",")" },
 +  { "[LN]","ln(" },
 +  { "[ln]",")" },
 +  { "[LOG]","log(" },
 +  { "[log]",")" },
 +  { "[COS]","cos(" },
 +  { "[cos]",")" },
 +  { "[SIN]","sin(" },
 +  { "[sin]",")" },
 +  { "[TAN]","tan(" },
 +  { "[tan]",")" },
 +  { "[COSH]","cosh(" },
 +  { "[cosh]",")" },
 +  { "[SINH]","sinh(" },
 +  { "[sinh]",")" },
 +  { "[TANH]","tanh(" },
 +  { "[tanh]",")" },
 +  { "[PAR]","\n\n" },
 +  { "\n ",    "\n" },
 +  { "<",    "" },
 +  { ">",    "" },
 +  { "^",    "" },
 +  { "#",    "" },
 +  { "[BR]", "\n"},
 +  { "-",    "\\-"},
 +  { "[GRK]", "" },
 +  { "[grk]", "" }
 +};
 +#define NSRNROFF asize(sandrNROFF)
 +
 +const t_sandr_const sandrHTML[] = {
 +  { "<",    "&lt;" },
 +  { ">",    "&gt;" },
 +  { "[TT]", "<tt>" },
 +  { "[tt]", "</tt>" },
 +  { "[BB]", "<b>" },
 +  { "[bb]", "</b>" },
 +  { "[IT]", "<it>" },
 +  { "[it]", "</it>" },
 +  { "[MATH]","" },
 +  { "[math]","" },
 +  { "[CHEVRON]","<" },
 +  { "[chevron]",">" },
 +  { "[MAG]", "|" },
 +  { "[mag]", "|" },
 +  { "[INT]","integral" },
 +  { "[FROM]"," from " },
 +  { "[from]","" },
 +  { "[TO]", " to " },
 +  { "[to]", " of" },
 +  { "[int]","" },
 +  { "[SUM]","sum" },
 +  { "[sum]","" },
 +  { "[SUB]","_" },
 +  { "[sub]","" },
 +  { "[SQRT]","sqrt(" },
 +  { "[sqrt]",")", },
 +  { "[EXP]","exp(" },
 +  { "[exp]",")" },
 +  { "[LN]","ln(" },
 +  { "[ln]",")" },
 +  { "[LOG]","log(" },
 +  { "[log]",")" },
 +  { "[COS]","cos(" },
 +  { "[cos]",")" },
 +  { "[SIN]","sin(" },
 +  { "[sin]",")" },
 +  { "[TAN]","tan(" },
 +  { "[tan]",")" },
 +  { "[COSH]","cosh(" },
 +  { "[cosh]",")" },
 +  { "[SINH]","sinh(" },
 +  { "[sinh]",")" },
 +  { "[TANH]","tanh(" },
 +  { "[tanh]",")" },
 +  { "[PAR]","<p>" },
 +  { "[BR]", "<br>" },
 +  { "[GRK]", "&"  },
 +  { "[grk]", ";"  }
 +};
 +#define NSRHTML asize(sandrHTML)
 +
 +const t_sandr_const sandrXML[] = {
 +  { "<",    "&lt;" },
 +  { ">",    "&gt;" },
 +  { "[TT]", "<arg>" },
 +  { "[tt]", "</arg>" },
 +  { "[BB]", "<emp>" },
 +  { "[bb]", "</emp>" },
 +  { "[IT]", "<it>" },
 +  { "[it]", "</it>" },
 +  { "[MATH]","" },
 +  { "[math]","" },
 +  { "[CHEVRON]","<" },
 +  { "[chevron]",">" },
 +  { "[MAG]", "|" },
 +  { "[mag]", "|" },
 +  { "[INT]","integral" },
 +  { "[FROM]"," from " },
 +  { "[from]","" },
 +  { "[TO]", " to " },
 +  { "[to]", " of" },
 +  { "[int]","" },
 +  { "[SUM]","sum" },
 +  { "[sum]","" },
 +  { "[SUB]","_" },
 +  { "[sub]","" },
 +  { "[SQRT]","sqrt(" },
 +  { "[sqrt]",")", },
 +  { "[EXP]","exp(" },
 +  { "[exp]",")" },
 +  { "[LN]","ln(" },
 +  { "[ln]",")" },
 +  { "[LOG]","log(" },
 +  { "[log]",")" },
 +  { "[COS]","cos(" },
 +  { "[cos]",")" },
 +  { "[SIN]","sin(" },
 +  { "[sin]",")" },
 +  { "[TAN]","tan(" },
 +  { "[tan]",")" },
 +  { "[COSH]","cosh(" },
 +  { "[cosh]",")" },
 +  { "[SINH]","sinh(" },
 +  { "[sinh]",")" },
 +  { "[TANH]","tanh(" },
 +  { "[tanh]",")" },
 +  { "[PAR]","</par>\n<par>" },
 +  { "[BR]", "<br />" },
 +  { "[GRK]", "" },
 +  { "[grk]", "" }
 +};
 +#define NSRXML asize(sandrXML)
 +
 +static void mynum(char *buf,int n)
 +{
 +  if (n >= 10)
 +    sprintf(buf,"%2d",n);
 +  else
 +    sprintf(buf,"0%1d",n);
 +}
 +
 +static char *mydate(char buf[], int maxsize,gmx_bool bWiki)
 +{
 +  const char *mon[] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun", 
 +                       "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" };
 +  const char *day[] = { "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat" };
 +  const char *num[] = { "01", "02", "03", "04", "05", "06","07", "08", "09" }; 
 +  time_t now;
 +  struct tm tm;
 +  
 +  time(&now);
++#ifdef GMX_NATIVE_WINDOWS
 +  /* Native windows */
 +  localtime_s(&tm,&now);
 +#else
 +  localtime_r(&now,&tm);
 +#endif
 +
 +  /* subtract one from maxsize, so we have room for \0. */
 +  if (bWiki) {
 +    char dd[8],mm[8],ss[8],hh[8],mn[8];
 +    
 +    mynum(dd,tm.tm_mday);
 +    mynum(mm,tm.tm_mon);
 +    mynum(ss,tm.tm_sec);
 +    mynum(hh,tm.tm_hour);
 +    mynum(mn,tm.tm_min);
 +    sprintf(buf,"%4d-%2s-%2sT%2s:%2s:%2sZ",
 +           tm.tm_year+1900,mm,dd,hh,mn,ss);
 +  }
 +  else
 +    sprintf(buf,"%s %d %s %d",day[tm.tm_wday],tm.tm_mday,
 +           mon[tm.tm_mon],tm.tm_year+1900);
 +  
 +  return buf;
 +}
 +
 +/* Data structure for saved HTML links */
 +typedef struct t_linkdata {
 +  int     nsr;
 +  t_sandr *sr;
 +} t_linkdata;
 +
 +static t_linkdata *init_linkdata()
 +{
 +  t_linkdata *p;
 +  snew(p,1);
 +  p->sr=NULL;
 +  p->nsr=0;
 +
 +  return p;
 +}
 +
 +static void finish_linkdata(t_linkdata *p)
 +{
 +  int i;
 +  
 +  for(i=0;i<p->nsr;i++) {
 +    sfree(p->sr[i].search);    
 +    sfree(p->sr[i].replace);
 +  }
 +  sfree(p->sr);
 +  sfree(p);
 +}
 +
 +static char *repall(const char *s,int nsr,const t_sandr_const sa[])
 +{
 +  int  i;
 +  char *buf1,*buf2;
 +  
 +  /* Copy input to a non-constant char buffer.
 +   * buf1 is allocated here 
 +   */
 +  buf1=gmx_strdup(s); 
 +  
 +  for(i=0; (i<nsr); i++) {
 +    /* Replace in buffer1, put result in buffer2.
 +     * buf2 is allocated here.
 +     */
 +    buf2=replace(buf1,sa[i].search,sa[i].replace);
 +    sfree(buf1);
 +    buf1=buf2;
 +  }
 +  
 +  return buf1;
 +} 
 +
 +static char *repallww(const char *s,int nsr,const t_sandr sa[])
 +{
 +  int  i;
 +  char *buf1,*buf2;
 +
 +  /* Copy input to a non-constant char buffer.
 +   * buf1 is allocated here 
 +   */
 +  buf1=gmx_strdup(s); 
 +  
 +  for(i=0; (i<nsr); i++) {
 +    /* Replace in buffer1, put result in buffer2.
 +     * buf2 is allocated here.
 +     */
 +    buf2=replaceww(buf1,sa[i].search,sa[i].replace);
 +    sfree(buf1);
 +    buf1=buf2;
 +  }
 +  return buf1;
 +}
 +
 +static char *html_xref(char *s,const char *program, t_linkdata *links,gmx_bool bWiki)
 +{
 +  char   buf[256],**filestr;
 +  int    i,j,n;
 +  
 +  if (links->sr == NULL) {
 +    n=get_file("links.dat",&(filestr));
 +    links->nsr=n;
 +    snew(links->sr,n);
 +    for(i=0,j=0; (i<n); i++) {
 +      if (!program || (gmx_strcasecmp(program,filestr[i])  != 0)) {
 +      links->sr[j].search=gmx_strdup(filestr[i]);
 +      if (bWiki)
 +        sprintf(buf,"[[%s]]",filestr[i]);
 +      else
 +        sprintf(buf,"<a href=\"%s.html\">%s</a>",filestr[i],filestr[i]);
 +      links->sr[j].replace=gmx_strdup(buf);
 +      j++;
 +      }
 +    }
 +    links->nsr=j;
 +    for(i=0;i<n;i++)
 +      sfree(filestr[i]);
 +    sfree(filestr);
 +  }
 +  return repallww(s,links->nsr,links->sr);
 +}
 +
 +char *check_tex(const char *s)
 +{
 +  return repall(s,NSRTEX,sandrTeX);
 +}
 +
 +static char *check_nroff(const char *s)
 +{
 +  return repall(s,NSRNROFF,sandrNROFF);
 +}
 +
 +static char *check_wiki(const char *s,const char *program, t_linkdata *links)
 +{
 +  char *buf;
 +  
 +  buf = repall(s,NSRWIKI,sandrWiki);
 +  buf = html_xref(buf,program,links,TRUE);
 +  
 +  return buf;
 +}
 +
 +static char *check_html(const char *s,const char *program, t_linkdata *links)
 +{
 +  char *buf;
 +  
 +  buf = repall(s,NSRHTML,sandrHTML);
 +  buf = html_xref(buf,program,links,FALSE);
 +  
 +  return buf;
 +}
 +
 +#define NWR(s) check_wiki(s,program,links)
 +#define NSR(s) check_html(s,program,links)
 +  
 +#define FLAG_SET(flag, mask) ((flag & mask) == mask)
 +char *fileopt(unsigned long flag,char buf[],int maxsize)
 +{
 +  char tmp[256];
 +  
 +  if (FLAG_SET(flag, ffRW))
 +    sprintf(tmp,"In/Out");
 +  else if (FLAG_SET(flag, ffREAD))
 +    sprintf(tmp,"Input");
 +  else if (FLAG_SET(flag, ffWRITE))
 +    sprintf(tmp,"Output");
 +  else
 +    sprintf(tmp,"Dunno");
 +
 +  if (FLAG_SET(flag, ffOPT)) {
 +    strcat(tmp,", Opt");
 +    if (FLAG_SET(flag, ffSET)) 
 +      strcat(tmp,"!");
 +    else
 +      strcat(tmp,".");
 +  }
 +  if (FLAG_SET(flag, ffLIB))
 +    strcat(tmp,", Lib.");
 +  if (FLAG_SET(flag, ffMULT))
 +    strcat(tmp,", Mult.");
 +
 +  sprintf(buf,"%s",tmp);
 +  
 +  return buf;
 +}
 +
 +static void write_texman(FILE *out,const char *program,
 +                       int nldesc,const char **desc,
 +                       int nfile,t_filenm *fnm,
 +                       int npargs,t_pargs *pa,
 +                       int nbug,const char **bugs,
 +                       t_linkdata *links)
 +{
 +  int i;
 +  char tmp[256];
 +  
 +  fprintf(out,"\\section{\\normindex{%s}}\\label{%s}\n\n",check_tex(program),check_tex(program));
 +  
 +  if (nldesc > 0)
 +    for(i=0; (i<nldesc); i++) 
 +      fprintf(out,"%s\n",check_tex(desc[i]));
 +
 +  if (nfile > 0) {
 +    fprintf(out,"\\vspace{-2ex}\\begin{tabbing}\n");
 +    fprintf(out,"\n{\\normalsize \\bf Files}\\nopagebreak\\\\\n");
 +    fprintf(out,"{\\tt ~~~~~~~} \\= {\\tt ~~~~~~~~~~~~~~} \\= "
 +          "~~~~~~~~~~~~~~~~~~~~~~ \\= \\nopagebreak\\kill\n");
 +    for(i=0; (i<nfile); i++)
 +      fprintf(out,"\\>{\\tt %s} \\'\\> {\\tt %s} \\' %s \\> "
 +            "\\parbox[t]{0.55\\linewidth}{%s} \\\\\n",
 +            check_tex(fnm[i].opt),check_tex(fnm[i].fns[0]),
 +            check_tex(fileopt(fnm[i].flag,tmp,255)),
 +            check_tex(ftp2desc(fnm[i].ftp)));
 +    fprintf(out,"\\end{tabbing}\\vspace{-4ex}\n");
 +  }
 +  if (npargs > 0) {
 +    fprintf(out,"\\vspace{-2ex}\\begin{tabbing}\n");
 +    fprintf(out,"\n{\\normalsize \\bf Other options}\\nopagebreak\\\\\n");
 +    fprintf(out,"{\\tt ~~~~~~~~~~} \\= vector \\= "
 +          "{\\tt ~~~~~~~} \\= \\nopagebreak\\kill\n");
 +    for(i=0; (i<npargs); i++) {
 +      if (strlen(check_tex(pa_val(&(pa[i]),tmp,255))) <= 8)
 +      fprintf(out,"\\> {\\tt %s} \\'\\> %s \\'\\> {\\tt %s} \\' "
 +              "\\parbox[t]{0.68\\linewidth}{%s}\\\\\n",
 +              check_tex(pa[i].option),get_arg_desc(pa[i].type),
 +              check_tex(pa_val(&(pa[i]),tmp,255)),
 +              check_tex(pa[i].desc));
 +      else
 +              fprintf(out,"\\> {\\tt %s} \\'\\> %s \\'\\>\\\\\n"
 +              "\\> \\'\\> \\'\\> {\\tt %s} \\' "
 +              "\\parbox[t]{0.7\\linewidth}{%s}\\\\\n",
 +              check_tex(pa[i].option),get_arg_desc(pa[i].type),
 +              check_tex(pa_val(&(pa[i]),tmp,255)),
 +              check_tex(pa[i].desc));
 +    }
 +    fprintf(out,"\\end{tabbing}\\vspace{-4ex}\n");
 +  }
 +  if (nbug > 0) {
 +    fprintf(out,"\n");
 +    fprintf(out,"\\begin{itemize}\n");
 +    for(i=0; (i<nbug); i++)
 +      fprintf(out,"\\item %s\n",check_tex(bugs[i]));
 +    fprintf(out,"\\end{itemize}\n");
 +  }
 +/*   fprintf(out,"\n\\newpage\n"); */
 +}
 +
 +static void write_nroffman(FILE *out,
 +                         const char *program,
 +                         int nldesc,const char **desc,
 +                         int nfile,t_filenm *fnm,
 +                         int npargs,t_pargs *pa,
 +                         int nbug,const char **bugs,
 +                         t_linkdata *links)
 +
 +{
 +  int i;
 +  char tmp[256];
 +  
 +  
 +  fprintf(out,".TH %s 1 \"%s\" \"\" \"GROMACS suite, %s\"\n",program,mydate(tmp,255,FALSE),GromacsVersion());
 +  fprintf(out,".SH NAME\n");
 +  fprintf(out,"%s\n",program);
 +  fprintf(out,".B %s\n",GromacsVersion());
 +  
 +  fprintf(out,".SH SYNOPSIS\n");
 +  fprintf(out,"\\f3%s\\fP\n",program);
 +
 +  /* command line arguments */
 +  if (nfile > 0) {
 +    for(i=0; (i<nfile); i++)
 +      fprintf(out,".BI \"%s\" \" %s \"\n",check_nroff(fnm[i].opt),
 +            check_nroff(fnm[i].fns[0]));
 +  }
 +  if (npargs > 0) {
 +    for(i=0; (i<npargs); i++)
 +      if (pa[i].type == etBOOL)
 +      fprintf(out,".BI \"\\-[no]%s\" \"\"\n",check_nroff(pa[i].option+1));
 +      else
 +      fprintf(out,".BI \"%s\" \" %s \"\n",check_nroff(pa[i].option),
 +              check_nroff(get_arg_desc(pa[i].type)));
 +  }
 +  
 +  /* description */
 +  if (nldesc > 0) {
 +    fprintf(out,".SH DESCRIPTION\n");
 +    for(i=0; (i<nldesc); i++) 
 +      fprintf(out,"\\&%s\n",check_nroff(desc[i]));
 +  }
 +
 +  /* FILES */
 +  if (nfile > 0) {
 +    fprintf(out,".SH FILES\n");
 +    for(i=0; (i<nfile); i++)
 +      fprintf(out,".BI \"%s\" \" %s\" \n.B %s\n %s \n\n",
 +            check_nroff(fnm[i].opt),
 +              check_nroff(fnm[i].fns[0]),
 +              check_nroff(fileopt(fnm[i].flag,tmp,255)),
 +            check_nroff(ftp2desc(fnm[i].ftp)));
 +  }
 +  
 +  /* other options */
 +  fprintf(out,".SH OTHER OPTIONS\n");
 +  if ( npargs > 0 ) {
 +    for(i=0; (i<npargs); i++) {
 +      if (pa[i].type == etBOOL)
 +      fprintf(out,".BI \"\\-[no]%s\"  \"%s\"\n %s\n\n",
 +              check_nroff(pa[i].option+1),
 +              check_nroff(pa_val(&(pa[i]),tmp,255)),
 +                check_nroff(pa[i].desc));
 +      else
 +      fprintf(out,".BI \"%s\"  \" %s\" \" %s\" \n %s\n\n",
 +              check_nroff(pa[i].option),
 +                check_nroff(get_arg_desc(pa[i].type)),
 +              check_nroff(pa_val(&(pa[i]),tmp,255)),
 +                check_nroff(pa[i].desc));
 +    }
 +  }
 +
 +  if (nbug > 0) {
 +    fprintf(out,".SH KNOWN PROBLEMS\n");
 +    for(i=0; (i<nbug); i++)
 +      fprintf(out,"\\- %s\n\n",check_nroff(bugs[i]));
 +  }
 +
 +  fprintf(out,".SH SEE ALSO\n.BR gromacs(7)\n\n");
 +  fprintf(out,"More information about \\fBGROMACS\\fR is available at <\\fIhttp://www.gromacs.org/\\fR>.\n");
 +
 +}
 +
 +char *check_tty(const char *s)
 +{
 +  return repall(s,NSRTTY,sandrTty);
 +}
 +
 +void
 +print_tty_formatted(FILE *out, int nldesc, const char **desc,int indent,
 +                    t_linkdata *links,const char *program,gmx_bool bWiki)
 +{
 +  char *buf;
 +  char *temp;
 +  int buflen,i,j;
 +
 +  buflen = 80*nldesc;
 +  snew(buf,buflen);
 +  for(i=0; (i<nldesc); i++) {
 +    if ((strlen(buf)>0) && 
 +      (buf[strlen(buf)-1] !=' ') && (buf[strlen(buf)-1] !='\n'))
 +      strcat(buf," ");
 +    if (bWiki)
 +      temp=NWR(desc[i]);
 +    else
 +      temp=check_tty(desc[i]);
 +    if (strlen(buf) + strlen(temp) >= (size_t)(buflen-2)) {
 +      buflen += strlen(temp);
 +      srenew(buf,buflen);
 +    }
 +    strcat(buf,temp);
 +    sfree(temp);
 +  }
 +  /* Make lines of at most 79 characters */
 +  temp = wrap_lines(buf,78,indent,FALSE);
 +  fprintf(out,"%s\n",temp);
 +  sfree(temp);
 +  sfree(buf);
 +}
 +
 +static void write_ttyman(FILE *out,
 +                       const char *program,
 +                       int nldesc,const char **desc,
 +                       int nfile,t_filenm *fnm,
 +                       int npargs,t_pargs *pa,
 +                       int nbug,const char **bugs,gmx_bool bHeader,
 +                       t_linkdata *links)
 +{
 +  int i;
 +  char buf[256];
 +  char *tmp;
 +  
 +  if (bHeader) {
 +    fprintf(out,"%s\n\n",check_tty(program));
 +    fprintf(out,"%s\n%s\n",GromacsVersion(),mydate(buf,255,FALSE));
 +  }
 +  if (nldesc > 0) {
 +    fprintf(out,"DESCRIPTION\n-----------\n");
 +    print_tty_formatted(out,nldesc,desc,0,links,program,FALSE);
 +  }
 +  if (nbug > 0) {
 +    fprintf(out,"\n");
 +    fprintf(out,"KNOWN PROBLEMS\n----------\n");
 +    for(i=0; i<nbug; i++) {
 +      snew(tmp,strlen(bugs[i])+3);
 +      strcpy(tmp,"* ");
 +      strcpy(tmp+2,check_tty(bugs[i]));
 +      fprintf(out,"%s\n",wrap_lines(tmp,78,2,FALSE));
 +      sfree(tmp);
 +    }
 +  }
 +  if (nfile > 0) {
 +    fprintf(out,"\n");
 +    pr_fns(out,nfile,fnm);
 +  }
 +  if (npargs > 0) {
 +    print_pargs(out,npargs,pa,FALSE);
 +  }
 +}
 +
 +static void pr_html_files(FILE *out,int nfile,t_filenm fnm[],
 +                        const char *program,t_linkdata *links,gmx_bool bWiki)
 +{ 
 +  int  i;
 +  char link[10],tmp[255];
 +  
 +  if (bWiki)
 +    fprintf(out," %-10s %-12s %-12s %-s\n"
 +          " -----------------------------------------------------\n",
 +          "Option","Filename","Type","Description");
 +  else
 +    fprintf(out,
 +          "<TABLE BORDER=1 CELLSPACING=0 CELLPADDING=2>\n"
 +          "<TR>"
 +          "<TH>option</TH>"
 +          "<TH>filename</TH>"
 +          "<TH>type</TH>"
 +          "<TH>description</TH>"
 +          "</TR>\n");
 +  
 +  for(i=0; (i<nfile); i++) {
 +    strcpy(link,ftp2ext(fnm[i].ftp));
 +    if (strcmp(link,"???")==0)
 +      strcpy(link,"files");
 +    if (bWiki)
 +      fprintf(out," %-10s %-16s %-12s %-s\n",
 +            fnm[i].opt,
 +            NWR(fnm[i].fns[0]),
 +            fileopt(fnm[i].flag,tmp,255),
 +            NWR(ftp2desc(fnm[i].ftp)));
 +    else
 +      fprintf(out,
 +            "<TR>"
 +            "<TD ALIGN=RIGHT> <b><tt>%s</tt></b> </TD>"
 +            "<TD ALIGN=RIGHT> <tt><a href=\"%s.html\">%12s</a></tt> </TD>"
 +            "<TD> %s </TD>"
 +            "<TD> %s </TD>"
 +            "</TR>\n",
 +            fnm[i].opt,link,fnm[i].fns[0],fileopt(fnm[i].flag,tmp,255),
 +            NSR(ftp2desc(fnm[i].ftp)));
 +  }
 +  if (!bWiki)
 +    fprintf(out,"</TABLE>\n");
 +}
 +
 +static void write_wikiman(FILE *out,
 +                        const char *program,
 +                        int nldesc,const char **desc,
 +                        int nfile,t_filenm *fnm,
 +                        int npargs,t_pargs *pa,
 +                        int nbug,const char **bugs,gmx_bool bHeader,
 +                        t_linkdata *links)
 +{
 +  int i;
 +  char buf[256],link[10];
 +  char *tmp,*tmp2;
 +  fprintf(out,"<page>\n<title>Manual:%s_%s</title>\n",program,
 +        VERSION);
 +  fprintf(out,"<revision>\n");
 +  fprintf(out,"<timestamp>%s</timestamp>\n",mydate(buf,255,TRUE));
 +  fprintf(out,"<text xml:space=\"preserve\">\n");
 +  if (nldesc > 0) {
 +    fprintf(out,"== Description ==\n");
 +    print_tty_formatted(out,nldesc,desc,0,links,program,TRUE);
 +    fprintf(out,"\n");
 +  }
 +  if (nbug > 0) {
 +    fprintf(out,"== Known Problems ==\n");
 +    for(i=0; i<nbug; i++) {
 +      snew(tmp,strlen(bugs[i])+3);
 +      strcpy(tmp,"* ");
 +      strcpy(tmp+2,bugs[i]);
 +      fprintf(out,"%s\n",NWR(tmp));
 +      sfree(tmp);
 +    }
 +  }
 +  if (nfile > 0) {
 +    fprintf(out,"\n== Files ==\n");
 +    pr_html_files(out,nfile,fnm,program,links,TRUE);
 +  }
 +  if (npargs > 0) {
 +    fprintf(out,"\n== Options ==\n");
 +    fprintf(out," %-12s %-6s %-6s  %-s\n",
 +          "Option","Type","Value","Description");
 +    fprintf(out," ------------------------------------------------------\n");
 +    for(i=0; (i<npargs); i++) {
 +      tmp = NWR(pargs_print_line(&pa[i],TRUE));
 +      fprintf(out,"%s",tmp);
 +      sfree(tmp);
 +    }
 +  }
 +  fprintf(out,"[[category:Manual_Pages_%s|%s]]\n",VERSION,program);
 +  fprintf(out,"</text>\n");
 +  fprintf(out,"</revision>\n");
 +  fprintf(out,"</page>\n\n");
 +}
 +
 +static void write_htmlman(FILE *out,
 +                        const char *program,
 +                        int nldesc,const char **desc,
 +                        int nfile,t_filenm *fnm,
 +                        int npargs,t_pargs *pa,
 +                        int nbug,const char **bugs,
 +                        t_linkdata *links)
 +{
 +  int i;
 +  char link[10],tmp[255];
 +  
 +  fprintf(out,"<HTML>\n<HEAD>\n<TITLE>%s</TITLE>\n",program);
 +  fprintf(out,"<LINK rel=stylesheet href=\"style.css\" type=\"text/css\">\n");
 +  fprintf(out,"<BODY text=\"#000000\" bgcolor=\"#FFFFFF\" link=\"#0000FF\" vlink=\"#990000\" alink=\"#FF0000\">\n");
 +  fprintf(out,"<TABLE WIDTH=\"98%%\" NOBORDER >\n<TR><TD WIDTH=400>\n");
 +  fprintf(out,"<TABLE WIDTH=400 NOBORDER>\n<TD WIDTH=116>\n");
 +  fprintf(out,"<a href=\"http://www.gromacs.org/\">"
 +        "<img SRC=\"../images/gmxlogo_small.png\""
 +        "BORDER=0 </a></td>\n");
 +  fprintf(out,"<td ALIGN=LEFT VALIGN=TOP WIDTH=280>"
 +        "<br><h2>%s</h2>",program);
 +  fprintf(out,"<font size=-1><A HREF=\"../online.html\">Main Table of Contents</A></font><br>");
 +  fprintf(out,"<br></td>\n</TABLE></TD><TD WIDTH=\"*\" ALIGN=RIGHT VALIGN=BOTTOM><p><B>%s<br>\n",GromacsVersion());
 +  fprintf(out,"%s</B></td></tr></TABLE>\n<HR>\n",mydate(tmp,255,FALSE));
 +  
 +  if (nldesc > 0) {
 +    fprintf(out,"<H3>Description</H3>\n<p>\n");
 +    for(i=0; (i<nldesc); i++) 
 +      fprintf(out,"%s\n",NSR(desc[i]));
 +  }
 +  if (nfile > 0) {
 +    fprintf(out,"<P>\n");
 +    fprintf(out,"<H3>Files</H3>\n");
 +    pr_html_files(out,nfile,fnm,program,links,FALSE);
 +  }
 +  if (npargs > 0) {
 +    fprintf(out,"<P>\n");
 +    fprintf(out,"<H3>Other options</H3>\n");
 +    fprintf(out,
 +          "<TABLE BORDER=1 CELLSPACING=0 CELLPADDING=2>\n"
 +          "<TR>"
 +          "<TH>option</TH>"
 +          "<TH>type</TH>"
 +          "<TH>default</TH>"
 +          "<TH>description</TH>"
 +          "</TR>\n");
 +    for(i=0; (i<npargs); i++)
 +      fprintf(out,
 +            "<TR>"
 +            "<TD ALIGN=RIGHT> <b><tt>%s%s</tt></b> </TD>"
 +            "<TD ALIGN=RIGHT> %s </TD>"
 +            "<TD ALIGN=RIGHT> <tt>%s</tt> </TD>"
 +            "<TD> %s </TD>"
 +            "</TD>\n",
 +            (pa[i].type == etBOOL)?"-[no]":"-",pa[i].option+1,
 +            get_arg_desc(pa[i].type),pa_val(&(pa[i]),tmp,255),NSR(pa[i].desc));
 +    fprintf(out,"</TABLE>\n");
 +  }
 +  if (nbug > 0) {
 +    fprintf(out,"<P>\n");
 +    fprintf(out,"<H3>Known problems</H3>\n");
 +    fprintf(out,"<UL>\n");
 +    for(i=0; (i<nbug); i++)
 +      fprintf(out,"<LI>%s\n",NSR(bugs[i]));
 +    fprintf(out,"</UL>\n");
 +  }
 +  fprintf(out,"<P>\n");
 +  fprintf(out,"<hr>\n<div ALIGN=RIGHT>\n");
 +  fprintf(out,"<font size=\"-1\"><a href=\"http://www.gromacs.org\">"
 +        "http://www.gromacs.org</a></font><br>\n");
 +  fprintf(out,"<font size=\"-1\"><a href=\"mailto:gromacs@gromacs.org\">"
 +        "gromacs@gromacs.org</a></font><br>\n");
 +  fprintf(out,"</div>\n");
 +  fprintf(out,"</BODY>\n");
 +}
 +
 +char *check_xml(const char *s,const char *program,t_linkdata *links)
 +{
 +  char *buf;
 +  
 +  buf=repall(s,NSRXML,sandrXML);
 +  buf=html_xref(buf,program,links,FALSE);     /* the same in html and xml */
 +  
 +  return buf;
 +}
 +
 +static void write_xmlman(FILE *out,
 +                       const char *program,
 +                       int nldesc,const char **desc,
 +                       int nfile,t_filenm *fnm,
 +                       int npargs,t_pargs *pa,
 +                       int nbug,const char **bugs,
 +                       t_linkdata *links)
 +{
 +  int i;
 +  char link[10],buf[256],opt[10];
 +
 +#define NSR2(s) check_xml(s,program,links)
 +#define FLAG(w,f) (((w) & (f))==(f)) 
 +
 +  fprintf(out,"<gromacs-manual version=\"%s\" date=\"%s\" www=\"http://www.gromacs.org\">\n",GromacsVersion(),mydate(buf,255,FALSE));
 +  /* fprintf(out,"<LINK rel=stylesheet href=\"style.css\" type=\"text/css\">\n"); */
 +
 +  fprintf(out,"<program name=\"%s\">",program);  
 +  if (nldesc > 0) {
 +    fprintf(out,"\n<description>\n<par>\n");
 +    for(i=0; (i<nldesc); i++) 
 +      fprintf(out,"%s\n",NSR2(desc[i]));
 +  }
 +  fprintf(out,"</par>\n</description>\n");
 +
 +  if (nfile > 0) {
 +    fprintf(out,"\n<files>\n");
 +    for(i=0; (i<nfile); i++) {
 +      strcpy(link,ftp2ext(fnm[i].ftp));
 +      if (strcmp(link,"???")==0)
 +      strcpy(link,"files");
 +        if (fnm[i].opt[0]=='-') strcpy(opt,fnm[i].opt+1);
 +      else strcpy(opt,fnm[i].opt);
 +      fprintf(out,
 +            "<file type=\"%s\" typeid=\"%d\">\n"
 +              "\t<flags read=\"%d\" write=\"%d\" optional=\"%d\"/>\n"
 +            "\t<option>%s</option>\n"
 +            "\t<default-name link=\"%s.html\">%s</default-name>\n"
 +            "\t<description>%s</description>\n"
 +            "</file>\n",
 +            ftp2defnm(fnm[i].ftp),    /* from gmxlib/filenm.c */
 +            fnm[i].ftp,
 +            FLAG(fnm[i].flag,ffREAD), FLAG(fnm[i].flag,ffWRITE), FLAG(fnm[i].flag,ffOPT), 
 +            opt,link,fnm[i].fn,/*fileopt(fnm[i].flag),*/
 +            NSR(ftp2desc(fnm[i].ftp)));
 +    }
 +    fprintf(out,"</files>\n");
 +  }
 +
 +  if (npargs > 0) {
 +    fprintf(out,"\n<options>\n");
 +    for(i=0; (i<npargs); i++)
 +      fprintf(out,
 +            "<option type=\"%s\" hidden=\"%d\">\n"
 +            "\t<name >%s</name>\n"
 +            "\t<default-value>%s</default-value>\n"
 +            "\t<description>%s</description>\n"
 +            "</option>\n",
 +            get_arg_desc(pa[i].type), is_hidden(&pa[i]),
 +            pa[i].option+1,                  /* +1 - with no trailing '-' */
 +            pa_val(&(pa[i]),buf,255),pa[i].desc); /*get_argtp()[pa[i].type],*/
 +    fprintf(out,"</options>\n");
 +  }
 +
 +  if (nbug > 0) {
 +    fprintf(out,"\n<bugs>\n");
 +    for(i=0; (i<nbug); i++)
 +      fprintf(out,"\t<bug>%s</bug>\n",NSR(bugs[i]));
 +    fprintf(out,"</bugs>\n");
 +  }
 +  fprintf(out,"\n</program>\n</gromacs-manual>\n");
 +#undef FLAG  
 +}
 +
 +static void pr_opts(FILE *fp, 
 +                  int nfile,  t_filenm *fnm, 
 +                  int npargs, t_pargs pa[], int shell)
 +{
 +  int i;
 +  
 +  switch (shell) {
 +  case eshellCSH:
 +    fprintf(fp," \"c/-/(");
 +    for (i=0; i<nfile; i++)
 +      fprintf(fp," %s",fnm[i].opt+1);
 +    for (i=0; i<npargs; i++)
 +      if ( (pa[i].type==etBOOL) && *(pa[i].u.b) )
 +      fprintf(fp," no%s",pa[i].option+1);
 +      else
 +      fprintf(fp," %s",pa[i].option+1);
 +    fprintf(fp,")/\"");
 +    break;
 +  case eshellBASH:
 +    fprintf(fp,"if (( $COMP_CWORD <= 1 )) || [[ $c == -* ]]; then COMPREPLY=( $(compgen  -W '");
 +    for (i=0; i<nfile; i++)
 +      fprintf(fp," -%s",fnm[i].opt+1);
 +    for (i=0; i<npargs; i++)
 +      if ( (pa[i].type==etBOOL) && *(pa[i].u.b) )
 +      fprintf(fp," -no%s",pa[i].option+1);
 +      else
 +      fprintf(fp," -%s",pa[i].option+1);
 +    fprintf(fp,"' -- $c)); return 0; fi\n");
 +    break;
 +  case eshellZSH:
 +    fprintf(fp," -x 's[-]' -s \"");
 +    for (i=0; i<nfile; i++)
 +      fprintf(fp," %s",fnm[i].opt+1);
 +    for (i=0; i<npargs; i++)
 +      if ( (pa[i].type==etBOOL) && *(pa[i].u.b) )
 +      fprintf(fp," no%s",pa[i].option+1);
 +      else
 +      fprintf(fp," %s",pa[i].option+1);
 +    fprintf(fp,"\" ");
 +    break;
 +  }
 +}
 +
 +static void write_cshcompl(FILE *out,
 +                         int nfile,  t_filenm *fnm,
 +                         int npargs, t_pargs *pa)
 +{
 +  fprintf(out,"complete %s",ShortProgram());
 +  pr_enums(out,npargs,pa,eshellCSH);
 +  pr_fopts(out,nfile,fnm,eshellCSH);
 +  pr_opts(out,nfile,fnm,npargs,pa,eshellCSH);
 +  fprintf(out,"\n");
 +}
 +
 +static void write_zshcompl(FILE *out,
 +                         int nfile,  t_filenm *fnm,
 +                         int npargs, t_pargs *pa)
 +{
 +  fprintf(out,"compctl ");
 +
 +  /* start with options, since they are always present */
 +  pr_opts(out,nfile,fnm,npargs,pa,eshellZSH);
 +  pr_enums(out,npargs,pa,eshellZSH);
 +  pr_fopts(out,nfile,fnm,eshellZSH);
 +  fprintf(out,"-- %s\n",ShortProgram());
 +}
 +
 +static void write_bashcompl(FILE *out,
 +                          int nfile,  t_filenm *fnm,
 +                          int npargs, t_pargs *pa)
 +{
 +  /* Advanced bash completions are handled by shell functions.
 +   * p and c hold the previous and current word on the command line.
 +   * We need to use extended globbing, so write it in each completion file */
 +  fprintf(out,"shopt -s extglob\n");
 +  fprintf(out,"_%s_compl() {\nlocal p c\n",ShortProgram());
 +  fprintf(out,"COMPREPLY=() c=${COMP_WORDS[COMP_CWORD]} p=${COMP_WORDS[COMP_CWORD-1]}\n");
 +  pr_opts(out,nfile,fnm,npargs,pa,eshellBASH);
 +  fprintf(out,"case \"$p\" in\n");
 +  
 +  pr_enums(out,npargs,pa,eshellBASH);
 +  pr_fopts(out,nfile,fnm,eshellBASH);
 +  fprintf(out,"esac }\ncomplete -F _%s_compl %s\n",ShortProgram(),ShortProgram());
 +}
 +
 +static void write_py(FILE *out,const char *program,
 +                   int nldesc,const char **desc,
 +                   int nfile,t_filenm *fnm,
 +                   int npargs,t_pargs *pa,
 +                   int nbug,const char **bugs,
 +                   t_linkdata *links)
 +{
 +  gmx_bool bHidden;
 +  const char *cls = program;
 +  char *tmp;
 +  int  i,j;
 +
 +  /* Header stuff */  
 +  fprintf(out,"#!/usr/bin/python\n\nfrom GmxDialog import *\n\n");
 +  
 +  /* Class definition */
 +  fprintf(out,"class %s:\n",cls);
 +  fprintf(out,"    def __init__(self,tk):\n");
 +  
 +  /* Help text */
 +  fprintf(out,"        %s_help = \"\"\"\n",cls);
 +  fprintf(out,"        DESCRIPTION\n");
 +  print_tty_formatted(out,nldesc,desc,8,links,program,FALSE);
 +  if (nbug > 0) {
 +    fprintf(out,"\n        BUGS and PROBLEMS\n");
 +    for(i=0; i<nbug; i++) {
 +      snew(tmp,strlen(bugs[i])+3);
 +      strcpy(tmp,"* ");
 +      strcpy(tmp+2,check_tty(bugs[i]));
 +      fprintf(out,"%s\n",wrap_lines(tmp,78,10,TRUE));
 +      sfree(tmp);
 +    }
 +  }
 +  fprintf(out,"        \"\"\"\n\n        # Command line options\n");
 +  /* File options */
 +  fprintf(out,"        flags = []\n");
 +  for(i=0; (i<nfile); i++) 
 +    fprintf(out,"        flags.append(pca_file('%s',\"%s\",0,%d))\n",
 +          ftp2ext_generic(fnm[i].ftp),fnm[i].opt ? fnm[i].opt : "k",
 +          is_optional(&(fnm[i])));
 +          
 +          
 +  /* Other options */
 +  for(i=0; (i<npargs); i++) {
 +    switch(pa[i].type) {
 +    case etINT:
 +      fprintf(out,"        flags.append(pca_int(\"%s\",\"%s\",%d,%d))\n",
 +            pa[i].option,pa[i].desc,*pa[i].u.i,is_hidden(&(pa[i])));
 +      break;
 +    case etREAL:
 +    case etTIME:
 +      fprintf(out,"        flags.append(pca_float(\"%s\",\"%s\",%f,%d))\n",
 +            pa[i].option,pa[i].desc,*pa[i].u.r,is_hidden(&(pa[i])));
 +      break;
 +    case etSTR:
 +    case etBOOL:
 +      fprintf(out,"        flags.append(pca_gmx_bool(\"%s\",\"%s\",%d,%d))\n",
 +            pa[i].option,pa[i].desc,*pa[i].u.b,is_hidden(&(pa[i])));
 +      break;
 +    case etRVEC:
 +      fprintf(stderr,"Sorry, no rvecs yet...\n");
 +      break;
 +    case etENUM:
 +      fprintf(out,"        flags.append(pca_enum(\"%s\",\"%s\",\n",
 +            pa[i].option,pa[i].desc);
 +      fprintf(out,"        ['%s'",pa[i].u.c[1]);
 +      for(j=2; (pa[i].u.c[j] != NULL); j++)
 +      fprintf(out,",'%s'",pa[i].u.c[j]);
 +      fprintf(out,"],%d))\n",is_hidden(&(pa[i])));
 +      break;
 +    default:
 +      break;
 +    }
 +  }
 +    
 +  /* Make the dialog box */
 +  fprintf(out,"        gmxd = gmx_dialog(tk,\"%s\",flags,%s_help)\n\n",
 +        cls,cls);
 +        
 +  /* Main loop */
 +  fprintf(out,"#####################################################\n");
 +  fprintf(out,"tk     = Tk()\n");
 +  fprintf(out,"my%s = %s(tk)\n",cls,cls);
 +  fprintf(out,"tk.mainloop()\n");
 +}
 +
 +void write_man(FILE *out,const char *mantp,
 +             const char *program,
 +             int nldesc,const char **desc,
 +             int nfile,t_filenm *fnm,
 +             int npargs,t_pargs *pa,
 +             int nbug,const char **bugs,
 +             gmx_bool bHidden)
 +{
 +  const char *pr;
 +  int     i,npar;
 +  t_pargs *par;
 + 
 +  t_linkdata *links;
 +  
 +  links=init_linkdata();
 +  
 +  /* Don't write hidden options to completions, it just
 +   * makes the options more complicated for normal users
 +   */
 +
 +  if (bHidden) {
 +    npar=npargs;
 +    par=pa;
 +  }
 +  else {
 +    snew(par,npargs);
 +    npar=0;
 +    for(i=0;i<npargs;i++)
 +      if (!is_hidden(&pa[i])) {
 +      par[npar]=pa[i];
 +      npar++;
 +      }
 +  }
 +  
 +  if ((pr=strrchr(program,DIR_SEPARATOR)) == NULL)
 +    pr=program;
 +  else
 +    pr+=1;
 +  if (strcmp(mantp,"tex")==0)
 +    write_texman(out,pr,nldesc,desc,nfile,fnm,npar,par,nbug,bugs,links);
 +  if (strcmp(mantp,"nroff")==0)
 +    write_nroffman(out,pr,nldesc,desc,nfile,fnm,npar,par,nbug,bugs,links);
 +  if (strcmp(mantp,"ascii")==0)
 +    write_ttyman(out,pr,nldesc,desc,nfile,fnm,npar,par,nbug,bugs,TRUE,links);
 +  if (strcmp(mantp,"wiki")==0)
 +    write_wikiman(out,pr,nldesc,desc,nfile,fnm,npar,par,nbug,bugs,TRUE,links);
 +  if (strcmp(mantp,"help")==0)
 +    write_ttyman(out,pr,nldesc,desc,nfile,fnm,npar,par,nbug,bugs,FALSE,links);
 +  if (strcmp(mantp,"html")==0)
 +    write_htmlman(out,pr,nldesc,desc,nfile,fnm,npar,par,nbug,bugs,links);
 +  if (strcmp(mantp,"py")==0)
 +    write_py(out,pr,nldesc,desc,nfile,fnm,npar,par,nbug,bugs,links);
 +  if (strcmp(mantp,"xml")==0)
 +    write_xmlman(out,pr,nldesc,desc,nfile,fnm,npargs,pa,nbug,bugs,links);     
 +  if (strcmp(mantp,"completion-zsh")==0)
 +    write_zshcompl(out,nfile,fnm,npar,par);
 +  if (strcmp(mantp,"completion-bash")==0)
 +    write_bashcompl(out,nfile,fnm,npar,par);
 +  if (strcmp(mantp,"completion-csh")==0)
 +    write_cshcompl(out,nfile,fnm,npar,par);
 +
 +  if (!bHidden)
 +    sfree(par);
 +
 +  finish_linkdata(links);
 +}
 +
 +const char *get_arg_desc(int type) {
 +   static const char *argtp[etNR] = {
 +     "int", "step", "real", "time", "string", "bool", "vector", "enum"
 +   };
 +   return argtp[type];
 +}
Simple merge
Simple merge
index a92e545a4cd53aebc410591c39860c124de37876,0000000000000000000000000000000000000000..c460cd6652c7e964c08ab261f4edf643c408f6c4
mode 100644,000000..100644
--- /dev/null
@@@ -1,737 -1,0 +1,737 @@@
-   return search_atom(name,i,pdba->nr,pdba->atom,pdba->atomname,
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +
 +#include <time.h>
 +#include <ctype.h>
 +#include "sysstuff.h"
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "copyrite.h"
 +#include "string2.h"
 +#include "confio.h"
 +#include "symtab.h"
 +#include "vec.h"
 +#include "statutil.h"
 +#include "futil.h"
 +#include "gmx_fatal.h"
 +#include "physics.h"
 +#include "calch.h"
 +#include "genhydro.h"
 +#include "h_db.h"
 +#include "ter_db.h"
 +#include "resall.h"
 +#include "pgutil.h"
 +#include "network.h"
 +#include "macros.h"
 +
 +static void copy_atom(t_atoms *atoms1,int a1,t_atoms *atoms2,int a2)
 +{
 +  atoms2->atom[a2] = atoms1->atom[a1];
 +  snew(atoms2->atomname[a2],1);
 +  *atoms2->atomname[a2]=strdup(*atoms1->atomname[a1]);
 +}
 +
 +static atom_id pdbasearch_atom(const char *name,int resind,t_atoms *pdba,
 +                             const char *searchtype,gmx_bool bAllowMissing)
 +{
 +  int  i;
 +  
 +  for(i=0; (i<pdba->nr) && (pdba->atom[i].resind != resind); i++)
 +    ;
 +    
++  return search_atom(name,i,pdba,
 +                   searchtype,bAllowMissing);
 +}
 +
 +static void hacksearch_atom(int *ii, int *jj, char *name,
 +                          int nab[], t_hack *ab[], 
 +                          int resind, t_atoms *pdba)
 +{
 +  int  i,j;
 +  
 +  *ii=-1;
 +  if (name[0] == '-') {
 +    name++;
 +    resind--;
 +  }
 +  for(i=0; (i<pdba->nr) && (pdba->atom[i].resind != resind); i++)
 +    ;
 +  for(   ; (i<pdba->nr) && (pdba->atom[i].resind == resind) && (*ii<0); i++)
 +    for(j=0; (j < nab[i]) && (*ii<0); j++)
 +      if (ab[i][j].nname && strcmp(name,ab[i][j].nname) == 0) {
 +      *ii=i;
 +      *jj=j;
 +      }
 +  
 +  return;
 +}
 +
 +void dump_ab(FILE *out,int natom,int nab[], t_hack *ab[], gmx_bool bHeader)
 +{
 +  int i,j;
 +  
 +#define SS(s) (s)?(s):"-"
 +  /* dump ab */
 +  if (bHeader)
 +    fprintf(out,"ADDBLOCK (t_hack) natom=%d\n"
 +          "%4s %2s %-4s %-4s %2s %-4s %-4s %-4s %-4s %1s %s\n",
 +          natom,"atom","nr","old","new","tp","ai","aj","ak","al","a","x");
 +  for(i=0; i<natom; i++)
 +    for(j=0; j<nab[i]; j++)
 +      fprintf(out,"%4d %2d %-4s %-4s %2d %-4s %-4s %-4s %-4s %s %g %g %g\n",
 +            i+1, ab[i][j].nr, SS(ab[i][j].oname), SS(ab[i][j].nname),
 +            ab[i][j].tp, 
 +            SS(ab[i][j].AI), SS(ab[i][j].AJ),
 +            SS(ab[i][j].AK), SS(ab[i][j].AL),
 +            ab[i][j].atom?"+":"", 
 +            ab[i][j].newx[XX], ab[i][j].newx[YY], ab[i][j].newx[ZZ]);
 +#undef SS
 +}
 +
 +static t_hackblock *get_hackblocks(t_atoms *pdba, int nah, t_hackblock ah[],
 +                                 int nterpairs,
 +                                 t_hackblock **ntdb, t_hackblock **ctdb, 
 +                                 int *rN, int *rC)
 +{
 +  int i,rnr;
 +  t_hackblock *hb,*ahptr;
 +
 +  /* make space */
 +  snew(hb,pdba->nres);
 +  /* first the termini */
 +  for(i=0; i<nterpairs; i++) {
 +    if (ntdb[i] != NULL) {
 +      copy_t_hackblock(ntdb[i], &hb[rN[i]]);
 +    }
 +    if (ctdb[i] != NULL) {
 +      merge_t_hackblock(ctdb[i], &hb[rC[i]]);
 +    }
 +  }
 +  /* then the whole hdb */
 +  for(rnr=0; rnr < pdba->nres; rnr++) {
 +    ahptr=search_h_db(nah,ah,*pdba->resinfo[rnr].rtp);
 +    if ( ahptr ) {
 +      if (hb[rnr].name==NULL) {
 +          hb[rnr].name=strdup(ahptr->name);
 +      }
 +      merge_hacks(ahptr, &hb[rnr]);
 +    }
 +  }
 +  return hb;
 +}
 +
 +static void expand_hackblocks_one(t_hackblock *hbr, char *atomname, 
 +                                int *nabi, t_hack **abi, gmx_bool bN, gmx_bool bC)
 +{
 +  int j, k, l, d;
 +  gmx_bool bIgnore;
 +  
 +  /* we'll recursively add atoms to atoms */
 +  for(j=0; j < hbr->nhack; j++) {
 +    /* first check if we're in the N- or C-terminus, then we should ignore 
 +       all hacks involving atoms from resp. previous or next residue
 +       (i.e. which name begins with '-' (N) or '+' (C) */
 +    bIgnore = FALSE;
 +    if ( bN ) { /* N-terminus: ignore '-' */
 +      for(k=0; k<4 && hbr->hack[j].a[k] && !bIgnore; k++) {
 +          bIgnore = hbr->hack[j].a[k][0]=='-';
 +      }
 +    }
 +    if ( bC ) { /* C-terminus: ignore '+' */
 +      for(k=0; k<4 && hbr->hack[j].a[k] && !bIgnore; k++) {
 +          bIgnore = hbr->hack[j].a[k][0]=='+';
 +      }
 +    }
 +    /* must be either hdb entry (tp>0) or add from tdb (oname==NULL)
 +       and first control aton (AI) matches this atom or
 +       delete/replace from tdb (oname!=NULL) and oname matches this atom */
 +    if (debug) {
 +        fprintf(debug," %s",hbr->hack[j].oname?hbr->hack[j].oname:hbr->hack[j].AI);
 +    }
 +
 +    if ( !bIgnore &&
 +         ( ( ( hbr->hack[j].tp > 0 || hbr->hack[j].oname==NULL ) &&
 +             strcmp(atomname, hbr->hack[j].AI) == 0 ) ||
 +           ( hbr->hack[j].oname!=NULL &&
 +             strcmp(atomname, hbr->hack[j].oname) == 0) ) ) {
 +      /* now expand all hacks for this atom */
 +      if (debug) {
 +         fprintf(debug," +%dh",hbr->hack[j].nr);
 +      }
 +      srenew(*abi,*nabi + hbr->hack[j].nr);
 +      for(k=0; k < hbr->hack[j].nr; k++) {
 +          copy_t_hack(&hbr->hack[j], &(*abi)[*nabi + k]);
 +          (*abi)[*nabi + k].bXSet = FALSE;
 +          /* if we're adding (oname==NULL) and don't have a new name (nname) 
 +          yet, build it from atomname */
 +          if ( (*abi)[*nabi + k].nname==NULL ) {
 +              if ( (*abi)[*nabi + k].oname==NULL ) {
 +                  (*abi)[*nabi + k].nname=strdup(atomname);
 +                  (*abi)[*nabi + k].nname[0]='H';
 +              }
 +          } else {
 +              if (gmx_debug_at) {
 +                  fprintf(debug,"Hack '%s' %d, replacing nname '%s' with '%s' (old name '%s')\n",
 +                          atomname,j,
 +                          (*abi)[*nabi + k].nname,hbr->hack[j].nname,
 +                          (*abi)[*nabi + k].oname ? (*abi)[*nabi + k].oname : "");
 +              }
 +          sfree((*abi)[*nabi + k].nname);
 +          (*abi)[*nabi + k].nname=strdup(hbr->hack[j].nname);
 +          }
 +
 +          if (hbr->hack[j].tp == 10 && k == 2) {
 +              /* This is a water virtual site, not a hydrogen */
 +              /* Ugly hardcoded name hack */
 +              (*abi)[*nabi + k].nname[0] = 'M';
 +          } else if (hbr->hack[j].tp == 11 && k >= 2) {
 +              /* This is a water lone pair, not a hydrogen */
 +              /* Ugly hardcoded name hack */
 +              srenew((*abi)[*nabi + k].nname,4);
 +              (*abi)[*nabi + k].nname[0] = 'L';
 +              (*abi)[*nabi + k].nname[1] = 'P';
 +              (*abi)[*nabi + k].nname[2] = '1' + k - 2;
 +              (*abi)[*nabi + k].nname[3] = '\0';
 +          } else if ( hbr->hack[j].nr > 1 ) {
 +              /* adding more than one atom, number them */
 +              l = strlen((*abi)[*nabi + k].nname);
 +              srenew((*abi)[*nabi + k].nname, l+2);
 +              (*abi)[*nabi + k].nname[l]   = '1' + k;
 +              (*abi)[*nabi + k].nname[l+1] = '\0';
 +          }
 +      }
 +      (*nabi) += hbr->hack[j].nr;
 +      
 +      /* add hacks to atoms we've just added */
 +      if ( hbr->hack[j].tp > 0 || hbr->hack[j].oname==NULL ) {
 +            for(k=0; k < hbr->hack[j].nr; k++) {
 +              expand_hackblocks_one(hbr, (*abi)[*nabi-hbr->hack[j].nr+k].nname, 
 +                              nabi, abi, bN, bC);
 +          }
 +      }
 +    }
 +  }
 +}
 +
 +static void expand_hackblocks(t_atoms *pdba, t_hackblock hb[], 
 +                            int nab[], t_hack *ab[], 
 +                            int nterpairs, int *rN, int *rC)
 +{
 +  int i,j;
 +  gmx_bool bN,bC;
 +  
 +  for(i=0; i < pdba->nr; i++) {
 +    bN = FALSE;
 +    for(j=0; j<nterpairs && !bN; j++)
 +      bN = pdba->atom[i].resind==rN[j];
 +    bC = FALSE;
 +    for(j=0; j<nterpairs && !bC; j++)
 +      bC = pdba->atom[i].resind==rC[j];
 +
 +    /* add hacks to this atom */
 +    expand_hackblocks_one(&hb[pdba->atom[i].resind], *pdba->atomname[i], 
 +                        &nab[i], &ab[i], bN, bC);
 +  }
 +  if (debug) fprintf(debug,"\n");
 +}
 +
 +static int check_atoms_present(t_atoms *pdba, int nab[], t_hack *ab[])
 +{
 +  int i, j, k, d, rnr, nadd;
 +  
 +  nadd=0;
 +  for(i=0; i < pdba->nr; i++) {
 +    rnr = pdba->atom[i].resind;
 +    for(j=0; j<nab[i]; j++) {
 +      if ( ab[i][j].oname==NULL ) { 
 +      /* we're adding */
 +      if (ab[i][j].nname == NULL)
 +        gmx_incons("ab[i][j].nname not allocated");
 +      /* check if the atom is already present */
 +      k = pdbasearch_atom(ab[i][j].nname, rnr, pdba, "check", TRUE);
 +      if ( k != -1 ) {
 +        /* We found the added atom. */
 +        ab[i][j].bAlreadyPresent = TRUE;
 +        if (debug) {
 +          fprintf(debug,"Atom '%s' in residue '%s' %d is already present\n",
 +                  ab[i][j].nname,
 +                  *pdba->resinfo[rnr].name,pdba->resinfo[rnr].nr);
 +        }
 +      } else {
 +        ab[i][j].bAlreadyPresent = FALSE;
 +        /* count how many atoms we'll add */
 +        nadd++;
 +      }
 +      } else if ( ab[i][j].nname==NULL ) {
 +      /* we're deleting */
 +      nadd--;
 +      }
 +    }
 +  }
 +  
 +  return nadd;
 +}
 +
 +static void calc_all_pos(t_atoms *pdba, rvec x[], int nab[], t_hack *ab[],
 +                       gmx_bool bCheckMissing)
 +{
 +  int i, j, ii, jj, m, ia, d, rnr,l=0;
 +#define MAXH 4
 +  rvec xa[4];     /* control atoms for calc_h_pos */
 +  rvec xh[MAXH]; /* hydrogen positions from calc_h_pos */
 +  gmx_bool bFoundAll;
 +
 +  jj = 0;
 +  
 +  for(i=0; i < pdba->nr; i++) {
 +    rnr   = pdba->atom[i].resind;
 +    for(j=0; j < nab[i]; j+=ab[i][j].nr) {
 +      /* check if we're adding: */
 +      if (ab[i][j].oname==NULL && ab[i][j].tp > 0) {
 +          bFoundAll = TRUE;
 +          for(m=0; (m<ab[i][j].nctl && bFoundAll); m++) {
 +              ia = pdbasearch_atom(ab[i][j].a[m], rnr, pdba,
 +                             bCheckMissing ? "atom" : "check",
 +                             !bCheckMissing);
 +          if (ia < 0) {
 +              /* not found in original atoms, might still be in t_hack (ab) */
 +              hacksearch_atom(&ii, &jj, ab[i][j].a[m], nab, ab, rnr, pdba);
 +              if (ii >= 0) {
 +                  copy_rvec(ab[ii][jj].newx, xa[m]);
 +              } else {
 +                  bFoundAll = FALSE;
 +                  if (bCheckMissing) {
 +                          gmx_fatal(FARGS,"Atom %s not found in residue %s %d"
 +                                  ", rtp entry %s"
 +                                  " while adding hydrogens",
 +                                  ab[i][j].a[m],
 +                                  *pdba->resinfo[rnr].name,
 +                                  pdba->resinfo[rnr].nr,
 +                                  *pdba->resinfo[rnr].rtp);
 +                  }
 +              }
 +        } else {
 +          copy_rvec(x[ia], xa[m]);
 +      }
 +      }
 +      if (bFoundAll) {
 +        for(m=0; (m<MAXH); m++)
 +          for(d=0; d<DIM; d++)
 +            if (m<ab[i][j].nr)
 +              xh[m][d] = 0;
 +            else
 +              xh[m][d] = NOTSET;
 +        calc_h_pos(ab[i][j].tp, xa, xh, &l);
 +        for(m=0; m<ab[i][j].nr; m++) {
 +          copy_rvec(xh[m],ab[i][j+m].newx);
 +          ab[i][j+m].bXSet = TRUE;
 +        }
 +      }
 +      }
 +    }
 +  }
 +}
 +
 +static void free_ab(int natoms,int *nab,t_hack **ab)
 +{
 +  int i;
 +
 +  for(i=0; i<natoms; i++) {
 +    free_t_hack(nab[i], &ab[i]);
 +  }
 +  sfree(nab);
 +  sfree(ab);
 +}
 +
 +static int add_h_low(t_atoms **pdbaptr, rvec *xptr[], 
 +                   int nah, t_hackblock ah[],
 +                   int nterpairs, t_hackblock **ntdb, t_hackblock **ctdb, 
 +                   int *rN, int *rC, gmx_bool bCheckMissing,
 +                   int **nabptr, t_hack ***abptr,
 +                   gmx_bool bUpdate_pdba, gmx_bool bKeep_old_pdba)
 +{
 +  t_atoms     *newpdba=NULL,*pdba=NULL;
 +  int         nadd;
 +  int         i,newi,j,d,natoms,nalreadypresent;
 +  int         *nab=NULL;
 +  t_hack      **ab=NULL;
 +  t_hackblock *hb;
 +  rvec        *xn;
 +  gmx_bool        bKeep_ab;
 +  
 +  /* set flags for adding hydrogens (according to hdb) */
 +  pdba=*pdbaptr;
 +  natoms=pdba->nr;
 +  
 +  if (nabptr && abptr) {
 +    /* the first time these will be pointers to NULL, but we will
 +       return in them the completed arrays, which we will get back
 +       the second time */
 +    nab = *nabptr;
 +    ab  = *abptr;
 +    bKeep_ab=TRUE;
 +    if (debug) fprintf(debug,"pointer to ab found\n");
 +  } else {
 +    bKeep_ab=FALSE;
 +  }
 +  
 +  if (nab && ab) {
 +    /* WOW, everything was already figured out */
 +    bUpdate_pdba = FALSE;
 +    if (debug) fprintf(debug,"pointer to non-null ab found\n");
 +  } else {
 +    /* We'll have to do all the hard work */
 +    bUpdate_pdba = TRUE;
 +    /* first get all the hackblocks for each residue: */
 +    hb = get_hackblocks(pdba, nah, ah, nterpairs, ntdb, ctdb, rN, rC);
 +    if (debug) dump_hb(debug, pdba->nres, hb);
 +    
 +    /* expand the hackblocks to atom level */
 +    snew(nab,natoms);
 +    snew(ab,natoms);
 +    expand_hackblocks(pdba, hb, nab, ab, nterpairs, rN, rC);
 +    free_t_hackblock(pdba->nres, &hb);
 +  }
 +  
 +  if (debug) { 
 +    fprintf(debug,"before calc_all_pos\n");
 +    dump_ab(debug, natoms, nab, ab, TRUE);
 +  }
 +  
 +  /* Now calc the positions */
 +  calc_all_pos(pdba, *xptr, nab, ab, bCheckMissing);
 +
 +  if (debug) { 
 +    fprintf(debug,"after calc_all_pos\n");
 +    dump_ab(debug, natoms, nab, ab, TRUE);
 +  }
 +  
 +  if (bUpdate_pdba) {
 +    /* we don't have to add atoms that are already present in pdba,
 +       so we will remove them from the ab (t_hack) */
 +    nadd = check_atoms_present(pdba, nab, ab);
 +    if (debug) {
 +      fprintf(debug, "removed add hacks that were already in pdba:\n");
 +      dump_ab(debug, natoms, nab, ab, TRUE);
 +      fprintf(debug, "will be adding %d atoms\n",nadd);
 +    }
 +    
 +    /* Copy old atoms, making space for new ones */
 +    snew(newpdba,1);
 +    init_t_atoms(newpdba,natoms+nadd,FALSE);
 +    newpdba->nres    = pdba->nres;   
 +    sfree(newpdba->resinfo);
 +    newpdba->resinfo = pdba->resinfo;
 +  } else {
 +    nadd = 0;
 +  }
 +  if (debug) fprintf(debug,"snew xn for %d old + %d new atoms %d total)\n",
 +                   natoms, nadd, natoms+nadd);
 +
 +  if (nadd == 0) {
 +    /* There is nothing to do: return now */
 +    if (!bKeep_ab) {
 +      free_ab(natoms,nab,ab);
 +    }
 +
 +    return natoms;
 +  }
 +
 +  snew(xn,natoms+nadd);
 +  newi=0;
 +  for(i=0; (i<natoms); i++) {
 +    /* check if this atom wasn't scheduled for deletion */
 +    if ( nab[i]==0 || (ab[i][0].nname != NULL) ) {
 +      if (newi >= natoms+nadd) {
 +      /*gmx_fatal(FARGS,"Not enough space for adding atoms");*/
 +      nadd+=10;
 +      srenew(xn,natoms+nadd);
 +      if (bUpdate_pdba) {
 +        srenew(newpdba->atom,natoms+nadd);
 +        srenew(newpdba->atomname,natoms+nadd);
 +      }
 +      debug_gmx();
 +      }
 +      if (debug) fprintf(debug,"(%3d) %3d %4s %4s%3d %3d",
 +                       i+1, newi+1, *pdba->atomname[i],
 +                       *pdba->resinfo[pdba->atom[i].resind].name,
 +                       pdba->resinfo[pdba->atom[i].resind].nr, nab[i]);
 +      if (bUpdate_pdba)
 +      copy_atom(pdba,i, newpdba,newi);
 +      copy_rvec((*xptr)[i],xn[newi]);
 +      /* process the hacks for this atom */
 +      nalreadypresent = 0;
 +      for(j=0; j<nab[i]; j++) {
 +      if ( ab[i][j].oname==NULL ) { /* add */
 +        newi++;
 +        if (newi >= natoms+nadd) {
 +          /* gmx_fatal(FARGS,"Not enough space for adding atoms");*/
 +          nadd+=10;
 +          srenew(xn,natoms+nadd);
 +          if (bUpdate_pdba) {
 +            srenew(newpdba->atom,natoms+nadd);
 +            srenew(newpdba->atomname,natoms+nadd);
 +          }
 +          debug_gmx();
 +        }
 +        if (bUpdate_pdba) {
 +          newpdba->atom[newi].resind=pdba->atom[i].resind;
 +        }
 +        if (debug) fprintf(debug," + %d",newi+1);
 +      }
 +      if (ab[i][j].nname != NULL &&
 +          (ab[i][j].oname == NULL ||
 +           strcmp(ab[i][j].oname,*newpdba->atomname[newi]) == 0)) {
 +        /* add or replace */
 +        if (ab[i][j].oname == NULL && ab[i][j].bAlreadyPresent) {
 +          /* This atom is already present, copy it from the input. */
 +          nalreadypresent++;
 +          if (bUpdate_pdba) {
 +            copy_atom(pdba,i+nalreadypresent, newpdba,newi);
 +          }
 +          copy_rvec((*xptr)[i+nalreadypresent],xn[newi]);
 +        } else {
 +        if (bUpdate_pdba) {
 +          if (gmx_debug_at) {
 +            fprintf(debug,"Replacing %d '%s' with (old name '%s') %s\n",
 +                    newi,
 +                    (newpdba->atomname[newi] && *newpdba->atomname[newi]) ? *newpdba->atomname[newi] : "",
 +                    ab[i][j].oname ? ab[i][j].oname : "",
 +                    ab[i][j].nname);
 +          }
 +          snew(newpdba->atomname[newi],1);
 +          *newpdba->atomname[newi]=strdup(ab[i][j].nname);
 +          if ( ab[i][j].oname!=NULL && ab[i][j].atom ) { /* replace */
 +/*          newpdba->atom[newi].m    = ab[i][j].atom->m; */
 +/*          newpdba->atom[newi].q    = ab[i][j].atom->q; */
 +/*          newpdba->atom[newi].type = ab[i][j].atom->type; */
 +          }
 +        }
 +        if (ab[i][j].bXSet)
 +          copy_rvec(ab[i][j].newx, xn[newi]);
 +        }
 +        if (bUpdate_pdba && debug) 
 +          fprintf(debug," %s %g %g",*newpdba->atomname[newi],
 +                  newpdba->atom[newi].m,newpdba->atom[newi].q);
 +      }
 +      }
 +      newi++;
 +      i += nalreadypresent;
 +      if (debug) fprintf(debug,"\n");
 +    }
 +  }
 +  if (bUpdate_pdba)
 +    newpdba->nr = newi;
 +  
 +  if ( bKeep_ab ) {
 +    *nabptr=nab;
 +    *abptr=ab;
 +  } else {
 +    /* Clean up */
 +    free_ab(natoms,nab,ab);
 +  }
 +    
 +  if ( bUpdate_pdba ) {
 +    if ( !bKeep_old_pdba ) {
 +      for(i=0; i < natoms; i++) {
 +      /* Do not free the atomname string itself, it might be in symtab */
 +      /* sfree(*(pdba->atomname[i])); */
 +      /* sfree(pdba->atomname[i]); */
 +      }
 +      sfree(pdba->atomname);
 +      sfree(pdba->atom);
 +      sfree(pdba->pdbinfo);
 +      sfree(pdba);
 +    }
 +    *pdbaptr=newpdba;
 +  } else
 +    nadd = newi-natoms;
 +  
 +  sfree(*xptr);
 +  *xptr=xn;
 +  
 +  return newi;
 +}
 +
 +void deprotonate(t_atoms *atoms,rvec *x)
 +{
 +  int  i,j;
 +  
 +  j=0;
 +  for(i=0; i<atoms->nr; i++) {
 +    if ( (*atoms->atomname[i])[0] != 'H' ) {
 +      atoms->atomname[j]=atoms->atomname[i];
 +      atoms->atom[j]=atoms->atom[i];
 +      copy_rvec(x[i],x[j]);
 +      j++;
 +    }
 +  }
 +  atoms->nr=j;
 +}
 +
 +int add_h(t_atoms **pdbaptr, rvec *xptr[], 
 +        int nah, t_hackblock ah[],
 +        int nterpairs, t_hackblock **ntdb, t_hackblock **ctdb, 
 +        int *rN, int *rC, gmx_bool bAllowMissing,
 +        int **nabptr, t_hack ***abptr,
 +        gmx_bool bUpdate_pdba, gmx_bool bKeep_old_pdba)
 +{
 +  int nold,nnew,niter;
 +
 +  /* Here we loop to be able to add atoms to added atoms.
 +   * We should not check for missing atoms here.
 +   */
 +  niter = 0;
 +  nnew = 0;
 +  do {
 +    nold = nnew;
 +    nnew = add_h_low(pdbaptr,xptr,nah,ah,nterpairs,ntdb,ctdb,rN,rC,FALSE,
 +                   nabptr,abptr,bUpdate_pdba,bKeep_old_pdba);
 +    niter++;
 +    if (niter > 100) {
 +      gmx_fatal(FARGS,"More than 100 iterations of add_h. Maybe you are trying to replace an added atom (this is not supported)?");
 +    }
 +  } while (nnew > nold);
 +  
 +  if (!bAllowMissing) {
 +    /* Call add_h_low once more, now only for the missing atoms check */
 +    add_h_low(pdbaptr,xptr,nah,ah,nterpairs,ntdb,ctdb,rN,rC,TRUE,
 +            nabptr,abptr,bUpdate_pdba,bKeep_old_pdba);
 +  }
 +
 +  return nnew;
 +}
 +
 +int protonate(t_atoms **atomsptr,rvec **xptr,t_protonate *protdata)
 +{
 +#define NTERPAIRS 1
 +  t_atoms *atoms;
 +  gmx_bool    bUpdate_pdba,bKeep_old_pdba;
 +  int     nntdb,nctdb,nt,ct;
 +  int     nadd;
 +  
 +  atoms=NULL;
 +  if ( !protdata->bInit ) {
 +    if (debug) {
 +         fprintf(debug,"protonate: Initializing protdata\n");
 +    }
 +
 +    /* set forcefield to use: */
 +    strcpy(protdata->FF,"gmx2.ff");
 +
 +    /* get the databases: */
 +    protdata->nah = read_h_db(protdata->FF,&protdata->ah);
 +    open_symtab(&protdata->tab); 
 +    protdata->atype = read_atype(protdata->FF,&protdata->tab);
 +    nntdb = read_ter_db(protdata->FF,'n',&protdata->ntdb,protdata->atype);
 +    if (nntdb < 1) {
 +      gmx_fatal(FARGS,"no N-terminus database");
 +    }
 +    nctdb = read_ter_db(protdata->FF,'c',&protdata->ctdb,protdata->atype);
 +    if (nctdb < 1) {
 +      gmx_fatal(FARGS,"no C-terminus database");
 +    }
 +    
 +    /* set terminus types: -NH3+ (different for Proline) and -COO- */
 +    atoms=*atomsptr;
 +    snew(protdata->sel_ntdb, NTERPAIRS);
 +    snew(protdata->sel_ctdb, NTERPAIRS);
 +
 +    if (nntdb>=4 && nctdb>=2) {
 +        /* Yuk, yuk, hardcoded default termini selections !!! */
 +        if (strncmp(*atoms->resinfo[atoms->atom[atoms->nr-1].resind].name,"PRO",3)==0) {
 +              nt = 3;
 +        } else {
 +          nt = 1;
 +        }
 +        ct = 1;
 +    } else {
 +        nt = 0;
 +        ct = 0;
 +    }
 +    protdata->sel_ntdb[0]=&(protdata->ntdb[nt]);
 +    protdata->sel_ctdb[0]=&(protdata->ctdb[ct]);
 +  
 +    /* set terminal residue numbers: */
 +    snew(protdata->rN, NTERPAIRS);
 +    snew(protdata->rC, NTERPAIRS);
 +    protdata->rN[0]=0;
 +    protdata->rC[0]=atoms->atom[atoms->nr-1].resind;
 +    
 +    /* keep unprotonated topology: */
 +    protdata->upatoms = atoms;
 +    /* we don't have these yet: */
 +    protdata->patoms = NULL;
 +    bUpdate_pdba=TRUE;
 +    bKeep_old_pdba=TRUE;
 +    
 +    /* clear hackblocks: */
 +    protdata->nab=NULL;
 +    protdata->ab=NULL;
 +    
 +    /* set flag to show we're initialized: */
 +    protdata->bInit=TRUE;
 +  } else {
 +    if (debug) {
 +        fprintf(debug,"protonate: using available protdata\n");
 +    }
 +    /* add_h will need the unprotonated topology again: */
 +    atoms = protdata->upatoms;
 +    bUpdate_pdba=FALSE;
 +    bKeep_old_pdba=FALSE;
 +  }
 +  
 +  /* now protonate */
 +  nadd = add_h(&atoms, xptr, protdata->nah, protdata->ah,
 +             NTERPAIRS, protdata->sel_ntdb, protdata->sel_ctdb,
 +             protdata->rN, protdata->rC, TRUE,
 +             &protdata->nab, &protdata->ab, bUpdate_pdba, bKeep_old_pdba);
 +  if ( ! protdata->patoms ) {
 +    /* store protonated topology */
 +    protdata->patoms = atoms;
 +  }
 +  *atomsptr = protdata->patoms;
 +  if (debug) {
 +    fprintf(debug,"natoms: %d -> %d (nadd=%d)\n",
 +                  protdata->upatoms->nr, protdata->patoms->nr, nadd);
 +  }
 +  return nadd;
 +#undef NTERPAIRS  
 +}
 +
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
index 593fc531081b727fb6268175724af2867bd48d02,0000000000000000000000000000000000000000..397bff235ce6d0343c7d8b9415e7f495f502819a
mode 100644,000000..100644
--- /dev/null
@@@ -1,7 -1,0 +1,14 @@@
- # includes: Nothing to build, just installation
++# includes: Nothing to build, just configuration and installation
++configure_file(${CMAKE_CURRENT_SOURCE_DIR}/gmx_header_config.h.cmakein ${CMAKE_CURRENT_BINARY_DIR}/gmx_header_config.h)
++install(FILES ${CMAKE_CURRENT_BINARY_DIR}/gmx_header_config.h
++  DESTINATION ${INCL_INSTALL_DIR}/gromacs/legacyheaders
++  COMPONENT development
++)
++
 +install(DIRECTORY . DESTINATION ${INCL_INSTALL_DIR}/gromacs/legacyheaders
 +  COMPONENT development
 +  PATTERN "Makefile*" EXCLUDE
 +  PATTERN "CMake*" EXCLUDE
 +  PATTERN "cmake*" EXCLUDE
++  PATTERN "*.cmakein" EXCLUDE
 +)
Simple merge
Simple merge
Simple merge
index 0000000000000000000000000000000000000000,a2f98bf13ae657f443db07dc30427ab40dbcb91b..a2f98bf13ae657f443db07dc30427ab40dbcb91b
mode 000000,100644..100644
--- /dev/null
index 6b4911cb28cf3f68b05e548f4e123ba981c4876a,0000000000000000000000000000000000000000..f04de55ffa6510896d9dd437f56c9b7bc4d0fc6d
mode 100644,000000..100644
--- /dev/null
@@@ -1,434 -1,0 +1,435 @@@
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gromacs Runs On Most of All Computer Systems
 + */
 +
 +#ifndef _mdrun_h
 +#define _mdrun_h
 +
 +#include <stdio.h>
 +#include <time.h>
 +#include "typedefs.h"
 +#include "network.h"
 +#include "tgroup.h"
 +#include "filenm.h"
 +#include "mshift.h"
 +#include "force.h"
 +#include "edsam.h"
 +#include "mdebin.h"
 +#include "vcm.h"
 +#include "vsite.h"
 +#include "pull.h"
 +#include "update.h"
 +#include "membed.h"
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
 +#define MD_POLARISE       (1<<2)
 +#define MD_IONIZE         (1<<3)
 +#define MD_RERUN          (1<<4)
 +#define MD_RERUN_VSITE    (1<<5)
 +#define MD_FFSCAN         (1<<6)
 +#define MD_SEPPOT         (1<<7)
 +#define MD_PARTDEC        (1<<9)
 +#define MD_DDBONDCHECK    (1<<10)
 +#define MD_DDBONDCOMM     (1<<11)
 +#define MD_CONFOUT        (1<<12)
 +#define MD_REPRODUCIBLE   (1<<13)
 +#define MD_READ_RNG       (1<<14)
 +#define MD_APPENDFILES    (1<<15)
++#define MD_APPENDFILESSET (1<<21)
 +#define MD_KEEPANDNUMCPT  (1<<16)
 +#define MD_READ_EKIN      (1<<17)
 +#define MD_STARTFROMCPT   (1<<18)
 +#define MD_RESETCOUNTERSHALFWAY (1<<19)
 +
 +/* Define a number of flags to better control the information
 + * passed to compute_globals in md.c and global_stat.
 + */
 +
 +/* We are rerunning the simulation */
 +#define CGLO_RERUNMD        (1<<1)
 +/* we are computing the kinetic energy from average velocities */
 +#define CGLO_EKINAVEVEL     (1<<2)
 +/* we are removing the center of mass momenta */
 +#define CGLO_STOPCM         (1<<3)
 +/* bGStat is defined in do_md */
 +#define CGLO_GSTAT          (1<<4)
 +/* Sum the energy terms in global computation */
 +#define CGLO_ENERGY         (1<<6)
 +/* Sum the kinetic energy terms in global computation */
 +#define CGLO_TEMPERATURE    (1<<7)
 +/* Sum the kinetic energy terms in global computation */
 +#define CGLO_PRESSURE       (1<<8)
 +/* Sum the constraint term in global computation */
 +#define CGLO_CONSTRAINT     (1<<9)
 +/* we are using an integrator that requires iteration over some steps - currently not used*/
 +#define CGLO_ITERATE        (1<<10)
 +/* it is the first time we are iterating (or, only once through is required */
 +#define CGLO_FIRSTITERATE   (1<<11)
 +/* Reading ekin from the trajectory */
 +#define CGLO_READEKIN       (1<<12)
 +/* we need to reset the ekin rescaling factor here */
 +#define CGLO_SCALEEKIN      (1<<13)
 +  
 +enum {
 +  ddnoSEL, ddnoINTERLEAVE, ddnoPP_PME, ddnoCARTESIAN, ddnoNR
 +};
 +
 +typedef struct {
 +  double real;
 +#ifdef GMX_CRAY_XT3
 +  double proc;
 +#else
 +  clock_t proc;
 +#endif
 +  double realtime;
 +  double proctime;
 +  double time_per_step;
 +  double last;
 +  gmx_large_int_t nsteps_done;
 +} gmx_runtime_t;
 +
 +typedef struct {
 +  t_fileio *fp_trn;
 +  t_fileio *fp_xtc;
 +  int  xtc_prec;
 +  ener_file_t fp_ene;
 +  const char *fn_cpt;
 +  gmx_bool bKeepAndNumCPT;
 +  int  eIntegrator;
 +  int  simulation_part;
 +  FILE *fp_dhdl;
 +  FILE *fp_field;
 +} gmx_mdoutf_t;
 +
 +/* Variables for temporary use with the deform option,
 + * used in runner.c and md.c.
 + * (These variables should be stored in the tpx file.)
 + */
 +extern gmx_large_int_t     deform_init_init_step_tpx;
 +extern matrix              deform_init_box_tpx;
 +#ifdef GMX_THREAD_MPI
 +extern tMPI_Thread_mutex_t deform_init_box_mutex;
 +
 +/* The minimum number of atoms per thread. With fewer atoms than this,
 + * the number of threads will get lowered.
 + */
 +#define MIN_ATOMS_PER_THREAD    90
 +#endif
 +
 +
 +typedef double gmx_integrator_t(FILE *log,t_commrec *cr,
 +                              int nfile,const t_filenm fnm[],
 +                              const output_env_t oenv, gmx_bool bVerbose,
 +                                gmx_bool bCompact, int nstglobalcomm,
 +                              gmx_vsite_t *vsite,gmx_constr_t constr,
 +                              int stepout,
 +                              t_inputrec *inputrec,
 +                              gmx_mtop_t *mtop,t_fcdata *fcd,
 +                              t_state *state,
 +                              t_mdatoms *mdatoms,
 +                              t_nrnb *nrnb,gmx_wallcycle_t wcycle,
 +                              gmx_edsam_t ed, 
 +                              t_forcerec *fr,
 +                              int repl_ex_nst,int repl_ex_seed,
 +                                gmx_membed_t *membed,
 +                              real cpt_period,real max_hours,
 +                              const char *deviceOptions,
 +                              unsigned long Flags,
 +                              gmx_runtime_t *runtime);
 +
 +typedef struct gmx_global_stat *gmx_global_stat_t;
 +
 +/* ROUTINES from md.c */
 +
 +gmx_integrator_t do_md;
 +
 +gmx_integrator_t do_md_openmm;
 +
 +
 +
 +/* ROUTINES from minimize.c */
 +
 +gmx_integrator_t do_steep;
 +/* Do steepest descents EM */
 +
 +gmx_integrator_t do_cg;
 +/* Do conjugate gradient EM */
 +
 +gmx_integrator_t do_lbfgs;
 +/* Do conjugate gradient L-BFGS */
 +
 +gmx_integrator_t do_nm;
 +/* Do normal mode analysis */
 +
 +/* ROUTINES from tpi.c */
 +
 +gmx_integrator_t do_tpi;
 +/* Do test particle insertion */
 +
 +
 +/* ROUTINES from md_support.c */
 +
 +/* return the number of steps between global communcations */
 +int check_nstglobalcomm(FILE *fplog,t_commrec *cr,
 +                        int nstglobalcomm,t_inputrec *ir);
 +
 +/* check whether an 'nst'-style parameter p is a multiple of nst, and
 +   set it to be one if not, with a warning. */
 +void check_nst_param(FILE *fplog,t_commrec *cr,
 +                     const char *desc_nst,int nst,
 +                     const char *desc_p,int *p);
 +
 +/* check which of the multisim simulations has the shortest number of
 +   steps and return that number of nsteps */
 +gmx_large_int_t get_multisim_nsteps(const t_commrec *cr,
 +                                    gmx_large_int_t nsteps);
 +
 +void rerun_parallel_comm(t_commrec *cr,t_trxframe *fr,
 +                         gmx_bool *bNotLastFrame);
 +
 +/* get the conserved energy associated with the ensemble type*/
 +real compute_conserved_from_auxiliary(t_inputrec *ir, t_state *state,           
 +                                      t_extmass *MassQ);
 +
 +/* reset all cycle and time counters. */
 +void reset_all_counters(FILE *fplog,t_commrec *cr,
 +                        gmx_large_int_t step,
 +                        gmx_large_int_t *step_rel,t_inputrec *ir,
 +                        gmx_wallcycle_t wcycle,t_nrnb *nrnb,
 +                        gmx_runtime_t *runtime);
 +
 +
 +
 +/* ROUTINES from sim_util.c */
 +void do_pbc_first(FILE *log,matrix box,t_forcerec *fr,
 +                       t_graph *graph,rvec x[]);
 +
 +void do_pbc_first_mtop(FILE *fplog,int ePBC,matrix box,
 +                            gmx_mtop_t *mtop,rvec x[]);
 +
 +void do_pbc_mtop(FILE *fplog,int ePBC,matrix box,
 +                      gmx_mtop_t *mtop,rvec x[]);
 +
 +
 +                   
 +/* ROUTINES from stat.c */
 +gmx_global_stat_t global_stat_init(t_inputrec *ir);
 +
 +void global_stat_destroy(gmx_global_stat_t gs);
 +
 +void global_stat(FILE *log,gmx_global_stat_t gs,
 +                      t_commrec *cr,gmx_enerdata_t *enerd,
 +                      tensor fvir,tensor svir,rvec mu_tot,
 +                      t_inputrec *inputrec,
 +                      gmx_ekindata_t *ekind,
 +                      gmx_constr_t constr,t_vcm *vcm,
 +                      int nsig,real *sig,
 +                      gmx_mtop_t *top_global, t_state *state_local, 
 +                      gmx_bool bSumEkinhOld, int flags);
 +/* Communicate statistics over cr->mpi_comm_mysim */
 +
 +gmx_mdoutf_t *init_mdoutf(int nfile,const t_filenm fnm[],
 +                               int mdrun_flags,
 +                               const t_commrec *cr,const t_inputrec *ir,
 +                               const output_env_t oenv);
 +/* Returns a pointer to a data structure with all output file pointers
 + * and names required by mdrun.
 + */
 +
 +void done_mdoutf(gmx_mdoutf_t *of);
 +/* Close all open output files and free the of pointer */
 +
 +#define MDOF_X   (1<<0)
 +#define MDOF_V   (1<<1)
 +#define MDOF_F   (1<<2)
 +#define MDOF_XTC (1<<3)
 +#define MDOF_CPT (1<<4)
 +
 +void write_traj(FILE *fplog,t_commrec *cr,
 +                     gmx_mdoutf_t *of,
 +                     int mdof_flags,
 +                     gmx_mtop_t *top_global,
 +                     gmx_large_int_t step,double t,
 +                     t_state *state_local,t_state *state_global,
 +                     rvec *f_local,rvec *f_global,
 +                     int *n_xtc,rvec **x_xtc);
 +/* Routine that writes frames to trn, xtc and/or checkpoint.
 + * What is written is determined by the mdof_flags defined above.
 + * Data is collected to the master node only when necessary.
 + */
 +
 +int do_per_step(gmx_large_int_t step,gmx_large_int_t nstep);
 +/* Return TRUE if io should be done */
 +
 +int do_any_io(int step, t_inputrec *ir);
 +
 +/* ROUTINES from sim_util.c */
 +
 +double gmx_gettime();
 +
 +void print_time(FILE *out, gmx_runtime_t *runtime,
 +                       gmx_large_int_t step,t_inputrec *ir, t_commrec *cr);
 +
 +void runtime_start(gmx_runtime_t *runtime);
 +
 +void runtime_end(gmx_runtime_t *runtime);
 +
 +void runtime_upd_proc(gmx_runtime_t *runtime);
 +/* The processor time should be updated every once in a while,
 + * since on 32-bit manchines it loops after 72 minutes.
 + */
 +  
 +void print_date_and_time(FILE *log,int pid,const char *title,
 +                              const gmx_runtime_t *runtime);
 +  
 +void nstop_cm(FILE *log,t_commrec *cr,
 +                   int start,int nr_atoms,real mass[],rvec x[],rvec v[]);
 +
 +void finish_run(FILE *log,t_commrec *cr,const char *confout,
 +                     t_inputrec *inputrec,
 +                     t_nrnb nrnb[],gmx_wallcycle_t wcycle,
 +                     gmx_runtime_t *runtime,
 +                     gmx_bool bWriteStat);
 +
 +void calc_enervirdiff(FILE *fplog,int eDispCorr,t_forcerec *fr);
 +
 +void calc_dispcorr(FILE *fplog,t_inputrec *ir,t_forcerec *fr,
 +                        gmx_large_int_t step, int natoms, 
 +                        matrix box,real lambda,tensor pres,tensor virial,
 +                        real *prescorr, real *enercorr, real *dvdlcorr);
 +
 +typedef enum
 +{
 +  LIST_SCALARS        =0001,
 +  LIST_INPUTREC       =0002,
 +  LIST_TOP    =0004,
 +  LIST_X      =0010,
 +  LIST_V      =0020,
 +  LIST_F      =0040,
 +  LIST_LOAD   =0100
 +} t_listitem;
 +
 +void check_nnodes_top(char *fn,t_topology *top);
 +/* Reset the tpr file to work with one node if necessary */
 +
 +
 +/* check the version */
 +void check_ir_old_tpx_versions(t_commrec *cr,FILE *fplog,
 +                               t_inputrec *ir,gmx_mtop_t *mtop);
 +
 +/* Allocate and initialize node-local state entries. */
 +void set_state_entries(t_state *state,const t_inputrec *ir,int nnodes);
 +
 +/* Broadcast the data for a simulation, and allocate node-specific settings
 +   such as rng generators. */
 +void init_parallel(FILE *log, t_commrec *cr, t_inputrec *inputrec,
 +                          gmx_mtop_t *mtop);
 +
 +
 +void do_constrain_first(FILE *log,gmx_constr_t constr,
 +                             t_inputrec *inputrec,t_mdatoms *md,
 +                             t_state *state,rvec *f,
 +                             t_graph *graph,t_commrec *cr,t_nrnb *nrnb,
 +                             t_forcerec *fr, gmx_localtop_t *top, tensor shake_vir); 
 +                        
 +void dynamic_load_balancing(gmx_bool bVerbose,t_commrec *cr,real capacity[],
 +                                 int dimension,t_mdatoms *md,t_topology *top,
 +                                 rvec x[],rvec v[],matrix box);
 +/* Perform load balancing, i.e. split the particles over processors
 + * based on their coordinates in the "dimension" direction.
 + */
 +
 +int multisim_min(const gmx_multisim_t *ms,int nmin,int n);
 +/* Set an appropriate value for n across the whole multi-simulation */
 +
 +int multisim_nstsimsync(const t_commrec *cr,
 +                      const t_inputrec *ir,int repl_ex_nst);
 +/* Determine the interval for inter-simulation communication */
 +                                 
 +void init_global_signals(globsig_t *gs,const t_commrec *cr,
 +                       const t_inputrec *ir,int repl_ex_nst);
 +/* Constructor for globsig_t */
 +
 +void copy_coupling_state(t_state *statea,t_state *stateb,
 +                       gmx_ekindata_t *ekinda,gmx_ekindata_t *ekindb, t_grpopts* opts);
 +/* Copy stuff from state A to state B */
 +
 +void compute_globals(FILE *fplog, gmx_global_stat_t gstat, t_commrec *cr, t_inputrec *ir,
 +                   t_forcerec *fr, gmx_ekindata_t *ekind,
 +                   t_state *state, t_state *state_global, t_mdatoms *mdatoms,
 +                   t_nrnb *nrnb, t_vcm *vcm, gmx_wallcycle_t wcycle,
 +                   gmx_enerdata_t *enerd,tensor force_vir, tensor shake_vir, tensor total_vir,
 +                   tensor pres, rvec mu_tot, gmx_constr_t constr,
 +                   globsig_t *gs,gmx_bool bInterSimGS,
 +                   matrix box, gmx_mtop_t *top_global, real *pcurr,
 +                   int natoms, gmx_bool *bSumEkinhOld, int flags);
 +/* Compute global variables during integration */
 +
 +int mdrunner(int nthreads_requested, FILE *fplog,t_commrec *cr,int nfile,
 +             const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
 +             gmx_bool bCompact, int nstglobalcomm, ivec ddxyz,int dd_node_order,
 +             real rdd, real rconstr, const char *dddlb_opt,real dlb_scale,
 +           const char *ddcsx,const char *ddcsy,const char *ddcsz,
 +           int nstepout, int resetstep, int nmultisim, int repl_ex_nst,
 +             int repl_ex_seed, real pforce,real cpt_period,real max_hours,
 +           const char *deviceOptions, unsigned long Flags);
 +/* Driver routine, that calls the different methods */
 +
 +void md_print_warning(const t_commrec *cr,FILE *fplog,const char *buf);
 +/* Print a warning message to stderr on the master node
 + * and to fplog if fplog!=NULL.
 + */
 +
 +void init_md(FILE *fplog,
 +                  t_commrec *cr,t_inputrec *ir, const output_env_t oenv, 
 +                  double *t,double *t0,
 +                  real *lambda,double *lam0,
 +                  t_nrnb *nrnb,gmx_mtop_t *mtop,
 +                  gmx_update_t *upd,
 +                  int nfile,const t_filenm fnm[],
 +                  gmx_mdoutf_t **outf,t_mdebin **mdebin,
 +                  tensor force_vir,tensor shake_vir,
 +                  rvec mu_tot,
 +                  gmx_bool *bSimAnn,t_vcm **vcm, 
 +                  t_state *state, unsigned long Flags);
 +  /* Routine in sim_util.c */
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +#endif        /* _mdrun_h */
index 3d46655865fd93a424a872dfe6d83a3d698ce488,0000000000000000000000000000000000000000..d0daa32882da0e90fe7974e7aaa0cc78cb6dad4a
mode 100644,000000..100644
--- /dev/null
@@@ -1,149 -1,0 +1,150 @@@
- #if ((defined WIN32 || defined _WIN32 || defined WIN64 || defined _WIN64) && !defined __CYGWIN__ && !defined __CYGWIN32__)
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gromacs Runs On Most of All Computer Systems
 + */
 +/*! \file
 + * \brief Generic string handling functions.
 + */
 +#ifndef _string2_h
 +#define _string2_h
 +
 +/*
 + *
 + * string2.h
 + * David van der Spoel
 + *
 + */
 +
 +
 +#include <string.h>
 +#include <stdio.h>
 +#include <stdlib.h>
 +#include <ctype.h>
 +#include <time.h>
 +#include <errno.h>
++#include "gmx_header_config.h"
 +
 +/*#include "typedefs.h"*/
 +#include "types/simple.h"
 +
 +/* Suppress Cygwin compiler warnings from using newlib version of
 + * ctype.h */
 +#ifdef GMX_CYGWIN
 +#undef isdigit
 +#undef isstring
 +#undef isspace
 +#undef isalnum
 +#undef isalpha
 +#undef ispunct
 +#undef isxdigit
 +#undef isupper
 +#undef islower
 +#undef toupper
 +#undef tolower
 +#endif
 +
 +#ifdef __cplusplus
 +extern "C" {
 +#endif
 +
 +#define CONTINUE    '\\'
 +#define COMMENTSIGN ';'
 +
 +int continuing(char *s);
 +
 +char *fgets2(char *s, int n, FILE *stream);
 +
 +void strip_comment (char *line);
 +
 +int break_line (char *line,
 +                     char *variable,
 +                     char *value);
 +
 +void upstring (char *str);
 +
 +void ltrim (char *str);
 +
 +void rtrim (char *str);
 +
 +void trim (char *str);
 +
 +void nice_header (FILE *out,const char *fn);
 +
 +int gmx_strcasecmp_min(const char *str1, const char *str2);
 +int gmx_strncasecmp_min(const char *str1, const char *str2, int n);
 +/* This funny version of strcasecmp, is not only case-insensitive,
 + * but also ignores '-' and '_'.
 + */
 +
 +int gmx_strcasecmp(const char *str1, const char *str2);
 +int gmx_strncasecmp(const char *str1, const char *str2, int n);
 +
 +char *gmx_strdup(const char *src);
 +char *gmx_strndup(const char *src, int n);
 +    
 +/** Pattern matcing with wildcards. */
 +int gmx_wcmatch(const char *pattern, const char *src);
 +
 +/** Return value for gmx_wcmatch() when there is no match. */
 +#define GMX_NO_WCMATCH 1
 +
 +
 +/* this is our implementation of strsep, the thread-safe replacement for
 +   strtok */
 +char *gmx_strsep(char **stringp, const char *delim);
 +
 +
 +char *wrap_lines(const char *buf,int line_width, int indent,
 +                      gmx_bool bIndentFirst);
 +/* wraps lines at 'linewidth', indenting all following
 + * lines by 'indent' spaces. A temp buffer is allocated and returned,
 + * which can be disposed of if no longer needed.
 + * If !bIndentFirst, then the first line will not be indented, only 
 + * the lines that are created due to wapping.
 + */
 +
 +
 +char **split(char sep,const char *str);
 +/* Implementation of the well-known Perl function split */
 +
 +gmx_large_int_t str_to_large_int_t(const char *str, char **endptr);
 +
++#ifdef GMX_NATIVE_WINDOWS
 +#define snprintf _snprintf
 +#endif
 +
 +#ifdef __cplusplus
 +}
 +#endif
 +
 +#endif        /* _string2_h */
Simple merge
Simple merge
Simple merge
index cad9c02266588702ed31174d2c9492e3fc2b8c94,0000000000000000000000000000000000000000..6217bc0bfa512819682c9566b782106ce31c47e7
mode 100644,000000..100644
--- /dev/null
@@@ -1,1415 -1,0 +1,1401 @@@
-     /* for now, we use Elr = 0, because if you want to get it right, you
-        really should be using PME. Maybe print a warning? */
-     
-     pscal   = calc_pres(ir->ePBC,nwall,box,ekinmod,vir,localpres,0.0) + pcorr;
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "update.h"
 +#include "vec.h"
 +#include "macros.h"
 +#include "physics.h"
 +#include "names.h"
 +#include "gmx_fatal.h"
 +#include "txtdump.h"
 +#include "nrnb.h"
 +#include "gmx_random.h"
 +#include "update.h"
 +#include "mdrun.h"
 +
 +#define NTROTTERPARTS 3
 +
 +/* Suzuki-Yoshida Constants, for n=3 and n=5, for symplectic integration  */
 +/* for n=1, w0 = 1 */
 +/* for n=3, w0 = w2 = 1/(2-2^-(1/3)), w1 = 1-2*w0 */
 +/* for n=5, w0 = w1 = w3 = w4 = 1/(4-4^-(1/3)), w1 = 1-4*w0 */
 +
 +#define MAX_SUZUKI_YOSHIDA_NUM 5
 +#define SUZUKI_YOSHIDA_NUM  5
 +
 +static const double sy_const_1[] = { 1. };
 +static const double sy_const_3[] = { 0.828981543588751,-0.657963087177502,0.828981543588751 };
 +static const double sy_const_5[] = { 0.2967324292201065,0.2967324292201065,-0.186929716880426,0.2967324292201065,0.2967324292201065 };
 +
 +static const double* sy_const[] = {
 +    NULL,
 +    sy_const_1,
 +    NULL,
 +    sy_const_3,
 +    NULL,
 +    sy_const_5
 +};
 +
 +/*
 +static const double sy_const[MAX_SUZUKI_YOSHIDA_NUM+1][MAX_SUZUKI_YOSHIDA_NUM+1] = {
 +    {},
 +    {1},
 +    {},
 +    {0.828981543588751,-0.657963087177502,0.828981543588751},
 +    {},
 +    {0.2967324292201065,0.2967324292201065,-0.186929716880426,0.2967324292201065,0.2967324292201065}
 +};*/
 +
 +/* these integration routines are only referenced inside this file */
 +static void NHC_trotter(t_grpopts *opts,int nvar, gmx_ekindata_t *ekind,real dtfull,
 +                        double xi[],double vxi[], double scalefac[], real *veta, t_extmass *MassQ, gmx_bool bEkinAveVel)
 +
 +{
 +    /* general routine for both barostat and thermostat nose hoover chains */
 +
 +    int   i,j,mi,mj,jmax;
 +    double Ekin,Efac,reft,kT,nd;
 +    double dt;
 +    t_grp_tcstat *tcstat;
 +    double *ivxi,*ixi;
 +    double *iQinv;
 +    double *GQ;
 +    gmx_bool bBarostat;
 +    int mstepsi, mstepsj;
 +    int ns = SUZUKI_YOSHIDA_NUM;  /* set the degree of integration in the types/state.h file */
 +    int nh = opts->nhchainlength;
 +    
 +    snew(GQ,nh);
 +    mstepsi = mstepsj = ns;
 +
 +/* if scalefac is NULL, we are doing the NHC of the barostat */
 +    
 +    bBarostat = FALSE;
 +    if (scalefac == NULL) {
 +        bBarostat = TRUE;
 +    }
 +
 +    for (i=0; i<nvar; i++) 
 +    {
 +    
 +        /* make it easier to iterate by selecting 
 +           out the sub-array that corresponds to this T group */
 +        
 +        ivxi = &vxi[i*nh];
 +        ixi = &xi[i*nh];
 +        if (bBarostat) {
 +            iQinv = &(MassQ->QPinv[i*nh]); 
 +            nd = 1; /* THIS WILL CHANGE IF NOT ISOTROPIC */
 +            reft = max(0.0,opts->ref_t[0]);
 +            Ekin = sqr(*veta)/MassQ->Winv;
 +        } else {
 +            iQinv = &(MassQ->Qinv[i*nh]);  
 +            tcstat = &ekind->tcstat[i];
 +            nd = opts->nrdf[i];
 +            reft = max(0.0,opts->ref_t[i]);
 +            if (bEkinAveVel) 
 +            {
 +                Ekin = 2*trace(tcstat->ekinf)*tcstat->ekinscalef_nhc;
 +            } else {
 +                Ekin = 2*trace(tcstat->ekinh)*tcstat->ekinscaleh_nhc;
 +            }
 +        }
 +        kT = BOLTZ*reft;
 +
 +        for(mi=0;mi<mstepsi;mi++) 
 +        {
 +            for(mj=0;mj<mstepsj;mj++)
 +            { 
 +                /* weighting for this step using Suzuki-Yoshida integration - fixed at 5 */
 +                dt = sy_const[ns][mj] * dtfull / mstepsi;
 +                
 +                /* compute the thermal forces */
 +                GQ[0] = iQinv[0]*(Ekin - nd*kT);
 +                
 +                for (j=0;j<nh-1;j++) 
 +                {     
 +                    if (iQinv[j+1] > 0) {
 +                        /* we actually don't need to update here if we save the 
 +                           state of the GQ, but it's easier to just recompute*/
 +                        GQ[j+1] = iQinv[j+1]*((sqr(ivxi[j])/iQinv[j])-kT);      
 +                    } else {
 +                        GQ[j+1] = 0;
 +                    }
 +                }
 +                
 +                ivxi[nh-1] += 0.25*dt*GQ[nh-1];
 +                for (j=nh-1;j>0;j--) 
 +                { 
 +                    Efac = exp(-0.125*dt*ivxi[j]);
 +                    ivxi[j-1] = Efac*(ivxi[j-1]*Efac + 0.25*dt*GQ[j-1]);
 +                }
 +                
 +                Efac = exp(-0.5*dt*ivxi[0]);
 +                if (bBarostat) {
 +                    *veta *= Efac;                
 +                } else {
 +                    scalefac[i] *= Efac;
 +                }
 +                Ekin *= (Efac*Efac);
 +                
 +                /* Issue - if the KE is an average of the last and the current temperatures, then we might not be
 +                   able to scale the kinetic energy directly with this factor.  Might take more bookkeeping -- have to
 +                   think about this a bit more . . . */
 +
 +                GQ[0] = iQinv[0]*(Ekin - nd*kT);
 +                
 +                /* update thermostat positions */
 +                for (j=0;j<nh;j++) 
 +                { 
 +                    ixi[j] += 0.5*dt*ivxi[j];
 +                }
 +                
 +                for (j=0;j<nh-1;j++) 
 +                { 
 +                    Efac = exp(-0.125*dt*ivxi[j+1]);
 +                    ivxi[j] = Efac*(ivxi[j]*Efac + 0.25*dt*GQ[j]);
 +                    if (iQinv[j+1] > 0) {
 +                        GQ[j+1] = iQinv[j+1]*((sqr(ivxi[j])/iQinv[j])-kT);  
 +                    } else {
 +                        GQ[j+1] = 0;
 +                    }
 +                }
 +                ivxi[nh-1] += 0.25*dt*GQ[nh-1];
 +            }
 +        }
 +    }
 +    sfree(GQ);
 +}
 +
 +static void boxv_trotter(t_inputrec *ir, real *veta, real dt, tensor box, 
 +                         gmx_ekindata_t *ekind, tensor vir, real pcorr, real ecorr, t_extmass *MassQ)
 +{
 +
 +    real  pscal;
 +    double alpha;
 +    int   i,j,d,n,nwall;
 +    real  T,GW,vol;
 +    tensor Winvm,ekinmod,localpres;
 +    
 +    /* The heat bath is coupled to a separate barostat, the last temperature group.  In the 
 +       2006 Tuckerman et al paper., the order is iL_{T_baro} iL {T_part}
 +    */
 +    
 +    if (ir->epct==epctSEMIISOTROPIC) 
 +    {
 +        nwall = 2;
 +    } 
 +    else 
 +    {
 +        nwall = 3;
 +    }
 +
 +    /* eta is in pure units.  veta is in units of ps^-1. GW is in 
 +       units of ps^-2.  However, eta has a reference of 1 nm^3, so care must be 
 +       taken to use only RATIOS of eta in updating the volume. */
 +    
 +    /* we take the partial pressure tensors, modify the 
 +       kinetic energy tensor, and recovert to pressure */
 +    
 +    if (ir->opts.nrdf[0]==0) 
 +    { 
 +        gmx_fatal(FARGS,"Barostat is coupled to a T-group with no degrees of freedom\n");    
 +    } 
 +    /* alpha factor for phase space volume, then multiply by the ekin scaling factor.  */
 +    alpha = 1.0 + DIM/((double)ir->opts.nrdf[0]);
 +    alpha *= ekind->tcstat[0].ekinscalef_nhc;
 +    msmul(ekind->ekin,alpha,ekinmod);  
 +    
-                tensor pres,real Elr)
++    pscal   = calc_pres(ir->ePBC,nwall,box,ekinmod,vir,localpres) + pcorr;
 +    
 +    vol = det(box);
 +    GW = (vol*(MassQ->Winv/PRESFAC))*(DIM*pscal - trace(ir->ref_p));   /* W is in ps^2 * bar * nm^3 */
 +    
 +    *veta += 0.5*dt*GW;   
 +}
 +
 +/* 
 + * This file implements temperature and pressure coupling algorithms:
 + * For now only the Weak coupling and the modified weak coupling.
 + *
 + * Furthermore computation of pressure and temperature is done here
 + *
 + */
 +
 +real calc_pres(int ePBC,int nwall,matrix box,tensor ekin,tensor vir,
-     real fac,Plr;
++               tensor pres)
 +{
 +    int  n,m;
-         /* Long range correction for periodic systems, see
-          * Neumann et al. JCP
-          * divide by 6 because it is multiplied by fac later on.
-          * If Elr = 0, no correction is made.
-          */
-         
-         /* This formula should not be used with Ewald or PME, 
-          * where the full long-range virial is calculated. EL 990823
-          */
-         Plr = Elr/6.0;
-         
++    real fac;
 +    
 +    if (ePBC==epbcNONE || (ePBC==epbcXY && nwall!=2))
 +        clear_mat(pres);
 +    else {
 +        /* Uitzoeken welke ekin hier van toepassing is, zie Evans & Morris - E. 
 +         * Wrs. moet de druktensor gecorrigeerd worden voor de netto stroom in  
 +         * het systeem...       
 +         */
 +        
-                 pres[n][m]=(ekin[n][m]-vir[n][m]+Plr)*fac;
 +        fac=PRESFAC*2.0/det(box);
 +        for(n=0; (n<DIM); n++)
 +            for(m=0; (m<DIM); m++)
-     rvec sumv,consk;
++                pres[n][m] = (ekin[n][m] - vir[n][m])*fac;
 +        
 +        if (debug) {
 +            pr_rvecs(debug,0,"PC: pres",pres,DIM);
 +            pr_rvecs(debug,0,"PC: ekin",ekin,DIM);
 +            pr_rvecs(debug,0,"PC: vir ",vir, DIM);
 +            pr_rvecs(debug,0,"PC: box ",box, DIM);
 +        }
 +    }
 +    return trace(pres)/DIM;
 +}
 +
 +real calc_temp(real ekin,real nrdf)
 +{
 +    if (nrdf > 0)
 +        return (2.0*ekin)/(nrdf*BOLTZ);
 +    else
 +        return 0;
 +}
 +
 +void parrinellorahman_pcoupl(FILE *fplog,gmx_large_int_t step,
 +                           t_inputrec *ir,real dt,tensor pres,
 +                           tensor box,tensor box_rel,tensor boxv,
 +                           tensor M,matrix mu,gmx_bool bFirstStep)
 +{
 +  /* This doesn't do any coordinate updating. It just
 +   * integrates the box vector equations from the calculated
 +   * acceleration due to pressure difference. We also compute
 +   * the tensor M which is used in update to couple the particle
 +   * coordinates to the box vectors.
 +   *
 +   * In Nose and Klein (Mol.Phys 50 (1983) no 5., p 1055) this is
 +   * given as
 +   *            -1    .           .     -1
 +   * M_nk = (h')   * (h' * h + h' h) * h
 +   *
 +   * with the dots denoting time derivatives and h is the transformation from
 +   * the scaled frame to the real frame, i.e. the TRANSPOSE of the box. 
 +   * This also goes for the pressure and M tensors - they are transposed relative
 +   * to ours. Our equation thus becomes:
 +   *
 +   *                  -1       .    .           -1
 +   * M_gmx = M_nk' = b  * (b * b' + b * b') * b'
 +   * 
 +   * where b is the gromacs box matrix.                       
 +   * Our box accelerations are given by
 +   *   ..                                    ..
 +   *   b = vol/W inv(box') * (P-ref_P)     (=h')
 +   */
 +  
 +  int    d,n;
 +  tensor winv;
 +  real   vol=box[XX][XX]*box[YY][YY]*box[ZZ][ZZ];
 +  real   atot,arel,change,maxchange,xy_pressure;
 +  tensor invbox,pdiff,t1,t2;
 +
 +  real maxl;
 +
 +  m_inv_ur0(box,invbox);
 +
 +  if (!bFirstStep) {
 +    /* Note that PRESFAC does not occur here.
 +     * The pressure and compressibility always occur as a product,
 +     * therefore the pressure unit drops out.
 +     */
 +    maxl=max(box[XX][XX],box[YY][YY]);
 +    maxl=max(maxl,box[ZZ][ZZ]);
 +    for(d=0;d<DIM;d++)
 +      for(n=0;n<DIM;n++)
 +      winv[d][n]=
 +        (4*M_PI*M_PI*ir->compress[d][n])/(3*ir->tau_p*ir->tau_p*maxl);
 +    
 +    m_sub(pres,ir->ref_p,pdiff);
 +    
 +    if(ir->epct==epctSURFACETENSION) {
 +      /* Unlike Berendsen coupling it might not be trivial to include a z
 +       * pressure correction here? On the other hand we don't scale the
 +       * box momentarily, but change accelerations, so it might not be crucial.
 +       */
 +      xy_pressure=0.5*(pres[XX][XX]+pres[YY][YY]);
 +      for(d=0;d<ZZ;d++)
 +      pdiff[d][d]=(xy_pressure-(pres[ZZ][ZZ]-ir->ref_p[d][d]/box[d][d]));
 +    }
 +    
 +    tmmul(invbox,pdiff,t1);
 +    /* Move the off-diagonal elements of the 'force' to one side to ensure
 +     * that we obey the box constraints.
 +     */
 +    for(d=0;d<DIM;d++) {
 +      for(n=0;n<d;n++) {
 +      t1[d][n] += t1[n][d];
 +      t1[n][d] = 0;
 +      }
 +    }
 +    
 +    switch (ir->epct) {
 +    case epctANISOTROPIC:
 +      for(d=0;d<DIM;d++) 
 +      for(n=0;n<=d;n++)
 +        t1[d][n] *= winv[d][n]*vol;
 +      break;
 +    case epctISOTROPIC:
 +      /* calculate total volume acceleration */
 +      atot=box[XX][XX]*box[YY][YY]*t1[ZZ][ZZ]+
 +      box[XX][XX]*t1[YY][YY]*box[ZZ][ZZ]+
 +      t1[XX][XX]*box[YY][YY]*box[ZZ][ZZ];
 +      arel=atot/(3*vol);
 +      /* set all RELATIVE box accelerations equal, and maintain total V
 +       * change speed */
 +      for(d=0;d<DIM;d++)
 +      for(n=0;n<=d;n++)
 +        t1[d][n] = winv[0][0]*vol*arel*box[d][n];    
 +      break;
 +    case epctSEMIISOTROPIC:
 +    case epctSURFACETENSION:
 +      /* Note the correction to pdiff above for surftens. coupling  */
 +      
 +      /* calculate total XY volume acceleration */
 +      atot=box[XX][XX]*t1[YY][YY]+t1[XX][XX]*box[YY][YY];
 +      arel=atot/(2*box[XX][XX]*box[YY][YY]);
 +      /* set RELATIVE XY box accelerations equal, and maintain total V
 +       * change speed. Dont change the third box vector accelerations */
 +      for(d=0;d<ZZ;d++)
 +      for(n=0;n<=d;n++)
 +        t1[d][n] = winv[d][n]*vol*arel*box[d][n];
 +      for(n=0;n<DIM;n++)
 +      t1[ZZ][n] *= winv[d][n]*vol;
 +      break;
 +    default:
 +      gmx_fatal(FARGS,"Parrinello-Rahman pressure coupling type %s "
 +                "not supported yet\n",EPCOUPLTYPETYPE(ir->epct));
 +      break;
 +    }
 +    
 +    maxchange=0;
 +    for(d=0;d<DIM;d++)
 +      for(n=0;n<=d;n++) {
 +      boxv[d][n] += dt*t1[d][n];
 +      
 +      /* We do NOT update the box vectors themselves here, since
 +       * we need them for shifting later. It is instead done last
 +       * in the update() routine.
 +       */
 +      
 +      /* Calculate the change relative to diagonal elements-
 +         since it's perfectly ok for the off-diagonal ones to
 +         be zero it doesn't make sense to check the change relative
 +         to its current size.
 +      */
 +      
 +      change=fabs(dt*boxv[d][n]/box[d][d]);
 +      
 +      if (change>maxchange)
 +        maxchange=change;
 +      }
 +    
 +    if (maxchange > 0.01 && fplog) {
 +      char buf[22];
 +      fprintf(fplog,
 +              "\nStep %s  Warning: Pressure scaling more than 1%%. "
 +              "This may mean your system\n is not yet equilibrated. "
 +              "Use of Parrinello-Rahman pressure coupling during\n"
 +              "equilibration can lead to simulation instability, "
 +              "and is discouraged.\n",
 +            gmx_step_str(step,buf));
 +    }
 +  }
 +  
 +  preserve_box_shape(ir,box_rel,boxv);
 +
 +  mtmul(boxv,box,t1);       /* t1=boxv * b' */
 +  mmul(invbox,t1,t2);
 +  mtmul(t2,invbox,M);
 +
 +  /* Determine the scaling matrix mu for the coordinates */
 +  for(d=0;d<DIM;d++)
 +    for(n=0;n<=d;n++)
 +      t1[d][n] = box[d][n] + dt*boxv[d][n];
 +  preserve_box_shape(ir,box_rel,t1);
 +  /* t1 is the box at t+dt, determine mu as the relative change */
 +  mmul_ur0(invbox,t1,mu);
 +}
 +
 +void berendsen_pcoupl(FILE *fplog,gmx_large_int_t step, 
 +                    t_inputrec *ir,real dt, tensor pres,matrix box,
 +                    matrix mu)
 +{
 +  int    d,n;
 +  real   scalar_pressure, xy_pressure, p_corr_z;
 +  char   *ptr,buf[STRLEN];
 +
 +  /*
 +   *  Calculate the scaling matrix mu
 +   */
 +  scalar_pressure=0;
 +  xy_pressure=0;
 +  for(d=0; d<DIM; d++) {
 +    scalar_pressure += pres[d][d]/DIM;
 +    if (d != ZZ)
 +      xy_pressure += pres[d][d]/(DIM-1);
 +  }
 +  /* Pressure is now in bar, everywhere. */
 +#define factor(d,m) (ir->compress[d][m]*dt/ir->tau_p)
 +  
 +  /* mu has been changed from pow(1+...,1/3) to 1+.../3, since this is
 +   * necessary for triclinic scaling
 +   */
 +  clear_mat(mu);
 +  switch (ir->epct) {
 +  case epctISOTROPIC:
 +    for(d=0; d<DIM; d++) 
 +      {
 +      mu[d][d] = 1.0 - factor(d,d)*(ir->ref_p[d][d] - scalar_pressure) /DIM;
 +      }
 +    break;
 +  case epctSEMIISOTROPIC:
 +    for(d=0; d<ZZ; d++)
 +      mu[d][d] = 1.0 - factor(d,d)*(ir->ref_p[d][d]-xy_pressure)/DIM;
 +    mu[ZZ][ZZ] = 
 +      1.0 - factor(ZZ,ZZ)*(ir->ref_p[ZZ][ZZ] - pres[ZZ][ZZ])/DIM;
 +    break;
 +  case epctANISOTROPIC:
 +    for(d=0; d<DIM; d++)
 +      for(n=0; n<DIM; n++)
 +      mu[d][n] = (d==n ? 1.0 : 0.0) 
 +        -factor(d,n)*(ir->ref_p[d][n] - pres[d][n])/DIM;
 +    break;
 +  case epctSURFACETENSION:
 +    /* ir->ref_p[0/1] is the reference surface-tension times *
 +     * the number of surfaces                                */
 +    if (ir->compress[ZZ][ZZ])
 +      p_corr_z = dt/ir->tau_p*(ir->ref_p[ZZ][ZZ] - pres[ZZ][ZZ]);
 +    else
 +      /* when the compressibity is zero, set the pressure correction   *
 +       * in the z-direction to zero to get the correct surface tension */
 +      p_corr_z = 0;
 +    mu[ZZ][ZZ] = 1.0 - ir->compress[ZZ][ZZ]*p_corr_z;
 +    for(d=0; d<DIM-1; d++)
 +      mu[d][d] = 1.0 + factor(d,d)*(ir->ref_p[d][d]/(mu[ZZ][ZZ]*box[ZZ][ZZ])
 +                                  - (pres[ZZ][ZZ]+p_corr_z - xy_pressure))/(DIM-1);
 +    break;
 +  default:
 +    gmx_fatal(FARGS,"Berendsen pressure coupling type %s not supported yet\n",
 +              EPCOUPLTYPETYPE(ir->epct));
 +    break;
 +  }
 +  /* To fullfill the orientation restrictions on triclinic boxes
 +   * we will set mu_yx, mu_zx and mu_zy to 0 and correct
 +   * the other elements of mu to first order.
 +   */
 +  mu[YY][XX] += mu[XX][YY];
 +  mu[ZZ][XX] += mu[XX][ZZ];
 +  mu[ZZ][YY] += mu[YY][ZZ];
 +  mu[XX][YY] = 0;
 +  mu[XX][ZZ] = 0;
 +  mu[YY][ZZ] = 0;
 +
 +  if (debug) {
 +    pr_rvecs(debug,0,"PC: pres ",pres,3);
 +    pr_rvecs(debug,0,"PC: mu   ",mu,3);
 +  }
 +  
 +  if (mu[XX][XX]<0.99 || mu[XX][XX]>1.01 ||
 +      mu[YY][YY]<0.99 || mu[YY][YY]>1.01 ||
 +      mu[ZZ][ZZ]<0.99 || mu[ZZ][ZZ]>1.01) {
 +    char buf2[22];
 +    sprintf(buf,"\nStep %s  Warning: pressure scaling more than 1%%, "
 +          "mu: %g %g %g\n",
 +          gmx_step_str(step,buf2),mu[XX][XX],mu[YY][YY],mu[ZZ][ZZ]);
 +    if (fplog)
 +      fprintf(fplog,"%s",buf);
 +    fprintf(stderr,"%s",buf);
 +  }
 +}
 +
 +void berendsen_pscale(t_inputrec *ir,matrix mu,
 +                    matrix box,matrix box_rel,
 +                    int start,int nr_atoms,
 +                    rvec x[],unsigned short cFREEZE[],
 +                    t_nrnb *nrnb)
 +{
 +  ivec   *nFreeze=ir->opts.nFreeze;
 +  int    n,d,g=0;
 +      
 +  /* Scale the positions */
 +  for (n=start; n<start+nr_atoms; n++) {
 +    if (cFREEZE)
 +      g = cFREEZE[n];
 +    
 +    if (!nFreeze[g][XX])
 +      x[n][XX] = mu[XX][XX]*x[n][XX]+mu[YY][XX]*x[n][YY]+mu[ZZ][XX]*x[n][ZZ];
 +    if (!nFreeze[g][YY])
 +      x[n][YY] = mu[YY][YY]*x[n][YY]+mu[ZZ][YY]*x[n][ZZ];
 +    if (!nFreeze[g][ZZ])
 +      x[n][ZZ] = mu[ZZ][ZZ]*x[n][ZZ];
 +  }
 +  /* compute final boxlengths */
 +  for (d=0; d<DIM; d++) {
 +    box[d][XX] = mu[XX][XX]*box[d][XX]+mu[YY][XX]*box[d][YY]+mu[ZZ][XX]*box[d][ZZ];
 +    box[d][YY] = mu[YY][YY]*box[d][YY]+mu[ZZ][YY]*box[d][ZZ];
 +    box[d][ZZ] = mu[ZZ][ZZ]*box[d][ZZ];
 +  }      
 +
 +  preserve_box_shape(ir,box_rel,box);
 +  
 +  /* (un)shifting should NOT be done after this,
 +   * since the box vectors might have changed
 +   */
 +  inc_nrnb(nrnb,eNR_PCOUPL,nr_atoms);
 +}
 +
 +void berendsen_tcoupl(t_inputrec *ir,gmx_ekindata_t *ekind,real dt)
 +{
 +    t_grpopts *opts;
 +    int    i;
 +    real   T,reft=0,lll;
 +
 +    opts = &ir->opts;
 +
 +    for(i=0; (i<opts->ngtc); i++)
 +    {
 +        if (ir->eI == eiVV)
 +        {
 +            T = ekind->tcstat[i].T;
 +        }
 +        else
 +        {
 +            T = ekind->tcstat[i].Th;
 +        }
 +    
 +    if ((opts->tau_t[i] > 0) && (T > 0.0)) {
 + 
 +      reft = max(0.0,opts->ref_t[i]);
 +      lll  = sqrt(1.0 + (dt/opts->tau_t[i])*(reft/T-1.0));
 +      ekind->tcstat[i].lambda = max(min(lll,1.25),0.8);
 +    }
 +    else {
 +       ekind->tcstat[i].lambda = 1.0;
 +    }
 +
 +    if (debug)
 +      fprintf(debug,"TC: group %d: T: %g, Lambda: %g\n",
 +            i,T,ekind->tcstat[i].lambda);
 +  }
 +}
 +
 +void nosehoover_tcoupl(t_grpopts *opts,gmx_ekindata_t *ekind,real dt,
 +                       double xi[],double vxi[], t_extmass *MassQ)
 +{
 +    int   i;
 +    real  reft,oldvxi;
 +    
 +    /* note that this routine does not include Nose-hoover chains yet. Should be easy to add. */
 +    
 +    for(i=0; (i<opts->ngtc); i++) {
 +        reft = max(0.0,opts->ref_t[i]);
 +        oldvxi = vxi[i];
 +        vxi[i]  += dt*MassQ->Qinv[i]*(ekind->tcstat[i].Th - reft);
 +        xi[i] += dt*(oldvxi + vxi[i])*0.5;
 +    }
 +}
 +
 +t_state *init_bufstate(const t_state *template_state) 
 +{
 +    t_state *state;
 +    int nc = template_state->nhchainlength;
 +    snew(state,1);
 +    snew(state->nosehoover_xi,nc*template_state->ngtc);
 +    snew(state->nosehoover_vxi,nc*template_state->ngtc);
 +    snew(state->therm_integral,template_state->ngtc);
 +    snew(state->nhpres_xi,nc*template_state->nnhpres);
 +    snew(state->nhpres_vxi,nc*template_state->nnhpres);
 +
 +    return state;
 +}  
 +
 +void destroy_bufstate(t_state *state) 
 +{
 +    sfree(state->x);
 +    sfree(state->v);
 +    sfree(state->nosehoover_xi);
 +    sfree(state->nosehoover_vxi);
 +    sfree(state->therm_integral);
 +    sfree(state->nhpres_xi);
 +    sfree(state->nhpres_vxi);
 +    sfree(state);
 +}  
 +
 +void trotter_update(t_inputrec *ir,gmx_large_int_t step, gmx_ekindata_t *ekind, 
 +                    gmx_enerdata_t *enerd, t_state *state, 
 +                    tensor vir, t_mdatoms *md, 
 +                    t_extmass *MassQ, int **trotter_seqlist, int trotter_seqno) 
 +{
 +    
 +    int n,i,j,d,ntgrp,ngtc,gc=0;
 +    t_grp_tcstat *tcstat;
 +    t_grpopts *opts;
 +    gmx_large_int_t step_eff;
 +    real ecorr,pcorr,dvdlcorr;
 +    real bmass,qmass,reft,kT,dt,nd;
 +    tensor dumpres,dumvir;
 +    double *scalefac,dtc;
 +    int *trotter_seq;
++    rvec sumv={0,0,0},consk;
 +    gmx_bool bCouple;
 +
 +    if (trotter_seqno <= ettTSEQ2)
 +    {
 +        step_eff = step-1;  /* the velocity verlet calls are actually out of order -- the first half step
 +                               is actually the last half step from the previous step.  Thus the first half step
 +                               actually corresponds to the n-1 step*/
 +                               
 +    } else {
 +        step_eff = step;
 +    }
 +
 +    bCouple = (ir->nsttcouple == 1 ||
 +               do_per_step(step_eff+ir->nsttcouple,ir->nsttcouple));
 +
 +    trotter_seq = trotter_seqlist[trotter_seqno];
 +
 +    /* signal we are returning if nothing is going to be done in this routine */
 +    if ((trotter_seq[0] == etrtSKIPALL)  || !(bCouple))
 +    {
 +        return;
 +    }
 +
 +    dtc = ir->nsttcouple*ir->delta_t;
 +    opts = &(ir->opts); /* just for ease of referencing */
 +    ngtc = opts->ngtc;
 +    snew(scalefac,opts->ngtc);
 +    for (i=0;i<ngtc;i++) 
 +    {
 +        scalefac[i] = 1;
 +    }
 +    /* execute the series of trotter updates specified in the trotterpart array */
 +    
 +    for (i=0;i<NTROTTERPARTS;i++){
 +        /* allow for doubled intgrators by doubling dt instead of making 2 calls */
 +        if ((trotter_seq[i] == etrtBAROV2) || (trotter_seq[i] == etrtBARONHC2) || (trotter_seq[i] == etrtNHC2))
 +        {
 +            dt = 2 * dtc;
 +        }
 +        else 
 +        {
 +            dt = dtc;
 +        }
 +
 +        switch (trotter_seq[i])
 +        {
 +        case etrtBAROV:
 +        case etrtBAROV2:
 +            boxv_trotter(ir,&(state->veta),dt,state->box,ekind,vir,
 +                         enerd->term[F_PDISPCORR],enerd->term[F_DISPCORR],MassQ);
 +            break;
 +        case etrtBARONHC:
 +        case etrtBARONHC2:
 +            NHC_trotter(opts,state->nnhpres,ekind,dt,state->nhpres_xi,
 +                        state->nhpres_vxi,NULL,&(state->veta),MassQ,FALSE);      
 +            break;
 +        case etrtNHC:
 +        case etrtNHC2:
 +            NHC_trotter(opts,opts->ngtc,ekind,dt,state->nosehoover_xi,
 +                        state->nosehoover_vxi,scalefac,NULL,MassQ,(ir->eI==eiVV));
 +            /* need to rescale the kinetic energies and velocities here.  Could 
 +               scale the velocities later, but we need them scaled in order to 
 +               produce the correct outputs, so we'll scale them here. */
 +            
 +            for (i=0; i<ngtc;i++) 
 +            {
 +                tcstat = &ekind->tcstat[i];
 +                tcstat->vscale_nhc = scalefac[i]; 
 +                tcstat->ekinscaleh_nhc *= (scalefac[i]*scalefac[i]); 
 +                tcstat->ekinscalef_nhc *= (scalefac[i]*scalefac[i]); 
 +            }
 +            /* now that we've scaled the groupwise velocities, we can add them up to get the total */
 +            /* but do we actually need the total? */
 +            
 +            /* modify the velocities as well */
 +            for (n=md->start;n<md->start+md->homenr;n++) 
 +            {
 +                if (md->cTC) 
 +                { 
 +                    gc = md->cTC[n];
 +                }
 +                for (d=0;d<DIM;d++) 
 +                {
 +                    state->v[n][d] *= scalefac[gc];
 +                }
 +                
 +                if (debug) 
 +                {
 +                    for (d=0;d<DIM;d++) 
 +                    {
 +                        sumv[d] += (state->v[n][d])/md->invmass[n];
 +                    }
 +                }
 +            }          
 +            break;
 +        default:
 +            break;
 +        }
 +    }
 +    /* check for conserved momentum -- worth looking at this again eventually, but not working right now.*/  
 +#if 0
 +    if (debug) 
 +    {
 +        if (bFirstHalf) 
 +        {
 +            for (d=0;d<DIM;d++) 
 +            {
 +                consk[d] = sumv[d]*exp((1 + 1.0/opts->nrdf[0])*((1.0/DIM)*log(det(state->box)/state->vol0)) + state->nosehoover_xi[0]); 
 +            }
 +            fprintf(debug,"Conserved kappa: %15.8f %15.8f %15.8f\n",consk[0],consk[1],consk[2]);    
 +        }
 +    }
 +#endif
 +    sfree(scalefac);
 +}
 +
 +int **init_npt_vars(t_inputrec *ir, t_state *state, t_extmass *MassQ, gmx_bool bTrotter) 
 +{
 +    int n,i,j,d,ntgrp,ngtc,nnhpres,nh,gc=0;
 +    t_grp_tcstat *tcstat;
 +    t_grpopts *opts;
 +    real ecorr,pcorr,dvdlcorr;
 +    real bmass,qmass,reft,kT,dt,ndj,nd;
 +    tensor dumpres,dumvir;
 +    int **trotter_seq;
 +
 +    opts = &(ir->opts); /* just for ease of referencing */
 +    ngtc = state->ngtc;
 +    nnhpres = state->nnhpres;
 +    nh = state->nhchainlength; 
 +
 +    if (ir->eI == eiMD) {
 +        snew(MassQ->Qinv,ngtc);
 +        for(i=0; (i<ngtc); i++) 
 +        { 
 +            if ((opts->tau_t[i] > 0) && (opts->ref_t[i] > 0)) 
 +            {
 +                MassQ->Qinv[i]=1.0/(sqr(opts->tau_t[i]/M_2PI)*opts->ref_t[i]);
 +            } 
 +            else 
 +            {
 +                MassQ->Qinv[i]=0.0;     
 +            }
 +        }
 +    }
 +    else if (EI_VV(ir->eI))
 +    {
 +    /* Set pressure variables */
 +        
 +        if (state->vol0 == 0) 
 +        {
 +            state->vol0 = det(state->box); /* because we start by defining a fixed compressibility, 
 +                                              we need the volume at this compressibility to solve the problem */ 
 +        }
 +
 +        /* units are nm^3 * ns^2 / (nm^3 * bar / kJ/mol) = kJ/mol  */
 +        /* Investigate this more -- is this the right mass to make it? */
 +        MassQ->Winv = (PRESFAC*trace(ir->compress)*BOLTZ*opts->ref_t[0])/(DIM*state->vol0*sqr(ir->tau_p/M_2PI));
 +        /* An alternate mass definition, from Tuckerman et al. */ 
 +        /* MassQ->Winv = 1.0/(sqr(ir->tau_p/M_2PI)*(opts->nrdf[0]+DIM)*BOLTZ*opts->ref_t[0]); */
 +        for (d=0;d<DIM;d++) 
 +        {
 +            for (n=0;n<DIM;n++) 
 +            {
 +                MassQ->Winvm[d][n]= PRESFAC*ir->compress[d][n]/(state->vol0*sqr(ir->tau_p/M_2PI)); 
 +                /* not clear this is correct yet for the anisotropic case*/
 +            } 
 +        }           
 +        /* Allocate space for thermostat variables */
 +        snew(MassQ->Qinv,ngtc*nh);
 +        
 +        /* now, set temperature variables */
 +        for(i=0; i<ngtc; i++) 
 +        {
 +            if ((opts->tau_t[i] > 0) && (opts->ref_t[i] > 0)) 
 +            {
 +                reft = max(0.0,opts->ref_t[i]);
 +                nd = opts->nrdf[i];
 +                kT = BOLTZ*reft;
 +                for (j=0;j<nh;j++) 
 +                {
 +                    if (j==0) 
 +                    {
 +                        ndj = nd;
 +                    } 
 +                    else 
 +                    {
 +                        ndj = 1;
 +                    }
 +                    MassQ->Qinv[i*nh+j]   = 1.0/(sqr(opts->tau_t[i]/M_2PI)*ndj*kT);
 +                }
 +            }
 +            else 
 +            {
 +                reft=0.0;
 +                for (j=0;j<nh;j++) 
 +                {
 +                    MassQ->Qinv[i*nh+j] = 0.0;
 +                }
 +            }
 +        }
 +    }
 +    
 +    /* first, initialize clear all the trotter calls */
 +    snew(trotter_seq,ettTSEQMAX);
 +    for (i=0;i<ettTSEQMAX;i++) 
 +    {
 +        snew(trotter_seq[i],NTROTTERPARTS);
 +        for (j=0;j<NTROTTERPARTS;j++) {
 +            trotter_seq[i][j] = etrtNONE;
 +        }
 +        trotter_seq[i][0] = etrtSKIPALL;
 +    }
 +    
 +    if (!bTrotter) 
 +    {
 +        /* no trotter calls, so we never use the values in the array.
 +         * We access them (so we need to define them, but ignore
 +         * then.*/
 +
 +        return trotter_seq;
 +    }
 +
 +    /* compute the kinetic energy by using the half step velocities or
 +     * the kinetic energies, depending on the order of the trotter calls */
 +
 +    if (ir->eI==eiVV)
 +    {
 +        if (IR_NPT_TROTTER(ir)) 
 +        {
 +            /* This is the complicated version - there are 4 possible calls, depending on ordering.
 +               We start with the initial one. */
 +            /* first, a round that estimates veta. */
 +            trotter_seq[0][0] = etrtBAROV; 
 +            
 +            /* trotter_seq[1] is etrtNHC for 1/2 step velocities - leave zero */
 +            
 +            /* The first half trotter update */
 +            trotter_seq[2][0] = etrtBAROV;
 +            trotter_seq[2][1] = etrtNHC;
 +            trotter_seq[2][2] = etrtBARONHC;
 +            
 +            /* The second half trotter update */
 +            trotter_seq[3][0] = etrtBARONHC;
 +            trotter_seq[3][1] = etrtNHC;
 +            trotter_seq[3][2] = etrtBAROV;
 +
 +            /* trotter_seq[4] is etrtNHC for second 1/2 step velocities - leave zero */
 +
 +        } 
 +        else 
 +        {
 +            if (IR_NVT_TROTTER(ir)) 
 +            {
 +                /* This is the easy version - there are only two calls, both the same. 
 +                   Otherwise, even easier -- no calls  */
 +                trotter_seq[2][0] = etrtNHC;
 +                trotter_seq[3][0] = etrtNHC;
 +            }
 +        }
 +    } else if (ir->eI==eiVVAK) {
 +        if (IR_NPT_TROTTER(ir)) 
 +        {
 +            /* This is the complicated version - there are 4 possible calls, depending on ordering.
 +               We start with the initial one. */
 +            /* first, a round that estimates veta. */
 +            trotter_seq[0][0] = etrtBAROV; 
 +            
 +            /* The first half trotter update, part 1 -- double update, because it commutes */
 +            trotter_seq[1][0] = etrtNHC;
 +
 +            /* The first half trotter update, part 2 */
 +            trotter_seq[2][0] = etrtBAROV;
 +            trotter_seq[2][1] = etrtBARONHC;
 +            
 +            /* The second half trotter update, part 1 */
 +            trotter_seq[3][0] = etrtBARONHC;
 +            trotter_seq[3][1] = etrtBAROV;
 +
 +            /* The second half trotter update -- blank for now */
 +            trotter_seq[4][0] = etrtNHC;
 +        } 
 +        else 
 +        {
 +            if (IR_NVT_TROTTER(ir)) 
 +            {
 +                /* This is the easy version - there is only one call, both the same. 
 +                   Otherwise, even easier -- no calls  */
 +                trotter_seq[1][0] = etrtNHC;
 +                trotter_seq[4][0] = etrtNHC;
 +            }
 +        }
 +    }
 +
 +    switch (ir->epct) 
 +    {
 +    case epctISOTROPIC:  
 +    default:
 +        bmass = DIM*DIM;  /* recommended mass parameters for isotropic barostat */
 +    }    
 +
 +    snew(MassQ->QPinv,nnhpres*opts->nhchainlength);
 +
 +    /* barostat temperature */
 +    if ((ir->tau_p > 0) && (opts->ref_t[0] > 0)) 
 +    {
 +        reft = max(0.0,opts->ref_t[0]);
 +        kT = BOLTZ*reft;
 +        for (i=0;i<nnhpres;i++) {
 +            for (j=0;j<nh;j++) 
 +            {
 +                if (j==0) {
 +                    qmass = bmass;
 +                } 
 +                else 
 +                {
 +                    qmass = 1;
 +                }
 +                MassQ->QPinv[i*opts->nhchainlength+j]   = 1.0/(sqr(opts->tau_t[0]/M_2PI)*qmass*kT);
 +            }
 +        }
 +    }
 +    else 
 +    {
 +        for (i=0;i<nnhpres;i++) {
 +            for (j=0;j<nh;j++) 
 +            {
 +                MassQ->QPinv[i*nh+j]=0.0;
 +            }
 +        }
 +    }    
 +    return trotter_seq;
 +}
 +
 +real NPT_energy(t_inputrec *ir, t_state *state, t_extmass *MassQ)
 +{
 +    int  i,j,nd,ndj,bmass,qmass,ngtcall;
 +    real ener_npt,reft,eta,kT,tau;
 +    double *ivxi, *ixi;
 +    double *iQinv;
 +    real vol,dbaro,W,Q;
 +    int nh = state->nhchainlength;
 +
 +    ener_npt = 0;
 +    
 +    /* now we compute the contribution of the pressure to the conserved quantity*/
 +    
 +    if (ir->epc==epcMTTK) 
 +    {
 +        /* find the volume, and the kinetic energy of the volume */
 +        
 +        switch (ir->epct) {
 +            
 +        case epctISOTROPIC:
 +            /* contribution from the pressure momenenta */
 +            ener_npt += 0.5*sqr(state->veta)/MassQ->Winv;
 +            
 +            /* contribution from the PV term */
 +            vol = det(state->box);
 +            ener_npt += vol*trace(ir->ref_p)/(DIM*PRESFAC);
 +
 +            break;
 +        case epctANISOTROPIC:
 +            
 +            break;
 +            
 +        case epctSURFACETENSION:
 +            
 +            break;
 +        case epctSEMIISOTROPIC:
 +            
 +            break;
 +        default:
 +            break;
 +        }
 +    }
 +    
 +    if (IR_NPT_TROTTER(ir)) 
 +    {
 +        /* add the energy from the barostat thermostat chain */
 +        for (i=0;i<state->nnhpres;i++) {
 +
 +            /* note -- assumes only one degree of freedom that is thermostatted in barostat */
 +            ivxi = &state->nhpres_vxi[i*nh];
 +            ixi = &state->nhpres_xi[i*nh];
 +            iQinv = &(MassQ->QPinv[i*nh]);
 +            reft = max(ir->opts.ref_t[0],0); /* using 'System' temperature */
 +            kT = BOLTZ * reft;
 +        
 +            for (j=0;j<nh;j++) 
 +            {
 +                if (iQinv[j] > 0)
 +                {
 +                    ener_npt += 0.5*sqr(ivxi[j])/iQinv[j];
 +                    /* contribution from the thermal variable of the NH chain */
 +                    ener_npt += ixi[j]*kT;
 +                }
 +                if (debug) 
 +                {
 +                    fprintf(debug,"P-T-group: %10d Chain %4d ThermV: %15.8f ThermX: %15.8f",i,j,ivxi[j],ixi[j]);
 +                }
 +            }
 +        }
 +    }
 +        
 +    if (ir->etc) 
 +    {
 +        for(i=0; i<ir->opts.ngtc; i++) 
 +        {
 +            ixi = &state->nosehoover_xi[i*nh];
 +            ivxi = &state->nosehoover_vxi[i*nh];
 +            iQinv = &(MassQ->Qinv[i*nh]);
 +            
 +            nd = ir->opts.nrdf[i];
 +            reft = max(ir->opts.ref_t[i],0);
 +            kT = BOLTZ * reft;
 +            
 +            if (nd > 0) 
 +            {
 +                if (IR_NVT_TROTTER(ir))
 +                {
 +                    /* contribution from the thermal momenta of the NH chain */
 +                    for (j=0;j<nh;j++) 
 +                    {
 +                        if (iQinv[j] > 0) {
 +                            ener_npt += 0.5*sqr(ivxi[j])/iQinv[j];
 +                            /* contribution from the thermal variable of the NH chain */
 +                            if (j==0) {
 +                                ndj = nd;
 +                            } 
 +                            else 
 +                            {
 +                                ndj = 1;
 +                            } 
 +                            ener_npt += ndj*ixi[j]*kT;
 +                        }
 +                    }
 +                }
 +                else  /* Other non Trotter temperature NH control  -- no chains yet. */
 +                { 
 +                    ener_npt += 0.5*BOLTZ*nd*sqr(ivxi[0])/iQinv[0];
 +                    ener_npt += nd*ixi[0]*kT;
 +                }
 +            }
 +        }
 +    }
 +    return ener_npt;
 +}
 +
 +static real vrescale_gamdev(int ia, gmx_rng_t rng)
 +/* Gamma distribution, adapted from numerical recipes */
 +{
 +    int j;
 +    real am,e,s,v1,v2,x,y;
 +
 +    if (ia < 6)
 +    {
 +        do
 +        {
 +            x = 1.0;
 +            for(j=1; j<=ia; j++)
 +            {
 +                x *= gmx_rng_uniform_real(rng);
 +            }
 +        }
 +        while (x == 0);
 +        x = -log(x);
 +    }
 +    else
 +    {
 +        do
 +        {
 +            do
 +            {
 +                do
 +                {
 +                    v1 = gmx_rng_uniform_real(rng);
 +                    v2 = 2.0*gmx_rng_uniform_real(rng)-1.0;
 +                }
 +                while (v1*v1 + v2*v2 > 1.0 ||
 +                       v1*v1*GMX_REAL_MAX < 3.0*ia);
 +                /* The last check above ensures that both x (3.0 > 2.0 in s)
 +                 * and the pre-factor for e do not go out of range.
 +                 */
 +                y = v2/v1;
 +                am = ia - 1;
 +                s = sqrt(2.0*am + 1.0);
 +                x = s*y + am;
 +            }
 +            while (x <= 0.0);
 +            e = (1.0 + y*y)*exp(am*log(x/am) - s*y);
 +        }
 +        while (gmx_rng_uniform_real(rng) > e);
 +    }
 +
 +    return x;
 +}
 +
 +static real vrescale_sumnoises(int nn,gmx_rng_t rng)
 +{
 +/*
 + * Returns the sum of n independent gaussian noises squared
 + * (i.e. equivalent to summing the square of the return values
 + * of nn calls to gmx_rng_gaussian_real).xs
 + */
 +  real rr;
 +
 +  if (nn == 0) {
 +    return 0.0;
 +  } else if (nn == 1) {
 +    rr = gmx_rng_gaussian_real(rng);
 +    return rr*rr;
 +  } else if (nn % 2 == 0) {
 +    return 2.0*vrescale_gamdev(nn/2,rng);
 +  } else {
 +    rr = gmx_rng_gaussian_real(rng);
 +    return 2.0*vrescale_gamdev((nn-1)/2,rng) + rr*rr;
 +  }
 +}
 +
 +static real vrescale_resamplekin(real kk,real sigma, int ndeg, real taut,
 +                               gmx_rng_t rng)
 +{
 +/*
 + * Generates a new value for the kinetic energy,
 + * according to Bussi et al JCP (2007), Eq. (A7)
 + * kk:    present value of the kinetic energy of the atoms to be thermalized (in arbitrary units)
 + * sigma: target average value of the kinetic energy (ndeg k_b T/2)  (in the same units as kk)
 + * ndeg:  number of degrees of freedom of the atoms to be thermalized
 + * taut:  relaxation time of the thermostat, in units of 'how often this routine is called'
 + */
 +  real factor,rr;
 +
 +  if (taut > 0.1) {
 +    factor = exp(-1.0/taut);
 +  } else {
 +    factor = 0.0;
 +  }
 +  rr = gmx_rng_gaussian_real(rng);
 +  return
 +    kk +
 +    (1.0 - factor)*(sigma*(vrescale_sumnoises(ndeg-1,rng) + rr*rr)/ndeg - kk) +
 +    2.0*rr*sqrt(kk*sigma/ndeg*(1.0 - factor)*factor);
 +}
 +
 +void vrescale_tcoupl(t_inputrec *ir,gmx_ekindata_t *ekind,real dt,
 +                     double therm_integral[],gmx_rng_t rng)
 +{
 +    t_grpopts *opts;
 +    int    i;
 +    real   Ek,Ek_ref1,Ek_ref,Ek_new; 
 +    
 +    opts = &ir->opts;
 +
 +    for(i=0; (i<opts->ngtc); i++)
 +    {
 +        if (ir->eI == eiVV)
 +        {
 +            Ek = trace(ekind->tcstat[i].ekinf);
 +        }
 +        else
 +        {
 +            Ek = trace(ekind->tcstat[i].ekinh);
 +        }
 +        
 +        if (opts->tau_t[i] >= 0 && opts->nrdf[i] > 0 && Ek > 0)
 +        {
 +            Ek_ref1 = 0.5*opts->ref_t[i]*BOLTZ;
 +            Ek_ref  = Ek_ref1*opts->nrdf[i];
 +
 +            Ek_new  = vrescale_resamplekin(Ek,Ek_ref,opts->nrdf[i],
 +                                           opts->tau_t[i]/dt,rng);
 +
 +            /* Analytically Ek_new>=0, but we check for rounding errors */
 +            if (Ek_new <= 0)
 +            {
 +                ekind->tcstat[i].lambda = 0.0;
 +            }
 +            else
 +            {
 +                ekind->tcstat[i].lambda = sqrt(Ek_new/Ek);
 +            }
 +
 +            therm_integral[i] -= Ek_new - Ek;
 +
 +            if (debug)
 +            {
 +                fprintf(debug,"TC: group %d: Ekr %g, Ek %g, Ek_new %g, Lambda: %g\n",
 +                        i,Ek_ref,Ek,Ek_new,ekind->tcstat[i].lambda);
 +            }
 +        }
 +        else
 +        {
 +            ekind->tcstat[i].lambda = 1.0;
 +        }
 +    }
 +}
 +
 +real vrescale_energy(t_grpopts *opts,double therm_integral[])
 +{
 +  int i;
 +  real ener;
 +
 +  ener = 0;
 +  for(i=0; i<opts->ngtc; i++) {
 +    ener += therm_integral[i];
 +  }
 +  
 +  return ener;
 +}
 +
 +void rescale_velocities(gmx_ekindata_t *ekind,t_mdatoms *mdatoms,
 +                        int start,int end,rvec v[])
 +{
 +    t_grp_acc      *gstat;
 +    t_grp_tcstat   *tcstat;
 +    unsigned short *cACC,*cTC;
 +    int  ga,gt,n,d;
 +    real lg;
 +    rvec vrel;
 +
 +    tcstat = ekind->tcstat;
 +    cTC    = mdatoms->cTC;
 +
 +    if (ekind->bNEMD)
 +    {
 +        gstat  = ekind->grpstat;
 +        cACC   = mdatoms->cACC;
 +
 +        ga = 0;
 +        gt = 0;
 +        for(n=start; n<end; n++) 
 +        {
 +            if (cACC) 
 +            {
 +                ga   = cACC[n];
 +            }
 +            if (cTC)
 +            {
 +                gt   = cTC[n];
 +            }
 +            /* Only scale the velocity component relative to the COM velocity */
 +            rvec_sub(v[n],gstat[ga].u,vrel);
 +            lg = tcstat[gt].lambda;
 +            for(d=0; d<DIM; d++)
 +            {
 +                v[n][d] = gstat[ga].u[d] + lg*vrel[d];
 +            }
 +        }
 +    }
 +    else
 +    {
 +        gt = 0;
 +        for(n=start; n<end; n++) 
 +        {
 +            if (cTC)
 +            {
 +                gt   = cTC[n];
 +            }
 +            lg = tcstat[gt].lambda;
 +            for(d=0; d<DIM; d++)
 +            {
 +                v[n][d] *= lg;
 +            }
 +        }
 +    }
 +}
 +
 +
 +/* set target temperatures if we are annealing */
 +void update_annealing_target_temp(t_grpopts *opts,real t)
 +{
 +  int i,j,n,npoints;
 +  real pert,thist=0,x;
 +
 +  for(i=0;i<opts->ngtc;i++) {
 +    npoints = opts->anneal_npoints[i];
 +    switch (opts->annealing[i]) {
 +    case eannNO:
 +      continue;
 +    case  eannPERIODIC:
 +      /* calculate time modulo the period */
 +      pert  = opts->anneal_time[i][npoints-1];
 +      n     = t / pert;
 +      thist = t - n*pert; /* modulo time */
 +      /* Make sure rounding didn't get us outside the interval */
 +      if (fabs(thist-pert) < GMX_REAL_EPS*100)
 +      thist=0;
 +      break;
 +    case eannSINGLE:
 +      thist = t;
 +      break;
 +    default:
 +      gmx_fatal(FARGS,"Death horror in update_annealing_target_temp (i=%d/%d npoints=%d)",i,opts->ngtc,npoints);
 +    }
 +    /* We are doing annealing for this group if we got here, 
 +     * and we have the (relative) time as thist.
 +     * calculate target temp */
 +    j=0;
 +    while ((j < npoints-1) && (thist>(opts->anneal_time[i][j+1])))
 +      j++;
 +    if (j < npoints-1) {
 +      /* Found our position between points j and j+1. 
 +       * Interpolate: x is the amount from j+1, (1-x) from point j 
 +       * First treat possible jumps in temperature as a special case.
 +       */
 +      if ((opts->anneal_time[i][j+1]-opts->anneal_time[i][j]) < GMX_REAL_EPS*100)
 +      opts->ref_t[i]=opts->anneal_temp[i][j+1];
 +      else {
 +      x = ((thist-opts->anneal_time[i][j])/
 +           (opts->anneal_time[i][j+1]-opts->anneal_time[i][j]));
 +      opts->ref_t[i] = x*opts->anneal_temp[i][j+1]+(1-x)*opts->anneal_temp[i][j];
 +      }
 +    }
 +    else {
 +      opts->ref_t[i] = opts->anneal_temp[i][npoints-1];
 +    }
 +  }
 +}
index aee11f661341e8aae5557369ba1e69417a75e49b,0000000000000000000000000000000000000000..f88f40a3532dd660ab107eaf1a2071d77fd89244
mode 100644,000000..100644
--- /dev/null
@@@ -1,8657 -1,0 +1,8657 @@@
-     
-     if (DDMASTER(dd))
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + * This file is part of Gromacs        Copyright (c) 1991-2008
 + * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the research papers on the package. Check out http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gnomes, ROck Monsters And Chili Sauce
 + */
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <stdio.h>
 +#include <time.h>
 +#include <math.h>
 +#include <string.h>
 +#include <stdlib.h>
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "vec.h"
 +#include "domdec.h"
 +#include "domdec_network.h"
 +#include "nrnb.h"
 +#include "pbc.h"
 +#include "chargegroup.h"
 +#include "constr.h"
 +#include "mdatoms.h"
 +#include "names.h"
 +#include "pdbio.h"
 +#include "futil.h"
 +#include "force.h"
 +#include "pme.h"
 +#include "pull.h"
 +#include "pull_rotation.h"
 +#include "gmx_wallcycle.h"
 +#include "mdrun.h"
 +#include "nsgrid.h"
 +#include "shellfc.h"
 +#include "mtop_util.h"
 +#include "gmxfio.h"
 +#include "gmx_ga2la.h"
 +#include "gmx_sort.h"
 +#include "macros.h"
 +
 +#ifdef GMX_LIB_MPI
 +#include <mpi.h>
 +#endif
 +#ifdef GMX_THREAD_MPI
 +#include "tmpi.h"
 +#endif
 +
 +#define DDRANK(dd,rank)    (rank)
 +#define DDMASTERRANK(dd)   (dd->masterrank)
 +
 +typedef struct gmx_domdec_master
 +{
 +    /* The cell boundaries */
 +    real **cell_x;
 +    /* The global charge group division */
 +    int  *ncg;     /* Number of home charge groups for each node */
 +    int  *index;   /* Index of nnodes+1 into cg */
 +    int  *cg;      /* Global charge group index */
 +    int  *nat;     /* Number of home atoms for each node. */
 +    int  *ibuf;    /* Buffer for communication */
 +    rvec *vbuf;    /* Buffer for state scattering and gathering */
 +} gmx_domdec_master_t;
 +
 +typedef struct
 +{
 +    /* The numbers of charge groups to send and receive for each cell
 +     * that requires communication, the last entry contains the total
 +     * number of atoms that needs to be communicated.
 +     */
 +    int nsend[DD_MAXIZONE+2];
 +    int nrecv[DD_MAXIZONE+2];
 +    /* The charge groups to send */
 +    int *index;
 +    int nalloc;
 +    /* The atom range for non-in-place communication */
 +    int cell2at0[DD_MAXIZONE];
 +    int cell2at1[DD_MAXIZONE];
 +} gmx_domdec_ind_t;
 +
 +typedef struct
 +{
 +    int  np;                   /* Number of grid pulses in this dimension */
 +    int  np_dlb;               /* For dlb, for use with edlbAUTO          */
 +    gmx_domdec_ind_t *ind;     /* The indices to communicate, size np     */
 +    int  np_nalloc;
 +    gmx_bool bInPlace;             /* Can we communicate in place?            */
 +} gmx_domdec_comm_dim_t;
 +
 +typedef struct
 +{
 +    gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
 +    real *cell_f;      /* State var.: cell boundaries, box relative      */
 +    real *old_cell_f;  /* Temp. var.: old cell size                      */
 +    real *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
 +    real *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
 +    real *bound_min;   /* Temp. var.: lower limit for cell boundary      */
 +    real *bound_max;   /* Temp. var.: upper limit for cell boundary      */
 +    gmx_bool bLimited;     /* State var.: is DLB limited in this dim and row */
 +    real *buf_ncd;     /* Temp. var.                                     */
 +} gmx_domdec_root_t;
 +
 +#define DD_NLOAD_MAX 9
 +
 +/* Here floats are accurate enough, since these variables
 + * only influence the load balancing, not the actual MD results.
 + */
 +typedef struct
 +{
 +    int  nload;
 +    float *load;
 +    float sum;
 +    float max;
 +    float sum_m;
 +    float cvol_min;
 +    float mdf;
 +    float pme;
 +    int   flags;
 +} gmx_domdec_load_t;
 +
 +typedef struct
 +{
 +    int  nsc;
 +    int  ind_gl;
 +    int  ind;
 +} gmx_cgsort_t;
 +
 +typedef struct
 +{
 +    gmx_cgsort_t *sort1,*sort2;
 +    int  sort_nalloc;
 +    gmx_cgsort_t *sort_new;
 +    int  sort_new_nalloc;
 +    int  *ibuf;
 +    int  ibuf_nalloc;
 +} gmx_domdec_sort_t;
 +
 +typedef struct
 +{
 +    rvec *v;
 +    int  nalloc;
 +} vec_rvec_t;
 +
 +/* This enum determines the order of the coordinates.
 + * ddnatHOME and ddnatZONE should be first and second,
 + * the others can be ordered as wanted.
 + */
 +enum { ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR };
 +
 +enum { edlbAUTO, edlbNO, edlbYES, edlbNR };
 +const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
 +
 +typedef struct
 +{
 +    int  dim;      /* The dimension                                          */
 +    gmx_bool dim_match;/* Tells if DD and PME dims match                         */
 +    int  nslab;    /* The number of PME slabs in this dimension              */
 +    real *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
 +    int  *pp_min;  /* The minimum pp node location, size nslab               */
 +    int  *pp_max;  /* The maximum pp node location,size nslab                */
 +    int  maxshift; /* The maximum shift for coordinate redistribution in PME */
 +} gmx_ddpme_t;
 +
 +typedef struct
 +{
 +    real min0;    /* The minimum bottom of this zone                        */
 +    real max1;    /* The maximum top of this zone                           */
 +    real mch0;    /* The maximum bottom communicaton height for this zone   */
 +    real mch1;    /* The maximum top communicaton height for this zone      */
 +    real p1_0;    /* The bottom value of the first cell in this zone        */
 +    real p1_1;    /* The top value of the first cell in this zone           */
 +} gmx_ddzone_t;
 +
 +typedef struct gmx_domdec_comm
 +{
 +    /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
 +     * unless stated otherwise.
 +     */
 +
 +    /* The number of decomposition dimensions for PME, 0: no PME */
 +    int  npmedecompdim;
 +    /* The number of nodes doing PME (PP/PME or only PME) */
 +    int  npmenodes;
 +    int  npmenodes_x;
 +    int  npmenodes_y;
 +    /* The communication setup including the PME only nodes */
 +    gmx_bool bCartesianPP_PME;
 +    ivec ntot;
 +    int  cartpmedim;
 +    int  *pmenodes;          /* size npmenodes                         */
 +    int  *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
 +                              * but with bCartesianPP_PME              */
 +    gmx_ddpme_t ddpme[2];
 +    
 +    /* The DD particle-particle nodes only */
 +    gmx_bool bCartesianPP;
 +    int  *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
 +    
 +    /* The global charge groups */
 +    t_block cgs_gl;
 +
 +    /* Should we sort the cgs */
 +    int  nstSortCG;
 +    gmx_domdec_sort_t *sort;
 +    
 +    /* Are there bonded and multi-body interactions between charge groups? */
 +    gmx_bool bInterCGBondeds;
 +    gmx_bool bInterCGMultiBody;
 +
 +    /* Data for the optional bonded interaction atom communication range */
 +    gmx_bool bBondComm;
 +    t_blocka *cglink;
 +    char *bLocalCG;
 +
 +    /* The DLB option */
 +    int  eDLB;
 +    /* Are we actually using DLB? */
 +    gmx_bool bDynLoadBal;
 +
 +    /* Cell sizes for static load balancing, first index cartesian */
 +    real **slb_frac;
 +    
 +    /* The width of the communicated boundaries */
 +    real cutoff_mbody;
 +    real cutoff;
 +    /* The minimum cell size (including triclinic correction) */
 +    rvec cellsize_min;
 +    /* For dlb, for use with edlbAUTO */
 +    rvec cellsize_min_dlb;
 +    /* The lower limit for the DD cell size with DLB */
 +    real cellsize_limit;
 +    /* Effectively no NB cut-off limit with DLB for systems without PBC? */
 +    gmx_bool bVacDLBNoLimit;
 +
 +    /* tric_dir is only stored here because dd_get_ns_ranges needs it */
 +    ivec tric_dir;
 +    /* box0 and box_size are required with dim's without pbc and -gcom */
 +    rvec box0;
 +    rvec box_size;
 +    
 +    /* The cell boundaries */
 +    rvec cell_x0;
 +    rvec cell_x1;
 +
 +    /* The old location of the cell boundaries, to check cg displacements */
 +    rvec old_cell_x0;
 +    rvec old_cell_x1;
 +
 +    /* The communication setup and charge group boundaries for the zones */
 +    gmx_domdec_zones_t zones;
 +    
 +    /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
 +     * cell boundaries of neighboring cells for dynamic load balancing.
 +     */
 +    gmx_ddzone_t zone_d1[2];
 +    gmx_ddzone_t zone_d2[2][2];
 +    
 +    /* The coordinate/force communication setup and indices */
 +    gmx_domdec_comm_dim_t cd[DIM];
 +    /* The maximum number of cells to communicate with in one dimension */
 +    int  maxpulse;
 +    
 +    /* Which cg distribution is stored on the master node */
 +    int master_cg_ddp_count;
 +    
 +    /* The number of cg's received from the direct neighbors */
 +    int  zone_ncg1[DD_MAXZONE];
 +    
 +    /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
 +    int  nat[ddnatNR];
 +    
 +    /* Communication buffer for general use */
 +    int  *buf_int;
 +    int  nalloc_int;
 +
 +     /* Communication buffer for general use */
 +    vec_rvec_t vbuf;
 +    
 +    /* Communication buffers only used with multiple grid pulses */
 +    int  *buf_int2;
 +    int  nalloc_int2;
 +    vec_rvec_t vbuf2;
 +    
 +    /* Communication buffers for local redistribution */
 +    int  **cggl_flag;
 +    int  cggl_flag_nalloc[DIM*2];
 +    rvec **cgcm_state;
 +    int  cgcm_state_nalloc[DIM*2];
 +    
 +    /* Cell sizes for dynamic load balancing */
 +    gmx_domdec_root_t **root;
 +    real *cell_f_row;
 +    real cell_f0[DIM];
 +    real cell_f1[DIM];
 +    real cell_f_max0[DIM];
 +    real cell_f_min1[DIM];
 +    
 +    /* Stuff for load communication */
 +    gmx_bool bRecordLoad;
 +    gmx_domdec_load_t *load;
 +#ifdef GMX_MPI
 +    MPI_Comm *mpi_comm_load;
 +#endif
 +
 +    /* Maximum DLB scaling per load balancing step in percent */
 +    int dlb_scale_lim;
 +
 +    /* Cycle counters */
 +    float cycl[ddCyclNr];
 +    int   cycl_n[ddCyclNr];
 +    float cycl_max[ddCyclNr];
 +    /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
 +    int eFlop;
 +    double flop;
 +    int    flop_n;
 +    /* Have often have did we have load measurements */
 +    int    n_load_have;
 +    /* Have often have we collected the load measurements */
 +    int    n_load_collect;
 +    
 +    /* Statistics */
 +    double sum_nat[ddnatNR-ddnatZONE];
 +    int    ndecomp;
 +    int    nload;
 +    double load_step;
 +    double load_sum;
 +    double load_max;
 +    ivec   load_lim;
 +    double load_mdf;
 +    double load_pme;
 +
 +    /* The last partition step */
 +    gmx_large_int_t globalcomm_step;
 +
 +    /* Debugging */
 +    int  nstDDDump;
 +    int  nstDDDumpGrid;
 +    int  DD_debug;
 +} gmx_domdec_comm_t;
 +
 +/* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 +#define DD_CGIBS 2
 +
 +/* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 +#define DD_FLAG_NRCG  65535
 +#define DD_FLAG_FW(d) (1<<(16+(d)*2))
 +#define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 +
 +/* Zone permutation required to obtain consecutive charge groups
 + * for neighbor searching.
 + */
 +static const int zone_perm[3][4] = { {0,0,0,0},{1,0,0,0},{3,0,1,2} };
 +
 +/* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
 + * components see only j zones with that component 0.
 + */
 +
 +/* The DD zone order */
 +static const ivec dd_zo[DD_MAXZONE] =
 +  {{0,0,0},{1,0,0},{1,1,0},{0,1,0},{0,1,1},{0,0,1},{1,0,1},{1,1,1}};
 +
 +/* The 3D setup */
 +#define dd_z3n  8
 +#define dd_zp3n 4
 +static const ivec dd_zp3[dd_zp3n] = {{0,0,8},{1,3,6},{2,5,6},{3,5,7}};
 +
 +/* The 2D setup */
 +#define dd_z2n  4
 +#define dd_zp2n 2
 +static const ivec dd_zp2[dd_zp2n] = {{0,0,4},{1,3,4}};
 +
 +/* The 1D setup */
 +#define dd_z1n  2
 +#define dd_zp1n 1
 +static const ivec dd_zp1[dd_zp1n] = {{0,0,2}};
 +
 +/* Factors used to avoid problems due to rounding issues */
 +#define DD_CELL_MARGIN       1.0001
 +#define DD_CELL_MARGIN2      1.00005
 +/* Factor to account for pressure scaling during nstlist steps */
 +#define DD_PRES_SCALE_MARGIN 1.02
 +
 +/* Allowed performance loss before we DLB or warn */
 +#define DD_PERF_LOSS 0.05
 +
 +#define DD_CELL_F_SIZE(dd,di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 +
 +/* Use separate MPI send and receive commands
 + * when nnodes <= GMX_DD_NNODES_SENDRECV.
 + * This saves memory (and some copying for small nnodes).
 + * For high parallelization scatter and gather calls are used.
 + */
 +#define GMX_DD_NNODES_SENDRECV 4
 +
 +
 +/*
 +#define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 +
 +static void index2xyz(ivec nc,int ind,ivec xyz)
 +{
 +  xyz[XX] = ind % nc[XX];
 +  xyz[YY] = (ind / nc[XX]) % nc[YY];
 +  xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 +}
 +*/
 +
 +/* This order is required to minimize the coordinate communication in PME
 + * which uses decomposition in the x direction.
 + */
 +#define dd_index(n,i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 +
 +static void ddindex2xyz(ivec nc,int ind,ivec xyz)
 +{
 +    xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 +    xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 +    xyz[ZZ] = ind % nc[ZZ];
 +}
 +
 +static int ddcoord2ddnodeid(gmx_domdec_t *dd,ivec c)
 +{
 +    int ddindex;
 +    int ddnodeid=-1;
 +    
 +    ddindex = dd_index(dd->nc,c);
 +    if (dd->comm->bCartesianPP_PME)
 +    {
 +        ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 +    }
 +    else if (dd->comm->bCartesianPP)
 +    {
 +#ifdef GMX_MPI
 +        MPI_Cart_rank(dd->mpi_comm_all,c,&ddnodeid);
 +#endif
 +    }
 +    else
 +    {
 +        ddnodeid = ddindex;
 +    }
 +    
 +    return ddnodeid;
 +}
 +
 +static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox,t_inputrec *ir)
 +{
 +    return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 +}
 +
 +int ddglatnr(gmx_domdec_t *dd,int i)
 +{
 +    int atnr;
 +    
 +    if (dd == NULL)
 +    {
 +        atnr = i + 1;
 +    }
 +    else
 +    {
 +        if (i >= dd->comm->nat[ddnatNR-1])
 +        {
 +            gmx_fatal(FARGS,"glatnr called with %d, which is larger than the local number of atoms (%d)",i,dd->comm->nat[ddnatNR-1]);
 +        }
 +        atnr = dd->gatindex[i] + 1;
 +    }
 +    
 +    return atnr;
 +}
 +
 +t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 +{
 +    return &dd->comm->cgs_gl;
 +}
 +
 +static void vec_rvec_init(vec_rvec_t *v)
 +{
 +    v->nalloc = 0;
 +    v->v      = NULL;
 +}
 +
 +static void vec_rvec_check_alloc(vec_rvec_t *v,int n)
 +{
 +    if (n > v->nalloc)
 +    {
 +        v->nalloc = over_alloc_dd(n);
 +        srenew(v->v,v->nalloc);
 +    }
 +}
 +
 +void dd_store_state(gmx_domdec_t *dd,t_state *state)
 +{
 +    int i;
 +    
 +    if (state->ddp_count != dd->ddp_count)
 +    {
 +        gmx_incons("The state does not the domain decomposition state");
 +    }
 +    
 +    state->ncg_gl = dd->ncg_home;
 +    if (state->ncg_gl > state->cg_gl_nalloc)
 +    {
 +        state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 +        srenew(state->cg_gl,state->cg_gl_nalloc);
 +    }
 +    for(i=0; i<state->ncg_gl; i++)
 +    {
 +        state->cg_gl[i] = dd->index_gl[i];
 +    }
 +    
 +    state->ddp_count_cg_gl = dd->ddp_count;
 +}
 +
 +gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 +{
 +    return &dd->comm->zones;
 +}
 +
 +void dd_get_ns_ranges(gmx_domdec_t *dd,int icg,
 +                      int *jcg0,int *jcg1,ivec shift0,ivec shift1)
 +{
 +    gmx_domdec_zones_t *zones;
 +    int izone,d,dim;
 +
 +    zones = &dd->comm->zones;
 +
 +    izone = 0;
 +    while (icg >= zones->izone[izone].cg1)
 +    {
 +        izone++;
 +    }
 +    
 +    if (izone == 0)
 +    {
 +        *jcg0 = icg;
 +    }
 +    else if (izone < zones->nizone)
 +    {
 +        *jcg0 = zones->izone[izone].jcg0;
 +    }
 +    else
 +    {
 +        gmx_fatal(FARGS,"DD icg %d out of range: izone (%d) >= nizone (%d)",
 +                  icg,izone,zones->nizone);
 +    }
 +        
 +    *jcg1 = zones->izone[izone].jcg1;
 +    
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        shift0[dim] = zones->izone[izone].shift0[dim];
 +        shift1[dim] = zones->izone[izone].shift1[dim];
 +        if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
 +        {
 +            /* A conservative approach, this can be optimized */
 +            shift0[dim] -= 1;
 +            shift1[dim] += 1;
 +        }
 +    }
 +}
 +
 +int dd_natoms_vsite(gmx_domdec_t *dd)
 +{
 +    return dd->comm->nat[ddnatVSITE];
 +}
 +
 +void dd_get_constraint_range(gmx_domdec_t *dd,int *at_start,int *at_end)
 +{
 +    *at_start = dd->comm->nat[ddnatCON-1];
 +    *at_end   = dd->comm->nat[ddnatCON];
 +}
 +
 +void dd_move_x(gmx_domdec_t *dd,matrix box,rvec x[])
 +{
 +    int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 +    int  *index,*cgindex;
 +    gmx_domdec_comm_t *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t *ind;
 +    rvec shift={0,0,0},*buf,*rbuf;
 +    gmx_bool bPBC,bScrew;
 +    
 +    comm = dd->comm;
 +    
 +    cgindex = dd->cgindex;
 +    
 +    buf = comm->vbuf.v;
 +
 +    nzone = 1;
 +    nat_tot = dd->nat_home;
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        bPBC   = (dd->ci[dd->dim[d]] == 0);
 +        bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 +        if (bPBC)
 +        {
 +            copy_rvec(box[dd->dim[d]],shift);
 +        }
 +        cd = &comm->cd[d];
 +        for(p=0; p<cd->np; p++)
 +        {
 +            ind = &cd->ind[p];
 +            index = ind->index;
 +            n = 0;
 +            if (!bPBC)
 +            {
 +                for(i=0; i<ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for(j=at0; j<at1; j++)
 +                    {
 +                        copy_rvec(x[j],buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else if (!bScrew)
 +            {
 +                for(i=0; i<ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for(j=at0; j<at1; j++)
 +                    {
 +                        /* We need to shift the coordinates */
 +                        rvec_add(x[j],shift,buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else
 +            {
 +                for(i=0; i<ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for(j=at0; j<at1; j++)
 +                    {
 +                        /* Shift x */
 +                        buf[n][XX] = x[j][XX] + shift[XX];
 +                        /* Rotate y and z.
 +                         * This operation requires a special shift force
 +                         * treatment, which is performed in calc_vir.
 +                         */
 +                        buf[n][YY] = box[YY][YY] - x[j][YY];
 +                        buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 +                        n++;
 +                    }
 +                }
 +            }
 +            
 +            if (cd->bInPlace)
 +            {
 +                rbuf = x + nat_tot;
 +            }
 +            else
 +            {
 +                rbuf = comm->vbuf2.v;
 +            }
 +            /* Send and receive the coordinates */
 +            dd_sendrecv_rvec(dd, d, dddirBackward,
 +                             buf,  ind->nsend[nzone+1],
 +                             rbuf, ind->nrecv[nzone+1]);
 +            if (!cd->bInPlace)
 +            {
 +                j = 0;
 +                for(zone=0; zone<nzone; zone++)
 +                {
 +                    for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 +                    {
 +                        copy_rvec(rbuf[j],x[i]);
 +                        j++;
 +                    }
 +                }
 +            }
 +            nat_tot += ind->nrecv[nzone+1];
 +        }
 +        nzone += nzone;
 +    }
 +}
 +
 +void dd_move_f(gmx_domdec_t *dd,rvec f[],rvec *fshift)
 +{
 +    int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 +    int  *index,*cgindex;
 +    gmx_domdec_comm_t *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t *ind;
 +    rvec *buf,*sbuf;
 +    ivec vis;
 +    int  is;
 +    gmx_bool bPBC,bScrew;
 +    
 +    comm = dd->comm;
 +    
 +    cgindex = dd->cgindex;
 +
 +    buf = comm->vbuf.v;
 +
 +    n = 0;
 +    nzone = comm->zones.n/2;
 +    nat_tot = dd->nat_tot;
 +    for(d=dd->ndim-1; d>=0; d--)
 +    {
 +        bPBC   = (dd->ci[dd->dim[d]] == 0);
 +        bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 +        if (fshift == NULL && !bScrew)
 +        {
 +            bPBC = FALSE;
 +        }
 +        /* Determine which shift vector we need */
 +        clear_ivec(vis);
 +        vis[dd->dim[d]] = 1;
 +        is = IVEC2IS(vis);
 +        
 +        cd = &comm->cd[d];
 +        for(p=cd->np-1; p>=0; p--) {
 +            ind = &cd->ind[p];
 +            nat_tot -= ind->nrecv[nzone+1];
 +            if (cd->bInPlace)
 +            {
 +                sbuf = f + nat_tot;
 +            }
 +            else
 +            {
 +                sbuf = comm->vbuf2.v;
 +                j = 0;
 +                for(zone=0; zone<nzone; zone++)
 +                {
 +                    for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 +                    {
 +                        copy_rvec(f[i],sbuf[j]);
 +                        j++;
 +                    }
 +                }
 +            }
 +            /* Communicate the forces */
 +            dd_sendrecv_rvec(dd, d, dddirForward,
 +                             sbuf, ind->nrecv[nzone+1],
 +                             buf,  ind->nsend[nzone+1]);
 +            index = ind->index;
 +            /* Add the received forces */
 +            n = 0;
 +            if (!bPBC)
 +            {
 +                for(i=0; i<ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for(j=at0; j<at1; j++)
 +                    {
 +                        rvec_inc(f[j],buf[n]);
 +                        n++;
 +                    }
 +                } 
 +            }
 +            else if (!bScrew)
 +            {
 +                for(i=0; i<ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for(j=at0; j<at1; j++)
 +                    {
 +                        rvec_inc(f[j],buf[n]);
 +                        /* Add this force to the shift force */
 +                        rvec_inc(fshift[is],buf[n]);
 +                        n++;
 +                    }
 +                }
 +            }
 +            else
 +            {
 +                for(i=0; i<ind->nsend[nzone]; i++)
 +                {
 +                    at0 = cgindex[index[i]];
 +                    at1 = cgindex[index[i]+1];
 +                    for(j=at0; j<at1; j++)
 +                    {
 +                        /* Rotate the force */
 +                        f[j][XX] += buf[n][XX];
 +                        f[j][YY] -= buf[n][YY];
 +                        f[j][ZZ] -= buf[n][ZZ];
 +                        if (fshift)
 +                        {
 +                            /* Add this force to the shift force */
 +                            rvec_inc(fshift[is],buf[n]);
 +                        }
 +                        n++;
 +                    }
 +                }
 +            }
 +        }
 +        nzone /= 2;
 +    }
 +}
 +
 +void dd_atom_spread_real(gmx_domdec_t *dd,real v[])
 +{
 +    int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 +    int  *index,*cgindex;
 +    gmx_domdec_comm_t *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t *ind;
 +    real *buf,*rbuf;
 +    
 +    comm = dd->comm;
 +    
 +    cgindex = dd->cgindex;
 +    
 +    buf = &comm->vbuf.v[0][0];
 +
 +    nzone = 1;
 +    nat_tot = dd->nat_home;
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        cd = &comm->cd[d];
 +        for(p=0; p<cd->np; p++)
 +        {
 +            ind = &cd->ind[p];
 +            index = ind->index;
 +            n = 0;
 +            for(i=0; i<ind->nsend[nzone]; i++)
 +            {
 +                at0 = cgindex[index[i]];
 +                at1 = cgindex[index[i]+1];
 +                for(j=at0; j<at1; j++)
 +                {
 +                    buf[n] = v[j];
 +                    n++;
 +                }
 +            }
 +            
 +            if (cd->bInPlace)
 +            {
 +                rbuf = v + nat_tot;
 +            }
 +            else
 +            {
 +                rbuf = &comm->vbuf2.v[0][0];
 +            }
 +            /* Send and receive the coordinates */
 +            dd_sendrecv_real(dd, d, dddirBackward,
 +                             buf,  ind->nsend[nzone+1],
 +                             rbuf, ind->nrecv[nzone+1]);
 +            if (!cd->bInPlace)
 +            {
 +                j = 0;
 +                for(zone=0; zone<nzone; zone++)
 +                {
 +                    for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 +                    {
 +                        v[i] = rbuf[j];
 +                        j++;
 +                    }
 +                }
 +            }
 +            nat_tot += ind->nrecv[nzone+1];
 +        }
 +        nzone += nzone;
 +    }
 +}
 +
 +void dd_atom_sum_real(gmx_domdec_t *dd,real v[])
 +{
 +    int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 +    int  *index,*cgindex;
 +    gmx_domdec_comm_t *comm;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t *ind;
 +    real *buf,*sbuf;
 +    
 +    comm = dd->comm;
 +    
 +    cgindex = dd->cgindex;
 +
 +    buf = &comm->vbuf.v[0][0];
 +
 +    n = 0;
 +    nzone = comm->zones.n/2;
 +    nat_tot = dd->nat_tot;
 +    for(d=dd->ndim-1; d>=0; d--)
 +    {
 +        cd = &comm->cd[d];
 +        for(p=cd->np-1; p>=0; p--) {
 +            ind = &cd->ind[p];
 +            nat_tot -= ind->nrecv[nzone+1];
 +            if (cd->bInPlace)
 +            {
 +                sbuf = v + nat_tot;
 +            }
 +            else
 +            {
 +                sbuf = &comm->vbuf2.v[0][0];
 +                j = 0;
 +                for(zone=0; zone<nzone; zone++)
 +                {
 +                    for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 +                    {
 +                        sbuf[j] = v[i];
 +                        j++;
 +                    }
 +                }
 +            }
 +            /* Communicate the forces */
 +            dd_sendrecv_real(dd, d, dddirForward,
 +                             sbuf, ind->nrecv[nzone+1],
 +                             buf,  ind->nsend[nzone+1]);
 +            index = ind->index;
 +            /* Add the received forces */
 +            n = 0;
 +            for(i=0; i<ind->nsend[nzone]; i++)
 +            {
 +                at0 = cgindex[index[i]];
 +                at1 = cgindex[index[i]+1];
 +                for(j=at0; j<at1; j++)
 +                {
 +                    v[j] += buf[n];
 +                    n++;
 +                }
 +            } 
 +        }
 +        nzone /= 2;
 +    }
 +}
 +
 +static void print_ddzone(FILE *fp,int d,int i,int j,gmx_ddzone_t *zone)
 +{
 +    fprintf(fp,"zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 +            d,i,j,
 +            zone->min0,zone->max1,
 +            zone->mch0,zone->mch0,
 +            zone->p1_0,zone->p1_1);
 +}
 +
 +static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
 +                               int ddimind,int direction,
 +                               gmx_ddzone_t *buf_s,int n_s,
 +                               gmx_ddzone_t *buf_r,int n_r)
 +{
 +    rvec vbuf_s[5*2],vbuf_r[5*2];
 +    int i;
 +
 +    for(i=0; i<n_s; i++)
 +    {
 +        vbuf_s[i*2  ][0] = buf_s[i].min0;
 +        vbuf_s[i*2  ][1] = buf_s[i].max1;
 +        vbuf_s[i*2  ][2] = buf_s[i].mch0;
 +        vbuf_s[i*2+1][0] = buf_s[i].mch1;
 +        vbuf_s[i*2+1][1] = buf_s[i].p1_0;
 +        vbuf_s[i*2+1][2] = buf_s[i].p1_1;
 +    }
 +
 +    dd_sendrecv_rvec(dd, ddimind, direction,
 +                     vbuf_s, n_s*2,
 +                     vbuf_r, n_r*2);
 +
 +    for(i=0; i<n_r; i++)
 +    {
 +        buf_r[i].min0 = vbuf_r[i*2  ][0];
 +        buf_r[i].max1 = vbuf_r[i*2  ][1];
 +        buf_r[i].mch0 = vbuf_r[i*2  ][2];
 +        buf_r[i].mch1 = vbuf_r[i*2+1][0];
 +        buf_r[i].p1_0 = vbuf_r[i*2+1][1];
 +        buf_r[i].p1_1 = vbuf_r[i*2+1][2];
 +    }
 +}
 +
 +static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
 +                          rvec cell_ns_x0,rvec cell_ns_x1)
 +{
 +    int  d,d1,dim,dim1,pos,buf_size,i,j,k,p,npulse,npulse_min;
 +    gmx_ddzone_t *zp,buf_s[5],buf_r[5],buf_e[5];
 +    rvec extr_s[2],extr_r[2];
 +    rvec dh;
 +    real dist_d,c=0,det;
 +    gmx_domdec_comm_t *comm;
 +    gmx_bool bPBC,bUse;
 +
 +    comm = dd->comm;
 +
 +    for(d=1; d<dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        zp = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
 +        zp->min0 = cell_ns_x0[dim];
 +        zp->max1 = cell_ns_x1[dim];
 +        zp->mch0 = cell_ns_x0[dim];
 +        zp->mch1 = cell_ns_x1[dim];
 +        zp->p1_0 = cell_ns_x0[dim];
 +        zp->p1_1 = cell_ns_x1[dim];
 +    }
 +    
 +    for(d=dd->ndim-2; d>=0; d--)
 +    {
 +        dim  = dd->dim[d];
 +        bPBC = (dim < ddbox->npbcdim);
 +
 +        /* Use an rvec to store two reals */
 +        extr_s[d][0] = comm->cell_f0[d+1];
 +        extr_s[d][1] = comm->cell_f1[d+1];
 +        extr_s[d][2] = 0;
 +
 +        pos = 0;
 +        /* Store the extremes in the backward sending buffer,
 +         * so the get updated separately from the forward communication.
 +         */
 +        for(d1=d; d1<dd->ndim-1; d1++)
 +        {
 +            /* We invert the order to be able to use the same loop for buf_e */
 +            buf_s[pos].min0 = extr_s[d1][1];
 +            buf_s[pos].max1 = extr_s[d1][0];
 +            buf_s[pos].mch0 = 0;
 +            buf_s[pos].mch1 = 0;
 +            /* Store the cell corner of the dimension we communicate along */
 +            buf_s[pos].p1_0 = comm->cell_x0[dim];
 +            buf_s[pos].p1_1 = 0;
 +            pos++;
 +        }
 +
 +        buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
 +        pos++;
 +
 +        if (dd->ndim == 3 && d == 0)
 +        {
 +            buf_s[pos] = comm->zone_d2[0][1];
 +            pos++;
 +            buf_s[pos] = comm->zone_d1[0];
 +            pos++;
 +        }
 +
 +        /* We only need to communicate the extremes
 +         * in the forward direction
 +         */
 +        npulse = comm->cd[d].np;
 +        if (bPBC)
 +        {
 +            /* Take the minimum to avoid double communication */
 +            npulse_min = min(npulse,dd->nc[dim]-1-npulse);
 +        }
 +        else
 +        {
 +            /* Without PBC we should really not communicate over
 +             * the boundaries, but implementing that complicates
 +             * the communication setup and therefore we simply
 +             * do all communication, but ignore some data.
 +             */
 +            npulse_min = npulse;
 +        }
 +        for(p=0; p<npulse_min; p++)
 +        {
 +            /* Communicate the extremes forward */
 +            bUse = (bPBC || dd->ci[dim] > 0);
 +
 +            dd_sendrecv_rvec(dd, d, dddirForward,
 +                             extr_s+d, dd->ndim-d-1,
 +                             extr_r+d, dd->ndim-d-1);
 +
 +            if (bUse)
 +            {
 +                for(d1=d; d1<dd->ndim-1; d1++)
 +                {
 +                    extr_s[d1][0] = max(extr_s[d1][0],extr_r[d1][0]);
 +                    extr_s[d1][1] = min(extr_s[d1][1],extr_r[d1][1]);
 +                }
 +            }
 +        }
 +
 +        buf_size = pos;
 +        for(p=0; p<npulse; p++)
 +        {
 +            /* Communicate all the zone information backward */
 +            bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
 +
 +            dd_sendrecv_ddzone(dd, d, dddirBackward,
 +                               buf_s, buf_size,
 +                               buf_r, buf_size);
 +
 +            clear_rvec(dh);
 +            if (p > 0)
 +            {
 +                for(d1=d+1; d1<dd->ndim; d1++)
 +                {
 +                    /* Determine the decrease of maximum required
 +                     * communication height along d1 due to the distance along d,
 +                     * this avoids a lot of useless atom communication.
 +                     */
 +                    dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
 +
 +                    if (ddbox->tric_dir[dim])
 +                    {
 +                        /* c is the off-diagonal coupling between the cell planes
 +                         * along directions d and d1.
 +                         */
 +                        c = ddbox->v[dim][dd->dim[d1]][dim];
 +                    }
 +                    else
 +                    {
 +                        c = 0;
 +                    }
 +                    det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
 +                    if (det > 0)
 +                    {
 +                        dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
 +                    }
 +                    else
 +                    {
 +                        /* A negative value signals out of range */
 +                        dh[d1] = -1;
 +                    }
 +                }
 +            }
 +
 +            /* Accumulate the extremes over all pulses */
 +            for(i=0; i<buf_size; i++)
 +            {
 +                if (p == 0)
 +                {
 +                    buf_e[i] = buf_r[i];
 +                }
 +                else
 +                {
 +                    if (bUse)
 +                    {
 +                        buf_e[i].min0 = min(buf_e[i].min0,buf_r[i].min0);
 +                        buf_e[i].max1 = max(buf_e[i].max1,buf_r[i].max1);
 +                    }
 +
 +                    if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
 +                    {
 +                        d1 = 1;
 +                    }
 +                    else
 +                    {
 +                        d1 = d + 1;
 +                    }
 +                    if (bUse && dh[d1] >= 0)
 +                    {
 +                        buf_e[i].mch0 = max(buf_e[i].mch0,buf_r[i].mch0-dh[d1]);
 +                        buf_e[i].mch1 = max(buf_e[i].mch1,buf_r[i].mch1-dh[d1]);
 +                    }
 +                }
 +                /* Copy the received buffer to the send buffer,
 +                 * to pass the data through with the next pulse.
 +                 */
 +                buf_s[i] = buf_r[i];
 +            }
 +            if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
 +                (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
 +            {
 +                /* Store the extremes */ 
 +                pos = 0;
 +
 +                for(d1=d; d1<dd->ndim-1; d1++)
 +                {
 +                    extr_s[d1][1] = min(extr_s[d1][1],buf_e[pos].min0);
 +                    extr_s[d1][0] = max(extr_s[d1][0],buf_e[pos].max1);
 +                    pos++;
 +                }
 +
 +                if (d == 1 || (d == 0 && dd->ndim == 3))
 +                {
 +                    for(i=d; i<2; i++)
 +                    {
 +                        comm->zone_d2[1-d][i] = buf_e[pos];
 +                        pos++;
 +                    }
 +                }
 +                if (d == 0)
 +                {
 +                    comm->zone_d1[1] = buf_e[pos];
 +                    pos++;
 +                }
 +            }
 +        }
 +    }
 +    
 +    if (dd->ndim >= 2)
 +    {
 +        dim = dd->dim[1];
 +        for(i=0; i<2; i++)
 +        {
 +            if (debug)
 +            {
 +                print_ddzone(debug,1,i,0,&comm->zone_d1[i]);
 +            }
 +            cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d1[i].min0);
 +            cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d1[i].max1);
 +        }
 +    }
 +    if (dd->ndim >= 3)
 +    {
 +        dim = dd->dim[2];
 +        for(i=0; i<2; i++)
 +        {
 +            for(j=0; j<2; j++)
 +            {
 +                if (debug)
 +                {
 +                    print_ddzone(debug,2,i,j,&comm->zone_d2[i][j]);
 +                }
 +                cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d2[i][j].min0);
 +                cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d2[i][j].max1);
 +            }
 +        }
 +    }
 +    for(d=1; d<dd->ndim; d++)
 +    {
 +        comm->cell_f_max0[d] = extr_s[d-1][0];
 +        comm->cell_f_min1[d] = extr_s[d-1][1];
 +        if (debug)
 +        {
 +            fprintf(debug,"Cell fraction d %d, max0 %f, min1 %f\n",
 +                    d,comm->cell_f_max0[d],comm->cell_f_min1[d]);
 +        }
 +    }
 +}
 +
 +static void dd_collect_cg(gmx_domdec_t *dd,
 +                          t_state *state_local)
 +{
 +    gmx_domdec_master_t *ma=NULL;
 +    int buf2[2],*ibuf,i,ncg_home=0,*cg=NULL,nat_home=0;
 +    t_block *cgs_gl;
 +
 +    if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
 +    {
 +        /* The master has the correct distribution */
 +        return;
 +    }
 +    
 +    if (state_local->ddp_count == dd->ddp_count)
 +    {
 +        ncg_home = dd->ncg_home;
 +        cg       = dd->index_gl;
 +        nat_home = dd->nat_home;
 +    } 
 +    else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
 +    {
 +        cgs_gl = &dd->comm->cgs_gl;
 +
 +        ncg_home = state_local->ncg_gl;
 +        cg       = state_local->cg_gl;
 +        nat_home = 0;
 +        for(i=0; i<ncg_home; i++)
 +        {
 +            nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
 +        }
 +    }
 +    else
 +    {
 +        gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
 +    }
 +    
 +    buf2[0] = dd->ncg_home;
 +    buf2[1] = dd->nat_home;
 +    if (DDMASTER(dd))
 +    {
 +        ma = dd->ma;
 +        ibuf = ma->ibuf;
 +    }
 +    else
 +    {
 +        ibuf = NULL;
 +    }
 +    /* Collect the charge group and atom counts on the master */
 +    dd_gather(dd,2*sizeof(int),buf2,ibuf);
 +    
 +    if (DDMASTER(dd))
 +    {
 +        ma->index[0] = 0;
 +        for(i=0; i<dd->nnodes; i++)
 +        {
 +            ma->ncg[i] = ma->ibuf[2*i];
 +            ma->nat[i] = ma->ibuf[2*i+1];
 +            ma->index[i+1] = ma->index[i] + ma->ncg[i];
 +            
 +        }
 +        /* Make byte counts and indices */
 +        for(i=0; i<dd->nnodes; i++)
 +        {
 +            ma->ibuf[i] = ma->ncg[i]*sizeof(int);
 +            ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug,"Initial charge group distribution: ");
 +            for(i=0; i<dd->nnodes; i++)
 +                fprintf(debug," %d",ma->ncg[i]);
 +            fprintf(debug,"\n");
 +        }
 +    }
 +    
 +    /* Collect the charge group indices on the master */
 +    dd_gatherv(dd,
 +               dd->ncg_home*sizeof(int),dd->index_gl,
 +               DDMASTER(dd) ? ma->ibuf : NULL,
 +               DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
 +               DDMASTER(dd) ? ma->cg : NULL);
 +    
 +    dd->comm->master_cg_ddp_count = state_local->ddp_count;
 +}
 +
 +static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
 +                                    rvec *lv,rvec *v)
 +{
 +    gmx_domdec_master_t *ma;
 +    int  n,i,c,a,nalloc=0;
 +    rvec *buf=NULL;
 +    t_block *cgs_gl;
 +
 +    ma = dd->ma;
 +    
 +    if (!DDMASTER(dd))
 +    {
 +#ifdef GMX_MPI
 +        MPI_Send(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
 +                 dd->rank,dd->mpi_comm_all);
 +#endif
 +    } else {
 +        /* Copy the master coordinates to the global array */
 +        cgs_gl = &dd->comm->cgs_gl;
 +
 +        n = DDMASTERRANK(dd);
 +        a = 0;
 +        for(i=ma->index[n]; i<ma->index[n+1]; i++)
 +        {
 +            for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
 +            {
 +                copy_rvec(lv[a++],v[c]);
 +            }
 +        }
 +        
 +        for(n=0; n<dd->nnodes; n++)
 +        {
 +            if (n != dd->rank)
 +            {
 +                if (ma->nat[n] > nalloc)
 +                {
 +                    nalloc = over_alloc_dd(ma->nat[n]);
 +                    srenew(buf,nalloc);
 +                }
 +#ifdef GMX_MPI
 +                MPI_Recv(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,DDRANK(dd,n),
 +                         n,dd->mpi_comm_all,MPI_STATUS_IGNORE);
 +#endif
 +                a = 0;
 +                for(i=ma->index[n]; i<ma->index[n+1]; i++)
 +                {
 +                    for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
 +                    {
 +                        copy_rvec(buf[a++],v[c]);
 +                    }
 +                }
 +            }
 +        }
 +        sfree(buf);
 +    }
 +}
 +
 +static void get_commbuffer_counts(gmx_domdec_t *dd,
 +                                  int **counts,int **disps)
 +{
 +    gmx_domdec_master_t *ma;
 +    int n;
 +
 +    ma = dd->ma;
 +    
 +    /* Make the rvec count and displacment arrays */
 +    *counts  = ma->ibuf;
 +    *disps   = ma->ibuf + dd->nnodes;
 +    for(n=0; n<dd->nnodes; n++)
 +    {
 +        (*counts)[n] = ma->nat[n]*sizeof(rvec);
 +        (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
 +    }
 +}
 +
 +static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
 +                                   rvec *lv,rvec *v)
 +{
 +    gmx_domdec_master_t *ma;
 +    int  *rcounts=NULL,*disps=NULL;
 +    int  n,i,c,a;
 +    rvec *buf=NULL;
 +    t_block *cgs_gl;
 +    
 +    ma = dd->ma;
 +    
 +    if (DDMASTER(dd))
 +    {
 +        get_commbuffer_counts(dd,&rcounts,&disps);
 +
 +        buf = ma->vbuf;
 +    }
 +    
 +    dd_gatherv(dd,dd->nat_home*sizeof(rvec),lv,rcounts,disps,buf);
 +
 +    if (DDMASTER(dd))
 +    {
 +        cgs_gl = &dd->comm->cgs_gl;
 +
 +        a = 0;
 +        for(n=0; n<dd->nnodes; n++)
 +        {
 +            for(i=ma->index[n]; i<ma->index[n+1]; i++)
 +            {
 +                for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
 +                {
 +                    copy_rvec(buf[a++],v[c]);
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +void dd_collect_vec(gmx_domdec_t *dd,
 +                    t_state *state_local,rvec *lv,rvec *v)
 +{
 +    gmx_domdec_master_t *ma;
 +    int  n,i,c,a,nalloc=0;
 +    rvec *buf=NULL;
 +    
 +    dd_collect_cg(dd,state_local);
 +
 +    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
 +    {
 +        dd_collect_vec_sendrecv(dd,lv,v);
 +    }
 +    else
 +    {
 +        dd_collect_vec_gatherv(dd,lv,v);
 +    }
 +}
 +
 +
 +void dd_collect_state(gmx_domdec_t *dd,
 +                      t_state *state_local,t_state *state)
 +{
 +    int est,i,j,nh;
 +
 +    nh = state->nhchainlength;
 +
 +    if (DDMASTER(dd))
 +    {
 +        state->lambda = state_local->lambda;
 +        state->veta = state_local->veta;
 +        state->vol0 = state_local->vol0;
 +        copy_mat(state_local->box,state->box);
 +        copy_mat(state_local->boxv,state->boxv);
 +        copy_mat(state_local->svir_prev,state->svir_prev);
 +        copy_mat(state_local->fvir_prev,state->fvir_prev);
 +        copy_mat(state_local->pres_prev,state->pres_prev);
 +
 +
 +        for(i=0; i<state_local->ngtc; i++)
 +        {
 +            for(j=0; j<nh; j++) {
 +                state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
 +                state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
 +            }
 +            state->therm_integral[i] = state_local->therm_integral[i];            
 +        }
 +        for(i=0; i<state_local->nnhpres; i++) 
 +        {
 +            for(j=0; j<nh; j++) {
 +                state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
 +                state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
 +            }
 +        }
 +    }
 +    for(est=0; est<estNR; est++)
 +    {
 +        if (EST_DISTR(est) && (state_local->flags & (1<<est)))
 +        {
 +            switch (est) {
 +            case estX:
 +                dd_collect_vec(dd,state_local,state_local->x,state->x);
 +                break;
 +            case estV:
 +                dd_collect_vec(dd,state_local,state_local->v,state->v);
 +                break;
 +            case estSDX:
 +                dd_collect_vec(dd,state_local,state_local->sd_X,state->sd_X);
 +                break;
 +            case estCGP:
 +                dd_collect_vec(dd,state_local,state_local->cg_p,state->cg_p);
 +                break;
 +            case estLD_RNG:
 +                if (state->nrngi == 1)
 +                {
 +                    if (DDMASTER(dd))
 +                    {
 +                        for(i=0; i<state_local->nrng; i++)
 +                        {
 +                            state->ld_rng[i] = state_local->ld_rng[i];
 +                        }
 +                    }
 +                }
 +                else
 +                {
 +                    dd_gather(dd,state_local->nrng*sizeof(state->ld_rng[0]),
 +                              state_local->ld_rng,state->ld_rng);
 +                }
 +                break;
 +            case estLD_RNGI:
 +                if (state->nrngi == 1)
 +                {
 +                   if (DDMASTER(dd))
 +                    {
 +                        state->ld_rngi[0] = state_local->ld_rngi[0];
 +                    } 
 +                }
 +                else
 +                {
 +                    dd_gather(dd,sizeof(state->ld_rngi[0]),
 +                              state_local->ld_rngi,state->ld_rngi);
 +                }
 +                break;
 +            case estDISRE_INITF:
 +            case estDISRE_RM3TAV:
 +            case estORIRE_INITF:
 +            case estORIRE_DTAV:
 +                break;
 +            default:
 +                gmx_incons("Unknown state entry encountered in dd_collect_state");
 +            }
 +        }
 +    }
 +}
 +
 +static void dd_realloc_fr_cg(t_forcerec *fr,int nalloc)
 +{
 +    if (debug)
 +    {
 +        fprintf(debug,"Reallocating forcerec: currently %d, required %d, allocating %d\n",fr->cg_nalloc,nalloc,over_alloc_dd(nalloc));
 +    }
 +    fr->cg_nalloc = over_alloc_dd(nalloc);
 +    srenew(fr->cg_cm,fr->cg_nalloc);
 +    srenew(fr->cginfo,fr->cg_nalloc);
 +}
 +
 +static void dd_realloc_state(t_state *state,rvec **f,int nalloc)
 +{
 +    int est;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"Reallocating state: currently %d, required %d, allocating %d\n",state->nalloc,nalloc,over_alloc_dd(nalloc));
 +    }
 +
 +    state->nalloc = over_alloc_dd(nalloc);
 +    
 +    for(est=0; est<estNR; est++)
 +    {
 +        if (EST_DISTR(est) && (state->flags & (1<<est)))
 +        {
 +            switch(est) {
 +            case estX:
 +                srenew(state->x,state->nalloc);
 +                break;
 +            case estV:
 +                srenew(state->v,state->nalloc);
 +                break;
 +            case estSDX:
 +                srenew(state->sd_X,state->nalloc);
 +                break;
 +            case estCGP:
 +                srenew(state->cg_p,state->nalloc);
 +                break;
 +            case estLD_RNG:
 +            case estLD_RNGI:
 +            case estDISRE_INITF:
 +            case estDISRE_RM3TAV:
 +            case estORIRE_INITF:
 +            case estORIRE_DTAV:
 +                /* No reallocation required */
 +                break;
 +            default:
 +                gmx_incons("Unknown state entry encountered in dd_realloc_state");            
 +            }
 +        }
 +    }
 +    
 +    if (f != NULL)
 +    {
 +        srenew(*f,state->nalloc);
 +    }
 +}
 +
 +static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd,t_block *cgs,
 +                                       rvec *v,rvec *lv)
 +{
 +    gmx_domdec_master_t *ma;
 +    int  n,i,c,a,nalloc=0;
 +    rvec *buf=NULL;
 +    
 +    if (DDMASTER(dd))
 +    {
 +        ma  = dd->ma;
 +        
 +        for(n=0; n<dd->nnodes; n++)
 +        {
 +            if (n != dd->rank)
 +            {
 +                if (ma->nat[n] > nalloc)
 +                {
 +                    nalloc = over_alloc_dd(ma->nat[n]);
 +                    srenew(buf,nalloc);
 +                }
 +                /* Use lv as a temporary buffer */
 +                a = 0;
 +                for(i=ma->index[n]; i<ma->index[n+1]; i++)
 +                {
 +                    for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
 +                    {
 +                        copy_rvec(v[c],buf[a++]);
 +                    }
 +                }
 +                if (a != ma->nat[n])
 +                {
 +                    gmx_fatal(FARGS,"Internal error a (%d) != nat (%d)",
 +                              a,ma->nat[n]);
 +                }
 +                
 +#ifdef GMX_MPI
 +                MPI_Send(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,
 +                         DDRANK(dd,n),n,dd->mpi_comm_all);
 +#endif
 +            }
 +        }
 +        sfree(buf);
 +        n = DDMASTERRANK(dd);
 +        a = 0;
 +        for(i=ma->index[n]; i<ma->index[n+1]; i++)
 +        {
 +            for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
 +            {
 +                copy_rvec(v[c],lv[a++]);
 +            }
 +        }
 +    }
 +    else
 +    {
 +#ifdef GMX_MPI
 +        MPI_Recv(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
 +                 MPI_ANY_TAG,dd->mpi_comm_all,MPI_STATUS_IGNORE);
 +#endif
 +    }
 +}
 +
 +static void dd_distribute_vec_scatterv(gmx_domdec_t *dd,t_block *cgs,
 +                                       rvec *v,rvec *lv)
 +{
 +    gmx_domdec_master_t *ma;
 +    int  *scounts=NULL,*disps=NULL;
 +    int  n,i,c,a,nalloc=0;
 +    rvec *buf=NULL;
 +    
 +    if (DDMASTER(dd))
 +    {
 +        ma  = dd->ma;
 +     
 +        get_commbuffer_counts(dd,&scounts,&disps);
 +
 +        buf = ma->vbuf;
 +        a = 0;
 +        for(n=0; n<dd->nnodes; n++)
 +        {
 +            for(i=ma->index[n]; i<ma->index[n+1]; i++)
 +            {
 +                for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
 +                {
 +                    copy_rvec(v[c],buf[a++]);
 +                }
 +            }
 +        }
 +    }
 +
 +    dd_scatterv(dd,scounts,disps,buf,dd->nat_home*sizeof(rvec),lv);
 +}
 +
 +static void dd_distribute_vec(gmx_domdec_t *dd,t_block *cgs,rvec *v,rvec *lv)
 +{
 +    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
 +    {
 +        dd_distribute_vec_sendrecv(dd,cgs,v,lv);
 +    }
 +    else
 +    {
 +        dd_distribute_vec_scatterv(dd,cgs,v,lv);
 +    }
 +}
 +
 +static void dd_distribute_state(gmx_domdec_t *dd,t_block *cgs,
 +                                t_state *state,t_state *state_local,
 +                                rvec **f)
 +{
 +    int  i,j,ngtch,ngtcp,nh;
 +
 +    nh = state->nhchainlength;
 +
 +    if (DDMASTER(dd))
 +    {
 +        state_local->lambda = state->lambda;
 +        state_local->veta   = state->veta;
 +        state_local->vol0   = state->vol0;
 +        copy_mat(state->box,state_local->box);
 +        copy_mat(state->box_rel,state_local->box_rel);
 +        copy_mat(state->boxv,state_local->boxv);
 +        copy_mat(state->svir_prev,state_local->svir_prev);
 +        copy_mat(state->fvir_prev,state_local->fvir_prev);
 +        for(i=0; i<state_local->ngtc; i++)
 +        {
 +            for(j=0; j<nh; j++) {
 +                state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
 +                state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
 +            }
 +            state_local->therm_integral[i] = state->therm_integral[i];
 +        }
 +        for(i=0; i<state_local->nnhpres; i++)
 +        {
 +            for(j=0; j<nh; j++) {
 +                state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
 +                state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
 +            }
 +        }
 +    }
 +    dd_bcast(dd,sizeof(real),&state_local->lambda);
 +    dd_bcast(dd,sizeof(real),&state_local->veta);
 +    dd_bcast(dd,sizeof(real),&state_local->vol0);
 +    dd_bcast(dd,sizeof(state_local->box),state_local->box);
 +    dd_bcast(dd,sizeof(state_local->box_rel),state_local->box_rel);
 +    dd_bcast(dd,sizeof(state_local->boxv),state_local->boxv);
 +    dd_bcast(dd,sizeof(state_local->svir_prev),state_local->svir_prev);
 +    dd_bcast(dd,sizeof(state_local->fvir_prev),state_local->fvir_prev);
 +    dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_xi);
 +    dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_vxi);
 +    dd_bcast(dd,state_local->ngtc*sizeof(double),state_local->therm_integral);
 +    dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_xi);
 +    dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_vxi);
 +
 +    if (dd->nat_home > state_local->nalloc)
 +    {
 +        dd_realloc_state(state_local,f,dd->nat_home);
 +    }
 +    for(i=0; i<estNR; i++)
 +    {
 +        if (EST_DISTR(i) && (state_local->flags & (1<<i)))
 +        {
 +            switch (i) {
 +            case estX:
 +                dd_distribute_vec(dd,cgs,state->x,state_local->x);
 +                break;
 +            case estV:
 +                dd_distribute_vec(dd,cgs,state->v,state_local->v);
 +                break;
 +            case estSDX:
 +                dd_distribute_vec(dd,cgs,state->sd_X,state_local->sd_X);
 +                break;
 +            case estCGP:
 +                dd_distribute_vec(dd,cgs,state->cg_p,state_local->cg_p);
 +                break;
 +            case estLD_RNG:
 +                if (state->nrngi == 1)
 +                {
 +                    dd_bcastc(dd,
 +                              state_local->nrng*sizeof(state_local->ld_rng[0]),
 +                              state->ld_rng,state_local->ld_rng);
 +                }
 +                else
 +                {
 +                    dd_scatter(dd,
 +                               state_local->nrng*sizeof(state_local->ld_rng[0]),
 +                               state->ld_rng,state_local->ld_rng);
 +                }
 +                break;
 +            case estLD_RNGI:
 +                if (state->nrngi == 1)
 +                {
 +                    dd_bcastc(dd,sizeof(state_local->ld_rngi[0]),
 +                              state->ld_rngi,state_local->ld_rngi);
 +                }
 +                else
 +                {
 +                     dd_scatter(dd,sizeof(state_local->ld_rngi[0]),
 +                               state->ld_rngi,state_local->ld_rngi);
 +                }   
 +                break;
 +            case estDISRE_INITF:
 +            case estDISRE_RM3TAV:
 +            case estORIRE_INITF:
 +            case estORIRE_DTAV:
 +                /* Not implemented yet */
 +                break;
 +            default:
 +                gmx_incons("Unknown state entry encountered in dd_distribute_state");
 +            }
 +        }
 +    }
 +}
 +
 +static char dim2char(int dim)
 +{
 +    char c='?';
 +    
 +    switch (dim)
 +    {
 +    case XX: c = 'X'; break;
 +    case YY: c = 'Y'; break;
 +    case ZZ: c = 'Z'; break;
 +    default: gmx_fatal(FARGS,"Unknown dim %d",dim);
 +    }
 +    
 +    return c;
 +}
 +
 +static void write_dd_grid_pdb(const char *fn,gmx_large_int_t step,
 +                              gmx_domdec_t *dd,matrix box,gmx_ddbox_t *ddbox)
 +{
 +    rvec grid_s[2],*grid_r=NULL,cx,r;
 +    char fname[STRLEN],format[STRLEN],buf[22];
 +    FILE *out;
 +    int  a,i,d,z,y,x;
 +    matrix tric;
 +    real vol;
 +
 +    copy_rvec(dd->comm->cell_x0,grid_s[0]);
 +    copy_rvec(dd->comm->cell_x1,grid_s[1]);
 +    
 +    if (DDMASTER(dd))
 +    {
 +        snew(grid_r,2*dd->nnodes);
 +    }
 +    
 +    dd_gather(dd,2*sizeof(rvec),grid_s[0],DDMASTER(dd) ? grid_r[0] : NULL);
 +    
 +    if (DDMASTER(dd))
 +    {
 +        for(d=0; d<DIM; d++)
 +        {
 +            for(i=0; i<DIM; i++)
 +            {
 +                if (d == i)
 +                {
 +                    tric[d][i] = 1;
 +                }
 +                else
 +                {
 +                    if (d < ddbox->npbcdim && dd->nc[d] > 1)
 +                    {
 +                        tric[d][i] = box[i][d]/box[i][i];
 +                    }
 +                    else
 +                    {
 +                        tric[d][i] = 0;
 +                    }
 +                }
 +            }
 +        }
 +        sprintf(fname,"%s_%s.pdb",fn,gmx_step_str(step,buf));
 +        sprintf(format,"%s%s\n",get_pdbformat(),"%6.2f%6.2f");
 +        out = gmx_fio_fopen(fname,"w");
 +        gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
 +        a = 1;
 +        for(i=0; i<dd->nnodes; i++)
 +        {
 +            vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
 +            for(d=0; d<DIM; d++)
 +            {
 +                vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
 +            }
 +            for(z=0; z<2; z++)
 +            {
 +                for(y=0; y<2; y++)
 +                {
 +                    for(x=0; x<2; x++)
 +                    {
 +                        cx[XX] = grid_r[i*2+x][XX];
 +                        cx[YY] = grid_r[i*2+y][YY];
 +                        cx[ZZ] = grid_r[i*2+z][ZZ];
 +                        mvmul(tric,cx,r);
 +                        fprintf(out,format,"ATOM",a++,"CA","GLY",' ',1+i,
 +                                10*r[XX],10*r[YY],10*r[ZZ],1.0,vol);
 +                    }
 +                }
 +            }
 +            for(d=0; d<DIM; d++)
 +            {
 +                for(x=0; x<4; x++)
 +                {
 +                    switch(d)
 +                    {
 +                    case 0: y = 1 + i*8 + 2*x; break;
 +                    case 1: y = 1 + i*8 + 2*x - (x % 2); break;
 +                    case 2: y = 1 + i*8 + x; break;
 +                    }
 +                    fprintf(out,"%6s%5d%5d\n","CONECT",y,y+(1<<d));
 +                }
 +            }
 +        }
 +        gmx_fio_fclose(out);
 +        sfree(grid_r);
 +    }
 +}
 +
 +void write_dd_pdb(const char *fn,gmx_large_int_t step,const char *title,
 +                  gmx_mtop_t *mtop,t_commrec *cr,
 +                  int natoms,rvec x[],matrix box)
 +{
 +    char fname[STRLEN],format[STRLEN],format4[STRLEN],buf[22];
 +    FILE *out;
 +    int  i,ii,resnr,c;
 +    char *atomname,*resname;
 +    real b;
 +    gmx_domdec_t *dd;
 +    
 +    dd = cr->dd;
 +    if (natoms == -1)
 +    {
 +        natoms = dd->comm->nat[ddnatVSITE];
 +    }
 +    
 +    sprintf(fname,"%s_%s_n%d.pdb",fn,gmx_step_str(step,buf),cr->sim_nodeid);
 +    
 +    sprintf(format,"%s%s\n",get_pdbformat(),"%6.2f%6.2f");
 +    sprintf(format4,"%s%s\n",get_pdbformat4(),"%6.2f%6.2f");
 +    
 +    out = gmx_fio_fopen(fname,"w");
 +    
 +    fprintf(out,"TITLE     %s\n",title);
 +    gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
 +    for(i=0; i<natoms; i++)
 +    {
 +        ii = dd->gatindex[i];
 +        gmx_mtop_atominfo_global(mtop,ii,&atomname,&resnr,&resname);
 +        if (i < dd->comm->nat[ddnatZONE])
 +        {
 +            c = 0;
 +            while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
 +            {
 +                c++;
 +            }
 +            b = c;
 +        }
 +        else if (i < dd->comm->nat[ddnatVSITE])
 +        {
 +            b = dd->comm->zones.n;
 +        }
 +        else
 +        {
 +            b = dd->comm->zones.n + 1;
 +        }
 +        fprintf(out,strlen(atomname)<4 ? format : format4,
 +                "ATOM",(ii+1)%100000,
 +                atomname,resname,' ',resnr%10000,' ',
 +                10*x[i][XX],10*x[i][YY],10*x[i][ZZ],1.0,b);
 +    }
 +    fprintf(out,"TER\n");
 +    
 +    gmx_fio_fclose(out);
 +}
 +
 +real dd_cutoff_mbody(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  di;
 +    real r;
 +
 +    comm = dd->comm;
 +
 +    r = -1;
 +    if (comm->bInterCGBondeds)
 +    {
 +        if (comm->cutoff_mbody > 0)
 +        {
 +            r = comm->cutoff_mbody;
 +        }
 +        else
 +        {
 +            /* cutoff_mbody=0 means we do not have DLB */
 +            r = comm->cellsize_min[dd->dim[0]];
 +            for(di=1; di<dd->ndim; di++)
 +            {
 +                r = min(r,comm->cellsize_min[dd->dim[di]]);
 +            }
 +            if (comm->bBondComm)
 +            {
 +                r = max(r,comm->cutoff_mbody);
 +            }
 +            else
 +            {
 +                r = min(r,comm->cutoff);
 +            }
 +        }
 +    }
 +
 +    return r;
 +}
 +
 +real dd_cutoff_twobody(gmx_domdec_t *dd)
 +{
 +    real r_mb;
 +
 +    r_mb = dd_cutoff_mbody(dd);
 +
 +    return max(dd->comm->cutoff,r_mb);
 +}
 +
 +
 +static void dd_cart_coord2pmecoord(gmx_domdec_t *dd,ivec coord,ivec coord_pme)
 +{
 +    int nc,ntot;
 +    
 +    nc   = dd->nc[dd->comm->cartpmedim];
 +    ntot = dd->comm->ntot[dd->comm->cartpmedim];
 +    copy_ivec(coord,coord_pme);
 +    coord_pme[dd->comm->cartpmedim] =
 +        nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
 +}
 +
 +static int low_ddindex2pmeindex(int ndd,int npme,int ddindex)
 +{
 +    /* Here we assign a PME node to communicate with this DD node
 +     * by assuming that the major index of both is x.
 +     * We add cr->npmenodes/2 to obtain an even distribution.
 +     */
 +    return (ddindex*npme + npme/2)/ndd;
 +}
 +
 +static int ddindex2pmeindex(const gmx_domdec_t *dd,int ddindex)
 +{
 +    return low_ddindex2pmeindex(dd->nnodes,dd->comm->npmenodes,ddindex);
 +}
 +
 +static int cr_ddindex2pmeindex(const t_commrec *cr,int ddindex)
 +{
 +    return low_ddindex2pmeindex(cr->dd->nnodes,cr->npmenodes,ddindex);
 +}
 +
 +static int *dd_pmenodes(t_commrec *cr)
 +{
 +    int *pmenodes;
 +    int n,i,p0,p1;
 +    
 +    snew(pmenodes,cr->npmenodes);
 +    n = 0;
 +    for(i=0; i<cr->dd->nnodes; i++) {
 +        p0 = cr_ddindex2pmeindex(cr,i);
 +        p1 = cr_ddindex2pmeindex(cr,i+1);
 +        if (i+1 == cr->dd->nnodes || p1 > p0) {
 +            if (debug)
 +                fprintf(debug,"pmenode[%d] = %d\n",n,i+1+n);
 +            pmenodes[n] = i + 1 + n;
 +            n++;
 +        }
 +    }
 +
 +    return pmenodes;
 +}
 +
 +static int gmx_ddcoord2pmeindex(t_commrec *cr,int x,int y,int z)
 +{
 +    gmx_domdec_t *dd;
 +    ivec coords,coords_pme,nc;
 +    int  slab;
 +    
 +    dd = cr->dd;
 +    /*
 +      if (dd->comm->bCartesian) {
 +      gmx_ddindex2xyz(dd->nc,ddindex,coords);
 +      dd_coords2pmecoords(dd,coords,coords_pme);
 +      copy_ivec(dd->ntot,nc);
 +      nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
 +      coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
 +      
 +      slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
 +      } else {
 +      slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
 +      }
 +    */
 +    coords[XX] = x;
 +    coords[YY] = y;
 +    coords[ZZ] = z;
 +    slab = ddindex2pmeindex(dd,dd_index(dd->nc,coords));
 +    
 +    return slab;
 +}
 +
 +static int ddcoord2simnodeid(t_commrec *cr,int x,int y,int z)
 +{
 +    gmx_domdec_comm_t *comm;
 +    ivec coords;
 +    int  ddindex,nodeid=-1;
 +    
 +    comm = cr->dd->comm;
 +    
 +    coords[XX] = x;
 +    coords[YY] = y;
 +    coords[ZZ] = z;
 +    if (comm->bCartesianPP_PME)
 +    {
 +#ifdef GMX_MPI
 +        MPI_Cart_rank(cr->mpi_comm_mysim,coords,&nodeid);
 +#endif
 +    }
 +    else
 +    {
 +        ddindex = dd_index(cr->dd->nc,coords);
 +        if (comm->bCartesianPP)
 +        {
 +            nodeid = comm->ddindex2simnodeid[ddindex];
 +        }
 +        else
 +        {
 +            if (comm->pmenodes)
 +            {
 +                nodeid = ddindex + gmx_ddcoord2pmeindex(cr,x,y,z);
 +            }
 +            else
 +            {
 +                nodeid = ddindex;
 +            }
 +        }
 +    }
 +  
 +    return nodeid;
 +}
 +
 +static int dd_simnode2pmenode(t_commrec *cr,int sim_nodeid)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_domdec_comm_t *comm;
 +    ivec coord,coord_pme;
 +    int  i;
 +    int  pmenode=-1;
 +    
 +    dd = cr->dd;
 +    comm = dd->comm;
 +    
 +    /* This assumes a uniform x domain decomposition grid cell size */
 +    if (comm->bCartesianPP_PME)
 +    {
 +#ifdef GMX_MPI
 +        MPI_Cart_coords(cr->mpi_comm_mysim,sim_nodeid,DIM,coord);
 +        if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 +        {
 +            /* This is a PP node */
 +            dd_cart_coord2pmecoord(dd,coord,coord_pme);
 +            MPI_Cart_rank(cr->mpi_comm_mysim,coord_pme,&pmenode);
 +        }
 +#endif
 +    }
 +    else if (comm->bCartesianPP)
 +    {
 +        if (sim_nodeid < dd->nnodes)
 +        {
 +            pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
 +        }
 +    }
 +    else
 +    {
 +        /* This assumes DD cells with identical x coordinates
 +         * are numbered sequentially.
 +         */
 +        if (dd->comm->pmenodes == NULL)
 +        {
 +            if (sim_nodeid < dd->nnodes)
 +            {
 +                /* The DD index equals the nodeid */
 +                pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
 +            }
 +        }
 +        else
 +        {
 +            i = 0;
 +            while (sim_nodeid > dd->comm->pmenodes[i])
 +            {
 +                i++;
 +            }
 +            if (sim_nodeid < dd->comm->pmenodes[i])
 +            {
 +                pmenode = dd->comm->pmenodes[i];
 +            }
 +        }
 +    }
 +    
 +    return pmenode;
 +}
 +
 +gmx_bool gmx_pmeonlynode(t_commrec *cr,int sim_nodeid)
 +{
 +    gmx_bool bPMEOnlyNode;
 +    
 +    if (DOMAINDECOMP(cr))
 +    {
 +        bPMEOnlyNode = (dd_simnode2pmenode(cr,sim_nodeid) == -1);
 +    }
 +    else
 +    {
 +        bPMEOnlyNode = FALSE;
 +    }
 +    
 +    return bPMEOnlyNode;
 +}
 +
 +void get_pme_ddnodes(t_commrec *cr,int pmenodeid,
 +                     int *nmy_ddnodes,int **my_ddnodes,int *node_peer)
 +{
 +    gmx_domdec_t *dd;
 +    int x,y,z;
 +    ivec coord,coord_pme;
 +    
 +    dd = cr->dd;
 +    
 +    snew(*my_ddnodes,(dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
 +    
 +    *nmy_ddnodes = 0;
 +    for(x=0; x<dd->nc[XX]; x++)
 +    {
 +        for(y=0; y<dd->nc[YY]; y++)
 +        {
 +            for(z=0; z<dd->nc[ZZ]; z++)
 +            {
 +                if (dd->comm->bCartesianPP_PME)
 +                {
 +                    coord[XX] = x;
 +                    coord[YY] = y;
 +                    coord[ZZ] = z;
 +                    dd_cart_coord2pmecoord(dd,coord,coord_pme);
 +                    if (dd->ci[XX] == coord_pme[XX] &&
 +                        dd->ci[YY] == coord_pme[YY] &&
 +                        dd->ci[ZZ] == coord_pme[ZZ])
 +                        (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
 +                }
 +                else
 +                {
 +                    /* The slab corresponds to the nodeid in the PME group */
 +                    if (gmx_ddcoord2pmeindex(cr,x,y,z) == pmenodeid)
 +                    {
 +                        (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
 +                    }
 +                }
 +            }
 +        }
 +    }
 +    
 +    /* The last PP-only node is the peer node */
 +    *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
 +    
 +    if (debug)
 +    {
 +        fprintf(debug,"Receive coordinates from PP nodes:");
 +        for(x=0; x<*nmy_ddnodes; x++)
 +        {
 +            fprintf(debug," %d",(*my_ddnodes)[x]);
 +        }
 +        fprintf(debug,"\n");
 +    }
 +}
 +
 +static gmx_bool receive_vir_ener(t_commrec *cr)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  pmenode,coords[DIM],rank;
 +    gmx_bool bReceive;
 +    
 +    bReceive = TRUE;
 +    if (cr->npmenodes < cr->dd->nnodes)
 +    {
 +        comm = cr->dd->comm;
 +        if (comm->bCartesianPP_PME)
 +        {
 +            pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
 +#ifdef GMX_MPI
 +            MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,coords);
 +            coords[comm->cartpmedim]++;
 +            if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
 +            {
 +                MPI_Cart_rank(cr->mpi_comm_mysim,coords,&rank);
 +                if (dd_simnode2pmenode(cr,rank) == pmenode)
 +                {
 +                    /* This is not the last PP node for pmenode */
 +                    bReceive = FALSE;
 +                }
 +            }
 +#endif  
 +        }
 +        else
 +        {
 +            pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
 +            if (cr->sim_nodeid+1 < cr->nnodes &&
 +                dd_simnode2pmenode(cr,cr->sim_nodeid+1) == pmenode)
 +            {
 +                /* This is not the last PP node for pmenode */
 +                bReceive = FALSE;
 +            }
 +        }
 +    }
 +    
 +    return bReceive;
 +}
 +
 +static void set_zones_ncg_home(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_zones_t *zones;
 +    int i;
 +
 +    zones = &dd->comm->zones;
 +
 +    zones->cg_range[0] = 0;
 +    for(i=1; i<zones->n+1; i++)
 +    {
 +        zones->cg_range[i] = dd->ncg_home;
 +    }
 +}
 +
 +static void rebuild_cgindex(gmx_domdec_t *dd,int *gcgs_index,t_state *state)
 +{
 +    int nat,i,*ind,*dd_cg_gl,*cgindex,cg_gl;
 +    
 +    ind = state->cg_gl;
 +    dd_cg_gl = dd->index_gl;
 +    cgindex  = dd->cgindex;
 +    nat = 0;
 +    cgindex[0] = nat;
 +    for(i=0; i<state->ncg_gl; i++)
 +    {
 +        cgindex[i] = nat;
 +        cg_gl = ind[i];
 +        dd_cg_gl[i] = cg_gl;
 +        nat += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
 +    }
 +    cgindex[i] = nat;
 +    
 +    dd->ncg_home = state->ncg_gl;
 +    dd->nat_home = nat;
 +
 +    set_zones_ncg_home(dd);
 +}
 +
 +static int ddcginfo(const cginfo_mb_t *cginfo_mb,int cg)
 +{
 +    while (cg >= cginfo_mb->cg_end)
 +    {
 +        cginfo_mb++;
 +    }
 +
 +    return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
 +}
 +
 +static void dd_set_cginfo(int *index_gl,int cg0,int cg1,
 +                          t_forcerec *fr,char *bLocalCG)
 +{
 +    cginfo_mb_t *cginfo_mb;
 +    int *cginfo;
 +    int cg;
 +
 +    if (fr != NULL)
 +    {
 +        cginfo_mb = fr->cginfo_mb;
 +        cginfo    = fr->cginfo;
 +
 +        for(cg=cg0; cg<cg1; cg++)
 +        {
 +            cginfo[cg] = ddcginfo(cginfo_mb,index_gl[cg]);
 +        }
 +    }
 +
 +    if (bLocalCG != NULL)
 +    {
 +        for(cg=cg0; cg<cg1; cg++)
 +        {
 +            bLocalCG[index_gl[cg]] = TRUE;
 +        }
 +    }
 +}
 +
 +static void make_dd_indices(gmx_domdec_t *dd,int *gcgs_index,int cg_start)
 +{
 +    int nzone,zone,zone1,cg0,cg,cg_gl,a,a_gl;
 +    int *zone2cg,*zone_ncg1,*index_gl,*gatindex;
 +    gmx_ga2la_t *ga2la;
 +    char *bLocalCG;
 +
 +    bLocalCG = dd->comm->bLocalCG;
 +
 +    if (dd->nat_tot > dd->gatindex_nalloc)
 +    {
 +        dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
 +        srenew(dd->gatindex,dd->gatindex_nalloc);
 +    }
 +
 +    nzone      = dd->comm->zones.n;
 +    zone2cg    = dd->comm->zones.cg_range;
 +    zone_ncg1  = dd->comm->zone_ncg1;
 +    index_gl   = dd->index_gl;
 +    gatindex   = dd->gatindex;
 +
 +    if (zone2cg[1] != dd->ncg_home)
 +    {
 +        gmx_incons("dd->ncg_zone is not up to date");
 +    }
 +    
 +    /* Make the local to global and global to local atom index */
 +    a = dd->cgindex[cg_start];
 +    for(zone=0; zone<nzone; zone++)
 +    {
 +        if (zone == 0)
 +        {
 +            cg0 = cg_start;
 +        }
 +        else
 +        {
 +            cg0 = zone2cg[zone];
 +        }
 +        for(cg=cg0; cg<zone2cg[zone+1]; cg++)
 +        {
 +            zone1 = zone;
 +            if (cg - cg0 >= zone_ncg1[zone])
 +            {
 +                /* Signal that this cg is from more than one zone away */
 +                zone1 += nzone;
 +            }
 +            cg_gl = index_gl[cg];
 +            for(a_gl=gcgs_index[cg_gl]; a_gl<gcgs_index[cg_gl+1]; a_gl++)
 +            {
 +                gatindex[a] = a_gl;
 +                ga2la_set(dd->ga2la,a_gl,a,zone1);
 +                a++;
 +            }
 +        }
 +    }
 +}
 +
 +static int check_bLocalCG(gmx_domdec_t *dd,int ncg_sys,const char *bLocalCG,
 +                          const char *where)
 +{
 +    int ncg,i,ngl,nerr;
 +
 +    nerr = 0;
 +    if (bLocalCG == NULL)
 +    {
 +        return nerr;
 +    }
 +    for(i=0; i<dd->ncg_tot; i++)
 +    {
 +        if (!bLocalCG[dd->index_gl[i]])
 +        {
 +            fprintf(stderr,
 +                    "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n",dd->rank,where,i+1,dd->index_gl[i]+1,dd->ncg_home);
 +            nerr++;
 +        }
 +    }
 +    ngl = 0;
 +    for(i=0; i<ncg_sys; i++)
 +    {
 +        if (bLocalCG[i])
 +        {
 +            ngl++;
 +        }
 +    }
 +    if (ngl != dd->ncg_tot)
 +    {
 +        fprintf(stderr,"DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n",dd->rank,where,ngl,dd->ncg_tot);
 +        nerr++;
 +    }
 +
 +    return nerr;
 +}
 +
 +static void check_index_consistency(gmx_domdec_t *dd,
 +                                    int natoms_sys,int ncg_sys,
 +                                    const char *where)
 +{
 +    int  nerr,ngl,i,a,cell;
 +    int  *have;
 +
 +    nerr = 0;
 +
 +    if (dd->comm->DD_debug > 1)
 +    {
 +        snew(have,natoms_sys);
 +        for(a=0; a<dd->nat_tot; a++)
 +        {
 +            if (have[dd->gatindex[a]] > 0)
 +            {
 +                fprintf(stderr,"DD node %d: global atom %d occurs twice: index %d and %d\n",dd->rank,dd->gatindex[a]+1,have[dd->gatindex[a]],a+1);
 +            }
 +            else
 +            {
 +                have[dd->gatindex[a]] = a + 1;
 +            }
 +        }
 +        sfree(have);
 +    }
 +
 +    snew(have,dd->nat_tot);
 +
 +    ngl  = 0;
 +    for(i=0; i<natoms_sys; i++)
 +    {
 +        if (ga2la_get(dd->ga2la,i,&a,&cell))
 +        {
 +            if (a >= dd->nat_tot)
 +            {
 +                fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n",dd->rank,i+1,a+1,dd->nat_tot);
 +                nerr++;
 +            }
 +            else
 +            {
 +                have[a] = 1;
 +                if (dd->gatindex[a] != i)
 +                {
 +                    fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n",dd->rank,i+1,a+1,dd->gatindex[a]+1);
 +                    nerr++;
 +                }
 +            }
 +            ngl++;
 +        }
 +    }
 +    if (ngl != dd->nat_tot)
 +    {
 +        fprintf(stderr,
 +                "DD node %d, %s: %d global atom indices, %d local atoms\n",
 +                dd->rank,where,ngl,dd->nat_tot);
 +    }
 +    for(a=0; a<dd->nat_tot; a++)
 +    {
 +        if (have[a] == 0)
 +        {
 +            fprintf(stderr,
 +                    "DD node %d, %s: local atom %d, global %d has no global index\n",
 +                    dd->rank,where,a+1,dd->gatindex[a]+1);
 +        }
 +    }
 +    sfree(have);
 +
 +    nerr += check_bLocalCG(dd,ncg_sys,dd->comm->bLocalCG,where);
 +
 +    if (nerr > 0) {
 +        gmx_fatal(FARGS,"DD node %d, %s: %d atom/cg index inconsistencies",
 +                  dd->rank,where,nerr);
 +    }
 +}
 +
 +static void clear_dd_indices(gmx_domdec_t *dd,int cg_start,int a_start)
 +{
 +    int  i;
 +    char *bLocalCG;
 +
 +    if (a_start == 0)
 +    {
 +        /* Clear the whole list without searching */
 +        ga2la_clear(dd->ga2la);
 +    }
 +    else
 +    {
 +        for(i=a_start; i<dd->nat_tot; i++)
 +        {
 +            ga2la_del(dd->ga2la,dd->gatindex[i]);
 +        }
 +    }
 +
 +    bLocalCG = dd->comm->bLocalCG;
 +    if (bLocalCG)
 +    {
 +        for(i=cg_start; i<dd->ncg_tot; i++)
 +        {
 +            bLocalCG[dd->index_gl[i]] = FALSE;
 +        }
 +    }
 +
 +    dd_clear_local_vsite_indices(dd);
 +    
 +    if (dd->constraints)
 +    {
 +        dd_clear_local_constraint_indices(dd);
 +    }
 +}
 +
 +static real grid_jump_limit(gmx_domdec_comm_t *comm,int dim_ind)
 +{
 +    real grid_jump_limit;
 +
 +    /* The distance between the boundaries of cells at distance
 +     * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
 +     * and by the fact that cells should not be shifted by more than
 +     * half their size, such that cg's only shift by one cell
 +     * at redecomposition.
 +     */
 +    grid_jump_limit = comm->cellsize_limit;
 +    if (!comm->bVacDLBNoLimit)
 +    {
 +        grid_jump_limit = max(grid_jump_limit,
 +                              comm->cutoff/comm->cd[dim_ind].np);
 +    }
 +
 +    return grid_jump_limit;
 +}
 +
 +static void check_grid_jump(gmx_large_int_t step,gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  d,dim;
 +    real limit,bfac;
 +    
 +    comm = dd->comm;
 +    
 +    for(d=1; d<dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        limit = grid_jump_limit(comm,d);
 +        bfac = ddbox->box_size[dim];
 +        if (ddbox->tric_dir[dim])
 +        {
 +            bfac *= ddbox->skew_fac[dim];
 +        }
 +        if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
 +            (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
 +        {
 +            char buf[22];
 +            gmx_fatal(FARGS,"Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d\n",
 +                      gmx_step_str(step,buf),
 +                      dim2char(dim),dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
 +        }
 +    }
 +}
 +
 +static int dd_load_count(gmx_domdec_comm_t *comm)
 +{
 +    return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
 +}
 +
 +static float dd_force_load(gmx_domdec_comm_t *comm)
 +{
 +    float load;
 +    
 +    if (comm->eFlop)
 +    {
 +        load = comm->flop;
 +        if (comm->eFlop > 1)
 +        {
 +            load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
 +        }
 +    } 
 +    else
 +    {
 +        load = comm->cycl[ddCyclF];
 +        if (comm->cycl_n[ddCyclF] > 1)
 +        {
 +            /* Subtract the maximum of the last n cycle counts
 +             * to get rid of possible high counts due to other soures,
 +             * for instance system activity, that would otherwise
 +             * affect the dynamic load balancing.
 +             */
 +            load -= comm->cycl_max[ddCyclF];
 +        }
 +    }
 +    
 +    return load;
 +}
 +
 +static void set_slb_pme_dim_f(gmx_domdec_t *dd,int dim,real **dim_f)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int i;
 +    
 +    comm = dd->comm;
 +    
 +    snew(*dim_f,dd->nc[dim]+1);
 +    (*dim_f)[0] = 0;
 +    for(i=1; i<dd->nc[dim]; i++)
 +    {
 +        if (comm->slb_frac[dim])
 +        {
 +            (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
 +        }
 +        else
 +        {
 +            (*dim_f)[i] = (real)i/(real)dd->nc[dim];
 +        }
 +    }
 +    (*dim_f)[dd->nc[dim]] = 1;
 +}
 +
 +static void init_ddpme(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,int dimind)
 +{
 +    int        pmeindex,slab,nso,i;
 +    ivec xyz;
 +    
 +    if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
 +    {
 +        ddpme->dim = YY;
 +    }
 +    else
 +    {
 +        ddpme->dim = dimind;
 +    }
 +    ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
 +    
 +    ddpme->nslab = (ddpme->dim == 0 ?
 +                    dd->comm->npmenodes_x :
 +                    dd->comm->npmenodes_y);
 +
 +    if (ddpme->nslab <= 1)
 +    {
 +        return;
 +    }
 +
 +    nso = dd->comm->npmenodes/ddpme->nslab;
 +    /* Determine for each PME slab the PP location range for dimension dim */
 +    snew(ddpme->pp_min,ddpme->nslab);
 +    snew(ddpme->pp_max,ddpme->nslab);
 +    for(slab=0; slab<ddpme->nslab; slab++) {
 +        ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
 +        ddpme->pp_max[slab] = 0;
 +    }
 +    for(i=0; i<dd->nnodes; i++) {
 +        ddindex2xyz(dd->nc,i,xyz);
 +        /* For y only use our y/z slab.
 +         * This assumes that the PME x grid size matches the DD grid size.
 +         */
 +        if (dimind == 0 || xyz[XX] == dd->ci[XX]) {
 +            pmeindex = ddindex2pmeindex(dd,i);
 +            if (dimind == 0) {
 +                slab = pmeindex/nso;
 +            } else {
 +                slab = pmeindex % ddpme->nslab;
 +            }
 +            ddpme->pp_min[slab] = min(ddpme->pp_min[slab],xyz[dimind]);
 +            ddpme->pp_max[slab] = max(ddpme->pp_max[slab],xyz[dimind]);
 +        }
 +    }
 +
 +    set_slb_pme_dim_f(dd,ddpme->dim,&ddpme->slb_dim_f);
 +}
 +
 +int dd_pme_maxshift_x(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->ddpme[0].dim == XX)
 +    {
 +        return dd->comm->ddpme[0].maxshift;
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +int dd_pme_maxshift_y(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->ddpme[0].dim == YY)
 +    {
 +        return dd->comm->ddpme[0].maxshift;
 +    }
 +    else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
 +    {
 +        return dd->comm->ddpme[1].maxshift;
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +static void set_pme_maxshift(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,
 +                             gmx_bool bUniform,gmx_ddbox_t *ddbox,real *cell_f)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  nc,ns,s;
 +    int  *xmin,*xmax;
 +    real range,pme_boundary;
 +    int  sh;
 +    
 +    comm = dd->comm;
 +    nc  = dd->nc[ddpme->dim];
 +    ns  = ddpme->nslab;
 +    
 +    if (!ddpme->dim_match)
 +    {
 +        /* PP decomposition is not along dim: the worst situation */
 +        sh = ns/2;
 +    }
 +    else if (ns <= 3 || (bUniform && ns == nc))
 +    {
 +        /* The optimal situation */
 +        sh = 1;
 +    }
 +    else
 +    {
 +        /* We need to check for all pme nodes which nodes they
 +         * could possibly need to communicate with.
 +         */
 +        xmin = ddpme->pp_min;
 +        xmax = ddpme->pp_max;
 +        /* Allow for atoms to be maximally 2/3 times the cut-off
 +         * out of their DD cell. This is a reasonable balance between
 +         * between performance and support for most charge-group/cut-off
 +         * combinations.
 +         */
 +        range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
 +        /* Avoid extra communication when we are exactly at a boundary */
 +        range *= 0.999;
 +        
 +        sh = 1;
 +        for(s=0; s<ns; s++)
 +        {
 +            /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
 +            pme_boundary = (real)s/ns;
 +            while (sh+1 < ns &&
 +                   ((s-(sh+1) >= 0 &&
 +                     cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
 +                    (s-(sh+1) <  0 &&
 +                     cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
 +            {
 +                sh++;
 +            }
 +            pme_boundary = (real)(s+1)/ns;
 +            while (sh+1 < ns &&
 +                   ((s+(sh+1) <  ns &&
 +                     cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
 +                    (s+(sh+1) >= ns &&
 +                     cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
 +            {
 +                sh++;
 +            }
 +        }
 +    }
 +    
 +    ddpme->maxshift = sh;
 +    
 +    if (debug)
 +    {
 +        fprintf(debug,"PME slab communication range for dim %d is %d\n",
 +                ddpme->dim,ddpme->maxshift);
 +    }
 +}
 +
 +static void check_box_size(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
 +{
 +    int d,dim;
 +    
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        if (dim < ddbox->nboundeddim &&
 +            ddbox->box_size[dim]*ddbox->skew_fac[dim] <
 +            dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
 +        {
 +            gmx_fatal(FARGS,"The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
 +                      dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
 +                      dd->nc[dim],dd->comm->cellsize_limit);
 +        }
 +    }
 +}
 +
 +static void set_dd_cell_sizes_slb(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
 +                                  gmx_bool bMaster,ivec npulse)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  d,j;
 +    rvec cellsize_min;
 +    real *cell_x,cell_dx,cellsize;
 +    
 +    comm = dd->comm;
 +    
 +    for(d=0; d<DIM; d++)
 +    {
 +        cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
 +        npulse[d] = 1;
 +        if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
 +        {
 +            /* Uniform grid */
 +            cell_dx = ddbox->box_size[d]/dd->nc[d];
 +            if (bMaster)
 +            {
 +                for(j=0; j<dd->nc[d]+1; j++)
 +                {
 +                    dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
 +                }
 +            }
 +            else
 +            {
 +                comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
 +                comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
 +            }
 +            cellsize = cell_dx*ddbox->skew_fac[d];
 +            while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
 +            {
 +                npulse[d]++;
 +            }
 +            cellsize_min[d] = cellsize;
 +        }
 +        else
 +        {
 +            /* Statically load balanced grid */
 +            /* Also when we are not doing a master distribution we determine
 +             * all cell borders in a loop to obtain identical values
 +             * to the master distribution case and to determine npulse.
 +             */
 +            if (bMaster)
 +            {
 +                cell_x = dd->ma->cell_x[d];
 +            }
 +            else
 +            {
 +                snew(cell_x,dd->nc[d]+1);
 +            }
 +            cell_x[0] = ddbox->box0[d];
 +            for(j=0; j<dd->nc[d]; j++)
 +            {
 +                cell_dx = ddbox->box_size[d]*comm->slb_frac[d][j];
 +                cell_x[j+1] = cell_x[j] + cell_dx;
 +                cellsize = cell_dx*ddbox->skew_fac[d];
 +                while (cellsize*npulse[d] < comm->cutoff &&
 +                       npulse[d] < dd->nc[d]-1)
 +                {
 +                    npulse[d]++;
 +                }
 +                cellsize_min[d] = min(cellsize_min[d],cellsize);
 +            }
 +            if (!bMaster)
 +            {
 +                comm->cell_x0[d] = cell_x[dd->ci[d]];
 +                comm->cell_x1[d] = cell_x[dd->ci[d]+1];
 +                sfree(cell_x);
 +            }
 +        }
 +        /* The following limitation is to avoid that a cell would receive
 +         * some of its own home charge groups back over the periodic boundary.
 +         * Double charge groups cause trouble with the global indices.
 +         */
 +        if (d < ddbox->npbcdim &&
 +            dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
 +        {
 +            gmx_fatal_collective(FARGS,NULL,dd,
 +                                 "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
 +                                 dim2char(d),ddbox->box_size[d],ddbox->skew_fac[d],
 +                                 comm->cutoff,
 +                                 dd->nc[d],dd->nc[d],
 +                                 dd->nnodes > dd->nc[d] ? "cells" : "processors");
 +        }
 +    }
 +    
 +    if (!comm->bDynLoadBal)
 +    {
 +        copy_rvec(cellsize_min,comm->cellsize_min);
 +    }
 +   
 +    for(d=0; d<comm->npmedecompdim; d++)
 +    {
 +        set_pme_maxshift(dd,&comm->ddpme[d],
 +                         comm->slb_frac[dd->dim[d]]==NULL,ddbox,
 +                         comm->ddpme[d].slb_dim_f);
 +    }
 +}
 +
 +
 +static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
 +                                       int d,int dim,gmx_domdec_root_t *root,
 +                                       gmx_ddbox_t *ddbox,
 +                                       gmx_bool bUniform,gmx_large_int_t step, real cellsize_limit_f, int range[])
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  ncd,i,j,nmin,nmin_old;
 +    gmx_bool bLimLo,bLimHi;
 +    real *cell_size;
 +    real fac,halfway,cellsize_limit_f_i,region_size;
 +    gmx_bool bPBC,bLastHi=FALSE;
 +    int nrange[]={range[0],range[1]};
 +
 +    region_size= root->cell_f[range[1]]-root->cell_f[range[0]];  
 +
 +    comm = dd->comm;
 +
 +    ncd = dd->nc[dim];
 +
 +    bPBC = (dim < ddbox->npbcdim);
 +
 +    cell_size = root->buf_ncd;
 +
 +    if (debug) 
 +    {
 +        fprintf(debug,"enforce_limits: %d %d\n",range[0],range[1]);
 +    }
 +
 +    /* First we need to check if the scaling does not make cells
 +     * smaller than the smallest allowed size.
 +     * We need to do this iteratively, since if a cell is too small,
 +     * it needs to be enlarged, which makes all the other cells smaller,
 +     * which could in turn make another cell smaller than allowed.
 +     */
 +    for(i=range[0]; i<range[1]; i++)
 +    {
 +        root->bCellMin[i] = FALSE;
 +    }
 +    nmin = 0;
 +    do
 +    {
 +        nmin_old = nmin;
 +        /* We need the total for normalization */
 +        fac = 0;
 +        for(i=range[0]; i<range[1]; i++)
 +        {
 +            if (root->bCellMin[i] == FALSE)
 +            {
 +                fac += cell_size[i];
 +            }
 +        }
 +        fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
 +        /* Determine the cell boundaries */
 +        for(i=range[0]; i<range[1]; i++)
 +        {
 +            if (root->bCellMin[i] == FALSE)
 +            {
 +                cell_size[i] *= fac;
 +                if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
 +                {
 +                    cellsize_limit_f_i = 0;
 +                }
 +                else
 +                {
 +                    cellsize_limit_f_i = cellsize_limit_f;
 +                }
 +                if (cell_size[i] < cellsize_limit_f_i)
 +                {
 +                    root->bCellMin[i] = TRUE;
 +                    cell_size[i] = cellsize_limit_f_i;
 +                    nmin++;
 +                }
 +            }
 +            root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
 +        }
 +    }
 +    while (nmin > nmin_old);
 +    
 +    i=range[1]-1;
 +    cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
 +    /* For this check we should not use DD_CELL_MARGIN,
 +     * but a slightly smaller factor,
 +     * since rounding could get use below the limit.
 +     */
 +    if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
 +    {
 +        char buf[22];
 +        gmx_fatal(FARGS,"Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
 +                  gmx_step_str(step,buf),
 +                  dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
 +                  ncd,comm->cellsize_min[dim]);
 +    }
 +    
 +    root->bLimited = (nmin > 0) || (range[0]>0) || (range[1]<ncd);
 +    
 +    if (!bUniform)
 +    {
 +        /* Check if the boundary did not displace more than halfway
 +         * each of the cells it bounds, as this could cause problems,
 +         * especially when the differences between cell sizes are large.
 +         * If changes are applied, they will not make cells smaller
 +         * than the cut-off, as we check all the boundaries which
 +         * might be affected by a change and if the old state was ok,
 +         * the cells will at most be shrunk back to their old size.
 +         */
 +        for(i=range[0]+1; i<range[1]; i++)
 +        {
 +            halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
 +            if (root->cell_f[i] < halfway)
 +            {
 +                root->cell_f[i] = halfway;
 +                /* Check if the change also causes shifts of the next boundaries */
 +                for(j=i+1; j<range[1]; j++)
 +                {
 +                    if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
 +                        root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
 +                }
 +            }
 +            halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
 +            if (root->cell_f[i] > halfway)
 +            {
 +                root->cell_f[i] = halfway;
 +                /* Check if the change also causes shifts of the next boundaries */
 +                for(j=i-1; j>=range[0]+1; j--)
 +                {
 +                    if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
 +                        root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
 +                }
 +            }
 +        }
 +    }
 +    
 +    /* nrange is defined as [lower, upper) range for new call to enforce_limits */
 +    /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
 +     * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
 +     * for a and b nrange is used */
 +    if (d > 0)
 +    {
 +        /* Take care of the staggering of the cell boundaries */
 +        if (bUniform)
 +        {
 +            for(i=range[0]; i<range[1]; i++)
 +            {
 +                root->cell_f_max0[i] = root->cell_f[i];
 +                root->cell_f_min1[i] = root->cell_f[i+1];
 +            }
 +        }
 +        else
 +        {
 +            for(i=range[0]+1; i<range[1]; i++)
 +            {
 +                bLimLo = (root->cell_f[i] < root->bound_min[i]);
 +                bLimHi = (root->cell_f[i] > root->bound_max[i]);
 +                if (bLimLo && bLimHi)
 +                {
 +                    /* Both limits violated, try the best we can */
 +                    /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
 +                    root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
 +                    nrange[0]=range[0];
 +                    nrange[1]=i;
 +                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +
 +                    nrange[0]=i;
 +                    nrange[1]=range[1];
 +                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +
 +                    return;
 +                }
 +                else if (bLimLo)
 +                {
 +                    /* root->cell_f[i] = root->bound_min[i]; */
 +                    nrange[1]=i;  /* only store violation location. There could be a LimLo violation following with an higher index */
 +                    bLastHi=FALSE;
 +                }
 +                else if (bLimHi && !bLastHi)
 +                {
 +                    bLastHi=TRUE;
 +                    if (nrange[1] < range[1])   /* found a LimLo before */
 +                    {
 +                        root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
 +                        dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +                        nrange[0]=nrange[1];
 +                    }
 +                    root->cell_f[i] = root->bound_max[i];
 +                    nrange[1]=i; 
 +                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +                    nrange[0]=i;
 +                    nrange[1]=range[1];
 +                }
 +            }
 +            if (nrange[1] < range[1])   /* found last a LimLo */
 +            {
 +                root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
 +                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +                nrange[0]=nrange[1];
 +                nrange[1]=range[1];
 +                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +            } 
 +            else if (nrange[0] > range[0]) /* found at least one LimHi */
 +            {
 +                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
 +                                       int d,int dim,gmx_domdec_root_t *root,
 +                                       gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
 +                                       gmx_bool bUniform,gmx_large_int_t step)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  ncd,d1,i,j,pos;
 +    real *cell_size;
 +    real load_aver,load_i,imbalance,change,change_max,sc;
 +    real cellsize_limit_f,dist_min_f,dist_min_f_hard,space;
 +    real change_limit;
 +    real relax = 0.5;
 +    gmx_bool bPBC;
 +    int range[] = { 0, 0 };
 +
 +    comm = dd->comm;
 +
 +    /* Convert the maximum change from the input percentage to a fraction */
 +    change_limit = comm->dlb_scale_lim*0.01;
 +
 +    ncd = dd->nc[dim];
 +
 +    bPBC = (dim < ddbox->npbcdim);
 +
 +    cell_size = root->buf_ncd;
 +
 +    /* Store the original boundaries */
 +    for(i=0; i<ncd+1; i++)
 +    {
 +        root->old_cell_f[i] = root->cell_f[i];
 +    }
 +    if (bUniform) {
 +        for(i=0; i<ncd; i++)
 +        {
 +            cell_size[i] = 1.0/ncd;
 +        }
 +    }
 +    else if (dd_load_count(comm))
 +    {
 +        load_aver = comm->load[d].sum_m/ncd;
 +        change_max = 0;
 +        for(i=0; i<ncd; i++)
 +        {
 +            /* Determine the relative imbalance of cell i */
 +            load_i = comm->load[d].load[i*comm->load[d].nload+2];
 +            imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
 +            /* Determine the change of the cell size using underrelaxation */
 +            change = -relax*imbalance;
 +            change_max = max(change_max,max(change,-change));
 +        }
 +        /* Limit the amount of scaling.
 +         * We need to use the same rescaling for all cells in one row,
 +         * otherwise the load balancing might not converge.
 +         */
 +        sc = relax;
 +        if (change_max > change_limit)
 +        {
 +            sc *= change_limit/change_max;
 +        }
 +        for(i=0; i<ncd; i++)
 +        {
 +            /* Determine the relative imbalance of cell i */
 +            load_i = comm->load[d].load[i*comm->load[d].nload+2];
 +            imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
 +            /* Determine the change of the cell size using underrelaxation */
 +            change = -sc*imbalance;
 +            cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
 +        }
 +    }
 +    
 +    cellsize_limit_f  = comm->cellsize_min[dim]/ddbox->box_size[dim];
 +    cellsize_limit_f *= DD_CELL_MARGIN;
 +    dist_min_f_hard        = grid_jump_limit(comm,d)/ddbox->box_size[dim];
 +    dist_min_f       = dist_min_f_hard * DD_CELL_MARGIN;
 +    if (ddbox->tric_dir[dim])
 +    {
 +        cellsize_limit_f /= ddbox->skew_fac[dim];
 +        dist_min_f       /= ddbox->skew_fac[dim];
 +    }
 +    if (bDynamicBox && d > 0)
 +    {
 +        dist_min_f *= DD_PRES_SCALE_MARGIN;
 +    }
 +    if (d > 0 && !bUniform)
 +    {
 +        /* Make sure that the grid is not shifted too much */
 +        for(i=1; i<ncd; i++) {
 +            if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard) 
 +            {
 +                gmx_incons("Inconsistent DD boundary staggering limits!");
 +            }
 +            root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
 +            space = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
 +            if (space > 0) {
 +                root->bound_min[i] += 0.5*space;
 +            }
 +            root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
 +            space = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
 +            if (space < 0) {
 +                root->bound_max[i] += 0.5*space;
 +            }
 +            if (debug)
 +            {
 +                fprintf(debug,
 +                        "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
 +                        d,i,
 +                        root->cell_f_max0[i-1] + dist_min_f,
 +                        root->bound_min[i],root->cell_f[i],root->bound_max[i],
 +                        root->cell_f_min1[i] - dist_min_f);
 +            }
 +        }
 +    }
 +    range[1]=ncd;
 +    root->cell_f[0] = 0;
 +    root->cell_f[ncd] = 1;
 +    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
 +
 +
 +    /* After the checks above, the cells should obey the cut-off
 +     * restrictions, but it does not hurt to check.
 +     */
 +    for(i=0; i<ncd; i++)
 +    {
 +        if (debug)
 +        {
 +            fprintf(debug,"Relative bounds dim %d  cell %d: %f %f\n",
 +                    dim,i,root->cell_f[i],root->cell_f[i+1]);
 +        }
 +
 +        if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
 +            root->cell_f[i+1] - root->cell_f[i] <
 +            cellsize_limit_f/DD_CELL_MARGIN)
 +        {
 +            char buf[22];
 +            fprintf(stderr,
 +                    "\nWARNING step %s: direction %c, cell %d too small: %f\n",
 +                    gmx_step_str(step,buf),dim2char(dim),i,
 +                    (root->cell_f[i+1] - root->cell_f[i])
 +                    *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
 +        }
 +    }
 +    
 +    pos = ncd + 1;
 +    /* Store the cell boundaries of the lower dimensions at the end */
 +    for(d1=0; d1<d; d1++)
 +    {
 +        root->cell_f[pos++] = comm->cell_f0[d1];
 +        root->cell_f[pos++] = comm->cell_f1[d1];
 +    }
 +    
 +    if (d < comm->npmedecompdim)
 +    {
 +        /* The master determines the maximum shift for
 +         * the coordinate communication between separate PME nodes.
 +         */
 +        set_pme_maxshift(dd,&comm->ddpme[d],bUniform,ddbox,root->cell_f);
 +    }
 +    root->cell_f[pos++] = comm->ddpme[0].maxshift;
 +    if (d >= 1)
 +    {
 +        root->cell_f[pos++] = comm->ddpme[1].maxshift;
 +    }
 +}    
 +
 +static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
 +                                             gmx_ddbox_t *ddbox,int dimind)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int dim;
 +
 +    comm = dd->comm;
 +
 +    /* Set the cell dimensions */
 +    dim = dd->dim[dimind];
 +    comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
 +    comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
 +    if (dim >= ddbox->nboundeddim)
 +    {
 +        comm->cell_x0[dim] += ddbox->box0[dim];
 +        comm->cell_x1[dim] += ddbox->box0[dim];
 +    }
 +}
 +
 +static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
 +                                         int d,int dim,real *cell_f_row,
 +                                         gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int d1,dim1,pos;
 +
 +    comm = dd->comm;
 +
 +#ifdef GMX_MPI
 +    /* Each node would only need to know two fractions,
 +     * but it is probably cheaper to broadcast the whole array.
 +     */
 +    MPI_Bcast(cell_f_row,DD_CELL_F_SIZE(dd,d)*sizeof(real),MPI_BYTE,
 +              0,comm->mpi_comm_load[d]);
 +#endif
 +    /* Copy the fractions for this dimension from the buffer */
 +    comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
 +    comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
 +    /* The whole array was communicated, so set the buffer position */
 +    pos = dd->nc[dim] + 1;
 +    for(d1=0; d1<=d; d1++)
 +    {
 +        if (d1 < d)
 +        {
 +            /* Copy the cell fractions of the lower dimensions */
 +            comm->cell_f0[d1] = cell_f_row[pos++];
 +            comm->cell_f1[d1] = cell_f_row[pos++];
 +        }
 +        relative_to_absolute_cell_bounds(dd,ddbox,d1);
 +    }
 +    /* Convert the communicated shift from float to int */
 +    comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
 +    if (d >= 1)
 +    {
 +        comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
 +    }
 +}
 +
 +static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
 +                                         gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
 +                                         gmx_bool bUniform,gmx_large_int_t step)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int d,dim,d1;
 +    gmx_bool bRowMember,bRowRoot;
 +    real *cell_f_row;
 +    
 +    comm = dd->comm;
 +
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        bRowMember = TRUE;
 +        bRowRoot = TRUE;
 +        for(d1=d; d1<dd->ndim; d1++)
 +        {
 +            if (dd->ci[dd->dim[d1]] > 0)
 +            {
 +                if (d1 > d)
 +                {
 +                    bRowMember = FALSE;
 +                }
 +                bRowRoot = FALSE;
 +            }
 +        }
 +        if (bRowMember)
 +        {
 +            if (bRowRoot)
 +            {
 +                set_dd_cell_sizes_dlb_root(dd,d,dim,comm->root[d],
 +                                           ddbox,bDynamicBox,bUniform,step);
 +                cell_f_row = comm->root[d]->cell_f;
 +            }
 +            else
 +            {
 +                cell_f_row = comm->cell_f_row;
 +            }
 +            distribute_dd_cell_sizes_dlb(dd,d,dim,cell_f_row,ddbox);
 +        }
 +    }
 +}    
 +
 +static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
 +{
 +    int d;
 +
 +    /* This function assumes the box is static and should therefore
 +     * not be called when the box has changed since the last
 +     * call to dd_partition_system.
 +     */
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        relative_to_absolute_cell_bounds(dd,ddbox,d); 
 +    }
 +}
 +
 +
 +
 +static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
 +                                  gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
 +                                  gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
 +                                  gmx_wallcycle_t wcycle)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int dim;
 +
 +    comm = dd->comm;
 +    
 +    if (bDoDLB)
 +    {
 +        wallcycle_start(wcycle,ewcDDCOMMBOUND);
 +        set_dd_cell_sizes_dlb_change(dd,ddbox,bDynamicBox,bUniform,step);
 +        wallcycle_stop(wcycle,ewcDDCOMMBOUND);
 +    }
 +    else if (bDynamicBox)
 +    {
 +        set_dd_cell_sizes_dlb_nochange(dd,ddbox);
 +    }
 +    
 +    /* Set the dimensions for which no DD is used */
 +    for(dim=0; dim<DIM; dim++) {
 +        if (dd->nc[dim] == 1) {
 +            comm->cell_x0[dim] = 0;
 +            comm->cell_x1[dim] = ddbox->box_size[dim];
 +            if (dim >= ddbox->nboundeddim)
 +            {
 +                comm->cell_x0[dim] += ddbox->box0[dim];
 +                comm->cell_x1[dim] += ddbox->box0[dim];
 +            }
 +        }
 +    }
 +}
 +
 +static void realloc_comm_ind(gmx_domdec_t *dd,ivec npulse)
 +{
 +    int d,np,i;
 +    gmx_domdec_comm_dim_t *cd;
 +    
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        cd = &dd->comm->cd[d];
 +        np = npulse[dd->dim[d]];
 +        if (np > cd->np_nalloc)
 +        {
 +            if (debug)
 +            {
 +                fprintf(debug,"(Re)allocing cd for %c to %d pulses\n",
 +                        dim2char(dd->dim[d]),np);
 +            }
 +            if (DDMASTER(dd) && cd->np_nalloc > 0)
 +            {
 +                fprintf(stderr,"\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n",dim2char(dd->dim[d]),np);
 +            }
 +            srenew(cd->ind,np);
 +            for(i=cd->np_nalloc; i<np; i++)
 +            {
 +                cd->ind[i].index  = NULL;
 +                cd->ind[i].nalloc = 0;
 +            }
 +            cd->np_nalloc = np;
 +        }
 +        cd->np = np;
 +    }
 +}
 +
 +
 +static void set_dd_cell_sizes(gmx_domdec_t *dd,
 +                              gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
 +                              gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
 +                              gmx_wallcycle_t wcycle)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  d;
 +    ivec npulse;
 +    
 +    comm = dd->comm;
 +
 +    /* Copy the old cell boundaries for the cg displacement check */
 +    copy_rvec(comm->cell_x0,comm->old_cell_x0);
 +    copy_rvec(comm->cell_x1,comm->old_cell_x1);
 +    
 +    if (comm->bDynLoadBal)
 +    {
 +        if (DDMASTER(dd))
 +        {
 +            check_box_size(dd,ddbox);
 +        }
 +        set_dd_cell_sizes_dlb(dd,ddbox,bDynamicBox,bUniform,bDoDLB,step,wcycle);
 +    }
 +    else
 +    {
 +        set_dd_cell_sizes_slb(dd,ddbox,FALSE,npulse);
 +        realloc_comm_ind(dd,npulse);
 +    }
 +    
 +    if (debug)
 +    {
 +        for(d=0; d<DIM; d++)
 +        {
 +            fprintf(debug,"cell_x[%d] %f - %f skew_fac %f\n",
 +                    d,comm->cell_x0[d],comm->cell_x1[d],ddbox->skew_fac[d]);
 +        }
 +    }
 +}
 +
 +static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
 +                                  gmx_ddbox_t *ddbox,
 +                                  rvec cell_ns_x0,rvec cell_ns_x1,
 +                                  gmx_large_int_t step)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int dim_ind,dim;
 +    
 +    comm = dd->comm;
 +
 +    for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
 +    {
 +        dim = dd->dim[dim_ind];
 +        
 +        /* Without PBC we don't have restrictions on the outer cells */
 +        if (!(dim >= ddbox->npbcdim && 
 +              (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
 +            comm->bDynLoadBal &&
 +            (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
 +            comm->cellsize_min[dim])
 +        {
 +            char buf[22];
 +            gmx_fatal(FARGS,"Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
 +                      gmx_step_str(step,buf),dim2char(dim),
 +                      comm->cell_x1[dim] - comm->cell_x0[dim],
 +                      ddbox->skew_fac[dim],
 +                      dd->comm->cellsize_min[dim],
 +                      dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
 +        }
 +    }
 +    
 +    if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
 +    {
 +        /* Communicate the boundaries and update cell_ns_x0/1 */
 +        dd_move_cellx(dd,ddbox,cell_ns_x0,cell_ns_x1);
 +        if (dd->bGridJump && dd->ndim > 1)
 +        {
 +            check_grid_jump(step,dd,ddbox);
 +        }
 +    }
 +}
 +
 +static void make_tric_corr_matrix(int npbcdim,matrix box,matrix tcm)
 +{
 +    if (YY < npbcdim)
 +    {
 +        tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
 +    }
 +    else
 +    {
 +        tcm[YY][XX] = 0;
 +    }
 +    if (ZZ < npbcdim)
 +    {
 +        tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
 +        tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
 +    }
 +    else
 +    {
 +        tcm[ZZ][XX] = 0;
 +        tcm[ZZ][YY] = 0;
 +    }
 +}
 +
 +static void check_screw_box(matrix box)
 +{
 +    /* Mathematical limitation */
 +    if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
 +    {
 +        gmx_fatal(FARGS,"With screw pbc the unit cell can not have non-zero off-diagonal x-components");
 +    }
 +    
 +    /* Limitation due to the asymmetry of the eighth shell method */
 +    if (box[ZZ][YY] != 0)
 +    {
 +        gmx_fatal(FARGS,"pbc=screw with non-zero box_zy is not supported");
 +    }
 +}
 +
 +static void distribute_cg(FILE *fplog,gmx_large_int_t step,
 +                          matrix box,ivec tric_dir,t_block *cgs,rvec pos[],
 +                          gmx_domdec_t *dd)
 +{
 +    gmx_domdec_master_t *ma;
 +    int **tmp_ind=NULL,*tmp_nalloc=NULL;
 +    int  i,icg,j,k,k0,k1,d,npbcdim;
 +    matrix tcm;
 +    rvec box_size,cg_cm;
 +    ivec ind;
 +    real nrcg,inv_ncg,pos_d;
 +    atom_id *cgindex;
 +    gmx_bool bUnbounded,bScrew;
 +
 +    ma = dd->ma;
 +    
 +    if (tmp_ind == NULL)
 +    {
 +        snew(tmp_nalloc,dd->nnodes);
 +        snew(tmp_ind,dd->nnodes);
 +        for(i=0; i<dd->nnodes; i++)
 +        {
 +            tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
 +            snew(tmp_ind[i],tmp_nalloc[i]);
 +        }
 +    }
 +    
 +    /* Clear the count */
 +    for(i=0; i<dd->nnodes; i++)
 +    {
 +        ma->ncg[i] = 0;
 +        ma->nat[i] = 0;
 +    }
 +    
 +    make_tric_corr_matrix(dd->npbcdim,box,tcm);
 +    
 +    cgindex = cgs->index;
 +    
 +    /* Compute the center of geometry for all charge groups */
 +    for(icg=0; icg<cgs->nr; icg++)
 +    {
 +        k0      = cgindex[icg];
 +        k1      = cgindex[icg+1];
 +        nrcg    = k1 - k0;
 +        if (nrcg == 1)
 +        {
 +            copy_rvec(pos[k0],cg_cm);
 +        }
 +        else
 +        {
 +            inv_ncg = 1.0/nrcg;
 +            
 +            clear_rvec(cg_cm);
 +            for(k=k0; (k<k1); k++)
 +            {
 +                rvec_inc(cg_cm,pos[k]);
 +            }
 +            for(d=0; (d<DIM); d++)
 +            {
 +                cg_cm[d] *= inv_ncg;
 +            }
 +        }
 +        /* Put the charge group in the box and determine the cell index */
 +        for(d=DIM-1; d>=0; d--) {
 +            pos_d = cg_cm[d];
 +            if (d < dd->npbcdim)
 +            {
 +                bScrew = (dd->bScrewPBC && d == XX);
 +                if (tric_dir[d] && dd->nc[d] > 1)
 +                {
 +                    /* Use triclinic coordintates for this dimension */
 +                    for(j=d+1; j<DIM; j++)
 +                    {
 +                        pos_d += cg_cm[j]*tcm[j][d];
 +                    }
 +                }
 +                while(pos_d >= box[d][d])
 +                {
 +                    pos_d -= box[d][d];
 +                    rvec_dec(cg_cm,box[d]);
 +                    if (bScrew)
 +                    {
 +                        cg_cm[YY] = box[YY][YY] - cg_cm[YY];
 +                        cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
 +                    }
 +                    for(k=k0; (k<k1); k++)
 +                    {
 +                        rvec_dec(pos[k],box[d]);
 +                        if (bScrew)
 +                        {
 +                            pos[k][YY] = box[YY][YY] - pos[k][YY];
 +                            pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
 +                        }
 +                    }
 +                }
 +                while(pos_d < 0)
 +                {
 +                    pos_d += box[d][d];
 +                    rvec_inc(cg_cm,box[d]);
 +                    if (bScrew)
 +                    {
 +                        cg_cm[YY] = box[YY][YY] - cg_cm[YY];
 +                        cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
 +                    }
 +                    for(k=k0; (k<k1); k++)
 +                    {
 +                        rvec_inc(pos[k],box[d]);
 +                        if (bScrew) {
 +                            pos[k][YY] = box[YY][YY] - pos[k][YY];
 +                            pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
 +                        }
 +                    }
 +                }
 +            }
 +            /* This could be done more efficiently */
 +            ind[d] = 0;
 +            while(ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
 +            {
 +                ind[d]++;
 +            }
 +        }
 +        i = dd_index(dd->nc,ind);
 +        if (ma->ncg[i] == tmp_nalloc[i])
 +        {
 +            tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
 +            srenew(tmp_ind[i],tmp_nalloc[i]);
 +        }
 +        tmp_ind[i][ma->ncg[i]] = icg;
 +        ma->ncg[i]++;
 +        ma->nat[i] += cgindex[icg+1] - cgindex[icg];
 +    }
 +    
 +    k1 = 0;
 +    for(i=0; i<dd->nnodes; i++)
 +    {
 +        ma->index[i] = k1;
 +        for(k=0; k<ma->ncg[i]; k++)
 +        {
 +            ma->cg[k1++] = tmp_ind[i][k];
 +        }
 +    }
 +    ma->index[dd->nnodes] = k1;
 +    
 +    for(i=0; i<dd->nnodes; i++)
 +    {
 +        sfree(tmp_ind[i]);
 +    }
 +    sfree(tmp_ind);
 +    sfree(tmp_nalloc);
 +    
 +    if (fplog)
 +    {
 +        char buf[22];
 +        fprintf(fplog,"Charge group distribution at step %s:",
 +                gmx_step_str(step,buf));
 +        for(i=0; i<dd->nnodes; i++)
 +        {
 +            fprintf(fplog," %d",ma->ncg[i]);
 +        }
 +        fprintf(fplog,"\n");
 +    }
 +}
 +
 +static void get_cg_distribution(FILE *fplog,gmx_large_int_t step,gmx_domdec_t *dd,
 +                                t_block *cgs,matrix box,gmx_ddbox_t *ddbox,
 +                                rvec pos[])
 +{
 +    gmx_domdec_master_t *ma=NULL;
 +    ivec npulse;
 +    int  i,cg_gl;
 +    int  *ibuf,buf2[2] = { 0, 0 };
-     if (DDMASTER(dd))
++    gmx_bool bMaster = DDMASTER(dd);
++    if (bMaster)
 +    {
 +        ma = dd->ma;
 +        
 +        if (dd->bScrewPBC)
 +        {
 +            check_screw_box(box);
 +        }
 +    
 +        set_dd_cell_sizes_slb(dd,ddbox,TRUE,npulse);
 +    
 +        distribute_cg(fplog,step,box,ddbox->tric_dir,cgs,pos,dd);
 +        for(i=0; i<dd->nnodes; i++)
 +        {
 +            ma->ibuf[2*i]   = ma->ncg[i];
 +            ma->ibuf[2*i+1] = ma->nat[i];
 +        }
 +        ibuf = ma->ibuf;
 +    }
 +    else
 +    {
 +        ibuf = NULL;
 +    }
 +    dd_scatter(dd,2*sizeof(int),ibuf,buf2);
 +    
 +    dd->ncg_home = buf2[0];
 +    dd->nat_home = buf2[1];
 +    dd->ncg_tot  = dd->ncg_home;
 +    dd->nat_tot  = dd->nat_home;
 +    if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
 +    {
 +        dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
 +        srenew(dd->index_gl,dd->cg_nalloc);
 +        srenew(dd->cgindex,dd->cg_nalloc+1);
 +    }
++    if (bMaster)
 +    {
 +        for(i=0; i<dd->nnodes; i++)
 +        {
 +            ma->ibuf[i] = ma->ncg[i]*sizeof(int);
 +            ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
 +        }
 +    }
 +    
 +    dd_scatterv(dd,
 +                DDMASTER(dd) ? ma->ibuf : NULL,
 +                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
 +                DDMASTER(dd) ? ma->cg : NULL,
 +                dd->ncg_home*sizeof(int),dd->index_gl);
 +    
 +    /* Determine the home charge group sizes */
 +    dd->cgindex[0] = 0;
 +    for(i=0; i<dd->ncg_home; i++)
 +    {
 +        cg_gl = dd->index_gl[i];
 +        dd->cgindex[i+1] =
 +            dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
 +    }
 +    
 +    if (debug)
 +    {
 +        fprintf(debug,"Home charge groups:\n");
 +        for(i=0; i<dd->ncg_home; i++)
 +        {
 +            fprintf(debug," %d",dd->index_gl[i]);
 +            if (i % 10 == 9) 
 +                fprintf(debug,"\n");
 +        }
 +        fprintf(debug,"\n");
 +    }
 +}
 +
 +static int compact_and_copy_vec_at(int ncg,int *move,
 +                                   int *cgindex,
 +                                   int nvec,int vec,
 +                                   rvec *src,gmx_domdec_comm_t *comm,
 +                                   gmx_bool bCompact)
 +{
 +    int m,icg,i,i0,i1,nrcg;
 +    int home_pos;
 +    int pos_vec[DIM*2];
 +    
 +    home_pos = 0;
 +
 +    for(m=0; m<DIM*2; m++)
 +    {
 +        pos_vec[m] = 0;
 +    }
 +    
 +    i0 = 0;
 +    for(icg=0; icg<ncg; icg++)
 +    {
 +        i1 = cgindex[icg+1];
 +        m = move[icg];
 +        if (m == -1)
 +        {
 +            if (bCompact)
 +            {
 +                /* Compact the home array in place */
 +                for(i=i0; i<i1; i++)
 +                {
 +                    copy_rvec(src[i],src[home_pos++]);
 +                }
 +            }
 +        }
 +        else
 +        {
 +            /* Copy to the communication buffer */
 +            nrcg = i1 - i0;
 +            pos_vec[m] += 1 + vec*nrcg;
 +            for(i=i0; i<i1; i++)
 +            {
 +                copy_rvec(src[i],comm->cgcm_state[m][pos_vec[m]++]);
 +            }
 +            pos_vec[m] += (nvec - vec - 1)*nrcg;
 +        }
 +        if (!bCompact)
 +        {
 +            home_pos += i1 - i0;
 +        }
 +        i0 = i1;
 +    }
 +    
 +    return home_pos;
 +}
 +
 +static int compact_and_copy_vec_cg(int ncg,int *move,
 +                                   int *cgindex,
 +                                   int nvec,rvec *src,gmx_domdec_comm_t *comm,
 +                                   gmx_bool bCompact)
 +{
 +    int m,icg,i0,i1,nrcg;
 +    int home_pos;
 +    int pos_vec[DIM*2];
 +    
 +    home_pos = 0;
 +    
 +    for(m=0; m<DIM*2; m++)
 +    {
 +        pos_vec[m] = 0;
 +    }
 +    
 +    i0 = 0;
 +    for(icg=0; icg<ncg; icg++)
 +    {
 +        i1 = cgindex[icg+1];
 +        m = move[icg];
 +        if (m == -1)
 +        {
 +            if (bCompact)
 +            {
 +                /* Compact the home array in place */
 +                copy_rvec(src[icg],src[home_pos++]);
 +            }
 +        }
 +        else
 +        {
 +            nrcg = i1 - i0;
 +            /* Copy to the communication buffer */
 +            copy_rvec(src[icg],comm->cgcm_state[m][pos_vec[m]]);
 +            pos_vec[m] += 1 + nrcg*nvec;
 +        }
 +        i0 = i1;
 +    }
 +    if (!bCompact)
 +    {
 +        home_pos = ncg;
 +    }
 +    
 +    return home_pos;
 +}
 +
 +static int compact_ind(int ncg,int *move,
 +                       int *index_gl,int *cgindex,
 +                       int *gatindex,
 +                       gmx_ga2la_t ga2la,char *bLocalCG,
 +                       int *cginfo)
 +{
 +    int cg,nat,a0,a1,a,a_gl;
 +    int home_pos;
 +
 +    home_pos = 0;
 +    nat = 0;
 +    for(cg=0; cg<ncg; cg++)
 +    {
 +        a0 = cgindex[cg];
 +        a1 = cgindex[cg+1];
 +        if (move[cg] == -1)
 +        {
 +            /* Compact the home arrays in place.
 +             * Anything that can be done here avoids access to global arrays.
 +             */
 +            cgindex[home_pos] = nat;
 +            for(a=a0; a<a1; a++)
 +            {
 +                a_gl = gatindex[a];
 +                gatindex[nat] = a_gl;
 +                /* The cell number stays 0, so we don't need to set it */
 +                ga2la_change_la(ga2la,a_gl,nat);
 +                nat++;
 +            }
 +            index_gl[home_pos] = index_gl[cg];
 +            cginfo[home_pos]   = cginfo[cg];
 +            /* The charge group remains local, so bLocalCG does not change */
 +            home_pos++;
 +        }
 +        else
 +        {
 +            /* Clear the global indices */
 +            for(a=a0; a<a1; a++)
 +            {
 +                ga2la_del(ga2la,gatindex[a]);
 +            }
 +            if (bLocalCG)
 +            {
 +                bLocalCG[index_gl[cg]] = FALSE;
 +            }
 +        }
 +    }
 +    cgindex[home_pos] = nat;
 +    
 +    return home_pos;
 +}
 +
 +static void clear_and_mark_ind(int ncg,int *move,
 +                               int *index_gl,int *cgindex,int *gatindex,
 +                               gmx_ga2la_t ga2la,char *bLocalCG,
 +                               int *cell_index)
 +{
 +    int cg,a0,a1,a;
 +    
 +    for(cg=0; cg<ncg; cg++)
 +    {
 +        if (move[cg] >= 0)
 +        {
 +            a0 = cgindex[cg];
 +            a1 = cgindex[cg+1];
 +            /* Clear the global indices */
 +            for(a=a0; a<a1; a++)
 +            {
 +                ga2la_del(ga2la,gatindex[a]);
 +            }
 +            if (bLocalCG)
 +            {
 +                bLocalCG[index_gl[cg]] = FALSE;
 +            }
 +            /* Signal that this cg has moved using the ns cell index.
 +             * Here we set it to -1.
 +             * fill_grid will change it from -1 to 4*grid->ncells.
 +             */
 +            cell_index[cg] = -1;
 +        }
 +    }
 +}
 +
 +static void print_cg_move(FILE *fplog,
 +                          gmx_domdec_t *dd,
 +                          gmx_large_int_t step,int cg,int dim,int dir,
 +                          gmx_bool bHaveLimitdAndCMOld,real limitd,
 +                          rvec cm_old,rvec cm_new,real pos_d)
 +{
 +    gmx_domdec_comm_t *comm;
 +    char buf[22];
 +
 +    comm = dd->comm;
 +
 +    fprintf(fplog,"\nStep %s:\n",gmx_step_str(step,buf));
 +    if (bHaveLimitdAndCMOld)
 +    {
 +        fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition (%f) in direction %c\n",
 +                ddglatnr(dd,dd->cgindex[cg]),limitd,dim2char(dim));
 +    }
 +    else
 +    {
 +        fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
 +                ddglatnr(dd,dd->cgindex[cg]),dim2char(dim));
 +    }
 +    fprintf(fplog,"distance out of cell %f\n",
 +            dir==1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
 +    if (bHaveLimitdAndCMOld)
 +    {
 +        fprintf(fplog,"Old coordinates: %8.3f %8.3f %8.3f\n",
 +                cm_old[XX],cm_old[YY],cm_old[ZZ]);
 +    }
 +    fprintf(fplog,"New coordinates: %8.3f %8.3f %8.3f\n",
 +            cm_new[XX],cm_new[YY],cm_new[ZZ]);
 +    fprintf(fplog,"Old cell boundaries in direction %c: %8.3f %8.3f\n",
 +            dim2char(dim),
 +            comm->old_cell_x0[dim],comm->old_cell_x1[dim]);
 +    fprintf(fplog,"New cell boundaries in direction %c: %8.3f %8.3f\n",
 +            dim2char(dim),
 +            comm->cell_x0[dim],comm->cell_x1[dim]);
 +}
 +
 +static void cg_move_error(FILE *fplog,
 +                          gmx_domdec_t *dd,
 +                          gmx_large_int_t step,int cg,int dim,int dir,
 +                          gmx_bool bHaveLimitdAndCMOld,real limitd,
 +                          rvec cm_old,rvec cm_new,real pos_d)
 +{
 +    if (fplog)
 +    {
 +        print_cg_move(fplog, dd,step,cg,dim,dir,
 +                      bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
 +    }
 +    print_cg_move(stderr,dd,step,cg,dim,dir,
 +                  bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
 +    gmx_fatal(FARGS,
 +              "A charge group moved too far between two domain decomposition steps\n"
 +              "This usually means that your system is not well equilibrated");
 +}
 +
 +static void rotate_state_atom(t_state *state,int a)
 +{
 +    int est;
 +
 +    for(est=0; est<estNR; est++)
 +    {
 +        if (EST_DISTR(est) && (state->flags & (1<<est))) {
 +            switch (est) {
 +            case estX:
 +                /* Rotate the complete state; for a rectangular box only */
 +                state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
 +                state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
 +                break;
 +            case estV:
 +                state->v[a][YY] = -state->v[a][YY];
 +                state->v[a][ZZ] = -state->v[a][ZZ];
 +                break;
 +            case estSDX:
 +                state->sd_X[a][YY] = -state->sd_X[a][YY];
 +                state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
 +                break;
 +            case estCGP:
 +                state->cg_p[a][YY] = -state->cg_p[a][YY];
 +                state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
 +                break;
 +            case estDISRE_INITF:
 +            case estDISRE_RM3TAV:
 +            case estORIRE_INITF:
 +            case estORIRE_DTAV:
 +                /* These are distances, so not affected by rotation */
 +                break;
 +            default:
 +                gmx_incons("Unknown state entry encountered in rotate_state_atom");            
 +            }
 +        }
 +    }
 +}
 +
 +static int dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
 +                              gmx_domdec_t *dd,ivec tric_dir,
 +                              t_state *state,rvec **f,
 +                              t_forcerec *fr,t_mdatoms *md,
 +                              gmx_bool bCompact,
 +                              t_nrnb *nrnb)
 +{
 +    int  *move;
 +    int  npbcdim;
 +    int  ncg[DIM*2],nat[DIM*2];
 +    int  c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
 +    int  mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
 +    int  sbuf[2],rbuf[2];
 +    int  home_pos_cg,home_pos_at,ncg_stay_home,buf_pos;
 +    int  flag;
 +    gmx_bool bV=FALSE,bSDX=FALSE,bCGP=FALSE;
 +    gmx_bool bScrew;
 +    ivec dev;
 +    real inv_ncg,pos_d;
 +    matrix tcm;
 +    rvec *cg_cm,cell_x0,cell_x1,limitd,limit0,limit1,cm_new;
 +    atom_id *cgindex;
 +    cginfo_mb_t *cginfo_mb;
 +    gmx_domdec_comm_t *comm;
 +    
 +    if (dd->bScrewPBC)
 +    {
 +        check_screw_box(state->box);
 +    }
 +    
 +    comm  = dd->comm;
 +    cg_cm = fr->cg_cm;
 +    
 +    for(i=0; i<estNR; i++)
 +    {
 +        if (EST_DISTR(i))
 +        {
 +            switch (i)
 +            {
 +            case estX:   /* Always present */            break;
 +            case estV:   bV   = (state->flags & (1<<i)); break;
 +            case estSDX: bSDX = (state->flags & (1<<i)); break;
 +            case estCGP: bCGP = (state->flags & (1<<i)); break;
 +            case estLD_RNG:
 +            case estLD_RNGI:
 +            case estDISRE_INITF:
 +            case estDISRE_RM3TAV:
 +            case estORIRE_INITF:
 +            case estORIRE_DTAV:
 +                /* No processing required */
 +                break;
 +            default:
 +            gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
 +            }
 +        }
 +    }
 +    
 +    if (dd->ncg_tot > comm->nalloc_int)
 +    {
 +        comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
 +        srenew(comm->buf_int,comm->nalloc_int);
 +    }
 +    move = comm->buf_int;
 +    
 +    /* Clear the count */
 +    for(c=0; c<dd->ndim*2; c++)
 +    {
 +        ncg[c] = 0;
 +        nat[c] = 0;
 +    }
 +
 +    npbcdim = dd->npbcdim;
 +
 +    for(d=0; (d<DIM); d++)
 +    {
 +        limitd[d] = dd->comm->cellsize_min[d];
 +        if (d >= npbcdim && dd->ci[d] == 0)
 +        {
 +            cell_x0[d] = -GMX_FLOAT_MAX;
 +        }
 +        else
 +        {
 +            cell_x0[d] = comm->cell_x0[d];
 +        }
 +        if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
 +        {
 +            cell_x1[d] = GMX_FLOAT_MAX;
 +        }
 +        else
 +        {
 +            cell_x1[d] = comm->cell_x1[d];
 +        }
 +        if (d < npbcdim)
 +        {
 +            limit0[d] = comm->old_cell_x0[d] - limitd[d];
 +            limit1[d] = comm->old_cell_x1[d] + limitd[d];
 +        }
 +        else
 +        {
 +            /* We check after communication if a charge group moved
 +             * more than one cell. Set the pre-comm check limit to float_max.
 +             */
 +            limit0[d] = -GMX_FLOAT_MAX;
 +            limit1[d] =  GMX_FLOAT_MAX;
 +        }
 +    }
 +    
 +    make_tric_corr_matrix(npbcdim,state->box,tcm);
 +    
 +    cgindex = dd->cgindex;
 +    
 +    /* Compute the center of geometry for all home charge groups
 +     * and put them in the box and determine where they should go.
 +     */
 +    for(cg=0; cg<dd->ncg_home; cg++)
 +    {
 +        k0   = cgindex[cg];
 +        k1   = cgindex[cg+1];
 +        nrcg = k1 - k0;
 +        if (nrcg == 1)
 +        {
 +            copy_rvec(state->x[k0],cm_new);
 +        }
 +        else
 +        {
 +            inv_ncg = 1.0/nrcg;
 +            
 +            clear_rvec(cm_new);
 +            for(k=k0; (k<k1); k++)
 +            {
 +                rvec_inc(cm_new,state->x[k]);
 +            }
 +            for(d=0; (d<DIM); d++)
 +            {
 +                cm_new[d] = inv_ncg*cm_new[d];
 +            }
 +        }
 +        
 +        clear_ivec(dev);
 +        /* Do pbc and check DD cell boundary crossings */
 +        for(d=DIM-1; d>=0; d--)
 +        {
 +            if (dd->nc[d] > 1)
 +            {
 +                bScrew = (dd->bScrewPBC && d == XX);
 +                /* Determine the location of this cg in lattice coordinates */
 +                pos_d = cm_new[d];
 +                if (tric_dir[d])
 +                {
 +                    for(d2=d+1; d2<DIM; d2++)
 +                    {
 +                        pos_d += cm_new[d2]*tcm[d2][d];
 +                    }
 +                }
 +                /* Put the charge group in the triclinic unit-cell */
 +                if (pos_d >= cell_x1[d])
 +                {
 +                    if (pos_d >= limit1[d])
 +                    {
 +                        cg_move_error(fplog,dd,step,cg,d,1,TRUE,limitd[d],
 +                                      cg_cm[cg],cm_new,pos_d);
 +                    }
 +                    dev[d] = 1;
 +                    if (dd->ci[d] == dd->nc[d] - 1)
 +                    {
 +                        rvec_dec(cm_new,state->box[d]);
 +                        if (bScrew)
 +                        {
 +                            cm_new[YY] = state->box[YY][YY] - cm_new[YY];
 +                            cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
 +                        }
 +                        for(k=k0; (k<k1); k++)
 +                        {
 +                            rvec_dec(state->x[k],state->box[d]);
 +                            if (bScrew)
 +                            {
 +                                rotate_state_atom(state,k);
 +                            }
 +                        }
 +                    }
 +                }
 +                else if (pos_d < cell_x0[d])
 +                {
 +                    if (pos_d < limit0[d])
 +                    {
 +                        cg_move_error(fplog,dd,step,cg,d,-1,TRUE,limitd[d],
 +                                      cg_cm[cg],cm_new,pos_d);
 +                    }
 +                    dev[d] = -1;
 +                    if (dd->ci[d] == 0)
 +                    {
 +                        rvec_inc(cm_new,state->box[d]);
 +                        if (bScrew)
 +                        {
 +                            cm_new[YY] = state->box[YY][YY] - cm_new[YY];
 +                            cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
 +                        }
 +                        for(k=k0; (k<k1); k++)
 +                        {
 +                            rvec_inc(state->x[k],state->box[d]);
 +                            if (bScrew)
 +                            {
 +                                rotate_state_atom(state,k);
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +            else if (d < npbcdim)
 +            {
 +                /* Put the charge group in the rectangular unit-cell */
 +                while (cm_new[d] >= state->box[d][d])
 +                {
 +                    rvec_dec(cm_new,state->box[d]);
 +                    for(k=k0; (k<k1); k++)
 +                    {
 +                        rvec_dec(state->x[k],state->box[d]);
 +                    }
 +                }
 +                while (cm_new[d] < 0)
 +                {
 +                    rvec_inc(cm_new,state->box[d]);
 +                    for(k=k0; (k<k1); k++)
 +                    {
 +                        rvec_inc(state->x[k],state->box[d]);
 +                    }
 +                }
 +            }
 +        }
 +    
 +        copy_rvec(cm_new,cg_cm[cg]);
 +        
 +        /* Determine where this cg should go */
 +        flag = 0;
 +        mc = -1;
 +        for(d=0; d<dd->ndim; d++)
 +        {
 +            dim = dd->dim[d];
 +            if (dev[dim] == 1)
 +            {
 +                flag |= DD_FLAG_FW(d);
 +                if (mc == -1)
 +                {
 +                    mc = d*2;
 +                }
 +            }
 +            else if (dev[dim] == -1)
 +            {
 +                flag |= DD_FLAG_BW(d);
 +                if (mc == -1) {
 +                    if (dd->nc[dim] > 2)
 +                    {
 +                        mc = d*2 + 1;
 +                    }
 +                    else
 +                    {
 +                        mc = d*2;
 +                    }
 +                }
 +            }
 +        }
 +        move[cg] = mc;
 +        if (mc >= 0)
 +        {
 +            if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
 +            {
 +                comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
 +                srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
 +            }
 +            comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
 +            /* We store the cg size in the lower 16 bits
 +             * and the place where the charge group should go
 +             * in the next 6 bits. This saves some communication volume.
 +             */
 +            comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
 +            ncg[mc] += 1;
 +            nat[mc] += nrcg;
 +        }
 +    }
 +    
 +    inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
 +    inc_nrnb(nrnb,eNR_RESETX,dd->ncg_home);
 +    
 +    nvec = 1;
 +    if (bV)
 +    {
 +        nvec++;
 +    }
 +    if (bSDX)
 +    {
 +        nvec++;
 +    }
 +    if (bCGP)
 +    {
 +        nvec++;
 +    }
 +    
 +    /* Make sure the communication buffers are large enough */
 +    for(mc=0; mc<dd->ndim*2; mc++)
 +    {
 +        nvr = ncg[mc] + nat[mc]*nvec;
 +        if (nvr > comm->cgcm_state_nalloc[mc])
 +        {
 +            comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
 +            srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
 +        }
 +    }
 +    
 +    /* Recalculating cg_cm might be cheaper than communicating,
 +     * but that could give rise to rounding issues.
 +     */
 +    home_pos_cg =
 +        compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
 +                                nvec,cg_cm,comm,bCompact);
 +    
 +    vec = 0;
 +    home_pos_at =
 +        compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
 +                                nvec,vec++,state->x,comm,bCompact);
 +    if (bV)
 +    {
 +        compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
 +                                nvec,vec++,state->v,comm,bCompact);
 +    }
 +    if (bSDX)
 +    {
 +        compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
 +                                nvec,vec++,state->sd_X,comm,bCompact);
 +    }
 +    if (bCGP)
 +    {
 +        compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
 +                                nvec,vec++,state->cg_p,comm,bCompact);
 +    }
 +    
 +    if (bCompact)
 +    {
 +        compact_ind(dd->ncg_home,move,
 +                    dd->index_gl,dd->cgindex,dd->gatindex,
 +                    dd->ga2la,comm->bLocalCG,
 +                    fr->cginfo);
 +    }
 +    else
 +    {
 +        clear_and_mark_ind(dd->ncg_home,move,
 +                           dd->index_gl,dd->cgindex,dd->gatindex,
 +                           dd->ga2la,comm->bLocalCG,
 +                           fr->ns.grid->cell_index);
 +    }
 +    
 +    cginfo_mb = fr->cginfo_mb;
 +
 +    ncg_stay_home = home_pos_cg;
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        ncg_recv = 0;
 +        nat_recv = 0;
 +        nvr      = 0;
 +        for(dir=0; dir<(dd->nc[dim]==2 ? 1 : 2); dir++)
 +        {
 +            cdd = d*2 + dir;
 +            /* Communicate the cg and atom counts */
 +            sbuf[0] = ncg[cdd];
 +            sbuf[1] = nat[cdd];
 +            if (debug)
 +            {
 +                fprintf(debug,"Sending ddim %d dir %d: ncg %d nat %d\n",
 +                        d,dir,sbuf[0],sbuf[1]);
 +            }
 +            dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
 +            
 +            if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
 +            {
 +                comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
 +                srenew(comm->buf_int,comm->nalloc_int);
 +            }
 +            
 +            /* Communicate the charge group indices, sizes and flags */
 +            dd_sendrecv_int(dd, d, dir,
 +                            comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
 +                            comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
 +            
 +            nvs = ncg[cdd] + nat[cdd]*nvec;
 +            i   = rbuf[0]  + rbuf[1] *nvec;
 +            vec_rvec_check_alloc(&comm->vbuf,nvr+i);
 +            
 +            /* Communicate cgcm and state */
 +            dd_sendrecv_rvec(dd, d, dir,
 +                             comm->cgcm_state[cdd], nvs,
 +                             comm->vbuf.v+nvr, i);
 +            ncg_recv += rbuf[0];
 +            nat_recv += rbuf[1];
 +            nvr      += i;
 +        }
 +        
 +        /* Process the received charge groups */
 +        buf_pos = 0;
 +        for(cg=0; cg<ncg_recv; cg++)
 +        {
 +            flag = comm->buf_int[cg*DD_CGIBS+1];
 +
 +            if (dim >= npbcdim && dd->nc[dim] > 2)
 +            {
 +                /* No pbc in this dim and more than one domain boundary.
 +                 * We to a separate check if a charge did not move too far.
 +                 */
 +                if (((flag & DD_FLAG_FW(d)) &&
 +                     comm->vbuf.v[buf_pos][d] > cell_x1[dim]) ||
 +                    ((flag & DD_FLAG_BW(d)) &&
 +                     comm->vbuf.v[buf_pos][d] < cell_x0[dim]))
 +                {
 +                    cg_move_error(fplog,dd,step,cg,d,
 +                                  (flag & DD_FLAG_FW(d)) ? 1 : 0,
 +                                   FALSE,0,
 +                                   comm->vbuf.v[buf_pos],
 +                                   comm->vbuf.v[buf_pos],
 +                                   comm->vbuf.v[buf_pos][d]);
 +                }
 +            }
 +
 +            mc = -1;
 +            if (d < dd->ndim-1)
 +            {
 +                /* Check which direction this cg should go */
 +                for(d2=d+1; (d2<dd->ndim && mc==-1); d2++)
 +                {
 +                    if (dd->bGridJump)
 +                    {
 +                        /* The cell boundaries for dimension d2 are not equal
 +                         * for each cell row of the lower dimension(s),
 +                         * therefore we might need to redetermine where
 +                         * this cg should go.
 +                         */
 +                        dim2 = dd->dim[d2];
 +                        /* If this cg crosses the box boundary in dimension d2
 +                         * we can use the communicated flag, so we do not
 +                         * have to worry about pbc.
 +                         */
 +                        if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
 +                               (flag & DD_FLAG_FW(d2))) ||
 +                              (dd->ci[dim2] == 0 &&
 +                               (flag & DD_FLAG_BW(d2)))))
 +                        {
 +                            /* Clear the two flags for this dimension */
 +                            flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
 +                            /* Determine the location of this cg
 +                             * in lattice coordinates
 +                             */
 +                            pos_d = comm->vbuf.v[buf_pos][dim2];
 +                            if (tric_dir[dim2])
 +                            {
 +                                for(d3=dim2+1; d3<DIM; d3++)
 +                                {
 +                                    pos_d +=
 +                                        comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
 +                                }
 +                            }
 +                            /* Check of we are not at the box edge.
 +                             * pbc is only handled in the first step above,
 +                             * but this check could move over pbc while
 +                             * the first step did not due to different rounding.
 +                             */
 +                            if (pos_d >= cell_x1[dim2] &&
 +                                dd->ci[dim2] != dd->nc[dim2]-1)
 +                            {
 +                                flag |= DD_FLAG_FW(d2);
 +                            }
 +                            else if (pos_d < cell_x0[dim2] &&
 +                                     dd->ci[dim2] != 0)
 +                            {
 +                                flag |= DD_FLAG_BW(d2);
 +                            }
 +                            comm->buf_int[cg*DD_CGIBS+1] = flag;
 +                        }
 +                    }
 +                    /* Set to which neighboring cell this cg should go */
 +                    if (flag & DD_FLAG_FW(d2))
 +                    {
 +                        mc = d2*2;
 +                    }
 +                    else if (flag & DD_FLAG_BW(d2))
 +                    {
 +                        if (dd->nc[dd->dim[d2]] > 2)
 +                        {
 +                            mc = d2*2+1;
 +                        }
 +                        else
 +                        {
 +                            mc = d2*2;
 +                        }
 +                    }
 +                }
 +            }
 +            
 +            nrcg = flag & DD_FLAG_NRCG;
 +            if (mc == -1)
 +            {
 +                if (home_pos_cg+1 > dd->cg_nalloc)
 +                {
 +                    dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
 +                    srenew(dd->index_gl,dd->cg_nalloc);
 +                    srenew(dd->cgindex,dd->cg_nalloc+1);
 +                }
 +                /* Set the global charge group index and size */
 +                dd->index_gl[home_pos_cg] = comm->buf_int[cg*DD_CGIBS];
 +                dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
 +                /* Copy the state from the buffer */
 +                if (home_pos_cg >= fr->cg_nalloc)
 +                {
 +                    dd_realloc_fr_cg(fr,home_pos_cg+1);
 +                    cg_cm = fr->cg_cm;
 +                }
 +                copy_rvec(comm->vbuf.v[buf_pos++],cg_cm[home_pos_cg]);
 +                /* Set the cginfo */
 +                fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
 +                                                   dd->index_gl[home_pos_cg]);
 +                if (comm->bLocalCG)
 +                {
 +                    comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
 +                }
 +
 +                if (home_pos_at+nrcg > state->nalloc)
 +                {
 +                    dd_realloc_state(state,f,home_pos_at+nrcg);
 +                }
 +                for(i=0; i<nrcg; i++)
 +                {
 +                    copy_rvec(comm->vbuf.v[buf_pos++],
 +                              state->x[home_pos_at+i]);
 +                }
 +                if (bV)
 +                {
 +                    for(i=0; i<nrcg; i++)
 +                    {
 +                        copy_rvec(comm->vbuf.v[buf_pos++],
 +                                  state->v[home_pos_at+i]);
 +                    }
 +                }
 +                if (bSDX)
 +                {
 +                    for(i=0; i<nrcg; i++)
 +                    {
 +                        copy_rvec(comm->vbuf.v[buf_pos++],
 +                                  state->sd_X[home_pos_at+i]);
 +                    }
 +                }
 +                if (bCGP)
 +                {
 +                    for(i=0; i<nrcg; i++)
 +                    {
 +                        copy_rvec(comm->vbuf.v[buf_pos++],
 +                                  state->cg_p[home_pos_at+i]);
 +                    }
 +                }
 +                home_pos_cg += 1;
 +                home_pos_at += nrcg;
 +            }
 +            else
 +            {
 +                /* Reallocate the buffers if necessary  */
 +                if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
 +                {
 +                    comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
 +                    srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
 +                }
 +                nvr = ncg[mc] + nat[mc]*nvec;
 +                if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
 +                {
 +                    comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
 +                    srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
 +                }
 +                /* Copy from the receive to the send buffers */
 +                memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
 +                       comm->buf_int + cg*DD_CGIBS,
 +                       DD_CGIBS*sizeof(int));
 +                memcpy(comm->cgcm_state[mc][nvr],
 +                       comm->vbuf.v[buf_pos],
 +                       (1+nrcg*nvec)*sizeof(rvec));
 +                buf_pos += 1 + nrcg*nvec;
 +                ncg[mc] += 1;
 +                nat[mc] += nrcg;
 +            }
 +        }
 +    }
 +    
 +    /* With sorting (!bCompact) the indices are now only partially up to date
 +     * and ncg_home and nat_home are not the real count, since there are
 +     * "holes" in the arrays for the charge groups that moved to neighbors.
 +     */
 +    dd->ncg_home = home_pos_cg;
 +    dd->nat_home = home_pos_at;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"Finished repartitioning\n");
 +    }
 +
 +    return ncg_stay_home;
 +}
 +
 +void dd_cycles_add(gmx_domdec_t *dd,float cycles,int ddCycl)
 +{
 +    dd->comm->cycl[ddCycl] += cycles;
 +    dd->comm->cycl_n[ddCycl]++;
 +    if (cycles > dd->comm->cycl_max[ddCycl])
 +    {
 +        dd->comm->cycl_max[ddCycl] = cycles;
 +    }
 +}
 +
 +static double force_flop_count(t_nrnb *nrnb)
 +{
 +    int i;
 +    double sum;
 +    const char *name;
 +
 +    sum = 0;
 +    for(i=eNR_NBKERNEL010; i<eNR_NBKERNEL_FREE_ENERGY; i++)
 +    {
 +        /* To get closer to the real timings, we half the count
 +         * for the normal loops and again half it for water loops.
 +         */
 +        name = nrnb_str(i);
 +        if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
 +        {
 +            sum += nrnb->n[i]*0.25*cost_nrnb(i);
 +        }
 +        else
 +        {
 +            sum += nrnb->n[i]*0.50*cost_nrnb(i);
 +        }
 +    }
 +    for(i=eNR_NBKERNEL_FREE_ENERGY; i<=eNR_NB14; i++)
 +    {
 +        name = nrnb_str(i);
 +        if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
 +        sum += nrnb->n[i]*cost_nrnb(i);
 +    }
 +    for(i=eNR_BONDS; i<=eNR_WALLS; i++)
 +    {
 +        sum += nrnb->n[i]*cost_nrnb(i);
 +    }
 +
 +    return sum;
 +}
 +
 +void dd_force_flop_start(gmx_domdec_t *dd,t_nrnb *nrnb)
 +{
 +    if (dd->comm->eFlop)
 +    {
 +        dd->comm->flop -= force_flop_count(nrnb);
 +    }
 +}
 +void dd_force_flop_stop(gmx_domdec_t *dd,t_nrnb *nrnb)
 +{
 +    if (dd->comm->eFlop)
 +    {
 +        dd->comm->flop += force_flop_count(nrnb);
 +        dd->comm->flop_n++;
 +    }
 +}  
 +
 +static void clear_dd_cycle_counts(gmx_domdec_t *dd)
 +{
 +    int i;
 +    
 +    for(i=0; i<ddCyclNr; i++)
 +    {
 +        dd->comm->cycl[i] = 0;
 +        dd->comm->cycl_n[i] = 0;
 +        dd->comm->cycl_max[i] = 0;
 +    }
 +    dd->comm->flop = 0;
 +    dd->comm->flop_n = 0;
 +}
 +
 +static void get_load_distribution(gmx_domdec_t *dd,gmx_wallcycle_t wcycle)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_domdec_load_t *load;
 +    gmx_domdec_root_t *root=NULL;
 +    int  d,dim,cid,i,pos;
 +    float cell_frac=0,sbuf[DD_NLOAD_MAX];
 +    gmx_bool bSepPME;
 +    
 +    if (debug)
 +    {
 +        fprintf(debug,"get_load_distribution start\n");
 +    }
 +
 +    wallcycle_start(wcycle,ewcDDCOMMLOAD);
 +    
 +    comm = dd->comm;
 +    
 +    bSepPME = (dd->pme_nodeid >= 0);
 +    
 +    for(d=dd->ndim-1; d>=0; d--)
 +    {
 +        dim = dd->dim[d];
 +        /* Check if we participate in the communication in this dimension */
 +        if (d == dd->ndim-1 || 
 +            (dd->ci[dd->dim[d+1]]==0 && dd->ci[dd->dim[dd->ndim-1]]==0))
 +        {
 +            load = &comm->load[d];
 +            if (dd->bGridJump)
 +            {
 +                cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
 +            }
 +            pos = 0;
 +            if (d == dd->ndim-1)
 +            {
 +                sbuf[pos++] = dd_force_load(comm);
 +                sbuf[pos++] = sbuf[0];
 +                if (dd->bGridJump)
 +                {
 +                    sbuf[pos++] = sbuf[0];
 +                    sbuf[pos++] = cell_frac;
 +                    if (d > 0)
 +                    {
 +                        sbuf[pos++] = comm->cell_f_max0[d];
 +                        sbuf[pos++] = comm->cell_f_min1[d];
 +                    }
 +                }
 +                if (bSepPME)
 +                {
 +                    sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
 +                    sbuf[pos++] = comm->cycl[ddCyclPME];
 +                }
 +            }
 +            else
 +            {
 +                sbuf[pos++] = comm->load[d+1].sum;
 +                sbuf[pos++] = comm->load[d+1].max;
 +                if (dd->bGridJump)
 +                {
 +                    sbuf[pos++] = comm->load[d+1].sum_m;
 +                    sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
 +                    sbuf[pos++] = comm->load[d+1].flags;
 +                    if (d > 0)
 +                    {
 +                        sbuf[pos++] = comm->cell_f_max0[d];
 +                        sbuf[pos++] = comm->cell_f_min1[d];
 +                    }
 +                }
 +                if (bSepPME)
 +                {
 +                    sbuf[pos++] = comm->load[d+1].mdf;
 +                    sbuf[pos++] = comm->load[d+1].pme;
 +                }
 +            }
 +            load->nload = pos;
 +            /* Communicate a row in DD direction d.
 +             * The communicators are setup such that the root always has rank 0.
 +             */
 +#ifdef GMX_MPI
 +            MPI_Gather(sbuf      ,load->nload*sizeof(float),MPI_BYTE,
 +                       load->load,load->nload*sizeof(float),MPI_BYTE,
 +                       0,comm->mpi_comm_load[d]);
 +#endif
 +            if (dd->ci[dim] == dd->master_ci[dim])
 +            {
 +                /* We are the root, process this row */
 +                if (comm->bDynLoadBal)
 +                {
 +                    root = comm->root[d];
 +                }
 +                load->sum = 0;
 +                load->max = 0;
 +                load->sum_m = 0;
 +                load->cvol_min = 1;
 +                load->flags = 0;
 +                load->mdf = 0;
 +                load->pme = 0;
 +                pos = 0;
 +                for(i=0; i<dd->nc[dim]; i++)
 +                {
 +                    load->sum += load->load[pos++];
 +                    load->max = max(load->max,load->load[pos]);
 +                    pos++;
 +                    if (dd->bGridJump)
 +                    {
 +                        if (root->bLimited)
 +                        {
 +                            /* This direction could not be load balanced properly,
 +                             * therefore we need to use the maximum iso the average load.
 +                             */
 +                            load->sum_m = max(load->sum_m,load->load[pos]);
 +                        }
 +                        else
 +                        {
 +                            load->sum_m += load->load[pos];
 +                        }
 +                        pos++;
 +                        load->cvol_min = min(load->cvol_min,load->load[pos]);
 +                        pos++;
 +                        if (d < dd->ndim-1)
 +                        {
 +                            load->flags = (int)(load->load[pos++] + 0.5);
 +                        }
 +                        if (d > 0)
 +                        {
 +                            root->cell_f_max0[i] = load->load[pos++];
 +                            root->cell_f_min1[i] = load->load[pos++];
 +                        }
 +                    }
 +                    if (bSepPME)
 +                    {
 +                        load->mdf = max(load->mdf,load->load[pos]);
 +                        pos++;
 +                        load->pme = max(load->pme,load->load[pos]);
 +                        pos++;
 +                    }
 +                }
 +                if (comm->bDynLoadBal && root->bLimited)
 +                {
 +                    load->sum_m *= dd->nc[dim];
 +                    load->flags |= (1<<d);
 +                }
 +            }
 +        }
 +    }
 +
 +    if (DDMASTER(dd))
 +    {
 +        comm->nload      += dd_load_count(comm);
 +        comm->load_step  += comm->cycl[ddCyclStep];
 +        comm->load_sum   += comm->load[0].sum;
 +        comm->load_max   += comm->load[0].max;
 +        if (comm->bDynLoadBal)
 +        {
 +            for(d=0; d<dd->ndim; d++)
 +            {
 +                if (comm->load[0].flags & (1<<d))
 +                {
 +                    comm->load_lim[d]++;
 +                }
 +            }
 +        }
 +        if (bSepPME)
 +        {
 +            comm->load_mdf += comm->load[0].mdf;
 +            comm->load_pme += comm->load[0].pme;
 +        }
 +    }
 +
 +    wallcycle_stop(wcycle,ewcDDCOMMLOAD);
 +    
 +    if (debug)
 +    {
 +        fprintf(debug,"get_load_distribution finished\n");
 +    }
 +}
 +
 +static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
 +{
 +    /* Return the relative performance loss on the total run time
 +     * due to the force calculation load imbalance.
 +     */
 +    if (dd->comm->nload > 0)
 +    {
 +        return
 +            (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
 +            (dd->comm->load_step*dd->nnodes);
 +    }
 +    else
 +    {
 +        return 0;
 +    }
 +}
 +
 +static void print_dd_load_av(FILE *fplog,gmx_domdec_t *dd)
 +{
 +    char  buf[STRLEN];
 +    int   npp,npme,nnodes,d,limp;
 +    float imbal,pme_f_ratio,lossf,lossp=0;
 +    gmx_bool  bLim;
 +    gmx_domdec_comm_t *comm;
 +
 +    comm = dd->comm;
 +    if (DDMASTER(dd) && comm->nload > 0)
 +    {
 +        npp    = dd->nnodes;
 +        npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
 +        nnodes = npp + npme;
 +        imbal = comm->load_max*npp/comm->load_sum - 1;
 +        lossf = dd_force_imb_perf_loss(dd);
 +        sprintf(buf," Average load imbalance: %.1f %%\n",imbal*100);
 +        fprintf(fplog,"%s",buf);
 +        fprintf(stderr,"\n");
 +        fprintf(stderr,"%s",buf);
 +        sprintf(buf," Part of the total run time spent waiting due to load imbalance: %.1f %%\n",lossf*100);
 +        fprintf(fplog,"%s",buf);
 +        fprintf(stderr,"%s",buf);
 +        bLim = FALSE;
 +        if (comm->bDynLoadBal)
 +        {
 +            sprintf(buf," Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
 +            for(d=0; d<dd->ndim; d++)
 +            {
 +                limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
 +                sprintf(buf+strlen(buf)," %c %d %%",dim2char(dd->dim[d]),limp);
 +                if (limp >= 50)
 +                {
 +                    bLim = TRUE;
 +                }
 +            }
 +            sprintf(buf+strlen(buf),"\n");
 +            fprintf(fplog,"%s",buf);
 +            fprintf(stderr,"%s",buf);
 +        }
 +        if (npme > 0)
 +        {
 +            pme_f_ratio = comm->load_pme/comm->load_mdf;
 +            lossp = (comm->load_pme -comm->load_mdf)/comm->load_step;
 +            if (lossp <= 0)
 +            {
 +                lossp *= (float)npme/(float)nnodes;
 +            }
 +            else
 +            {
 +                lossp *= (float)npp/(float)nnodes;
 +            }
 +            sprintf(buf," Average PME mesh/force load: %5.3f\n",pme_f_ratio);
 +            fprintf(fplog,"%s",buf);
 +            fprintf(stderr,"%s",buf);
 +            sprintf(buf," Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",fabs(lossp)*100);
 +            fprintf(fplog,"%s",buf);
 +            fprintf(stderr,"%s",buf);
 +        }
 +        fprintf(fplog,"\n");
 +        fprintf(stderr,"\n");
 +        
 +        if (lossf >= DD_PERF_LOSS)
 +        {
 +            sprintf(buf,
 +                    "NOTE: %.1f %% performance was lost due to load imbalance\n"
 +                    "      in the domain decomposition.\n",lossf*100);
 +            if (!comm->bDynLoadBal)
 +            {
 +                sprintf(buf+strlen(buf),"      You might want to use dynamic load balancing (option -dlb.)\n");
 +            }
 +            else if (bLim)
 +            {
 +                sprintf(buf+strlen(buf),"      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
 +            }
 +            fprintf(fplog,"%s\n",buf);
 +            fprintf(stderr,"%s\n",buf);
 +        }
 +        if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
 +        {
 +            sprintf(buf,
 +                    "NOTE: %.1f %% performance was lost because the PME nodes\n"
 +                    "      had %s work to do than the PP nodes.\n"
 +                    "      You might want to %s the number of PME nodes\n"
 +                    "      or %s the cut-off and the grid spacing.\n",
 +                    fabs(lossp*100),
 +                    (lossp < 0) ? "less"     : "more",
 +                    (lossp < 0) ? "decrease" : "increase",
 +                    (lossp < 0) ? "decrease" : "increase");
 +            fprintf(fplog,"%s\n",buf);
 +            fprintf(stderr,"%s\n",buf);
 +        }
 +    }
 +}
 +
 +static float dd_vol_min(gmx_domdec_t *dd)
 +{
 +    return dd->comm->load[0].cvol_min*dd->nnodes;
 +}
 +
 +static gmx_bool dd_load_flags(gmx_domdec_t *dd)
 +{
 +    return dd->comm->load[0].flags;
 +}
 +
 +static float dd_f_imbal(gmx_domdec_t *dd)
 +{
 +    return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
 +}
 +
 +static float dd_pme_f_ratio(gmx_domdec_t *dd)
 +{
 +    return dd->comm->load[0].pme/dd->comm->load[0].mdf;
 +}
 +
 +static void dd_print_load(FILE *fplog,gmx_domdec_t *dd,gmx_large_int_t step)
 +{
 +    int flags,d;
 +    char buf[22];
 +    
 +    flags = dd_load_flags(dd);
 +    if (flags)
 +    {
 +        fprintf(fplog,
 +                "DD  load balancing is limited by minimum cell size in dimension");
 +        for(d=0; d<dd->ndim; d++)
 +        {
 +            if (flags & (1<<d))
 +            {
 +                fprintf(fplog," %c",dim2char(dd->dim[d]));
 +            }
 +        }
 +        fprintf(fplog,"\n");
 +    }
 +    fprintf(fplog,"DD  step %s",gmx_step_str(step,buf));
 +    if (dd->comm->bDynLoadBal)
 +    {
 +        fprintf(fplog,"  vol min/aver %5.3f%c",
 +                dd_vol_min(dd),flags ? '!' : ' ');
 +    }
 +    fprintf(fplog," load imb.: force %4.1f%%",dd_f_imbal(dd)*100);
 +    if (dd->comm->cycl_n[ddCyclPME])
 +    {
 +        fprintf(fplog,"  pme mesh/force %5.3f",dd_pme_f_ratio(dd));
 +    }
 +    fprintf(fplog,"\n\n");
 +}
 +
 +static void dd_print_load_verbose(gmx_domdec_t *dd)
 +{
 +    if (dd->comm->bDynLoadBal)
 +    {
 +        fprintf(stderr,"vol %4.2f%c ",
 +                dd_vol_min(dd),dd_load_flags(dd) ? '!' : ' ');
 +    }
 +    fprintf(stderr,"imb F %2d%% ",(int)(dd_f_imbal(dd)*100+0.5));
 +    if (dd->comm->cycl_n[ddCyclPME])
 +    {
 +        fprintf(stderr,"pme/F %4.2f ",dd_pme_f_ratio(dd));
 +    }
 +}
 +
 +#ifdef GMX_MPI
 +static void make_load_communicator(gmx_domdec_t *dd,MPI_Group g_all,
 +                                   int dim_ind,ivec loc)
 +{
 +    MPI_Group g_row = MPI_GROUP_EMPTY;
 +    MPI_Comm  c_row;
 +    int  dim,i,*rank;
 +    ivec loc_c;
 +    gmx_domdec_root_t *root;
 +    gmx_bool bPartOfGroup = FALSE;
 +    
 +    dim = dd->dim[dim_ind];
 +    copy_ivec(loc,loc_c);
 +    snew(rank,dd->nc[dim]);
 +    for(i=0; i<dd->nc[dim]; i++)
 +    {
 +        loc_c[dim] = i;
 +        rank[i] = dd_index(dd->nc,loc_c);
 +        if (rank[i] == dd->rank)
 +        {
 +            /* This process is part of the group */
 +            bPartOfGroup = TRUE;
 +        }
 +    }
 +    if (bPartOfGroup)
 +    {
 +        MPI_Group_incl(g_all,dd->nc[dim],rank,&g_row);
 +    }
 +    MPI_Comm_create(dd->mpi_comm_all,g_row,&c_row);
 +    if (bPartOfGroup)
 +    {
 +        dd->comm->mpi_comm_load[dim_ind] = c_row;
 +        if (dd->comm->eDLB != edlbNO)
 +        {
 +            if (dd->ci[dim] == dd->master_ci[dim])
 +            {
 +                /* This is the root process of this row */
 +                snew(dd->comm->root[dim_ind],1);
 +                root = dd->comm->root[dim_ind];
 +                snew(root->cell_f,DD_CELL_F_SIZE(dd,dim_ind));
 +                snew(root->old_cell_f,dd->nc[dim]+1);
 +                snew(root->bCellMin,dd->nc[dim]);
 +                if (dim_ind > 0)
 +                {
 +                    snew(root->cell_f_max0,dd->nc[dim]);
 +                    snew(root->cell_f_min1,dd->nc[dim]);
 +                    snew(root->bound_min,dd->nc[dim]);
 +                    snew(root->bound_max,dd->nc[dim]);
 +                }
 +                snew(root->buf_ncd,dd->nc[dim]);
 +            }
 +            else
 +            {
 +                /* This is not a root process, we only need to receive cell_f */
 +                snew(dd->comm->cell_f_row,DD_CELL_F_SIZE(dd,dim_ind));
 +            }
 +        }
 +        if (dd->ci[dim] == dd->master_ci[dim])
 +        {
 +            snew(dd->comm->load[dim_ind].load,dd->nc[dim]*DD_NLOAD_MAX);
 +        }
 +    }
 +    sfree(rank);
 +}
 +#endif
 +
 +static void make_load_communicators(gmx_domdec_t *dd)
 +{
 +#ifdef GMX_MPI
 +  MPI_Group g_all;
 +  int  dim0,dim1,i,j;
 +  ivec loc;
 +
 +  if (debug)
 +    fprintf(debug,"Making load communicators\n");
 +
 +  MPI_Comm_group(dd->mpi_comm_all,&g_all);
 +  
 +  snew(dd->comm->load,dd->ndim);
 +  snew(dd->comm->mpi_comm_load,dd->ndim);
 +  
 +  clear_ivec(loc);
 +  make_load_communicator(dd,g_all,0,loc);
 +  if (dd->ndim > 1) {
 +    dim0 = dd->dim[0];
 +    for(i=0; i<dd->nc[dim0]; i++) {
 +      loc[dim0] = i;
 +      make_load_communicator(dd,g_all,1,loc);
 +    }
 +  }
 +  if (dd->ndim > 2) {
 +    dim0 = dd->dim[0];
 +    for(i=0; i<dd->nc[dim0]; i++) {
 +      loc[dim0] = i;
 +      dim1 = dd->dim[1];
 +      for(j=0; j<dd->nc[dim1]; j++) {
 +        loc[dim1] = j;
 +        make_load_communicator(dd,g_all,2,loc);
 +      }
 +    }
 +  }
 +
 +  MPI_Group_free(&g_all);
 +
 +  if (debug)
 +    fprintf(debug,"Finished making load communicators\n");
 +#endif
 +}
 +
 +void setup_dd_grid(FILE *fplog,gmx_domdec_t *dd)
 +{
 +    gmx_bool bZYX;
 +    int  d,dim,i,j,m;
 +    ivec tmp,s;
 +    int  nzone,nzonep;
 +    ivec dd_zp[DD_MAXIZONE];
 +    gmx_domdec_zones_t *zones;
 +    gmx_domdec_ns_ranges_t *izone;
 +    
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        dim = dd->dim[d];
 +        copy_ivec(dd->ci,tmp);
 +        tmp[dim] = (tmp[dim] + 1) % dd->nc[dim];
 +        dd->neighbor[d][0] = ddcoord2ddnodeid(dd,tmp);
 +        copy_ivec(dd->ci,tmp);
 +        tmp[dim] = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
 +        dd->neighbor[d][1] = ddcoord2ddnodeid(dd,tmp);
 +        if (debug)
 +        {
 +            fprintf(debug,"DD rank %d neighbor ranks in dir %d are + %d - %d\n",
 +                    dd->rank,dim,
 +                    dd->neighbor[d][0],
 +                    dd->neighbor[d][1]);
 +        }
 +    }
 +    
 +    if (DDMASTER(dd))
 +    {
 +        fprintf(stderr,"Making %dD domain decomposition %d x %d x %d\n",
 +          dd->ndim,dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
 +    }
 +    if (fplog)
 +    {
 +        fprintf(fplog,"\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
 +                dd->ndim,
 +                dd->nc[XX],dd->nc[YY],dd->nc[ZZ],
 +                dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
 +    }
 +    switch (dd->ndim)
 +    {
 +    case 3:
 +        nzone  = dd_z3n;
 +        nzonep = dd_zp3n;
 +        for(i=0; i<nzonep; i++)
 +        {
 +            copy_ivec(dd_zp3[i],dd_zp[i]);
 +        }
 +        break;
 +    case 2:
 +        nzone  = dd_z2n;
 +        nzonep = dd_zp2n;
 +        for(i=0; i<nzonep; i++)
 +        {
 +            copy_ivec(dd_zp2[i],dd_zp[i]);
 +        }
 +        break;
 +    case 1:
 +        nzone  = dd_z1n;
 +        nzonep = dd_zp1n;
 +        for(i=0; i<nzonep; i++)
 +        {
 +            copy_ivec(dd_zp1[i],dd_zp[i]);
 +        }
 +        break;
 +    default:
 +        gmx_fatal(FARGS,"Can only do 1, 2 or 3D domain decomposition");
 +        nzone = 0;
 +        nzonep = 0;
 +    }
 +
 +    zones = &dd->comm->zones;
 +
 +    for(i=0; i<nzone; i++)
 +    {
 +        m = 0;
 +        clear_ivec(zones->shift[i]);
 +        for(d=0; d<dd->ndim; d++)
 +        {
 +            zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
 +        }
 +    }
 +    
 +    zones->n = nzone;
 +    for(i=0; i<nzone; i++)
 +    {
 +        for(d=0; d<DIM; d++)
 +        {
 +            s[d] = dd->ci[d] - zones->shift[i][d];
 +            if (s[d] < 0)
 +            {
 +                s[d] += dd->nc[d];
 +            }
 +            else if (s[d] >= dd->nc[d])
 +            {
 +                s[d] -= dd->nc[d];
 +            }
 +        }
 +    }
 +    zones->nizone = nzonep;
 +    for(i=0; i<zones->nizone; i++)
 +    {
 +        if (dd_zp[i][0] != i)
 +        {
 +            gmx_fatal(FARGS,"Internal inconsistency in the dd grid setup");
 +        }
 +        izone = &zones->izone[i];
 +        izone->j0 = dd_zp[i][1];
 +        izone->j1 = dd_zp[i][2];
 +        for(dim=0; dim<DIM; dim++)
 +        {
 +            if (dd->nc[dim] == 1)
 +            {
 +                /* All shifts should be allowed */
 +                izone->shift0[dim] = -1;
 +                izone->shift1[dim] = 1;
 +            }
 +            else
 +            {
 +                /*
 +                  izone->shift0[d] = 0;
 +                  izone->shift1[d] = 0;
 +                  for(j=izone->j0; j<izone->j1; j++) {
 +                  if (dd->shift[j][d] > dd->shift[i][d])
 +                  izone->shift0[d] = -1;
 +                  if (dd->shift[j][d] < dd->shift[i][d])
 +                  izone->shift1[d] = 1;
 +                  }
 +                */
 +                
 +                int shift_diff;
 +                
 +                /* Assume the shift are not more than 1 cell */
 +                izone->shift0[dim] = 1;
 +                izone->shift1[dim] = -1;
 +                for(j=izone->j0; j<izone->j1; j++)
 +                {
 +                    shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
 +                    if (shift_diff < izone->shift0[dim])
 +                    {
 +                        izone->shift0[dim] = shift_diff;
 +                    }
 +                    if (shift_diff > izone->shift1[dim])
 +                    {
 +                        izone->shift1[dim] = shift_diff;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +    
 +    if (dd->comm->eDLB != edlbNO)
 +    {
 +        snew(dd->comm->root,dd->ndim);
 +    }
 +    
 +    if (dd->comm->bRecordLoad)
 +    {
 +        make_load_communicators(dd);
 +    }
 +}
 +
 +static void make_pp_communicator(FILE *fplog,t_commrec *cr,int reorder)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_domdec_comm_t *comm;
 +    int  i,rank,*buf;
 +    ivec periods;
 +#ifdef GMX_MPI
 +    MPI_Comm comm_cart;
 +#endif
 +    
 +    dd = cr->dd;
 +    comm = dd->comm;
 +    
 +#ifdef GMX_MPI
 +    if (comm->bCartesianPP)
 +    {
 +        /* Set up cartesian communication for the particle-particle part */
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Will use a Cartesian communicator: %d x %d x %d\n",
 +                    dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
 +        }
 +        
 +        for(i=0; i<DIM; i++)
 +        {
 +            periods[i] = TRUE;
 +        }
 +        MPI_Cart_create(cr->mpi_comm_mygroup,DIM,dd->nc,periods,reorder,
 +                        &comm_cart);
 +        /* We overwrite the old communicator with the new cartesian one */
 +        cr->mpi_comm_mygroup = comm_cart;
 +    }
 +    
 +    dd->mpi_comm_all = cr->mpi_comm_mygroup;
 +    MPI_Comm_rank(dd->mpi_comm_all,&dd->rank);
 +    
 +    if (comm->bCartesianPP_PME)
 +    {
 +        /* Since we want to use the original cartesian setup for sim,
 +         * and not the one after split, we need to make an index.
 +         */
 +        snew(comm->ddindex2ddnodeid,dd->nnodes);
 +        comm->ddindex2ddnodeid[dd_index(dd->nc,dd->ci)] = dd->rank;
 +        gmx_sumi(dd->nnodes,comm->ddindex2ddnodeid,cr);
 +        /* Get the rank of the DD master,
 +         * above we made sure that the master node is a PP node.
 +         */
 +        if (MASTER(cr))
 +        {
 +            rank = dd->rank;
 +        }
 +        else
 +        {
 +            rank = 0;
 +        }
 +        MPI_Allreduce(&rank,&dd->masterrank,1,MPI_INT,MPI_SUM,dd->mpi_comm_all);
 +    }
 +    else if (comm->bCartesianPP)
 +    {
 +        if (cr->npmenodes == 0)
 +        {
 +            /* The PP communicator is also
 +             * the communicator for this simulation
 +             */
 +            cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
 +        }
 +        cr->nodeid = dd->rank;
 +        
 +        MPI_Cart_coords(dd->mpi_comm_all,dd->rank,DIM,dd->ci);
 +        
 +        /* We need to make an index to go from the coordinates
 +         * to the nodeid of this simulation.
 +         */
 +        snew(comm->ddindex2simnodeid,dd->nnodes);
 +        snew(buf,dd->nnodes);
 +        if (cr->duty & DUTY_PP)
 +        {
 +            buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
 +        }
 +        /* Communicate the ddindex to simulation nodeid index */
 +        MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
 +                      cr->mpi_comm_mysim);
 +        sfree(buf);
 +        
 +        /* Determine the master coordinates and rank.
 +         * The DD master should be the same node as the master of this sim.
 +         */
 +        for(i=0; i<dd->nnodes; i++)
 +        {
 +            if (comm->ddindex2simnodeid[i] == 0)
 +            {
 +                ddindex2xyz(dd->nc,i,dd->master_ci);
 +                MPI_Cart_rank(dd->mpi_comm_all,dd->master_ci,&dd->masterrank);
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug,"The master rank is %d\n",dd->masterrank);
 +        }
 +    }
 +    else
 +    {
 +        /* No Cartesian communicators */
 +        /* We use the rank in dd->comm->all as DD index */
 +        ddindex2xyz(dd->nc,dd->rank,dd->ci);
 +        /* The simulation master nodeid is 0, so the DD master rank is also 0 */
 +        dd->masterrank = 0;
 +        clear_ivec(dd->master_ci);
 +    }
 +#endif
 +  
 +    if (fplog)
 +    {
 +        fprintf(fplog,
 +                "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
 +                dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug,
 +                "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
 +                dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
 +    }
 +}
 +
 +static void receive_ddindex2simnodeid(t_commrec *cr)
 +{
 +    gmx_domdec_t *dd;
 +    
 +    gmx_domdec_comm_t *comm;
 +    int  *buf;
 +    
 +    dd = cr->dd;
 +    comm = dd->comm;
 +    
 +#ifdef GMX_MPI
 +    if (!comm->bCartesianPP_PME && comm->bCartesianPP)
 +    {
 +        snew(comm->ddindex2simnodeid,dd->nnodes);
 +        snew(buf,dd->nnodes);
 +        if (cr->duty & DUTY_PP)
 +        {
 +            buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
 +        }
 +#ifdef GMX_MPI
 +        /* Communicate the ddindex to simulation nodeid index */
 +        MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
 +                      cr->mpi_comm_mysim);
 +#endif
 +        sfree(buf);
 +    }
 +#endif
 +}
 +
 +static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
 +                                                     int ncg,int natoms)
 +{
 +    gmx_domdec_master_t *ma;
 +    int i;
 +
 +    snew(ma,1);
 +    
 +    snew(ma->ncg,dd->nnodes);
 +    snew(ma->index,dd->nnodes+1);
 +    snew(ma->cg,ncg);
 +    snew(ma->nat,dd->nnodes);
 +    snew(ma->ibuf,dd->nnodes*2);
 +    snew(ma->cell_x,DIM);
 +    for(i=0; i<DIM; i++)
 +    {
 +        snew(ma->cell_x[i],dd->nc[i]+1);
 +    }
 +
 +    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
 +    {
 +        ma->vbuf = NULL;
 +    }
 +    else
 +    {
 +        snew(ma->vbuf,natoms);
 +    }
 +
 +    return ma;
 +}
 +
 +static void split_communicator(FILE *fplog,t_commrec *cr,int dd_node_order,
 +                               int reorder)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_domdec_comm_t *comm;
 +    int  i,rank;
 +    gmx_bool bDiv[DIM];
 +    ivec periods;
 +#ifdef GMX_MPI
 +    MPI_Comm comm_cart;
 +#endif
 +    
 +    dd = cr->dd;
 +    comm = dd->comm;
 +    
 +    if (comm->bCartesianPP)
 +    {
 +        for(i=1; i<DIM; i++)
 +        {
 +            bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
 +        }
 +        if (bDiv[YY] || bDiv[ZZ])
 +        {
 +            comm->bCartesianPP_PME = TRUE;
 +            /* If we have 2D PME decomposition, which is always in x+y,
 +             * we stack the PME only nodes in z.
 +             * Otherwise we choose the direction that provides the thinnest slab
 +             * of PME only nodes as this will have the least effect
 +             * on the PP communication.
 +             * But for the PME communication the opposite might be better.
 +             */
 +            if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
 +                             !bDiv[YY] ||
 +                             dd->nc[YY] > dd->nc[ZZ]))
 +            {
 +                comm->cartpmedim = ZZ;
 +            }
 +            else
 +            {
 +                comm->cartpmedim = YY;
 +            }
 +            comm->ntot[comm->cartpmedim]
 +                += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
 +        }
 +        else if (fplog)
 +        {
 +            fprintf(fplog,"#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n",cr->npmenodes,dd->nc[XX],dd->nc[YY],dd->nc[XX],dd->nc[ZZ]);
 +            fprintf(fplog,
 +                    "Will not use a Cartesian communicator for PP <-> PME\n\n");
 +        }
 +    }
 +    
 +#ifdef GMX_MPI
 +    if (comm->bCartesianPP_PME)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n",comm->ntot[XX],comm->ntot[YY],comm->ntot[ZZ]);
 +        }
 +        
 +        for(i=0; i<DIM; i++)
 +        {
 +            periods[i] = TRUE;
 +        }
 +        MPI_Cart_create(cr->mpi_comm_mysim,DIM,comm->ntot,periods,reorder,
 +                        &comm_cart);
 +        
 +        MPI_Comm_rank(comm_cart,&rank);
 +        if (MASTERNODE(cr) && rank != 0)
 +        {
 +            gmx_fatal(FARGS,"MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
 +        }
 +        
 +        /* With this assigment we loose the link to the original communicator
 +         * which will usually be MPI_COMM_WORLD, unless have multisim.
 +         */
 +        cr->mpi_comm_mysim = comm_cart;
 +        cr->sim_nodeid = rank;
 +        
 +        MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,dd->ci);
 +        
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Cartesian nodeid %d, coordinates %d %d %d\n\n",
 +                    cr->sim_nodeid,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
 +        }
 +        
 +        if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
 +        {
 +            cr->duty = DUTY_PP;
 +        }
 +        if (cr->npmenodes == 0 ||
 +            dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
 +        {
 +            cr->duty = DUTY_PME;
 +        }
 +        
 +        /* Split the sim communicator into PP and PME only nodes */
 +        MPI_Comm_split(cr->mpi_comm_mysim,
 +                       cr->duty,
 +                       dd_index(comm->ntot,dd->ci),
 +                       &cr->mpi_comm_mygroup);
 +    }
 +    else
 +    {
 +        switch (dd_node_order)
 +        {
 +        case ddnoPP_PME:
 +            if (fplog)
 +            {
 +                fprintf(fplog,"Order of the nodes: PP first, PME last\n");
 +            }
 +            break;
 +        case ddnoINTERLEAVE:
 +            /* Interleave the PP-only and PME-only nodes,
 +             * as on clusters with dual-core machines this will double
 +             * the communication bandwidth of the PME processes
 +             * and thus speed up the PP <-> PME and inter PME communication.
 +             */
 +            if (fplog)
 +            {
 +                fprintf(fplog,"Interleaving PP and PME nodes\n");
 +            }
 +            comm->pmenodes = dd_pmenodes(cr);
 +            break;
 +        case ddnoCARTESIAN:
 +            break;
 +        default:
 +            gmx_fatal(FARGS,"Unknown dd_node_order=%d",dd_node_order);
 +        }
 +    
 +        if (dd_simnode2pmenode(cr,cr->sim_nodeid) == -1)
 +        {
 +            cr->duty = DUTY_PME;
 +        }
 +        else
 +        {
 +            cr->duty = DUTY_PP;
 +        }
 +        
 +        /* Split the sim communicator into PP and PME only nodes */
 +        MPI_Comm_split(cr->mpi_comm_mysim,
 +                       cr->duty,
 +                       cr->nodeid,
 +                       &cr->mpi_comm_mygroup);
 +        MPI_Comm_rank(cr->mpi_comm_mygroup,&cr->nodeid);
 +    }
 +#endif
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog,"This is a %s only node\n\n",
 +                (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
 +    }
 +}
 +
 +void make_dd_communicators(FILE *fplog,t_commrec *cr,int dd_node_order)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_domdec_comm_t *comm;
 +    int CartReorder;
 +    
 +    dd = cr->dd;
 +    comm = dd->comm;
 +    
 +    copy_ivec(dd->nc,comm->ntot);
 +    
 +    comm->bCartesianPP = (dd_node_order == ddnoCARTESIAN);
 +    comm->bCartesianPP_PME = FALSE;
 +    
 +    /* Reorder the nodes by default. This might change the MPI ranks.
 +     * Real reordering is only supported on very few architectures,
 +     * Blue Gene is one of them.
 +     */
 +    CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
 +    
 +    if (cr->npmenodes > 0)
 +    {
 +        /* Split the communicator into a PP and PME part */
 +        split_communicator(fplog,cr,dd_node_order,CartReorder);
 +        if (comm->bCartesianPP_PME)
 +        {
 +            /* We (possibly) reordered the nodes in split_communicator,
 +             * so it is no longer required in make_pp_communicator.
 +             */
 +            CartReorder = FALSE;
 +        }
 +    }
 +    else
 +    {
 +        /* All nodes do PP and PME */
 +#ifdef GMX_MPI    
 +        /* We do not require separate communicators */
 +        cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
 +#endif
 +    }
 +    
 +    if (cr->duty & DUTY_PP)
 +    {
 +        /* Copy or make a new PP communicator */
 +        make_pp_communicator(fplog,cr,CartReorder);
 +    }
 +    else
 +    {
 +        receive_ddindex2simnodeid(cr);
 +    }
 +    
 +    if (!(cr->duty & DUTY_PME))
 +    {
 +        /* Set up the commnuication to our PME node */
 +        dd->pme_nodeid = dd_simnode2pmenode(cr,cr->sim_nodeid);
 +        dd->pme_receive_vir_ener = receive_vir_ener(cr);
 +        if (debug)
 +        {
 +            fprintf(debug,"My pme_nodeid %d receive ener %d\n",
 +                    dd->pme_nodeid,dd->pme_receive_vir_ener);
 +        }
 +    }
 +    else
 +    {
 +        dd->pme_nodeid = -1;
 +    }
 +
 +    if (DDMASTER(dd))
 +    {
 +        dd->ma = init_gmx_domdec_master_t(dd,
 +                                          comm->cgs_gl.nr,
 +                                          comm->cgs_gl.index[comm->cgs_gl.nr]);
 +    }
 +}
 +
 +static real *get_slb_frac(FILE *fplog,const char *dir,int nc,const char *size_string)
 +{
 +    real *slb_frac,tot;
 +    int  i,n;
 +    double dbl;
 +    
 +    slb_frac = NULL;
 +    if (nc > 1 && size_string != NULL)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Using static load balancing for the %s direction\n",
 +                    dir);
 +        }
 +        snew(slb_frac,nc);
 +        tot = 0;
 +        for (i=0; i<nc; i++)
 +        {
 +            dbl = 0;
 +            sscanf(size_string,"%lf%n",&dbl,&n);
 +            if (dbl == 0)
 +            {
 +                gmx_fatal(FARGS,"Incorrect or not enough DD cell size entries for direction %s: '%s'",dir,size_string);
 +            }
 +            slb_frac[i] = dbl;
 +            size_string += n;
 +            tot += slb_frac[i];
 +        }
 +        /* Normalize */
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Relative cell sizes:");
 +        }
 +        for (i=0; i<nc; i++)
 +        {
 +            slb_frac[i] /= tot;
 +            if (fplog)
 +            {
 +                fprintf(fplog," %5.3f",slb_frac[i]);
 +            }
 +        }
 +        if (fplog)
 +        {
 +            fprintf(fplog,"\n");
 +        }
 +    }
 +    
 +    return slb_frac;
 +}
 +
 +static int multi_body_bondeds_count(gmx_mtop_t *mtop)
 +{
 +    int n,nmol,ftype;
 +    gmx_mtop_ilistloop_t iloop;
 +    t_ilist *il;
 +    
 +    n = 0;
 +    iloop = gmx_mtop_ilistloop_init(mtop);
 +    while (gmx_mtop_ilistloop_next(iloop,&il,&nmol))
 +    {
 +        for(ftype=0; ftype<F_NRE; ftype++)
 +        {
 +            if ((interaction_function[ftype].flags & IF_BOND) &&
 +                NRAL(ftype) >  2)
 +            {
 +                n += nmol*il[ftype].nr/(1 + NRAL(ftype));
 +            }
 +        }
 +  }
 +
 +  return n;
 +}
 +
 +static int dd_nst_env(FILE *fplog,const char *env_var,int def)
 +{
 +    char *val;
 +    int  nst;
 +    
 +    nst = def;
 +    val = getenv(env_var);
 +    if (val)
 +    {
 +        if (sscanf(val,"%d",&nst) <= 0)
 +        {
 +            nst = 1;
 +        }
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Found env.var. %s = %s, using value %d\n",
 +                    env_var,val,nst);
 +        }
 +    }
 +    
 +    return nst;
 +}
 +
 +static void dd_warning(t_commrec *cr,FILE *fplog,const char *warn_string)
 +{
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr,"\n%s\n",warn_string);
 +    }
 +    if (fplog)
 +    {
 +        fprintf(fplog,"\n%s\n",warn_string);
 +    }
 +}
 +
 +static void check_dd_restrictions(t_commrec *cr,gmx_domdec_t *dd,
 +                                  t_inputrec *ir,FILE *fplog)
 +{
 +    if (ir->ePBC == epbcSCREW &&
 +        (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
 +    {
 +        gmx_fatal(FARGS,"With pbc=%s can only do domain decomposition in the x-direction",epbc_names[ir->ePBC]);
 +    }
 +
 +    if (ir->ns_type == ensSIMPLE)
 +    {
 +        gmx_fatal(FARGS,"Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
 +    }
 +
 +    if (ir->nstlist == 0)
 +    {
 +        gmx_fatal(FARGS,"Domain decomposition does not work with nstlist=0");
 +    }
 +
 +    if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
 +    {
 +        dd_warning(cr,fplog,"comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
 +    }
 +}
 +
 +static real average_cellsize_min(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
 +{
 +    int  di,d;
 +    real r;
 +
 +    r = ddbox->box_size[XX];
 +    for(di=0; di<dd->ndim; di++)
 +    {
 +        d = dd->dim[di];
 +        /* Check using the initial average cell size */
 +        r = min(r,ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
 +    }
 +
 +    return r;
 +}
 +
 +static int check_dlb_support(FILE *fplog,t_commrec *cr,
 +                             const char *dlb_opt,gmx_bool bRecordLoad,
 +                             unsigned long Flags,t_inputrec *ir)
 +{
 +    gmx_domdec_t *dd;
 +    int  eDLB=-1;
 +    char buf[STRLEN];
 +
 +    switch (dlb_opt[0])
 +    {
 +    case 'a': eDLB = edlbAUTO; break;
 +    case 'n': eDLB = edlbNO;   break;
 +    case 'y': eDLB = edlbYES;  break;
 +    default: gmx_incons("Unknown dlb_opt");
 +    }
 +
 +    if (Flags & MD_RERUN)
 +    {
 +        return edlbNO;
 +    }
 +
 +    if (!EI_DYNAMICS(ir->eI))
 +    {
 +        if (eDLB == edlbYES)
 +        {
 +            sprintf(buf,"NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n",EI(ir->eI));
 +            dd_warning(cr,fplog,buf);
 +        }
 +            
 +        return edlbNO;
 +    }
 +
 +    if (!bRecordLoad)
 +    {
 +        dd_warning(cr,fplog,"NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
 +
 +        return edlbNO;
 +    }
 +
 +    if (Flags & MD_REPRODUCIBLE)
 +    {
 +        switch (eDLB)
 +        {
 +                      case edlbNO: 
 +                              break;
 +                      case edlbAUTO:
 +                              dd_warning(cr,fplog,"NOTE: reproducibility requested, will not use dynamic load balancing\n");
 +                              eDLB = edlbNO;
 +                              break;
 +                      case edlbYES:
 +                              dd_warning(cr,fplog,"WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
 +                              break;
 +                      default:
 +                              gmx_fatal(FARGS,"Death horror: undefined case (%d) for load balancing choice",eDLB);
 +                              break;
 +        }
 +    }
 +
 +    return eDLB;
 +}
 +
 +static void set_dd_dim(FILE *fplog,gmx_domdec_t *dd)
 +{
 +    int dim;
 +
 +    dd->ndim = 0;
 +    if (getenv("GMX_DD_ORDER_ZYX") != NULL)
 +    {
 +        /* Decomposition order z,y,x */
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Using domain decomposition order z, y, x\n");
 +        }
 +        for(dim=DIM-1; dim>=0; dim--)
 +        {
 +            if (dd->nc[dim] > 1)
 +            {
 +                dd->dim[dd->ndim++] = dim;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        /* Decomposition order x,y,z */
 +        for(dim=0; dim<DIM; dim++)
 +        {
 +            if (dd->nc[dim] > 1)
 +            {
 +                dd->dim[dd->ndim++] = dim;
 +            }
 +        }
 +    }
 +}
 +
 +static gmx_domdec_comm_t *init_dd_comm()
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  i;
 +
 +    snew(comm,1);
 +    snew(comm->cggl_flag,DIM*2);
 +    snew(comm->cgcm_state,DIM*2);
 +    for(i=0; i<DIM*2; i++)
 +    {
 +        comm->cggl_flag_nalloc[i]  = 0;
 +        comm->cgcm_state_nalloc[i] = 0;
 +    }
 +    
 +    comm->nalloc_int = 0;
 +    comm->buf_int    = NULL;
 +
 +    vec_rvec_init(&comm->vbuf);
 +
 +    comm->n_load_have    = 0;
 +    comm->n_load_collect = 0;
 +
 +    for(i=0; i<ddnatNR-ddnatZONE; i++)
 +    {
 +        comm->sum_nat[i] = 0;
 +    }
 +    comm->ndecomp = 0;
 +    comm->nload   = 0;
 +    comm->load_step = 0;
 +    comm->load_sum  = 0;
 +    comm->load_max  = 0;
 +    clear_ivec(comm->load_lim);
 +    comm->load_mdf  = 0;
 +    comm->load_pme  = 0;
 +
 +    return comm;
 +}
 +
 +gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
 +                                        unsigned long Flags,
 +                                        ivec nc,
 +                                        real comm_distance_min,real rconstr,
 +                                        const char *dlb_opt,real dlb_scale,
 +                                        const char *sizex,const char *sizey,const char *sizez,
 +                                        gmx_mtop_t *mtop,t_inputrec *ir,
 +                                        matrix box,rvec *x,
 +                                        gmx_ddbox_t *ddbox,
 +                                        int *npme_x,int *npme_y)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_domdec_comm_t *comm;
 +    int  recload;
 +    int  d,i,j;
 +    real r_2b,r_mb,r_bonded=-1,r_bonded_limit=-1,limit,acs;
 +    gmx_bool bC;
 +    char buf[STRLEN];
 +    
 +    if (fplog)
 +    {
 +        fprintf(fplog,
 +                "\nInitializing Domain Decomposition on %d nodes\n",cr->nnodes);
 +    }
 +    
 +    snew(dd,1);
 +
 +    dd->comm = init_dd_comm();
 +    comm = dd->comm;
 +    snew(comm->cggl_flag,DIM*2);
 +    snew(comm->cgcm_state,DIM*2);
 +
 +    dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
 +    dd->bScrewPBC = (ir->ePBC == epbcSCREW);
 +    
 +    dd->bSendRecv2      = dd_nst_env(fplog,"GMX_DD_SENDRECV2",0);
 +    comm->dlb_scale_lim = dd_nst_env(fplog,"GMX_DLB_MAX",10);
 +    comm->eFlop         = dd_nst_env(fplog,"GMX_DLB_FLOP",0);
 +    recload             = dd_nst_env(fplog,"GMX_DD_LOAD",1);
 +    comm->nstSortCG     = dd_nst_env(fplog,"GMX_DD_SORT",1);
 +    comm->nstDDDump     = dd_nst_env(fplog,"GMX_DD_DUMP",0);
 +    comm->nstDDDumpGrid = dd_nst_env(fplog,"GMX_DD_DUMP_GRID",0);
 +    comm->DD_debug      = dd_nst_env(fplog,"GMX_DD_DEBUG",0);
 +
 +    dd->pme_recv_f_alloc = 0;
 +    dd->pme_recv_f_buf = NULL;
 +
 +    if (dd->bSendRecv2 && fplog)
 +    {
 +        fprintf(fplog,"Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
 +    }
 +    if (comm->eFlop)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Will load balance based on FLOP count\n");
 +        }
 +        if (comm->eFlop > 1)
 +        {
 +            srand(1+cr->nodeid);
 +        }
 +        comm->bRecordLoad = TRUE;
 +    }
 +    else
 +    {
 +        comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
 +                             
 +    }
 +    
 +    comm->eDLB = check_dlb_support(fplog,cr,dlb_opt,comm->bRecordLoad,Flags,ir);
 +    
 +    comm->bDynLoadBal = (comm->eDLB == edlbYES);
 +    if (fplog)
 +    {
 +        fprintf(fplog,"Dynamic load balancing: %s\n",edlb_names[comm->eDLB]);
 +    }
 +    dd->bGridJump = comm->bDynLoadBal;
 +    
 +    if (comm->nstSortCG)
 +    {
 +        if (fplog)
 +        {
 +            if (comm->nstSortCG == 1)
 +            {
 +                fprintf(fplog,"Will sort the charge groups at every domain (re)decomposition\n");
 +            }
 +            else
 +            {
 +                fprintf(fplog,"Will sort the charge groups every %d steps\n",
 +                        comm->nstSortCG);
 +            }
 +        }
 +        snew(comm->sort,1);
 +    }
 +    else
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog,"Will not sort the charge groups\n");
 +        }
 +    }
 +    
 +    comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
 +    if (comm->bInterCGBondeds)
 +    {
 +        comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
 +    }
 +    else
 +    {
 +        comm->bInterCGMultiBody = FALSE;
 +    }
 +    
 +    dd->bInterCGcons = inter_charge_group_constraints(mtop);
 +
 +    if (ir->rlistlong == 0)
 +    {
 +        /* Set the cut-off to some very large value,
 +         * so we don't need if statements everywhere in the code.
 +         * We use sqrt, since the cut-off is squared in some places.
 +         */
 +        comm->cutoff   = GMX_CUTOFF_INF;
 +    }
 +    else
 +    {
 +        comm->cutoff   = ir->rlistlong;
 +    }
 +    comm->cutoff_mbody = 0;
 +    
 +    comm->cellsize_limit = 0;
 +    comm->bBondComm = FALSE;
 +
 +    if (comm->bInterCGBondeds)
 +    {
 +        if (comm_distance_min > 0)
 +        {
 +            comm->cutoff_mbody = comm_distance_min;
 +            if (Flags & MD_DDBONDCOMM)
 +            {
 +                comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
 +            }
 +            else
 +            {
 +                comm->cutoff = max(comm->cutoff,comm->cutoff_mbody);
 +            }
 +            r_bonded_limit = comm->cutoff_mbody;
 +        }
 +        else if (ir->bPeriodicMols)
 +        {
 +            /* Can not easily determine the required cut-off */
 +            dd_warning(cr,fplog,"NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
 +            comm->cutoff_mbody = comm->cutoff/2;
 +            r_bonded_limit = comm->cutoff_mbody;
 +        }
 +        else
 +        {
 +            if (MASTER(cr))
 +            {
 +                dd_bonded_cg_distance(fplog,dd,mtop,ir,x,box,
 +                                      Flags & MD_DDBONDCHECK,&r_2b,&r_mb);
 +            }
 +            gmx_bcast(sizeof(r_2b),&r_2b,cr);
 +            gmx_bcast(sizeof(r_mb),&r_mb,cr);
 +
 +            /* We use an initial margin of 10% for the minimum cell size,
 +             * except when we are just below the non-bonded cut-off.
 +             */
 +            if (Flags & MD_DDBONDCOMM)
 +            {
 +                if (max(r_2b,r_mb) > comm->cutoff)
 +                {
 +                    r_bonded       = max(r_2b,r_mb);
 +                    r_bonded_limit = 1.1*r_bonded;
 +                    comm->bBondComm = TRUE;
 +                }
 +                else
 +                {
 +                    r_bonded       = r_mb;
 +                    r_bonded_limit = min(1.1*r_bonded,comm->cutoff);
 +                }
 +                /* We determine cutoff_mbody later */
 +            }
 +            else
 +            {
 +                /* No special bonded communication,
 +                 * simply increase the DD cut-off.
 +                 */
 +                r_bonded_limit     = 1.1*max(r_2b,r_mb);
 +                comm->cutoff_mbody = r_bonded_limit;
 +                comm->cutoff       = max(comm->cutoff,comm->cutoff_mbody);
 +            }
 +        }
 +        comm->cellsize_limit = max(comm->cellsize_limit,r_bonded_limit);
 +        if (fplog)
 +        {
 +            fprintf(fplog,
 +                    "Minimum cell size due to bonded interactions: %.3f nm\n",
 +                    comm->cellsize_limit);
 +        }
 +    }
 +
 +    if (dd->bInterCGcons && rconstr <= 0)
 +    {
 +        /* There is a cell size limit due to the constraints (P-LINCS) */
 +        rconstr = constr_r_max(fplog,mtop,ir);
 +        if (fplog)
 +        {
 +            fprintf(fplog,
 +                    "Estimated maximum distance required for P-LINCS: %.3f nm\n",
 +                    rconstr);
 +            if (rconstr > comm->cellsize_limit)
 +            {
 +                fprintf(fplog,"This distance will limit the DD cell size, you can override this with -rcon\n");
 +            }
 +        }
 +    }
 +    else if (rconstr > 0 && fplog)
 +    {
 +        /* Here we do not check for dd->bInterCGcons,
 +         * because one can also set a cell size limit for virtual sites only
 +         * and at this point we don't know yet if there are intercg v-sites.
 +         */
 +        fprintf(fplog,
 +                "User supplied maximum distance required for P-LINCS: %.3f nm\n",
 +                rconstr);
 +    }
 +    comm->cellsize_limit = max(comm->cellsize_limit,rconstr);
 +
 +    comm->cgs_gl = gmx_mtop_global_cgs(mtop);
 +
 +    if (nc[XX] > 0)
 +    {
 +        copy_ivec(nc,dd->nc);
 +        set_dd_dim(fplog,dd);
 +        set_ddbox_cr(cr,&dd->nc,ir,box,&comm->cgs_gl,x,ddbox);
 +
 +        if (cr->npmenodes == -1)
 +        {
 +            cr->npmenodes = 0;
 +        }
 +        acs = average_cellsize_min(dd,ddbox);
 +        if (acs < comm->cellsize_limit)
 +        {
 +            if (fplog)
 +            {
 +                fprintf(fplog,"ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n",acs,comm->cellsize_limit);
 +            }
 +            gmx_fatal_collective(FARGS,cr,NULL,
 +                                 "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
 +                                 acs,comm->cellsize_limit);
 +        }
 +    }
 +    else
 +    {
 +        set_ddbox_cr(cr,NULL,ir,box,&comm->cgs_gl,x,ddbox);
 +
 +        /* We need to choose the optimal DD grid and possibly PME nodes */
 +        limit = dd_choose_grid(fplog,cr,dd,ir,mtop,box,ddbox,
 +                               comm->eDLB!=edlbNO,dlb_scale,
 +                               comm->cellsize_limit,comm->cutoff,
 +                               comm->bInterCGBondeds,comm->bInterCGMultiBody);
 +        
 +        if (dd->nc[XX] == 0)
 +        {
 +            bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
 +            sprintf(buf,"Change the number of nodes or mdrun option %s%s%s",
 +                    !bC ? "-rdd" : "-rcon",
 +                    comm->eDLB!=edlbNO ? " or -dds" : "",
 +                    bC ? " or your LINCS settings" : "");
 +
 +            gmx_fatal_collective(FARGS,cr,NULL,
 +                                 "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
 +                                 "%s\n"
 +                                 "Look in the log file for details on the domain decomposition",
 +                                 cr->nnodes-cr->npmenodes,limit,buf);
 +        }
 +        set_dd_dim(fplog,dd);
 +    }
 +
 +    if (fplog)
 +    {
 +        fprintf(fplog,
 +                "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
 +                dd->nc[XX],dd->nc[YY],dd->nc[ZZ],cr->npmenodes);
 +    }
 +    
 +    dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
 +    if (cr->nnodes - dd->nnodes != cr->npmenodes)
 +    {
 +        gmx_fatal_collective(FARGS,cr,NULL,
 +                             "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
 +                             dd->nnodes,cr->nnodes - cr->npmenodes,cr->nnodes);
 +    }
 +    if (cr->npmenodes > dd->nnodes)
 +    {
 +        gmx_fatal_collective(FARGS,cr,NULL,
 +                             "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.",cr->npmenodes,dd->nnodes);
 +    }
 +    if (cr->npmenodes > 0)
 +    {
 +        comm->npmenodes = cr->npmenodes;
 +    }
 +    else
 +    {
 +        comm->npmenodes = dd->nnodes;
 +    }
 +
 +    if (EEL_PME(ir->coulombtype))
 +    {
 +        /* The following choices should match those
 +         * in comm_cost_est in domdec_setup.c.
 +         * Note that here the checks have to take into account
 +         * that the decomposition might occur in a different order than xyz
 +         * (for instance through the env.var. GMX_DD_ORDER_ZYX),
 +         * in which case they will not match those in comm_cost_est,
 +         * but since that is mainly for testing purposes that's fine.
 +         */
 +        if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
 +            comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
 +            getenv("GMX_PMEONEDD") == NULL)
 +        {
 +            comm->npmedecompdim = 2;
 +            comm->npmenodes_x   = dd->nc[XX];
 +            comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
 +        }
 +        else
 +        {
 +            /* In case nc is 1 in both x and y we could still choose to
 +             * decompose pme in y instead of x, but we use x for simplicity.
 +             */
 +            comm->npmedecompdim = 1;
 +            if (dd->dim[0] == YY)
 +            {
 +                comm->npmenodes_x = 1;
 +                comm->npmenodes_y = comm->npmenodes;
 +            }
 +            else
 +            {
 +                comm->npmenodes_x = comm->npmenodes;
 +                comm->npmenodes_y = 1;
 +            }
 +        }    
 +        if (fplog)
 +        {
 +            fprintf(fplog,"PME domain decomposition: %d x %d x %d\n",
 +                    comm->npmenodes_x,comm->npmenodes_y,1);
 +        }
 +    }
 +    else
 +    {
 +        comm->npmedecompdim = 0;
 +        comm->npmenodes_x   = 0;
 +        comm->npmenodes_y   = 0;
 +    }
 +    
 +    /* Technically we don't need both of these,
 +     * but it simplifies code not having to recalculate it.
 +     */
 +    *npme_x = comm->npmenodes_x;
 +    *npme_y = comm->npmenodes_y;
 +        
 +    snew(comm->slb_frac,DIM);
 +    if (comm->eDLB == edlbNO)
 +    {
 +        comm->slb_frac[XX] = get_slb_frac(fplog,"x",dd->nc[XX],sizex);
 +        comm->slb_frac[YY] = get_slb_frac(fplog,"y",dd->nc[YY],sizey);
 +        comm->slb_frac[ZZ] = get_slb_frac(fplog,"z",dd->nc[ZZ],sizez);
 +    }
 +
 +    if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
 +    {
 +        if (comm->bBondComm || comm->eDLB != edlbNO)
 +        {
 +            /* Set the bonded communication distance to halfway
 +             * the minimum and the maximum,
 +             * since the extra communication cost is nearly zero.
 +             */
 +            acs = average_cellsize_min(dd,ddbox);
 +            comm->cutoff_mbody = 0.5*(r_bonded + acs);
 +            if (comm->eDLB != edlbNO)
 +            {
 +                /* Check if this does not limit the scaling */
 +                comm->cutoff_mbody = min(comm->cutoff_mbody,dlb_scale*acs);
 +            }
 +            if (!comm->bBondComm)
 +            {
 +                /* Without bBondComm do not go beyond the n.b. cut-off */
 +                comm->cutoff_mbody = min(comm->cutoff_mbody,comm->cutoff);
 +                if (comm->cellsize_limit >= comm->cutoff)
 +                {
 +                    /* We don't loose a lot of efficieny
 +                     * when increasing it to the n.b. cut-off.
 +                     * It can even be slightly faster, because we need
 +                     * less checks for the communication setup.
 +                     */
 +                    comm->cutoff_mbody = comm->cutoff;
 +                }
 +            }
 +            /* Check if we did not end up below our original limit */
 +            comm->cutoff_mbody = max(comm->cutoff_mbody,r_bonded_limit);
 +
 +            if (comm->cutoff_mbody > comm->cellsize_limit)
 +            {
 +                comm->cellsize_limit = comm->cutoff_mbody;
 +            }
 +        }
 +        /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"Bonded atom communication beyond the cut-off: %d\n"
 +                "cellsize limit %f\n",
 +                comm->bBondComm,comm->cellsize_limit);
 +    }
 +    
 +    if (MASTER(cr))
 +    {
 +        check_dd_restrictions(cr,dd,ir,fplog);
 +    }
 +
 +    comm->globalcomm_step = INT_MIN;
 +    dd->ddp_count = 0;
 +
 +    clear_dd_cycle_counts(dd);
 +
 +    return dd;
 +}
 +
 +static void set_dlb_limits(gmx_domdec_t *dd)
 +
 +{
 +    int d;
 +
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        dd->comm->cd[d].np = dd->comm->cd[d].np_dlb;
 +        dd->comm->cellsize_min[dd->dim[d]] =
 +            dd->comm->cellsize_min_dlb[dd->dim[d]];
 +    }
 +}
 +
 +
 +static void turn_on_dlb(FILE *fplog,t_commrec *cr,gmx_large_int_t step)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_domdec_comm_t *comm;
 +    real cellsize_min;
 +    int  d,nc,i;
 +    char buf[STRLEN];
 +    
 +    dd = cr->dd;
 +    comm = dd->comm;
 +    
 +    if (fplog)
 +    {
 +        fprintf(fplog,"At step %s the performance loss due to force load imbalance is %.1f %%\n",gmx_step_str(step,buf),dd_force_imb_perf_loss(dd)*100);
 +    }
 +
 +    cellsize_min = comm->cellsize_min[dd->dim[0]];
 +    for(d=1; d<dd->ndim; d++)
 +    {
 +        cellsize_min = min(cellsize_min,comm->cellsize_min[dd->dim[d]]);
 +    }
 +
 +    if (cellsize_min < comm->cellsize_limit*1.05)
 +    {
 +        dd_warning(cr,fplog,"NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
 +
 +        /* Change DLB from "auto" to "no". */
 +        comm->eDLB = edlbNO;
 +
 +        return;
 +    }
 +
 +    dd_warning(cr,fplog,"NOTE: Turning on dynamic load balancing\n");
 +    comm->bDynLoadBal = TRUE;
 +    dd->bGridJump = TRUE;
 +    
 +    set_dlb_limits(dd);
 +
 +    /* We can set the required cell size info here,
 +     * so we do not need to communicate this.
 +     * The grid is completely uniform.
 +     */
 +    for(d=0; d<dd->ndim; d++)
 +    {
 +        if (comm->root[d])
 +        {
 +            comm->load[d].sum_m = comm->load[d].sum;
 +
 +            nc = dd->nc[dd->dim[d]];
 +            for(i=0; i<nc; i++)
 +            {
 +                comm->root[d]->cell_f[i]    = i/(real)nc;
 +                if (d > 0)
 +                {
 +                    comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
 +                    comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
 +                }
 +            }
 +            comm->root[d]->cell_f[nc] = 1.0;
 +        }
 +    }
 +}
 +
 +static char *init_bLocalCG(gmx_mtop_t *mtop)
 +{
 +    int  ncg,cg;
 +    char *bLocalCG;
 +    
 +    ncg = ncg_mtop(mtop);
 +    snew(bLocalCG,ncg);
 +    for(cg=0; cg<ncg; cg++)
 +    {
 +        bLocalCG[cg] = FALSE;
 +    }
 +
 +    return bLocalCG;
 +}
 +
 +void dd_init_bondeds(FILE *fplog,
 +                     gmx_domdec_t *dd,gmx_mtop_t *mtop,
 +                     gmx_vsite_t *vsite,gmx_constr_t constr,
 +                     t_inputrec *ir,gmx_bool bBCheck,cginfo_mb_t *cginfo_mb)
 +{
 +    gmx_domdec_comm_t *comm;
 +    gmx_bool bBondComm;
 +    int  d;
 +
 +    dd_make_reverse_top(fplog,dd,mtop,vsite,constr,ir,bBCheck);
 +
 +    comm = dd->comm;
 +
 +    if (comm->bBondComm)
 +    {
 +        /* Communicate atoms beyond the cut-off for bonded interactions */
 +        comm = dd->comm;
 +
 +        comm->cglink = make_charge_group_links(mtop,dd,cginfo_mb);
 +
 +        comm->bLocalCG = init_bLocalCG(mtop);
 +    }
 +    else
 +    {
 +        /* Only communicate atoms based on cut-off */
 +        comm->cglink   = NULL;
 +        comm->bLocalCG = NULL;
 +    }
 +}
 +
 +static void print_dd_settings(FILE *fplog,gmx_domdec_t *dd,
 +                              t_inputrec *ir,
 +                              gmx_bool bDynLoadBal,real dlb_scale,
 +                              gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  d;
 +    ivec np;
 +    real limit,shrink;
 +    char buf[64];
 +
 +    if (fplog == NULL)
 +    {
 +        return;
 +    }
 +
 +    comm = dd->comm;
 +
 +    if (bDynLoadBal)
 +    {
 +        fprintf(fplog,"The maximum number of communication pulses is:");
 +        for(d=0; d<dd->ndim; d++)
 +        {
 +            fprintf(fplog," %c %d",dim2char(dd->dim[d]),comm->cd[d].np_dlb);
 +        }
 +        fprintf(fplog,"\n");
 +        fprintf(fplog,"The minimum size for domain decomposition cells is %.3f nm\n",comm->cellsize_limit);
 +        fprintf(fplog,"The requested allowed shrink of DD cells (option -dds) is: %.2f\n",dlb_scale);
 +        fprintf(fplog,"The allowed shrink of domain decomposition cells is:");
 +        for(d=0; d<DIM; d++)
 +        {
 +            if (dd->nc[d] > 1)
 +            {
 +                if (d >= ddbox->npbcdim && dd->nc[d] == 2)
 +                {
 +                    shrink = 0;
 +                }
 +                else
 +                {
 +                    shrink =
 +                        comm->cellsize_min_dlb[d]/
 +                        (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
 +                }
 +                fprintf(fplog," %c %.2f",dim2char(d),shrink);
 +            }
 +        }
 +        fprintf(fplog,"\n");
 +    }
 +    else
 +    {
 +        set_dd_cell_sizes_slb(dd,ddbox,FALSE,np);
 +        fprintf(fplog,"The initial number of communication pulses is:");
 +        for(d=0; d<dd->ndim; d++)
 +        {
 +            fprintf(fplog," %c %d",dim2char(dd->dim[d]),np[dd->dim[d]]);
 +        }
 +        fprintf(fplog,"\n");
 +        fprintf(fplog,"The initial domain decomposition cell size is:");
 +        for(d=0; d<DIM; d++) {
 +            if (dd->nc[d] > 1)
 +            {
 +                fprintf(fplog," %c %.2f nm",
 +                        dim2char(d),dd->comm->cellsize_min[d]);
 +            }
 +        }
 +        fprintf(fplog,"\n\n");
 +    }
 +    
 +    if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
 +    {
 +        fprintf(fplog,"The maximum allowed distance for charge groups involved in interactions is:\n");
 +        fprintf(fplog,"%40s  %-7s %6.3f nm\n",
 +                "non-bonded interactions","",comm->cutoff);
 +
 +        if (bDynLoadBal)
 +        {
 +            limit = dd->comm->cellsize_limit;
 +        }
 +        else
 +        {
 +            if (dynamic_dd_box(ddbox,ir))
 +            {
 +                fprintf(fplog,"(the following are initial values, they could change due to box deformation)\n");
 +            }
 +            limit = dd->comm->cellsize_min[XX];
 +            for(d=1; d<DIM; d++)
 +            {
 +                limit = min(limit,dd->comm->cellsize_min[d]);
 +            }
 +        }
 +
 +        if (comm->bInterCGBondeds)
 +        {
 +            fprintf(fplog,"%40s  %-7s %6.3f nm\n",
 +                    "two-body bonded interactions","(-rdd)",
 +                    max(comm->cutoff,comm->cutoff_mbody));
 +            fprintf(fplog,"%40s  %-7s %6.3f nm\n",
 +                    "multi-body bonded interactions","(-rdd)",
 +                    (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff,limit));
 +        }
 +        if (dd->vsite_comm)
 +        {
 +            fprintf(fplog,"%40s  %-7s %6.3f nm\n",
 +                    "virtual site constructions","(-rcon)",limit);
 +        }
 +        if (dd->constraint_comm)
 +        {
 +            sprintf(buf,"atoms separated by up to %d constraints",
 +                    1+ir->nProjOrder);
 +            fprintf(fplog,"%40s  %-7s %6.3f nm\n",
 +                    buf,"(-rcon)",limit);
 +        }
 +        fprintf(fplog,"\n");
 +    }
 +    
 +    fflush(fplog);
 +}
 +
 +void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
 +                       t_inputrec *ir,t_forcerec *fr,
 +                       gmx_ddbox_t *ddbox)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int  d,dim,npulse,npulse_d_max,npulse_d;
 +    gmx_bool bNoCutOff;
 +    int  natoms_tot;
 +    real vol_frac;
 +
 +    comm = dd->comm;
 +
 +    bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
 +
 +    if (EEL_PME(ir->coulombtype))
 +    {
 +        init_ddpme(dd,&comm->ddpme[0],0);
 +        if (comm->npmedecompdim >= 2)
 +        {
 +            init_ddpme(dd,&comm->ddpme[1],1);
 +        }
 +    }
 +    else
 +    {
 +        comm->npmenodes = 0;
 +        if (dd->pme_nodeid >= 0)
 +        {
 +            gmx_fatal_collective(FARGS,NULL,dd,
 +                                 "Can not have separate PME nodes without PME electrostatics");
 +        }
 +    }
 +    
 +    /* If each molecule is a single charge group
 +     * or we use domain decomposition for each periodic dimension,
 +     * we do not need to take pbc into account for the bonded interactions.
 +     */
 +    if (fr->ePBC == epbcNONE || !comm->bInterCGBondeds ||
 +        (dd->nc[XX]>1 && dd->nc[YY]>1 && (dd->nc[ZZ]>1 || fr->ePBC==epbcXY)))
 +    {
 +        fr->bMolPBC = FALSE;
 +    }
 +    else
 +    {
 +        fr->bMolPBC = TRUE;
 +    }
 +        
 +    if (debug)
 +    {
 +        fprintf(debug,"The DD cut-off is %f\n",comm->cutoff);
 +    }
 +    if (comm->eDLB != edlbNO)
 +    {
 +        /* Determine the maximum number of comm. pulses in one dimension */
 +        
 +        comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
 +        
 +        /* Determine the maximum required number of grid pulses */
 +        if (comm->cellsize_limit >= comm->cutoff)
 +        {
 +            /* Only a single pulse is required */
 +            npulse = 1;
 +        }
 +        else if (!bNoCutOff && comm->cellsize_limit > 0)
 +        {
 +            /* We round down slightly here to avoid overhead due to the latency
 +             * of extra communication calls when the cut-off
 +             * would be only slightly longer than the cell size.
 +             * Later cellsize_limit is redetermined,
 +             * so we can not miss interactions due to this rounding.
 +             */
 +            npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
 +        }
 +        else
 +        {
 +            /* There is no cell size limit */
 +            npulse = max(dd->nc[XX]-1,max(dd->nc[YY]-1,dd->nc[ZZ]-1));
 +        }
 +
 +        if (!bNoCutOff && npulse > 1)
 +        {
 +            /* See if we can do with less pulses, based on dlb_scale */
 +            npulse_d_max = 0;
 +            for(d=0; d<dd->ndim; d++)
 +            {
 +                dim = dd->dim[d];
 +                npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
 +                                 /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
 +                npulse_d_max = max(npulse_d_max,npulse_d);
 +            }
 +            npulse = min(npulse,npulse_d_max);
 +        }
 +        
 +        /* This env var can override npulse */
 +        d = dd_nst_env(fplog,"GMX_DD_NPULSE",0);
 +        if (d > 0)
 +        {
 +            npulse = d;
 +        }
 +
 +        comm->maxpulse = 1;
 +        comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
 +        for(d=0; d<dd->ndim; d++)
 +        {
 +            comm->cd[d].np_dlb = min(npulse,dd->nc[dd->dim[d]]-1);
 +            comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
 +            snew(comm->cd[d].ind,comm->cd[d].np_nalloc);
 +            comm->maxpulse = max(comm->maxpulse,comm->cd[d].np_dlb);
 +            if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
 +            {
 +                comm->bVacDLBNoLimit = FALSE;
 +            }
 +        }
 +        
 +        /* cellsize_limit is set for LINCS in init_domain_decomposition */
 +        if (!comm->bVacDLBNoLimit)
 +        {
 +            comm->cellsize_limit = max(comm->cellsize_limit,
 +                                       comm->cutoff/comm->maxpulse);
 +        }
 +        comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
 +        /* Set the minimum cell size for each DD dimension */
 +        for(d=0; d<dd->ndim; d++)
 +        {
 +            if (comm->bVacDLBNoLimit ||
 +                comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
 +            {
 +                comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
 +            }
 +            else
 +            {
 +                comm->cellsize_min_dlb[dd->dim[d]] =
 +                    comm->cutoff/comm->cd[d].np_dlb;
 +            }
 +        }
 +        if (comm->cutoff_mbody <= 0)
 +        {
 +            comm->cutoff_mbody = min(comm->cutoff,comm->cellsize_limit);
 +        }
 +        if (comm->bDynLoadBal)
 +        {
 +            set_dlb_limits(dd);
 +        }
 +    }
 +    
 +    print_dd_settings(fplog,dd,ir,comm->bDynLoadBal,dlb_scale,ddbox);
 +    if (comm->eDLB == edlbAUTO)
 +    {
 +        if (fplog)
 +        {
 +            fprintf(fplog,"When dynamic load balancing gets turned on, these settings will change to:\n");
 +        }
 +        print_dd_settings(fplog,dd,ir,TRUE,dlb_scale,ddbox);
 +    }
 +
 +    if (ir->ePBC == epbcNONE)
 +    {
 +        vol_frac = 1 - 1/(double)dd->nnodes;
 +    }
 +    else
 +    {
 +        vol_frac =
 +            (1 + comm_box_frac(dd->nc,comm->cutoff,ddbox))/(double)dd->nnodes;
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug,"Volume fraction for all DD zones: %f\n",vol_frac);
 +    }
 +    natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
 +   
 +    dd->ga2la = ga2la_init(natoms_tot,vol_frac*natoms_tot);
 +}
 +
 +static void merge_cg_buffers(int ncell,
 +                             gmx_domdec_comm_dim_t *cd, int pulse,
 +                             int  *ncg_cell,
 +                             int  *index_gl, int  *recv_i,
 +                             rvec *cg_cm,    rvec *recv_vr,
 +                             int *cgindex,
 +                             cginfo_mb_t *cginfo_mb,int *cginfo)
 +{
 +    gmx_domdec_ind_t *ind,*ind_p;
 +    int p,cell,c,cg,cg0,cg1,cg_gl,nat;
 +    int shift,shift_at;
 +    
 +    ind = &cd->ind[pulse];
 +    
 +    /* First correct the already stored data */
 +    shift = ind->nrecv[ncell];
 +    for(cell=ncell-1; cell>=0; cell--)
 +    {
 +        shift -= ind->nrecv[cell];
 +        if (shift > 0)
 +        {
 +            /* Move the cg's present from previous grid pulses */
 +            cg0 = ncg_cell[ncell+cell];
 +            cg1 = ncg_cell[ncell+cell+1];
 +            cgindex[cg1+shift] = cgindex[cg1];
 +            for(cg=cg1-1; cg>=cg0; cg--)
 +            {
 +                index_gl[cg+shift] = index_gl[cg];
 +                copy_rvec(cg_cm[cg],cg_cm[cg+shift]);
 +                cgindex[cg+shift] = cgindex[cg];
 +                cginfo[cg+shift] = cginfo[cg];
 +            }
 +            /* Correct the already stored send indices for the shift */
 +            for(p=1; p<=pulse; p++)
 +            {
 +                ind_p = &cd->ind[p];
 +                cg0 = 0;
 +                for(c=0; c<cell; c++)
 +                {
 +                    cg0 += ind_p->nsend[c];
 +                }
 +                cg1 = cg0 + ind_p->nsend[cell];
 +                for(cg=cg0; cg<cg1; cg++)
 +                {
 +                    ind_p->index[cg] += shift;
 +                }
 +            }
 +        }
 +    }
 +
 +    /* Merge in the communicated buffers */
 +    shift = 0;
 +    shift_at = 0;
 +    cg0 = 0;
 +    for(cell=0; cell<ncell; cell++)
 +    {
 +        cg1 = ncg_cell[ncell+cell+1] + shift;
 +        if (shift_at > 0)
 +        {
 +            /* Correct the old cg indices */
 +            for(cg=ncg_cell[ncell+cell]; cg<cg1; cg++)
 +            {
 +                cgindex[cg+1] += shift_at;
 +            }
 +        }
 +        for(cg=0; cg<ind->nrecv[cell]; cg++)
 +        {
 +            /* Copy this charge group from the buffer */
 +            index_gl[cg1] = recv_i[cg0];
 +            copy_rvec(recv_vr[cg0],cg_cm[cg1]);
 +            /* Add it to the cgindex */
 +            cg_gl = index_gl[cg1];
 +            cginfo[cg1] = ddcginfo(cginfo_mb,cg_gl);
 +            nat = GET_CGINFO_NATOMS(cginfo[cg1]);
 +            cgindex[cg1+1] = cgindex[cg1] + nat;
 +            cg0++;
 +            cg1++;
 +            shift_at += nat;
 +        }
 +        shift += ind->nrecv[cell];
 +        ncg_cell[ncell+cell+1] = cg1;
 +    }
 +}
 +
 +static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
 +                               int nzone,int cg0,const int *cgindex)
 +{
 +    int cg,zone,p;
 +    
 +    /* Store the atom block boundaries for easy copying of communication buffers
 +     */
 +    cg = cg0;
 +    for(zone=0; zone<nzone; zone++)
 +    {
 +        for(p=0; p<cd->np; p++) {
 +            cd->ind[p].cell2at0[zone] = cgindex[cg];
 +            cg += cd->ind[p].nrecv[zone];
 +            cd->ind[p].cell2at1[zone] = cgindex[cg];
 +        }
 +    }
 +}
 +
 +static gmx_bool missing_link(t_blocka *link,int cg_gl,char *bLocalCG)
 +{
 +    int  i;
 +    gmx_bool bMiss;
 +
 +    bMiss = FALSE;
 +    for(i=link->index[cg_gl]; i<link->index[cg_gl+1]; i++)
 +    {
 +        if (!bLocalCG[link->a[i]])
 +        {
 +            bMiss = TRUE;
 +        }
 +    }
 +
 +    return bMiss;
 +}
 +
 +static void setup_dd_communication(gmx_domdec_t *dd,
 +                                   matrix box,gmx_ddbox_t *ddbox,t_forcerec *fr)
 +{
 +    int dim_ind,dim,dim0,dim1=-1,dim2=-1,dimd,p,nat_tot;
 +    int nzone,nzone_send,zone,zonei,cg0,cg1;
 +    int c,i,j,cg,cg_gl,nrcg;
 +    int *zone_cg_range,pos_cg,*index_gl,*cgindex,*recv_i;
 +    gmx_domdec_comm_t *comm;
 +    gmx_domdec_zones_t *zones;
 +    gmx_domdec_comm_dim_t *cd;
 +    gmx_domdec_ind_t *ind;
 +    cginfo_mb_t *cginfo_mb;
 +    gmx_bool bBondComm,bDist2B,bDistMB,bDistMB_pulse,bDistBonded,bScrew;
 +    real r_mb,r_comm2,r_scomm2,r_bcomm2,r,r_0,r_1,r2,rb2,r2inc,inv_ncg,tric_sh;
 +    rvec rb,rn;
 +    real corner[DIM][4],corner_round_0=0,corner_round_1[4];
 +    real bcorner[DIM],bcorner_round_1=0;
 +    ivec tric_dist;
 +    rvec *cg_cm,*normal,*v_d,*v_0=NULL,*v_1=NULL,*recv_vr;
 +    real skew_fac2_d,skew_fac_01;
 +    rvec sf2_round;
 +    int  nsend,nat;
 +    
 +    if (debug)
 +    {
 +        fprintf(debug,"Setting up DD communication\n");
 +    }
 +    
 +    comm  = dd->comm;
 +    cg_cm = fr->cg_cm;
 +
 +    for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
 +    {
 +        dim = dd->dim[dim_ind];
 +
 +        /* Check if we need to use triclinic distances */
 +        tric_dist[dim_ind] = 0;
 +        for(i=0; i<=dim_ind; i++)
 +        {
 +            if (ddbox->tric_dir[dd->dim[i]])
 +            {
 +                tric_dist[dim_ind] = 1;
 +            }
 +        }
 +    }
 +
 +    bBondComm = comm->bBondComm;
 +
 +    /* Do we need to determine extra distances for multi-body bondeds? */
 +    bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
 +    
 +    /* Do we need to determine extra distances for only two-body bondeds? */
 +    bDist2B = (bBondComm && !bDistMB);
 +
 +    r_comm2  = sqr(comm->cutoff);
 +    r_bcomm2 = sqr(comm->cutoff_mbody);
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"bBondComm %d, r_bc %f\n",bBondComm,sqrt(r_bcomm2));
 +    }
 +
 +    zones = &comm->zones;
 +    
 +    dim0 = dd->dim[0];
 +    /* The first dimension is equal for all cells */
 +    corner[0][0] = comm->cell_x0[dim0];
 +    if (bDistMB)
 +    {
 +        bcorner[0] = corner[0][0];
 +    }
 +    if (dd->ndim >= 2)
 +    {
 +        dim1 = dd->dim[1];
 +        /* This cell row is only seen from the first row */
 +        corner[1][0] = comm->cell_x0[dim1];
 +        /* All rows can see this row */
 +        corner[1][1] = comm->cell_x0[dim1];
 +        if (dd->bGridJump)
 +        {
 +            corner[1][1] = max(comm->cell_x0[dim1],comm->zone_d1[1].mch0);
 +            if (bDistMB)
 +            {
 +                /* For the multi-body distance we need the maximum */
 +                bcorner[1] = max(comm->cell_x0[dim1],comm->zone_d1[1].p1_0);
 +            }
 +        }
 +        /* Set the upper-right corner for rounding */
 +        corner_round_0 = comm->cell_x1[dim0];
 +        
 +        if (dd->ndim >= 3)
 +        {
 +            dim2 = dd->dim[2];
 +            for(j=0; j<4; j++)
 +            {
 +                corner[2][j] = comm->cell_x0[dim2];
 +            }
 +            if (dd->bGridJump)
 +            {
 +                /* Use the maximum of the i-cells that see a j-cell */
 +                for(i=0; i<zones->nizone; i++)
 +                {
 +                    for(j=zones->izone[i].j0; j<zones->izone[i].j1; j++)
 +                    {
 +                        if (j >= 4)
 +                        {
 +                            corner[2][j-4] =
 +                                max(corner[2][j-4],
 +                                    comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
 +                        }
 +                    }
 +                }
 +                if (bDistMB)
 +                {
 +                    /* For the multi-body distance we need the maximum */
 +                    bcorner[2] = comm->cell_x0[dim2];
 +                    for(i=0; i<2; i++)
 +                    {
 +                        for(j=0; j<2; j++)
 +                        {
 +                            bcorner[2] = max(bcorner[2],
 +                                             comm->zone_d2[i][j].p1_0);
 +                        }
 +                    }
 +                }
 +            }
 +            
 +            /* Set the upper-right corner for rounding */
 +            /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
 +             * Only cell (0,0,0) can see cell 7 (1,1,1)
 +             */
 +            corner_round_1[0] = comm->cell_x1[dim1];
 +            corner_round_1[3] = comm->cell_x1[dim1];
 +            if (dd->bGridJump)
 +            {
 +                corner_round_1[0] = max(comm->cell_x1[dim1],
 +                                        comm->zone_d1[1].mch1);
 +                if (bDistMB)
 +                {
 +                    /* For the multi-body distance we need the maximum */
 +                    bcorner_round_1 = max(comm->cell_x1[dim1],
 +                                          comm->zone_d1[1].p1_1);
 +                }
 +            }
 +        }
 +    }
 +    
 +    /* Triclinic stuff */
 +    normal = ddbox->normal;
 +    skew_fac_01 = 0;
 +    if (dd->ndim >= 2)
 +    {
 +        v_0 = ddbox->v[dim0];
 +        if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
 +        {
 +            /* Determine the coupling coefficient for the distances
 +             * to the cell planes along dim0 and dim1 through dim2.
 +             * This is required for correct rounding.
 +             */
 +            skew_fac_01 =
 +                ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
 +            if (debug)
 +            {
 +                fprintf(debug,"\nskew_fac_01 %f\n",skew_fac_01);
 +            }
 +        }
 +    }
 +    if (dd->ndim >= 3)
 +    {
 +        v_1 = ddbox->v[dim1];
 +    }
 +    
 +    zone_cg_range = zones->cg_range;
 +    index_gl = dd->index_gl;
 +    cgindex  = dd->cgindex;
 +    cginfo_mb = fr->cginfo_mb;
 +    
 +    zone_cg_range[0]   = 0;
 +    zone_cg_range[1]   = dd->ncg_home;
 +    comm->zone_ncg1[0] = dd->ncg_home;
 +    pos_cg             = dd->ncg_home;
 +    
 +    nat_tot = dd->nat_home;
 +    nzone = 1;
 +    for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
 +    {
 +        dim = dd->dim[dim_ind];
 +        cd = &comm->cd[dim_ind];
 +        
 +        if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
 +        {
 +            /* No pbc in this dimension, the first node should not comm. */
 +            nzone_send = 0;
 +        }
 +        else
 +        {
 +            nzone_send = nzone;
 +        }
 +
 +        bScrew = (dd->bScrewPBC && dim == XX);
 +        
 +        v_d = ddbox->v[dim];
 +        skew_fac2_d = sqr(ddbox->skew_fac[dim]);
 +
 +        cd->bInPlace = TRUE;
 +        for(p=0; p<cd->np; p++)
 +        {
 +            /* Only atoms communicated in the first pulse are used
 +             * for multi-body bonded interactions or for bBondComm.
 +             */
 +            bDistBonded   = ((bDistMB || bDist2B) && p == 0);
 +            bDistMB_pulse = (bDistMB && bDistBonded);
 +
 +            ind = &cd->ind[p];
 +            nsend = 0;
 +            nat = 0;
 +            for(zone=0; zone<nzone_send; zone++)
 +            {
 +                if (tric_dist[dim_ind] && dim_ind > 0)
 +                {
 +                    /* Determine slightly more optimized skew_fac's
 +                     * for rounding.
 +                     * This reduces the number of communicated atoms
 +                     * by about 10% for 3D DD of rhombic dodecahedra.
 +                     */
 +                    for(dimd=0; dimd<dim; dimd++)
 +                    {
 +                        sf2_round[dimd] = 1;
 +                        if (ddbox->tric_dir[dimd])
 +                        {
 +                            for(i=dd->dim[dimd]+1; i<DIM; i++)
 +                            {
 +                                /* If we are shifted in dimension i
 +                                 * and the cell plane is tilted forward
 +                                 * in dimension i, skip this coupling.
 +                                 */
 +                                if (!(zones->shift[nzone+zone][i] &&
 +                                      ddbox->v[dimd][i][dimd] >= 0))
 +                                {
 +                                    sf2_round[dimd] +=
 +                                        sqr(ddbox->v[dimd][i][dimd]);
 +                                }
 +                            }
 +                            sf2_round[dimd] = 1/sf2_round[dimd];
 +                        }
 +                    }
 +                }
 +
 +                zonei = zone_perm[dim_ind][zone];
 +                if (p == 0)
 +                {
 +                    /* Here we permutate the zones to obtain a convenient order
 +                     * for neighbor searching
 +                     */
 +                    cg0 = zone_cg_range[zonei];
 +                    cg1 = zone_cg_range[zonei+1];
 +                }
 +                else
 +                {
 +                    /* Look only at the cg's received in the previous grid pulse
 +                     */
 +                    cg1 = zone_cg_range[nzone+zone+1];
 +                    cg0 = cg1 - cd->ind[p-1].nrecv[zone];
 +                }
 +                ind->nsend[zone] = 0;
 +                for(cg=cg0; cg<cg1; cg++)
 +                {
 +                    r2  = 0;
 +                    rb2 = 0;
 +                    if (tric_dist[dim_ind] == 0)
 +                    {
 +                        /* Rectangular direction, easy */
 +                        r = cg_cm[cg][dim] - corner[dim_ind][zone];
 +                        if (r > 0)
 +                        {
 +                            r2 += r*r;
 +                        }
 +                        if (bDistMB_pulse)
 +                        {
 +                            r = cg_cm[cg][dim] - bcorner[dim_ind];
 +                            if (r > 0)
 +                            {
 +                                rb2 += r*r;
 +                            }
 +                        }
 +                        /* Rounding gives at most a 16% reduction
 +                         * in communicated atoms
 +                         */
 +                        if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
 +                        {
 +                            r = cg_cm[cg][dim0] - corner_round_0;
 +                            /* This is the first dimension, so always r >= 0 */
 +                            r2 += r*r;
 +                            if (bDistMB_pulse)
 +                            {
 +                                rb2 += r*r;
 +                            }
 +                        }
 +                        if (dim_ind == 2 && (zonei == 2 || zonei == 3))
 +                        {
 +                            r = cg_cm[cg][dim1] - corner_round_1[zone];
 +                            if (r > 0)
 +                            {
 +                                r2 += r*r;
 +                            }
 +                            if (bDistMB_pulse)
 +                            {
 +                                r = cg_cm[cg][dim1] - bcorner_round_1;
 +                                if (r > 0)
 +                                {
 +                                    rb2 += r*r;
 +                                }
 +                            }
 +                        }
 +                    }
 +                    else
 +                    {
 +                        /* Triclinic direction, more complicated */
 +                        clear_rvec(rn);
 +                        clear_rvec(rb);
 +                        /* Rounding, conservative as the skew_fac multiplication
 +                         * will slightly underestimate the distance.
 +                         */
 +                        if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
 +                        {
 +                            rn[dim0] = cg_cm[cg][dim0] - corner_round_0;
 +                            for(i=dim0+1; i<DIM; i++)
 +                            {
 +                                rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
 +                            }
 +                            r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
 +                            if (bDistMB_pulse)
 +                            {
 +                                rb[dim0] = rn[dim0];
 +                                rb2 = r2;
 +                            }
 +                            /* Take care that the cell planes along dim0 might not
 +                             * be orthogonal to those along dim1 and dim2.
 +                             */
 +                            for(i=1; i<=dim_ind; i++)
 +                            {
 +                                dimd = dd->dim[i];
 +                                if (normal[dim0][dimd] > 0)
 +                                {
 +                                    rn[dimd] -= rn[dim0]*normal[dim0][dimd];
 +                                    if (bDistMB_pulse)
 +                                    {
 +                                        rb[dimd] -= rb[dim0]*normal[dim0][dimd];
 +                                    }
 +                                }
 +                            }
 +                        }
 +                        if (dim_ind == 2 && (zonei == 2 || zonei == 3))
 +                        {
 +                            rn[dim1] += cg_cm[cg][dim1] - corner_round_1[zone];
 +                            tric_sh = 0;
 +                            for(i=dim1+1; i<DIM; i++)
 +                            {
 +                                tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
 +                            }
 +                            rn[dim1] += tric_sh;
 +                            if (rn[dim1] > 0)
 +                            {
 +                                r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
 +                                /* Take care of coupling of the distances
 +                                 * to the planes along dim0 and dim1 through dim2.
 +                                 */
 +                                r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
 +                                /* Take care that the cell planes along dim1
 +                                 * might not be orthogonal to that along dim2.
 +                                 */
 +                                if (normal[dim1][dim2] > 0)
 +                                {
 +                                    rn[dim2] -= rn[dim1]*normal[dim1][dim2];
 +                                }
 +                            }
 +                            if (bDistMB_pulse)
 +                            {
 +                                rb[dim1] +=
 +                                    cg_cm[cg][dim1] - bcorner_round_1 + tric_sh;
 +                                if (rb[dim1] > 0)
 +                                {
 +                                    rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
 +                                    /* Take care of coupling of the distances
 +                                     * to the planes along dim0 and dim1 through dim2.
 +                                     */
 +                                    rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
 +                                    /* Take care that the cell planes along dim1
 +                                     * might not be orthogonal to that along dim2.
 +                                     */
 +                                    if (normal[dim1][dim2] > 0)
 +                                    {
 +                                        rb[dim2] -= rb[dim1]*normal[dim1][dim2];
 +                                    }
 +                                }
 +                            }
 +                        }
 +                        /* The distance along the communication direction */
 +                        rn[dim] += cg_cm[cg][dim] - corner[dim_ind][zone];
 +                        tric_sh = 0;
 +                        for(i=dim+1; i<DIM; i++)
 +                        {
 +                            tric_sh -= cg_cm[cg][i]*v_d[i][dim];
 +                        }
 +                        rn[dim] += tric_sh;
 +                        if (rn[dim] > 0)
 +                        {
 +                            r2 += rn[dim]*rn[dim]*skew_fac2_d;
 +                            /* Take care of coupling of the distances
 +                             * to the planes along dim0 and dim1 through dim2.
 +                             */
 +                            if (dim_ind == 1 && zonei == 1)
 +                            {
 +                                r2 -= rn[dim0]*rn[dim]*skew_fac_01;
 +                            }
 +                        }
 +                        if (bDistMB_pulse)
 +                        {
 +                            clear_rvec(rb);
 +                            rb[dim] += cg_cm[cg][dim] - bcorner[dim_ind] + tric_sh;
 +                            if (rb[dim] > 0)
 +                            {
 +                                rb2 += rb[dim]*rb[dim]*skew_fac2_d;
 +                                /* Take care of coupling of the distances
 +                                 * to the planes along dim0 and dim1 through dim2.
 +                                 */
 +                                if (dim_ind == 1 && zonei == 1)
 +                                {
 +                                    rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
 +                                }
 +                            }
 +                        }
 +                    }
 +                    
 +                    if (r2 < r_comm2 ||
 +                        (bDistBonded &&
 +                         ((bDistMB && rb2 < r_bcomm2) ||
 +                          (bDist2B && r2  < r_bcomm2)) &&
 +                         (!bBondComm ||
 +                          (GET_CGINFO_BOND_INTER(fr->cginfo[cg]) &&
 +                           missing_link(comm->cglink,index_gl[cg],
 +                                        comm->bLocalCG)))))
 +                    {
 +                        /* Make an index to the local charge groups */
 +                        if (nsend+1 > ind->nalloc)
 +                        {
 +                            ind->nalloc = over_alloc_large(nsend+1);
 +                            srenew(ind->index,ind->nalloc);
 +                        }
 +                        if (nsend+1 > comm->nalloc_int)
 +                        {
 +                            comm->nalloc_int = over_alloc_large(nsend+1);
 +                            srenew(comm->buf_int,comm->nalloc_int);
 +                        }
 +                        ind->index[nsend] = cg;
 +                        comm->buf_int[nsend] = index_gl[cg];
 +                        ind->nsend[zone]++;
 +                        vec_rvec_check_alloc(&comm->vbuf,nsend+1);
 +
 +                        if (dd->ci[dim] == 0)
 +                        {
 +                            /* Correct cg_cm for pbc */
 +                            rvec_add(cg_cm[cg],box[dim],comm->vbuf.v[nsend]);
 +                            if (bScrew)
 +                            {
 +                                comm->vbuf.v[nsend][YY] =
 +                                    box[YY][YY]-comm->vbuf.v[nsend][YY];
 +                                comm->vbuf.v[nsend][ZZ] =
 +                                    box[ZZ][ZZ]-comm->vbuf.v[nsend][ZZ];
 +                            }
 +                        }
 +                        else
 +                        {
 +                            copy_rvec(cg_cm[cg],comm->vbuf.v[nsend]);
 +                        }
 +                        nsend++;
 +                        nat += cgindex[cg+1] - cgindex[cg];
 +                    }
 +                }
 +            }
 +            /* Clear the counts in case we do not have pbc */
 +            for(zone=nzone_send; zone<nzone; zone++)
 +            {
 +                ind->nsend[zone] = 0;
 +            }
 +            ind->nsend[nzone]   = nsend;
 +            ind->nsend[nzone+1] = nat;
 +            /* Communicate the number of cg's and atoms to receive */
 +            dd_sendrecv_int(dd, dim_ind, dddirBackward,
 +                            ind->nsend, nzone+2,
 +                            ind->nrecv, nzone+2);
 +            
 +            /* The rvec buffer is also required for atom buffers of size nsend
 +             * in dd_move_x and dd_move_f.
 +             */
 +            vec_rvec_check_alloc(&comm->vbuf,ind->nsend[nzone+1]);
 +
 +            if (p > 0)
 +            {
 +                /* We can receive in place if only the last zone is not empty */
 +                for(zone=0; zone<nzone-1; zone++)
 +                {
 +                    if (ind->nrecv[zone] > 0)
 +                    {
 +                        cd->bInPlace = FALSE;
 +                    }
 +                }
 +                if (!cd->bInPlace)
 +                {
 +                    /* The int buffer is only required here for the cg indices */
 +                    if (ind->nrecv[nzone] > comm->nalloc_int2)
 +                    {
 +                        comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
 +                        srenew(comm->buf_int2,comm->nalloc_int2);
 +                    }
 +                    /* The rvec buffer is also required for atom buffers
 +                     * of size nrecv in dd_move_x and dd_move_f.
 +                     */
 +                    i = max(cd->ind[0].nrecv[nzone+1],ind->nrecv[nzone+1]);
 +                    vec_rvec_check_alloc(&comm->vbuf2,i);
 +                }
 +            }
 +            
 +            /* Make space for the global cg indices */
 +            if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
 +                || dd->cg_nalloc == 0)
 +            {
 +                dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
 +                srenew(index_gl,dd->cg_nalloc);
 +                srenew(cgindex,dd->cg_nalloc+1);
 +            }
 +            /* Communicate the global cg indices */
 +            if (cd->bInPlace)
 +            {
 +                recv_i = index_gl + pos_cg;
 +            }
 +            else
 +            {
 +                recv_i = comm->buf_int2;
 +            }
 +            dd_sendrecv_int(dd, dim_ind, dddirBackward,
 +                            comm->buf_int, nsend,
 +                            recv_i,        ind->nrecv[nzone]);
 +
 +            /* Make space for cg_cm */
 +            if (pos_cg + ind->nrecv[nzone] > fr->cg_nalloc)
 +            {
 +                dd_realloc_fr_cg(fr,pos_cg + ind->nrecv[nzone]);
 +                cg_cm = fr->cg_cm;
 +            }
 +            /* Communicate cg_cm */
 +            if (cd->bInPlace)
 +            {
 +                recv_vr = cg_cm + pos_cg;
 +            }
 +            else
 +            {
 +                recv_vr = comm->vbuf2.v;
 +            }
 +            dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
 +                             comm->vbuf.v, nsend,
 +                             recv_vr,      ind->nrecv[nzone]);
 +            
 +            /* Make the charge group index */
 +            if (cd->bInPlace)
 +            {
 +                zone = (p == 0 ? 0 : nzone - 1);
 +                while (zone < nzone)
 +                {
 +                    for(cg=0; cg<ind->nrecv[zone]; cg++)
 +                    {
 +                        cg_gl = index_gl[pos_cg];
 +                        fr->cginfo[pos_cg] = ddcginfo(cginfo_mb,cg_gl);
 +                        nrcg = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
 +                        cgindex[pos_cg+1] = cgindex[pos_cg] + nrcg;
 +                        if (bBondComm)
 +                        {
 +                            /* Update the charge group presence,
 +                             * so we can use it in the next pass of the loop.
 +                             */
 +                            comm->bLocalCG[cg_gl] = TRUE;
 +                        }
 +                        pos_cg++;
 +                    }
 +                    if (p == 0)
 +                    {
 +                        comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
 +                    }
 +                    zone++;
 +                    zone_cg_range[nzone+zone] = pos_cg;
 +                }
 +            }
 +            else
 +            {
 +                /* This part of the code is never executed with bBondComm. */
 +                merge_cg_buffers(nzone,cd,p,zone_cg_range,
 +                                 index_gl,recv_i,cg_cm,recv_vr,
 +                                 cgindex,fr->cginfo_mb,fr->cginfo);
 +                pos_cg += ind->nrecv[nzone];
 +            }
 +            nat_tot += ind->nrecv[nzone+1];
 +        }
 +        if (!cd->bInPlace)
 +        {
 +            /* Store the atom block for easy copying of communication buffers */
 +            make_cell2at_index(cd,nzone,zone_cg_range[nzone],cgindex);
 +        }
 +        nzone += nzone;
 +    }
 +    dd->index_gl = index_gl;
 +    dd->cgindex  = cgindex;
 +    
 +    dd->ncg_tot = zone_cg_range[zones->n];
 +    dd->nat_tot = nat_tot;
 +    comm->nat[ddnatHOME] = dd->nat_home;
 +    for(i=ddnatZONE; i<ddnatNR; i++)
 +    {
 +        comm->nat[i] = dd->nat_tot;
 +    }
 +
 +    if (!bBondComm)
 +    {
 +        /* We don't need to update cginfo, since that was alrady done above.
 +         * So we pass NULL for the forcerec.
 +         */
 +        dd_set_cginfo(dd->index_gl,dd->ncg_home,dd->ncg_tot,
 +                      NULL,comm->bLocalCG);
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"Finished setting up DD communication, zones:");
 +        for(c=0; c<zones->n; c++)
 +        {
 +            fprintf(debug," %d",zones->cg_range[c+1]-zones->cg_range[c]);
 +        }
 +        fprintf(debug,"\n");
 +    }
 +}
 +
 +static void set_cg_boundaries(gmx_domdec_zones_t *zones)
 +{
 +    int c;
 +    
 +    for(c=0; c<zones->nizone; c++)
 +    {
 +        zones->izone[c].cg1  = zones->cg_range[c+1];
 +        zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
 +        zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
 +    }
 +}
 +
 +static int comp_cgsort(const void *a,const void *b)
 +{
 +    int comp;
 +    
 +    gmx_cgsort_t *cga,*cgb;
 +    cga = (gmx_cgsort_t *)a;
 +    cgb = (gmx_cgsort_t *)b;
 +    
 +    comp = cga->nsc - cgb->nsc;
 +    if (comp == 0)
 +    {
 +        comp = cga->ind_gl - cgb->ind_gl;
 +    }
 +    
 +    return comp;
 +}
 +
 +static void order_int_cg(int n,gmx_cgsort_t *sort,
 +                         int *a,int *buf)
 +{
 +    int i;
 +    
 +    /* Order the data */
 +    for(i=0; i<n; i++)
 +    {
 +        buf[i] = a[sort[i].ind];
 +    }
 +    
 +    /* Copy back to the original array */
 +    for(i=0; i<n; i++)
 +    {
 +        a[i] = buf[i];
 +    }
 +}
 +
 +static void order_vec_cg(int n,gmx_cgsort_t *sort,
 +                         rvec *v,rvec *buf)
 +{
 +    int i;
 +    
 +    /* Order the data */
 +    for(i=0; i<n; i++)
 +    {
 +        copy_rvec(v[sort[i].ind],buf[i]);
 +    }
 +    
 +    /* Copy back to the original array */
 +    for(i=0; i<n; i++)
 +    {
 +        copy_rvec(buf[i],v[i]);
 +    }
 +}
 +
 +static void order_vec_atom(int ncg,int *cgindex,gmx_cgsort_t *sort,
 +                           rvec *v,rvec *buf)
 +{
 +    int a,atot,cg,cg0,cg1,i;
 +    
 +    /* Order the data */
 +    a = 0;
 +    for(cg=0; cg<ncg; cg++)
 +    {
 +        cg0 = cgindex[sort[cg].ind];
 +        cg1 = cgindex[sort[cg].ind+1];
 +        for(i=cg0; i<cg1; i++)
 +        {
 +            copy_rvec(v[i],buf[a]);
 +            a++;
 +        }
 +    }
 +    atot = a;
 +    
 +    /* Copy back to the original array */
 +    for(a=0; a<atot; a++)
 +    {
 +        copy_rvec(buf[a],v[a]);
 +    }
 +}
 +
 +static void ordered_sort(int nsort2,gmx_cgsort_t *sort2,
 +                         int nsort_new,gmx_cgsort_t *sort_new,
 +                         gmx_cgsort_t *sort1)
 +{
 +    int i1,i2,i_new;
 +    
 +    /* The new indices are not very ordered, so we qsort them */
 +    qsort_threadsafe(sort_new,nsort_new,sizeof(sort_new[0]),comp_cgsort);
 +    
 +    /* sort2 is already ordered, so now we can merge the two arrays */
 +    i1 = 0;
 +    i2 = 0;
 +    i_new = 0;
 +    while(i2 < nsort2 || i_new < nsort_new)
 +    {
 +        if (i2 == nsort2)
 +        {
 +            sort1[i1++] = sort_new[i_new++];
 +        }
 +        else if (i_new == nsort_new)
 +        {
 +            sort1[i1++] = sort2[i2++];
 +        }
 +        else if (sort2[i2].nsc < sort_new[i_new].nsc ||
 +                 (sort2[i2].nsc == sort_new[i_new].nsc &&
 +                  sort2[i2].ind_gl < sort_new[i_new].ind_gl))
 +        {
 +            sort1[i1++] = sort2[i2++];
 +        }
 +        else
 +        {
 +            sort1[i1++] = sort_new[i_new++];
 +        }
 +    }
 +}
 +
 +static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
 +                          rvec *cgcm,t_forcerec *fr,t_state *state,
 +                          int ncg_home_old)
 +{
 +    gmx_domdec_sort_t *sort;
 +    gmx_cgsort_t *cgsort,*sort_i;
 +    int  ncg_new,nsort2,nsort_new,i,cell_index,*ibuf,cgsize;
 +    rvec *vbuf;
 +    
 +    sort = dd->comm->sort;
 +    
 +    if (dd->ncg_home > sort->sort_nalloc)
 +    {
 +        sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
 +        srenew(sort->sort1,sort->sort_nalloc);
 +        srenew(sort->sort2,sort->sort_nalloc);
 +    }
 +    
 +    if (ncg_home_old >= 0)
 +    {
 +        /* The charge groups that remained in the same ns grid cell
 +         * are completely ordered. So we can sort efficiently by sorting
 +         * the charge groups that did move into the stationary list.
 +         */
 +        ncg_new = 0;
 +        nsort2 = 0;
 +        nsort_new = 0;
 +        for(i=0; i<dd->ncg_home; i++)
 +        {
 +            /* Check if this cg did not move to another node */
 +            cell_index = fr->ns.grid->cell_index[i];
 +            if (cell_index !=  4*fr->ns.grid->ncells)
 +            {
 +                if (i >= ncg_home_old || cell_index != sort->sort1[i].nsc)
 +                {
 +                    /* This cg is new on this node or moved ns grid cell */
 +                    if (nsort_new >= sort->sort_new_nalloc)
 +                    {
 +                        sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
 +                        srenew(sort->sort_new,sort->sort_new_nalloc);
 +                    }
 +                    sort_i = &(sort->sort_new[nsort_new++]);
 +                }
 +                else
 +                {
 +                    /* This cg did not move */
 +                    sort_i = &(sort->sort2[nsort2++]);
 +                }
 +                /* Sort on the ns grid cell indices
 +                 * and the global topology index
 +                 */
 +                sort_i->nsc    = cell_index;
 +                sort_i->ind_gl = dd->index_gl[i];
 +                sort_i->ind    = i;
 +                ncg_new++;
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug,"ordered sort cgs: stationary %d moved %d\n",
 +                    nsort2,nsort_new);
 +        }
 +        /* Sort efficiently */
 +        ordered_sort(nsort2,sort->sort2,nsort_new,sort->sort_new,sort->sort1);
 +    }
 +    else
 +    {
 +        cgsort = sort->sort1;
 +        ncg_new = 0;
 +        for(i=0; i<dd->ncg_home; i++)
 +        {
 +            /* Sort on the ns grid cell indices
 +             * and the global topology index
 +             */
 +            cgsort[i].nsc    = fr->ns.grid->cell_index[i];
 +            cgsort[i].ind_gl = dd->index_gl[i];
 +            cgsort[i].ind    = i;
 +            if (cgsort[i].nsc != 4*fr->ns.grid->ncells)
 +            {
 +                ncg_new++;
 +            }
 +        }
 +        if (debug)
 +        {
 +            fprintf(debug,"qsort cgs: %d new home %d\n",dd->ncg_home,ncg_new);
 +        }
 +        /* Determine the order of the charge groups using qsort */
 +        qsort_threadsafe(cgsort,dd->ncg_home,sizeof(cgsort[0]),comp_cgsort);
 +    }
 +    cgsort = sort->sort1;
 +    
 +    /* We alloc with the old size, since cgindex is still old */
 +    vec_rvec_check_alloc(&dd->comm->vbuf,dd->cgindex[dd->ncg_home]);
 +    vbuf = dd->comm->vbuf.v;
 +    
 +    /* Remove the charge groups which are no longer at home here */
 +    dd->ncg_home = ncg_new;
 +    
 +    /* Reorder the state */
 +    for(i=0; i<estNR; i++)
 +    {
 +        if (EST_DISTR(i) && (state->flags & (1<<i)))
 +        {
 +            switch (i)
 +            {
 +            case estX:
 +                order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->x,vbuf);
 +                break;
 +            case estV:
 +                order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->v,vbuf);
 +                break;
 +            case estSDX:
 +                order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->sd_X,vbuf);
 +                break;
 +            case estCGP:
 +                order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->cg_p,vbuf);
 +                break;
 +            case estLD_RNG:
 +            case estLD_RNGI:
 +            case estDISRE_INITF:
 +            case estDISRE_RM3TAV:
 +            case estORIRE_INITF:
 +            case estORIRE_DTAV:
 +                /* No ordering required */
 +                break;
 +            default:
 +                gmx_incons("Unknown state entry encountered in dd_sort_state");
 +                break;
 +            }
 +        }
 +    }
 +    /* Reorder cgcm */
 +    order_vec_cg(dd->ncg_home,cgsort,cgcm,vbuf);
 +    
 +    if (dd->ncg_home+1 > sort->ibuf_nalloc)
 +    {
 +        sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
 +        srenew(sort->ibuf,sort->ibuf_nalloc);
 +    }
 +    ibuf = sort->ibuf;
 +    /* Reorder the global cg index */
 +    order_int_cg(dd->ncg_home,cgsort,dd->index_gl,ibuf);
 +    /* Reorder the cginfo */
 +    order_int_cg(dd->ncg_home,cgsort,fr->cginfo,ibuf);
 +    /* Rebuild the local cg index */
 +    ibuf[0] = 0;
 +    for(i=0; i<dd->ncg_home; i++)
 +    {
 +        cgsize = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
 +        ibuf[i+1] = ibuf[i] + cgsize;
 +    }
 +    for(i=0; i<dd->ncg_home+1; i++)
 +    {
 +        dd->cgindex[i] = ibuf[i];
 +    }
 +    /* Set the home atom number */
 +    dd->nat_home = dd->cgindex[dd->ncg_home];
 +    
 +    /* Copy the sorted ns cell indices back to the ns grid struct */
 +    for(i=0; i<dd->ncg_home; i++)
 +    {
 +        fr->ns.grid->cell_index[i] = cgsort[i].nsc;
 +    }
 +    fr->ns.grid->nr = dd->ncg_home;
 +}
 +
 +static void add_dd_statistics(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int ddnat;
 +    
 +    comm = dd->comm;
 +    
 +    for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
 +    {
 +        comm->sum_nat[ddnat-ddnatZONE] +=
 +            comm->nat[ddnat] - comm->nat[ddnat-1];
 +    }
 +    comm->ndecomp++;
 +}
 +
 +void reset_dd_statistics_counters(gmx_domdec_t *dd)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int ddnat;
 +    
 +    comm = dd->comm;
 +
 +    /* Reset all the statistics and counters for total run counting */
 +    for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
 +    {
 +        comm->sum_nat[ddnat-ddnatZONE] = 0;
 +    }
 +    comm->ndecomp = 0;
 +    comm->nload = 0;
 +    comm->load_step = 0;
 +    comm->load_sum = 0;
 +    comm->load_max = 0;
 +    clear_ivec(comm->load_lim);
 +    comm->load_mdf = 0;
 +    comm->load_pme = 0;
 +}
 +
 +void print_dd_statistics(t_commrec *cr,t_inputrec *ir,FILE *fplog)
 +{
 +    gmx_domdec_comm_t *comm;
 +    int ddnat;
 +    double av;
 +   
 +    comm = cr->dd->comm;
 +    
 +    gmx_sumd(ddnatNR-ddnatZONE,comm->sum_nat,cr);
 +    
 +    if (fplog == NULL)
 +    {
 +        return;
 +    }
 +    
 +    fprintf(fplog,"\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
 +            
 +    for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
 +    {
 +        av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
 +        switch(ddnat)
 +        {
 +        case ddnatZONE:
 +            fprintf(fplog,
 +                    " av. #atoms communicated per step for force:  %d x %.1f\n",
 +                    2,av);
 +            break;
 +        case ddnatVSITE:
 +            if (cr->dd->vsite_comm)
 +            {
 +                fprintf(fplog,
 +                        " av. #atoms communicated per step for vsites: %d x %.1f\n",
 +                        (EEL_PME(ir->coulombtype) || ir->coulombtype==eelEWALD) ? 3 : 2,
 +                        av);
 +            }
 +            break;
 +        case ddnatCON:
 +            if (cr->dd->constraint_comm)
 +            {
 +                fprintf(fplog,
 +                        " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
 +                        1 + ir->nLincsIter,av);
 +            }
 +            break;
 +        default:
 +            gmx_incons(" Unknown type for DD statistics");
 +        }
 +    }
 +    fprintf(fplog,"\n");
 +    
 +    if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
 +    {
 +        print_dd_load_av(fplog,cr->dd);
 +    }
 +}
 +
 +void dd_partition_system(FILE            *fplog,
 +                         gmx_large_int_t      step,
 +                         t_commrec       *cr,
 +                         gmx_bool            bMasterState,
 +                         int             nstglobalcomm,
 +                         t_state         *state_global,
 +                         gmx_mtop_t      *top_global,
 +                         t_inputrec      *ir,
 +                         t_state         *state_local,
 +                         rvec            **f,
 +                         t_mdatoms       *mdatoms,
 +                         gmx_localtop_t  *top_local,
 +                         t_forcerec      *fr,
 +                         gmx_vsite_t     *vsite,
 +                         gmx_shellfc_t   shellfc,
 +                         gmx_constr_t    constr,
 +                         t_nrnb          *nrnb,
 +                         gmx_wallcycle_t wcycle,
 +                         gmx_bool            bVerbose)
 +{
 +    gmx_domdec_t *dd;
 +    gmx_domdec_comm_t *comm;
 +    gmx_ddbox_t ddbox={0};
 +    t_block *cgs_gl;
 +    gmx_large_int_t step_pcoupl;
 +    rvec cell_ns_x0,cell_ns_x1;
 +    int  i,j,n,cg0=0,ncg_home_old=-1,nat_f_novirsum;
 +    gmx_bool bBoxChanged,bNStGlobalComm,bDoDLB,bCheckDLB,bTurnOnDLB,bLogLoad;
 +    gmx_bool bRedist,bSortCG,bResortAll;
 +    ivec ncells_old,np;
 +    real grid_density;
 +    char sbuf[22];
 +      
 +    dd = cr->dd;
 +    comm = dd->comm;
 +
 +    bBoxChanged = (bMasterState || DEFORM(*ir));
 +    if (ir->epc != epcNO)
 +    {
 +        /* With nstpcouple > 1 pressure coupling happens.
 +         * one step after calculating the pressure.
 +         * Box scaling happens at the end of the MD step,
 +         * after the DD partitioning.
 +         * We therefore have to do DLB in the first partitioning
 +         * after an MD step where P-coupling occured.
 +         * We need to determine the last step in which p-coupling occurred.
 +         * MRS -- need to validate this for vv?
 +         */
 +        n = ir->nstpcouple;
 +        if (n == 1)
 +        {
 +            step_pcoupl = step - 1;
 +        }
 +        else
 +        {
 +            step_pcoupl = ((step - 1)/n)*n + 1;
 +        }
 +        if (step_pcoupl >= comm->globalcomm_step)
 +        {
 +            bBoxChanged = TRUE;
 +        }
 +    }
 +
 +    bNStGlobalComm = (step >= comm->globalcomm_step + nstglobalcomm);
 +
 +    if (!comm->bDynLoadBal)
 +    {
 +        bDoDLB = FALSE;
 +    }
 +    else
 +    {
 +        /* Should we do dynamic load balacing this step?
 +         * Since it requires (possibly expensive) global communication,
 +         * we might want to do DLB less frequently.
 +         */
 +        if (bBoxChanged || ir->epc != epcNO)
 +        {
 +            bDoDLB = bBoxChanged;
 +        }
 +        else
 +        {
 +            bDoDLB = bNStGlobalComm;
 +        }
 +    }
 +
 +    /* Check if we have recorded loads on the nodes */
 +    if (comm->bRecordLoad && dd_load_count(comm))
 +    {
 +        if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
 +        {
 +            /* Check if we should use DLB at the second partitioning
 +             * and every 100 partitionings,
 +             * so the extra communication cost is negligible.
 +             */
 +            n = max(100,nstglobalcomm);
 +            bCheckDLB = (comm->n_load_collect == 0 ||
 +                         comm->n_load_have % n == n-1);
 +        }
 +        else
 +        {
 +            bCheckDLB = FALSE;
 +        }
 +        
 +        /* Print load every nstlog, first and last step to the log file */
 +        bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
 +                    comm->n_load_collect == 0 ||
 +                    (ir->nsteps >= 0 &&
 +                     (step + ir->nstlist > ir->init_step + ir->nsteps)));
 +
 +        /* Avoid extra communication due to verbose screen output
 +         * when nstglobalcomm is set.
 +         */
 +        if (bDoDLB || bLogLoad || bCheckDLB ||
 +            (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
 +        {
 +            get_load_distribution(dd,wcycle);
 +            if (DDMASTER(dd))
 +            {
 +                if (bLogLoad)
 +                {
 +                    dd_print_load(fplog,dd,step-1);
 +                }
 +                if (bVerbose)
 +                {
 +                    dd_print_load_verbose(dd);
 +                }
 +            }
 +            comm->n_load_collect++;
 +
 +            if (bCheckDLB) {
 +                /* Since the timings are node dependent, the master decides */
 +                if (DDMASTER(dd))
 +                {
 +                    bTurnOnDLB =
 +                        (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
 +                    if (debug)
 +                    {
 +                        fprintf(debug,"step %s, imb loss %f\n",
 +                                gmx_step_str(step,sbuf),
 +                                dd_force_imb_perf_loss(dd));
 +                    }
 +                }
 +                dd_bcast(dd,sizeof(bTurnOnDLB),&bTurnOnDLB);
 +                if (bTurnOnDLB)
 +                {
 +                    turn_on_dlb(fplog,cr,step);
 +                    bDoDLB = TRUE;
 +                }
 +            }
 +        }
 +        comm->n_load_have++;
 +    }
 +
 +    cgs_gl = &comm->cgs_gl;
 +
 +    bRedist = FALSE;
 +    if (bMasterState)
 +    {
 +        /* Clear the old state */
 +        clear_dd_indices(dd,0,0);
 +
 +        set_ddbox(dd,bMasterState,cr,ir,state_global->box,
 +                  TRUE,cgs_gl,state_global->x,&ddbox);
 +    
 +        get_cg_distribution(fplog,step,dd,cgs_gl,
 +                            state_global->box,&ddbox,state_global->x);
 +        
 +        dd_distribute_state(dd,cgs_gl,
 +                            state_global,state_local,f);
 +        
 +        dd_make_local_cgs(dd,&top_local->cgs);
 +        
 +        if (dd->ncg_home > fr->cg_nalloc)
 +        {
 +            dd_realloc_fr_cg(fr,dd->ncg_home);
 +        }
 +        calc_cgcm(fplog,0,dd->ncg_home,
 +                  &top_local->cgs,state_local->x,fr->cg_cm);
 +        
 +        inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
 +        
 +        dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
 +
 +        cg0 = 0;
 +    }
 +    else if (state_local->ddp_count != dd->ddp_count)
 +    {
 +        if (state_local->ddp_count > dd->ddp_count)
 +        {
 +            gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)",state_local->ddp_count,dd->ddp_count);
 +        }
 +        
 +        if (state_local->ddp_count_cg_gl != state_local->ddp_count)
 +        {
 +            gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)",state_local->ddp_count_cg_gl,state_local->ddp_count);
 +        }
 +        
 +        /* Clear the old state */
 +        clear_dd_indices(dd,0,0);
 +        
 +        /* Build the new indices */
 +        rebuild_cgindex(dd,cgs_gl->index,state_local);
 +        make_dd_indices(dd,cgs_gl->index,0);
 +        
 +        /* Redetermine the cg COMs */
 +        calc_cgcm(fplog,0,dd->ncg_home,
 +                  &top_local->cgs,state_local->x,fr->cg_cm);
 +        
 +        inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
 +
 +        dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
 +
 +        set_ddbox(dd,bMasterState,cr,ir,state_local->box,
 +                  TRUE,&top_local->cgs,state_local->x,&ddbox);
 +
 +        bRedist = comm->bDynLoadBal;
 +    }
 +    else
 +    {
 +        /* We have the full state, only redistribute the cgs */
 +
 +        /* Clear the non-home indices */
 +        clear_dd_indices(dd,dd->ncg_home,dd->nat_home);
 +
 +        /* Avoid global communication for dim's without pbc and -gcom */
 +        if (!bNStGlobalComm)
 +        {
 +            copy_rvec(comm->box0    ,ddbox.box0    );
 +            copy_rvec(comm->box_size,ddbox.box_size);
 +        }
 +        set_ddbox(dd,bMasterState,cr,ir,state_local->box,
 +                  bNStGlobalComm,&top_local->cgs,state_local->x,&ddbox);
 +
 +        bBoxChanged = TRUE;
 +        bRedist = TRUE;
 +    }
 +    /* For dim's without pbc and -gcom */
 +    copy_rvec(ddbox.box0    ,comm->box0    );
 +    copy_rvec(ddbox.box_size,comm->box_size);
 +    
 +    set_dd_cell_sizes(dd,&ddbox,dynamic_dd_box(&ddbox,ir),bMasterState,bDoDLB,
 +                      step,wcycle);
 +    
 +    if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
 +    {
 +        write_dd_grid_pdb("dd_grid",step,dd,state_local->box,&ddbox);
 +    }
 +    
 +    /* Check if we should sort the charge groups */
 +    if (comm->nstSortCG > 0)
 +    {
 +        bSortCG = (bMasterState ||
 +                   (bRedist && (step % comm->nstSortCG == 0)));
 +    }
 +    else
 +    {
 +        bSortCG = FALSE;
 +    }
 +
 +    ncg_home_old = dd->ncg_home;
 +
 +    if (bRedist)
 +    {
 +        cg0 = dd_redistribute_cg(fplog,step,dd,ddbox.tric_dir,
 +                                 state_local,f,fr,mdatoms,
 +                                 !bSortCG,nrnb);
 +    }
 +    
 +    get_nsgrid_boundaries(fr->ns.grid,dd,
 +                          state_local->box,&ddbox,&comm->cell_x0,&comm->cell_x1,
 +                          dd->ncg_home,fr->cg_cm,
 +                          cell_ns_x0,cell_ns_x1,&grid_density);
 +
 +    if (bBoxChanged)
 +    {
 +        comm_dd_ns_cell_sizes(dd,&ddbox,cell_ns_x0,cell_ns_x1,step);
 +    }
 +
 +    copy_ivec(fr->ns.grid->n,ncells_old);
 +    grid_first(fplog,fr->ns.grid,dd,&ddbox,fr->ePBC,
 +               state_local->box,cell_ns_x0,cell_ns_x1,
 +               fr->rlistlong,grid_density);
 +    /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
 +    copy_ivec(ddbox.tric_dir,comm->tric_dir);
 +
 +    if (bSortCG)
 +    {
 +        /* Sort the state on charge group position.
 +         * This enables exact restarts from this step.
 +         * It also improves performance by about 15% with larger numbers
 +         * of atoms per node.
 +         */
 +        
 +        /* Fill the ns grid with the home cell,
 +         * so we can sort with the indices.
 +         */
 +        set_zones_ncg_home(dd);
 +        fill_grid(fplog,&comm->zones,fr->ns.grid,dd->ncg_home,
 +                  0,dd->ncg_home,fr->cg_cm);
 +        
 +        /* Check if we can user the old order and ns grid cell indices
 +         * of the charge groups to sort the charge groups efficiently.
 +         */
 +        bResortAll = (bMasterState ||
 +                      fr->ns.grid->n[XX] != ncells_old[XX] ||
 +                      fr->ns.grid->n[YY] != ncells_old[YY] ||
 +                      fr->ns.grid->n[ZZ] != ncells_old[ZZ]);
 +
 +        if (debug)
 +        {
 +            fprintf(debug,"Step %s, sorting the %d home charge groups\n",
 +                    gmx_step_str(step,sbuf),dd->ncg_home);
 +        }
 +        dd_sort_state(dd,ir->ePBC,fr->cg_cm,fr,state_local,
 +                      bResortAll ? -1 : ncg_home_old);
 +        /* Rebuild all the indices */
 +        cg0 = 0;
 +        ga2la_clear(dd->ga2la);
 +    }
 +    
 +    /* Setup up the communication and communicate the coordinates */
 +    setup_dd_communication(dd,state_local->box,&ddbox,fr);
 +    
 +    /* Set the indices */
 +    make_dd_indices(dd,cgs_gl->index,cg0);
 +
 +    /* Set the charge group boundaries for neighbor searching */
 +    set_cg_boundaries(&comm->zones);
 +    
 +    /*
 +    write_dd_pdb("dd_home",step,"dump",top_global,cr,
 +                 -1,state_local->x,state_local->box);
 +    */
 +    
 +    /* Extract a local topology from the global topology */
 +    for(i=0; i<dd->ndim; i++)
 +    {
 +        np[dd->dim[i]] = comm->cd[i].np;
 +    }
 +    dd_make_local_top(fplog,dd,&comm->zones,dd->npbcdim,state_local->box,
 +                      comm->cellsize_min,np,
 +                      fr,vsite,top_global,top_local);
 +    
 +    /* Set up the special atom communication */
 +    n = comm->nat[ddnatZONE];
 +    for(i=ddnatZONE+1; i<ddnatNR; i++)
 +    {
 +        switch(i)
 +        {
 +        case ddnatVSITE:
 +            if (vsite && vsite->n_intercg_vsite)
 +            {
 +                n = dd_make_local_vsites(dd,n,top_local->idef.il);
 +            }
 +            break;
 +        case ddnatCON:
 +            if (dd->bInterCGcons)
 +            {
 +                /* Only for inter-cg constraints we need special code */
 +                n = dd_make_local_constraints(dd,n,top_global,
 +                                              constr,ir->nProjOrder,
 +                                              &top_local->idef.il[F_CONSTR]);
 +            }
 +            break;
 +        default:
 +            gmx_incons("Unknown special atom type setup");
 +        }
 +        comm->nat[i] = n;
 +    }
 +    
 +    /* Make space for the extra coordinates for virtual site
 +     * or constraint communication.
 +     */
 +    state_local->natoms = comm->nat[ddnatNR-1];
 +    if (state_local->natoms > state_local->nalloc)
 +    {
 +        dd_realloc_state(state_local,f,state_local->natoms);
 +    }
 +
 +    if (fr->bF_NoVirSum)
 +    {
 +        if (vsite && vsite->n_intercg_vsite)
 +        {
 +            nat_f_novirsum = comm->nat[ddnatVSITE];
 +        }
 +        else
 +        {
 +            if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
 +            {
 +                nat_f_novirsum = dd->nat_tot;
 +            }
 +            else
 +            {
 +                nat_f_novirsum = dd->nat_home;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        nat_f_novirsum = 0;
 +    }
 +
 +    /* Set the number of atoms required for the force calculation.
 +     * Forces need to be constrained when using a twin-range setup
 +     * or with energy minimization. For simple simulations we could
 +     * avoid some allocation, zeroing and copying, but this is
 +     * probably not worth the complications ande checking.
 +     */
 +    forcerec_set_ranges(fr,dd->ncg_home,dd->ncg_tot,
 +                        dd->nat_tot,comm->nat[ddnatCON],nat_f_novirsum);
 +
 +    /* We make the all mdatoms up to nat_tot_con.
 +     * We could save some work by only setting invmass
 +     * between nat_tot and nat_tot_con.
 +     */
 +    /* This call also sets the new number of home particles to dd->nat_home */
 +    atoms2md(top_global,ir,
 +             comm->nat[ddnatCON],dd->gatindex,0,dd->nat_home,mdatoms);
 +
 +    /* Now we have the charges we can sort the FE interactions */
 +    dd_sort_local_top(dd,mdatoms,top_local);
 +
 +    if (shellfc)
 +    {
 +        /* Make the local shell stuff, currently no communication is done */
 +        make_local_shells(cr,mdatoms,shellfc);
 +    }
 +    
 +      if (ir->implicit_solvent)
 +    {
 +        make_local_gb(cr,fr->born,ir->gb_algorithm);
 +    }
 +      
 +    if (!(cr->duty & DUTY_PME))
 +    {
 +        /* Send the charges to our PME only node */
 +        gmx_pme_send_q(cr,mdatoms->nChargePerturbed,
 +                       mdatoms->chargeA,mdatoms->chargeB,
 +                       dd_pme_maxshift_x(dd),dd_pme_maxshift_y(dd));
 +    }
 +    
 +    if (constr)
 +    {
 +        set_constraints(constr,top_local,ir,mdatoms,cr);
 +    }
 +    
 +    if (ir->ePull != epullNO)
 +    {
 +        /* Update the local pull groups */
 +        dd_make_local_pull_groups(dd,ir->pull,mdatoms);
 +    }
 +    
 +    if (ir->bRot)
 +    {
 +        /* Update the local rotation groups */
 +        dd_make_local_rotation_groups(dd,ir->rot);
 +    }
 +
 +
 +    add_dd_statistics(dd);
 +    
 +    /* Make sure we only count the cycles for this DD partitioning */
 +    clear_dd_cycle_counts(dd);
 +    
 +    /* Because the order of the atoms might have changed since
 +     * the last vsite construction, we need to communicate the constructing
 +     * atom coordinates again (for spreading the forces this MD step).
 +     */
 +    dd_move_x_vsites(dd,state_local->box,state_local->x);
 +    
 +    if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
 +    {
 +        dd_move_x(dd,state_local->box,state_local->x);
 +        write_dd_pdb("dd_dump",step,"dump",top_global,cr,
 +                     -1,state_local->x,state_local->box);
 +    }
 +
 +    if (bNStGlobalComm)
 +    {
 +        /* Store the global communication step */
 +        comm->globalcomm_step = step;
 +    }
 +    
 +    /* Increase the DD partitioning counter */
 +    dd->ddp_count++;
 +    /* The state currently matches this DD partitioning count, store it */
 +    state_local->ddp_count = dd->ddp_count;
 +    if (bMasterState)
 +    {
 +        /* The DD master node knows the complete cg distribution,
 +         * store the count so we can possibly skip the cg info communication.
 +         */
 +        comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
 +    }
 +
 +    if (comm->DD_debug > 0)
 +    {
 +        /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
 +        check_index_consistency(dd,top_global->natoms,ncg_mtop(top_global),
 +                                "after partitioning");
 +    }
 +}
Simple merge
Simple merge
index de28448a4583eee930ea97dd79da5d226423cd65,0000000000000000000000000000000000000000..fc73494ad0d2b5f3150cbcc62cc727a7218c8160
mode 100644,000000..100644
--- /dev/null
@@@ -1,739 -1,0 +1,727 @@@
- #include "pppm.h"
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <math.h>
 +#include <string.h>
 +#include "sysstuff.h"
 +#include "typedefs.h"
 +#include "macros.h"
 +#include "smalloc.h"
 +#include "macros.h"
 +#include "physics.h"
 +#include "force.h"
 +#include "nonbonded.h"
 +#include "names.h"
 +#include "network.h"
 +#include "pbc.h"
 +#include "ns.h"
 +#include "nrnb.h"
 +#include "bondf.h"
 +#include "mshift.h"
 +#include "txtdump.h"
 +#include "coulomb.h"
-         else
-         {
-             Vcorr = shift_LRcorrection(fplog,md->start,md->homenr,cr,fr,
-                                        md->chargeA,excl,x,TRUE,box,
-                                        fr->vir_el_recip);
-         }
 +#include "pme.h"
 +#include "mdrun.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "qmmm.h"
 +
 +
 +void ns(FILE *fp,
 +        t_forcerec *fr,
 +        rvec       x[],
 +        matrix     box,
 +        gmx_groups_t *groups,
 +        t_grpopts  *opts,
 +        gmx_localtop_t *top,
 +        t_mdatoms  *md,
 +        t_commrec  *cr,
 +        t_nrnb     *nrnb,
 +        real       lambda,
 +        real       *dvdlambda,
 +        gmx_grppairener_t *grppener,
 +        gmx_bool       bFillGrid,
 +        gmx_bool       bDoLongRange,
 +        gmx_bool       bDoForces,
 +        rvec       *f)
 +{
 +  char   *ptr;
 +  int    nsearch;
 +
 +
 +  if (!fr->ns.nblist_initialized)
 +  {
 +      init_neighbor_list(fp, fr, md->homenr);
 +  }
 +    
 +  if (fr->bTwinRange) 
 +    fr->nlr=0;
 +
 +    nsearch = search_neighbours(fp,fr,x,box,top,groups,cr,nrnb,md,
 +                                lambda,dvdlambda,grppener,
 +                                bFillGrid,bDoLongRange,
 +                                bDoForces,f);
 +  if (debug)
 +    fprintf(debug,"nsearch = %d\n",nsearch);
 +    
 +  /* Check whether we have to do dynamic load balancing */
 +  /*if ((nsb->nstDlb > 0) && (mod(step,nsb->nstDlb) == 0))
 +    count_nb(cr,nsb,&(top->blocks[ebCGS]),nns,fr->nlr,
 +    &(top->idef),opts->ngener);
 +  */
 +  if (fr->ns.dump_nl > 0)
 +    dump_nblist(fp,cr,fr,fr->ns.dump_nl);
 +}
 +
 +void do_force_lowlevel(FILE       *fplog,   gmx_large_int_t step,
 +                       t_forcerec *fr,      t_inputrec *ir,
 +                       t_idef     *idef,    t_commrec  *cr,
 +                       t_nrnb     *nrnb,    gmx_wallcycle_t wcycle,
 +                       t_mdatoms  *md,
 +                       t_grpopts  *opts,
 +                       rvec       x[],      history_t  *hist,
 +                       rvec       f[],
 +                       gmx_enerdata_t *enerd,
 +                       t_fcdata   *fcd,
 +                       gmx_mtop_t     *mtop,
 +                       gmx_localtop_t *top,
 +                       gmx_genborn_t *born,
 +                       t_atomtypes *atype,
 +                       gmx_bool       bBornRadii,
 +                       matrix     box,
 +                       real       lambda,  
 +                       t_graph    *graph,
 +                       t_blocka   *excl,    
 +                       rvec       mu_tot[],
 +                       int        flags,
 +                       float      *cycles_pme)
 +{
 +    int     i,status;
 +    int     donb_flags;
 +    gmx_bool    bDoEpot,bSepDVDL,bSB;
 +    int     pme_flags;
 +    matrix  boxs;
 +    rvec    box_size;
 +    real    dvdlambda,Vsr,Vlr,Vcorr=0,vdip,vcharge;
 +    t_pbc   pbc;
 +    real    dvdgb;
 +    char    buf[22];
 +    gmx_enerdata_t ed_lam;
 +    double  lam_i;
 +    real    dvdl_dum;
 +
 +#ifdef GMX_MPI
 +    double  t0=0.0,t1,t2,t3; /* time measurement for coarse load balancing */
 +#endif
 +    
 +#define PRINT_SEPDVDL(s,v,dvdl) if (bSepDVDL) fprintf(fplog,sepdvdlformat,s,v,dvdl);
 +    
 +
 +    set_pbc(&pbc,fr->ePBC,box);
 +    
 +    /* Reset box */
 +    for(i=0; (i<DIM); i++)
 +    {
 +        box_size[i]=box[i][i];
 +    }
 +    
 +    bSepDVDL=(fr->bSepDVDL && do_per_step(step,ir->nstlog));
 +    debug_gmx();
 +    
 +    /* do QMMM first if requested */
 +    if(fr->bQMMM)
 +    {
 +        enerd->term[F_EQM] = calculate_QMMM(cr,x,f,fr,md);
 +    }
 +    
 +    if (bSepDVDL)
 +    {
 +        fprintf(fplog,"Step %s: non-bonded V and dVdl for node %d:\n",
 +                gmx_step_str(step,buf),cr->nodeid);
 +    }
 +    
 +    /* Call the short range functions all in one go. */
 +    
 +    dvdlambda = 0;
 +    
 +#ifdef GMX_MPI
 +    /*#define TAKETIME ((cr->npmenodes) && (fr->timesteps < 12))*/
 +#define TAKETIME FALSE
 +    if (TAKETIME)
 +    {
 +        MPI_Barrier(cr->mpi_comm_mygroup);
 +        t0=MPI_Wtime();
 +    }
 +#endif
 +    
 +    if (ir->nwall)
 +    {
 +        dvdlambda = do_walls(ir,fr,box,md,x,f,lambda,
 +                             enerd->grpp.ener[egLJSR],nrnb);
 +        PRINT_SEPDVDL("Walls",0.0,dvdlambda);
 +        enerd->dvdl_lin += dvdlambda;
 +    }
 +              
 +      /* If doing GB, reset dvda and calculate the Born radii */
 +      if (ir->implicit_solvent)
 +      {
 +              /* wallcycle_start(wcycle,ewcGB); */
 +              
 +              for(i=0;i<born->nr;i++)
 +              {
 +                      fr->dvda[i]=0;
 +              }
 +              
 +              if(bBornRadii)
 +              {
 +                      calc_gb_rad(cr,fr,ir,top,atype,x,&(fr->gblist),born,md,nrnb);
 +              }
 +              
 +              /* wallcycle_stop(wcycle, ewcGB); */
 +      }
 +      
 +    where();
 +    donb_flags = 0;
 +    if (flags & GMX_FORCE_FORCES)
 +    {
 +        donb_flags |= GMX_DONB_FORCES;
 +    }
 +    do_nonbonded(cr,fr,x,f,md,excl,
 +                 fr->bBHAM ?
 +                 enerd->grpp.ener[egBHAMSR] :
 +                 enerd->grpp.ener[egLJSR],
 +                 enerd->grpp.ener[egCOULSR],
 +                               enerd->grpp.ener[egGB],box_size,nrnb,
 +                 lambda,&dvdlambda,-1,-1,donb_flags);
 +    /* If we do foreign lambda and we have soft-core interactions
 +     * we have to recalculate the (non-linear) energies contributions.
 +     */
 +    if (ir->n_flambda > 0 && (flags & GMX_FORCE_DHDL) && ir->sc_alpha != 0)
 +    {
 +        init_enerdata(mtop->groups.grps[egcENER].nr,ir->n_flambda,&ed_lam);
 +        
 +        for(i=0; i<enerd->n_lambda; i++)
 +        {
 +            lam_i = (i==0 ? lambda : ir->flambda[i-1]);
 +            dvdl_dum = 0;
 +            reset_enerdata(&ir->opts,fr,TRUE,&ed_lam,FALSE);
 +            do_nonbonded(cr,fr,x,f,md,excl,
 +                         fr->bBHAM ?
 +                         ed_lam.grpp.ener[egBHAMSR] :
 +                         ed_lam.grpp.ener[egLJSR],
 +                         ed_lam.grpp.ener[egCOULSR],
 +                         enerd->grpp.ener[egGB], box_size,nrnb,
 +                         lam_i,&dvdl_dum,-1,-1,
 +                         GMX_DONB_FOREIGNLAMBDA);
 +            sum_epot(&ir->opts,&ed_lam);
 +            enerd->enerpart_lambda[i] += ed_lam.term[F_EPOT];
 +        }
 +        destroy_enerdata(&ed_lam);
 +    }
 +    where();
 +      
 +      /* If we are doing GB, calculate bonded forces and apply corrections 
 +       * to the solvation forces */
 +      if (ir->implicit_solvent)  {
 +              calc_gb_forces(cr,md,born,top,atype,x,f,fr,idef,
 +                       ir->gb_algorithm,ir->sa_algorithm,nrnb,bBornRadii,&pbc,graph,enerd);
 +    }
 +
 +#ifdef GMX_MPI
 +    if (TAKETIME)
 +    {
 +        t1=MPI_Wtime();
 +        fr->t_fnbf += t1-t0;
 +    }
 +#endif
 +    
 +    if (ir->sc_alpha != 0)
 +    {
 +        enerd->dvdl_nonlin += dvdlambda;
 +    }
 +    else
 +    {
 +        enerd->dvdl_lin    += dvdlambda;
 +    }
 +    Vsr = 0;
 +    if (bSepDVDL)
 +    {
 +        for(i=0; i<enerd->grpp.nener; i++)
 +        {
 +            Vsr +=
 +                (fr->bBHAM ?
 +                 enerd->grpp.ener[egBHAMSR][i] :
 +                 enerd->grpp.ener[egLJSR][i])
 +                + enerd->grpp.ener[egCOULSR][i] + enerd->grpp.ener[egGB][i];
 +        }
 +    }
 +    PRINT_SEPDVDL("VdW and Coulomb SR particle-p.",Vsr,dvdlambda);
 +    debug_gmx();
 +    
 +    if (debug)
 +    {
 +        pr_rvecs(debug,0,"fshift after SR",fr->fshift,SHIFTS);
 +    }
 +    
 +    /* Shift the coordinates. Must be done before bonded forces and PPPM, 
 +     * but is also necessary for SHAKE and update, therefore it can NOT 
 +     * go when no bonded forces have to be evaluated.
 +     */
 +    
 +    /* Here sometimes we would not need to shift with NBFonly,
 +     * but we do so anyhow for consistency of the returned coordinates.
 +     */
 +    if (graph)
 +    {
 +        shift_self(graph,box,x);
 +        if (TRICLINIC(box))
 +        {
 +            inc_nrnb(nrnb,eNR_SHIFTX,2*graph->nnodes);
 +        }
 +        else
 +        {
 +            inc_nrnb(nrnb,eNR_SHIFTX,graph->nnodes);
 +        }
 +    }
 +    /* Check whether we need to do bondeds or correct for exclusions */
 +    if (fr->bMolPBC &&
 +        ((flags & GMX_FORCE_BONDED)
 +         || EEL_RF(fr->eeltype) || EEL_FULL(fr->eeltype)))
 +    {
 +        /* Since all atoms are in the rectangular or triclinic unit-cell,
 +         * only single box vector shifts (2 in x) are required.
 +         */
 +        set_pbc_dd(&pbc,fr->ePBC,cr->dd,TRUE,box);
 +    }
 +    debug_gmx();
 +    
 +    if (flags & GMX_FORCE_BONDED)
 +    {
 +        calc_bonds(fplog,cr->ms,
 +                   idef,x,hist,f,fr,&pbc,graph,enerd,nrnb,lambda,md,fcd,
 +                   DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL, atype, born,
 +                   fr->bSepDVDL && do_per_step(step,ir->nstlog),step);
 +        
 +        /* Check if we have to determine energy differences
 +         * at foreign lambda's.
 +         */
 +        if (ir->n_flambda > 0 && (flags & GMX_FORCE_DHDL) &&
 +            idef->ilsort != ilsortNO_FE)
 +        {
 +            if (idef->ilsort != ilsortFE_SORTED)
 +            {
 +                gmx_incons("The bonded interactions are not sorted for free energy");
 +            }
 +            init_enerdata(mtop->groups.grps[egcENER].nr,ir->n_flambda,&ed_lam);
 +            
 +            for(i=0; i<enerd->n_lambda; i++)
 +            {
 +                lam_i = (i==0 ? lambda : ir->flambda[i-1]);
 +                dvdl_dum = 0;
 +                reset_enerdata(&ir->opts,fr,TRUE,&ed_lam,FALSE);
 +                calc_bonds_lambda(fplog,
 +                                  idef,x,fr,&pbc,graph,&ed_lam,nrnb,lam_i,md,
 +                                  fcd,
 +                                  DOMAINDECOMP(cr) ? cr->dd->gatindex : NULL);
 +                sum_epot(&ir->opts,&ed_lam);
 +                enerd->enerpart_lambda[i] += ed_lam.term[F_EPOT];
 +            }
 +            destroy_enerdata(&ed_lam);
 +        }
 +        debug_gmx();
 +    }
 +
 +    where();
 +
 +    *cycles_pme = 0;
 +    if (EEL_FULL(fr->eeltype))
 +    {
 +        bSB = (ir->nwall == 2);
 +        if (bSB)
 +        {
 +            copy_mat(box,boxs);
 +            svmul(ir->wall_ewald_zfac,boxs[ZZ],boxs[ZZ]);
 +            box_size[ZZ] *= ir->wall_ewald_zfac;
 +        }
 +        
 +        clear_mat(fr->vir_el_recip);  
 +        
 +        if (fr->bEwald)
 +        {
 +            if (fr->n_tpi == 0)
 +            {
 +                dvdlambda = 0;
 +                Vcorr = ewald_LRcorrection(fplog,md->start,md->start+md->homenr,
 +                                           cr,fr,
 +                                           md->chargeA,
 +                                           md->nChargePerturbed ? md->chargeB : NULL,
 +                                           excl,x,bSB ? boxs : box,mu_tot,
 +                                           ir->ewald_geometry,
 +                                           ir->epsilon_surface,
 +                                           lambda,&dvdlambda,&vdip,&vcharge);
 +                PRINT_SEPDVDL("Ewald excl./charge/dip. corr.",Vcorr,dvdlambda);
 +                enerd->dvdl_lin += dvdlambda;
 +            }
 +            else
 +            {
 +                if (ir->ewald_geometry != eewg3D || ir->epsilon_surface != 0)
 +                {
 +                    gmx_fatal(FARGS,"TPI with PME currently only works in a 3D geometry with tin-foil boundary conditions");
 +                }
 +                /* The TPI molecule does not have exclusions with the rest
 +                 * of the system and no intra-molecular PME grid contributions
 +                 * will be calculated in gmx_pme_calc_energy.
 +                 */
 +                Vcorr = 0;
 +            }
 +        }
-         case eelPPPM:
-             status = gmx_pppm_do(fplog,fr->pmedata,FALSE,x,fr->f_novirsum,
-                                  md->chargeA,
-                                  box_size,fr->phi,cr,md->start,md->homenr,
-                                  nrnb,ir->pme_order,&Vlr);
-             break;
 +        
 +        dvdlambda = 0;
 +        status = 0;
 +        switch (fr->eeltype)
 +        {
 +        case eelPME:
 +        case eelPMESWITCH:
 +        case eelPMEUSER:
 +        case eelPMEUSERSWITCH:
++        case eelP3M_AD:
 +            if (cr->duty & DUTY_PME)
 +            {
 +                if (fr->n_tpi == 0 || (flags & GMX_FORCE_STATECHANGED))
 +                {
 +                    pme_flags = GMX_PME_SPREAD_Q | GMX_PME_SOLVE;
 +                    if (flags & GMX_FORCE_FORCES)
 +                    {
 +                        pme_flags |= GMX_PME_CALC_F;
 +                    }
 +                    if (flags & GMX_FORCE_VIRIAL)
 +                    {
 +                        pme_flags |= GMX_PME_CALC_ENER_VIR;
 +                    }
 +                    if (fr->n_tpi > 0)
 +                    {
 +                        /* We don't calculate f, but we do want the potential */
 +                        pme_flags |= GMX_PME_CALC_POT;
 +                    }
 +                    wallcycle_start(wcycle,ewcPMEMESH);
 +                    status = gmx_pme_do(fr->pmedata,
 +                                        md->start,md->homenr - fr->n_tpi,
 +                                        x,fr->f_novirsum,
 +                                        md->chargeA,md->chargeB,
 +                                        bSB ? boxs : box,cr,
 +                                        DOMAINDECOMP(cr) ? dd_pme_maxshift_x(cr->dd) : 0,
 +                                        DOMAINDECOMP(cr) ? dd_pme_maxshift_y(cr->dd) : 0,
 +                                        nrnb,wcycle,
 +                                        fr->vir_el_recip,fr->ewaldcoeff,
 +                                        &Vlr,lambda,&dvdlambda,
 +                                        pme_flags);
 +                    *cycles_pme = wallcycle_stop(wcycle,ewcPMEMESH);
 +
 +                    /* We should try to do as little computation after
 +                     * this as possible, because parallel PME synchronizes
 +                     * the nodes, so we want all load imbalance of the rest
 +                     * of the force calculation to be before the PME call.
 +                     * DD load balancing is done on the whole time of
 +                     * the force call (without PME).
 +                     */
 +                }
 +                if (fr->n_tpi > 0)
 +                {
 +                    /* Determine the PME grid energy of the test molecule
 +                     * with the PME grid potential of the other charges.
 +                     */
 +                    gmx_pme_calc_energy(fr->pmedata,fr->n_tpi,
 +                                        x + md->homenr - fr->n_tpi,
 +                                        md->chargeA + md->homenr - fr->n_tpi,
 +                                        &Vlr);
 +                }
 +                PRINT_SEPDVDL("PME mesh",Vlr,dvdlambda);
 +            } 
 +            else
 +            {
 +                /* Energies and virial are obtained later from the PME nodes */
 +                /* but values have to be zeroed out here */
 +                Vlr=0.0;
 +            }
 +            break;
 +        case eelEWALD:
 +            Vlr = do_ewald(fplog,FALSE,ir,x,fr->f_novirsum,
 +                           md->chargeA,md->chargeB,
 +                           box_size,cr,md->homenr,
 +                           fr->vir_el_recip,fr->ewaldcoeff,
 +                           lambda,&dvdlambda,fr->ewald_table);
 +            PRINT_SEPDVDL("Ewald long-range",Vlr,dvdlambda);
 +            break;
 +        default:
 +            Vlr = 0;
 +            gmx_fatal(FARGS,"No such electrostatics method implemented %s",
 +                      eel_names[fr->eeltype]);
 +        }
 +        if (status != 0)
 +        {
 +            gmx_fatal(FARGS,"Error %d in long range electrostatics routine %s",
 +                      status,EELTYPE(fr->eeltype));
 +              }
 +        enerd->dvdl_lin += dvdlambda;
 +        enerd->term[F_COUL_RECIP] = Vlr + Vcorr;
 +        if (debug)
 +        {
 +            fprintf(debug,"Vlr = %g, Vcorr = %g, Vlr_corr = %g\n",
 +                    Vlr,Vcorr,enerd->term[F_COUL_RECIP]);
 +            pr_rvecs(debug,0,"vir_el_recip after corr",fr->vir_el_recip,DIM);
 +            pr_rvecs(debug,0,"fshift after LR Corrections",fr->fshift,SHIFTS);
 +        }
 +    }
 +    else
 +    {
 +        if (EEL_RF(fr->eeltype))
 +        {
 +            dvdlambda = 0;
 +            
 +            if (fr->eeltype != eelRF_NEC)
 +            {
 +                enerd->term[F_RF_EXCL] =
 +                    RF_excl_correction(fplog,fr,graph,md,excl,x,f,
 +                                       fr->fshift,&pbc,lambda,&dvdlambda);
 +            }
 +            
 +            enerd->dvdl_lin += dvdlambda;
 +            PRINT_SEPDVDL("RF exclusion correction",
 +                          enerd->term[F_RF_EXCL],dvdlambda);
 +        }
 +    }
 +    where();
 +    debug_gmx();
 +      
 +    if (debug)
 +    {
 +        print_nrnb(debug,nrnb); 
 +    }
 +    debug_gmx();
 +    
 +#ifdef GMX_MPI
 +    if (TAKETIME)
 +    {
 +        t2=MPI_Wtime();
 +        MPI_Barrier(cr->mpi_comm_mygroup);
 +        t3=MPI_Wtime();
 +        fr->t_wait += t3-t2;
 +        if (fr->timesteps == 11)
 +        {
 +            fprintf(stderr,"* PP load balancing info: node %d, step %s, rel wait time=%3.0f%% , load string value: %7.2f\n", 
 +                    cr->nodeid, gmx_step_str(fr->timesteps,buf), 
 +                    100*fr->t_wait/(fr->t_wait+fr->t_fnbf), 
 +                    (fr->t_fnbf+fr->t_wait)/fr->t_fnbf);
 +        }       
 +        fr->timesteps++;
 +    }
 +#endif
 +    
 +    if (debug)
 +    {
 +        pr_rvecs(debug,0,"fshift after bondeds",fr->fshift,SHIFTS);
 +    }
 +    
 +}
 +
 +void init_enerdata(int ngener,int n_flambda,gmx_enerdata_t *enerd)
 +{
 +    int i,n2;
 +    
 +    for(i=0; i<F_NRE; i++)
 +    {
 +        enerd->term[i] = 0;
 +    }
 +    
 +    n2=ngener*ngener;
 +    if (debug)
 +    {
 +        fprintf(debug,"Creating %d sized group matrix for energies\n",n2);
 +    }
 +    enerd->grpp.nener = n2;
 +    for(i=0; (i<egNR); i++)
 +    {
 +        snew(enerd->grpp.ener[i],n2);
 +    }
 +
 +    if (n_flambda)
 +    {
 +        enerd->n_lambda = 1 + n_flambda;
 +        snew(enerd->enerpart_lambda,enerd->n_lambda);
 +    }
 +    else
 +    {
 +        enerd->n_lambda = 0;
 +    }
 +}
 +
 +void destroy_enerdata(gmx_enerdata_t *enerd)
 +{
 +    int i;
 +
 +    for(i=0; (i<egNR); i++)
 +    {
 +        sfree(enerd->grpp.ener[i]);
 +    }
 +
 +    if (enerd->n_lambda)
 +    {
 +        sfree(enerd->enerpart_lambda);
 +    }
 +}
 +
 +static real sum_v(int n,real v[])
 +{
 +  real t;
 +  int  i;
 +  
 +  t = 0.0;
 +  for(i=0; (i<n); i++)
 +    t = t + v[i];
 +    
 +  return t;
 +}
 +
 +void sum_epot(t_grpopts *opts,gmx_enerdata_t *enerd)
 +{
 +  gmx_grppairener_t *grpp;
 +  real *epot;
 +  int i;
 +  
 +  grpp = &enerd->grpp;
 +  epot = enerd->term;
 +
 +  /* Accumulate energies */
 +  epot[F_COUL_SR]  = sum_v(grpp->nener,grpp->ener[egCOULSR]);
 +  epot[F_LJ]       = sum_v(grpp->nener,grpp->ener[egLJSR]);
 +  epot[F_LJ14]     = sum_v(grpp->nener,grpp->ener[egLJ14]);
 +  epot[F_COUL14]   = sum_v(grpp->nener,grpp->ener[egCOUL14]);
 +  epot[F_COUL_LR]  = sum_v(grpp->nener,grpp->ener[egCOULLR]);
 +  epot[F_LJ_LR]    = sum_v(grpp->nener,grpp->ener[egLJLR]);
 +  /* We have already added 1-2,1-3, and 1-4 terms to F_GBPOL */
 +  epot[F_GBPOL]   += sum_v(grpp->nener,grpp->ener[egGB]);
 +    
 +/* lattice part of LR doesnt belong to any group
 + * and has been added earlier
 + */
 +  epot[F_BHAM]     = sum_v(grpp->nener,grpp->ener[egBHAMSR]);
 +  epot[F_BHAM_LR]  = sum_v(grpp->nener,grpp->ener[egBHAMLR]);
 +
 +  epot[F_EPOT] = 0;
 +  for(i=0; (i<F_EPOT); i++)
 +    if (i != F_DISRESVIOL && i != F_ORIRESDEV && i != F_DIHRESVIOL)
 +      epot[F_EPOT] += epot[i];
 +}
 +
 +void sum_dhdl(gmx_enerdata_t *enerd,double lambda,t_inputrec *ir)
 +{
 +    int i;
 +    double dlam,dhdl_lin;
 +
 +    enerd->term[F_DVDL] = enerd->dvdl_lin + enerd->dvdl_nonlin;
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"dvdl: %f, non-linear %f + linear %f\n",
 +                enerd->term[F_DVDL],enerd->dvdl_nonlin,enerd->dvdl_lin);
 +    }
 +
 +    /* Notes on the foreign lambda free energy difference evaluation:
 +     * Adding the potential and ekin terms that depend linearly on lambda
 +     * as delta lam * dvdl to the energy differences is exact.
 +     * For the constraint dvdl this is not exact, but we have no other option.
 +     * For the non-bonded LR term we assume that the soft-core (if present)
 +     * no longer affects the energy beyond the short-range cut-off,
 +     * which is a very good approximation (except for exotic settings).
 +     */
 +    for(i=1; i<enerd->n_lambda; i++)
 +    {
 +        dlam = (ir->flambda[i-1] - lambda);
 +        dhdl_lin =
 +            enerd->dvdl_lin + enerd->term[F_DKDL] + enerd->term[F_DHDL_CON];
 +        if (debug)
 +        {
 +            fprintf(debug,"enerdiff lam %g: non-linear %f linear %f*%f\n",
 +                    ir->flambda[i-1],
 +                    enerd->enerpart_lambda[i] - enerd->enerpart_lambda[0],
 +                    dlam,dhdl_lin);
 +        }
 +        enerd->enerpart_lambda[i] += dlam*dhdl_lin;
 +
 +    }
 +}
 +
 +void reset_enerdata(t_grpopts *opts,
 +                    t_forcerec *fr,gmx_bool bNS,
 +                    gmx_enerdata_t *enerd,
 +                    gmx_bool bMaster)
 +{
 +  gmx_bool bKeepLR;
 +  int  i,j;
 +  
 +  /* First reset all energy components, except for the long range terms
 +   * on the master at non neighbor search steps, since the long range
 +   * terms have already been summed at the last neighbor search step.
 +   */
 +  bKeepLR = (fr->bTwinRange && !bNS);
 +  for(i=0; (i<egNR); i++) {
 +    if (!(bKeepLR && bMaster && (i == egCOULLR || i == egLJLR))) {
 +      for(j=0; (j<enerd->grpp.nener); j++)
 +      enerd->grpp.ener[i][j] = 0.0;
 +    }
 +  }
 +  enerd->dvdl_lin    = 0.0;
 +  enerd->dvdl_nonlin = 0.0;
 +
 +  /* Normal potential energy components */
 +  for(i=0; (i<=F_EPOT); i++) {
 +    enerd->term[i] = 0.0;
 +  }
 +  /* Initialize the dVdlambda term with the long range contribution */
 +  enerd->term[F_DVDL]     = 0.0;
 +  enerd->term[F_DKDL]     = 0.0;
 +  enerd->term[F_DHDL_CON] = 0.0;
 +  if (enerd->n_lambda > 0)
 +  {
 +      for(i=0; i<enerd->n_lambda; i++)
 +      {
 +          enerd->enerpart_lambda[i] = 0.0;
 +      }
 +  }
 +}
Simple merge
index 3d2428f15a534486265c3b9c46160cbce2a41e41,0000000000000000000000000000000000000000..c08eaef67a0ee0b7692a7f0fb2088a68c376b37b
mode 100644,000000..100644
--- /dev/null
@@@ -1,747 -1,0 +1,742 @@@
- #if ((defined WIN32 || defined _WIN32 || defined WIN64 || defined _WIN64) && !defined __CYGWIN__ && !defined __CYGWIN32__)
- /* _isnan() */
- #include <float.h>
- #endif
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
-         
-         /* Calculate center of mass velocity if necessary, also parallellized */
-         if (bStopCM && !bRerunMD && bEner) 
-         {
-             calc_vcm_grp(fplog,mdatoms->start,mdatoms->homenr,mdatoms,
-                          state->x,state->v,vcm);
-         }
 +#include "typedefs.h"
 +#include "string2.h"
 +#include "smalloc.h"
 +#include "mdrun.h"
 +#include "domdec.h"
 +#include "mtop_util.h"
 +#include "gmx_wallcycle.h"
 +#include "vcm.h"
 +#include "nrnb.h"
 +#include "macros.h"
 +
 +/* Is the signal in one simulation independent of other simulations? */
 +gmx_bool gs_simlocal[eglsNR] = { TRUE, FALSE, FALSE, TRUE };
 +
 +/* check which of the multisim simulations has the shortest number of
 +   steps and return that number of nsteps */
 +gmx_large_int_t get_multisim_nsteps(const t_commrec *cr,
 +                                    gmx_large_int_t nsteps)
 +{
 +    gmx_large_int_t steps_out;
 +
 +    if MASTER(cr)
 +    {
 +        gmx_large_int_t *buf;
 +        int s;
 +
 +        snew(buf,cr->ms->nsim);
 +
 +        buf[cr->ms->sim] = nsteps;
 +        gmx_sumli_sim(cr->ms->nsim, buf, cr->ms);
 +
 +        steps_out=-1;
 +        for(s=0; s<cr->ms->nsim; s++)
 +        {
 +            /* find the smallest positive number */
 +            if (buf[s]>= 0 && ((steps_out < 0) || (buf[s]<steps_out)) )
 +            {
 +                steps_out=buf[s];
 +            }
 +        }
 +        sfree(buf);
 +
 +        /* if we're the limiting simulation, don't do anything */
 +        if (steps_out>=0 && steps_out<nsteps) 
 +        {
 +            char strbuf[255];
 +            snprintf(strbuf, 255, "Will stop simulation %%d after %s steps (another simulation will end then).\n", gmx_large_int_pfmt);
 +            fprintf(stderr, strbuf, cr->ms->sim, steps_out);
 +        }
 +    }
 +    /* broadcast to non-masters */
 +    gmx_bcast(sizeof(gmx_large_int_t), &steps_out, cr);
 +    return steps_out;
 +}
 +
 +int multisim_min(const gmx_multisim_t *ms,int nmin,int n)
 +{
 +    int  *buf;
 +    gmx_bool bPos,bEqual;
 +    int  s,d;
 +
 +    snew(buf,ms->nsim);
 +    buf[ms->sim] = n;
 +    gmx_sumi_sim(ms->nsim,buf,ms);
 +    bPos   = TRUE;
 +    bEqual = TRUE;
 +    for(s=0; s<ms->nsim; s++)
 +    {
 +        bPos   = bPos   && (buf[s] > 0);
 +        bEqual = bEqual && (buf[s] == buf[0]);
 +    }
 +    if (bPos)
 +    {
 +        if (bEqual)
 +        {
 +            nmin = min(nmin,buf[0]);
 +        }
 +        else
 +        {
 +            /* Find the least common multiple */
 +            for(d=2; d<nmin; d++)
 +            {
 +                s = 0;
 +                while (s < ms->nsim && d % buf[s] == 0)
 +                {
 +                    s++;
 +                }
 +                if (s == ms->nsim)
 +                {
 +                    /* We found the LCM and it is less than nmin */
 +                    nmin = d;
 +                    break;
 +                }
 +            }
 +        }
 +    }
 +    sfree(buf);
 +
 +    return nmin;
 +}
 +
 +int multisim_nstsimsync(const t_commrec *cr,
 +                        const t_inputrec *ir,int repl_ex_nst)
 +{
 +    int nmin;
 +
 +    if (MASTER(cr))
 +    {
 +        nmin = INT_MAX;
 +        nmin = multisim_min(cr->ms,nmin,ir->nstlist);
 +        nmin = multisim_min(cr->ms,nmin,ir->nstcalcenergy);
 +        nmin = multisim_min(cr->ms,nmin,repl_ex_nst);
 +        if (nmin == INT_MAX)
 +        {
 +            gmx_fatal(FARGS,"Can not find an appropriate interval for inter-simulation communication, since nstlist, nstcalcenergy and -replex are all <= 0");
 +        }
 +        /* Avoid inter-simulation communication at every (second) step */
 +        if (nmin <= 2)
 +        {
 +            nmin = 10;
 +        }
 +    }
 +
 +    gmx_bcast(sizeof(int),&nmin,cr);
 +
 +    return nmin;
 +}
 +
 +void init_global_signals(globsig_t *gs,const t_commrec *cr,
 +                         const t_inputrec *ir,int repl_ex_nst)
 +{
 +    int i;
 +
 +    if (MULTISIM(cr))
 +    {
 +        gs->nstms = multisim_nstsimsync(cr,ir,repl_ex_nst);
 +        if (debug)
 +        {
 +            fprintf(debug,"Syncing simulations for checkpointing and termination every %d steps\n",gs->nstms);
 +        }
 +    }
 +    else
 +    {
 +        gs->nstms = 1;
 +    }
 +
 +    for(i=0; i<eglsNR; i++)
 +    {
 +        gs->sig[i] = 0;
 +        gs->set[i] = 0;
 +    }
 +}
 +
 +void copy_coupling_state(t_state *statea,t_state *stateb, 
 +                         gmx_ekindata_t *ekinda,gmx_ekindata_t *ekindb, t_grpopts* opts) 
 +{
 +    
 +    /* MRS note -- might be able to get rid of some of the arguments.  Look over it when it's all debugged */
 +    
 +    int i,j,nc;
 +
 +    /* Make sure we have enough space for x and v */
 +    if (statea->nalloc > stateb->nalloc)
 +    {
 +        stateb->nalloc = statea->nalloc;
 +        srenew(stateb->x,stateb->nalloc);
 +        srenew(stateb->v,stateb->nalloc);
 +    }
 +
 +    stateb->natoms     = statea->natoms;
 +    stateb->ngtc       = statea->ngtc;
 +    stateb->nnhpres    = statea->nnhpres;
 +    stateb->veta       = statea->veta;
 +    if (ekinda) 
 +    {
 +        copy_mat(ekinda->ekin,ekindb->ekin);
 +        for (i=0; i<stateb->ngtc; i++) 
 +        {
 +            ekindb->tcstat[i].T = ekinda->tcstat[i].T;
 +            ekindb->tcstat[i].Th = ekinda->tcstat[i].Th;
 +            copy_mat(ekinda->tcstat[i].ekinh,ekindb->tcstat[i].ekinh);
 +            copy_mat(ekinda->tcstat[i].ekinf,ekindb->tcstat[i].ekinf);
 +            ekindb->tcstat[i].ekinscalef_nhc =  ekinda->tcstat[i].ekinscalef_nhc;
 +            ekindb->tcstat[i].ekinscaleh_nhc =  ekinda->tcstat[i].ekinscaleh_nhc;
 +            ekindb->tcstat[i].vscale_nhc =  ekinda->tcstat[i].vscale_nhc;
 +        }
 +    }
 +    copy_rvecn(statea->x,stateb->x,0,stateb->natoms);
 +    copy_rvecn(statea->v,stateb->v,0,stateb->natoms);
 +    copy_mat(statea->box,stateb->box);
 +    copy_mat(statea->box_rel,stateb->box_rel);
 +    copy_mat(statea->boxv,stateb->boxv);
 +
 +    for (i = 0; i<stateb->ngtc; i++) 
 +    { 
 +        nc = i*opts->nhchainlength;
 +        for (j=0; j<opts->nhchainlength; j++) 
 +        {
 +            stateb->nosehoover_xi[nc+j]  = statea->nosehoover_xi[nc+j];
 +            stateb->nosehoover_vxi[nc+j] = statea->nosehoover_vxi[nc+j];
 +        }
 +    }
 +    if (stateb->nhpres_xi != NULL)
 +    {
 +        for (i = 0; i<stateb->nnhpres; i++) 
 +        {
 +            nc = i*opts->nhchainlength;
 +            for (j=0; j<opts->nhchainlength; j++) 
 +            {
 +                stateb->nhpres_xi[nc+j]  = statea->nhpres_xi[nc+j];
 +                stateb->nhpres_vxi[nc+j] = statea->nhpres_vxi[nc+j];
 +            }
 +        }
 +    }
 +}
 +
 +real compute_conserved_from_auxiliary(t_inputrec *ir, t_state *state, t_extmass *MassQ)
 +{
 +    real quantity = 0;
 +    switch (ir->etc) 
 +    {
 +    case etcNO:
 +        break;
 +    case etcBERENDSEN:
 +        break;
 +    case etcNOSEHOOVER:
 +        quantity = NPT_energy(ir,state,MassQ);                
 +        break;
 +    case etcVRESCALE:
 +        quantity = vrescale_energy(&(ir->opts),state->therm_integral);
 +        break;
 +    default:
 +        break;
 +    }
 +    return quantity;
 +}
 +
 +void compute_globals(FILE *fplog, gmx_global_stat_t gstat, t_commrec *cr, t_inputrec *ir, 
 +                     t_forcerec *fr, gmx_ekindata_t *ekind, 
 +                     t_state *state, t_state *state_global, t_mdatoms *mdatoms, 
 +                     t_nrnb *nrnb, t_vcm *vcm, gmx_wallcycle_t wcycle,
 +                     gmx_enerdata_t *enerd,tensor force_vir, tensor shake_vir, tensor total_vir, 
 +                     tensor pres, rvec mu_tot, gmx_constr_t constr, 
 +                     globsig_t *gs,gmx_bool bInterSimGS,
 +                     matrix box, gmx_mtop_t *top_global, real *pcurr, 
 +                     int natoms, gmx_bool *bSumEkinhOld, int flags)
 +{
 +    int  i,gsi;
 +    real gs_buf[eglsNR];
 +    tensor corr_vir,corr_pres,shakeall_vir;
 +    gmx_bool bEner,bPres,bTemp, bVV;
 +    gmx_bool bRerunMD, bStopCM, bGStat, bIterate, 
 +        bFirstIterate,bReadEkin,bEkinAveVel,bScaleEkin, bConstrain;
 +    real ekin,temp,prescorr,enercorr,dvdlcorr;
 +    
 +    /* translate CGLO flags to gmx_booleans */
 +    bRerunMD = flags & CGLO_RERUNMD;
 +    bStopCM = flags & CGLO_STOPCM;
 +    bGStat = flags & CGLO_GSTAT;
 +
 +    bReadEkin = (flags & CGLO_READEKIN);
 +    bScaleEkin = (flags & CGLO_SCALEEKIN);
 +    bEner = flags & CGLO_ENERGY;
 +    bTemp = flags & CGLO_TEMPERATURE;
 +    bPres  = (flags & CGLO_PRESSURE);
 +    bConstrain = (flags & CGLO_CONSTRAINT);
 +    bIterate = (flags & CGLO_ITERATE);
 +    bFirstIterate = (flags & CGLO_FIRSTITERATE);
 +
 +    /* we calculate a full state kinetic energy either with full-step velocity verlet
 +       or half step where we need the pressure */
 +    
 +    bEkinAveVel = (ir->eI==eiVV || (ir->eI==eiVVAK && bPres) || bReadEkin);
 +    
 +    /* in initalization, it sums the shake virial in vv, and to 
 +       sums ekinh_old in leapfrog (or if we are calculating ekinh_old) for other reasons */
 +
 +    /* ########## Kinetic energy  ############## */
 +    
 +    if (bTemp) 
 +    {
 +        /* Non-equilibrium MD: this is parallellized, but only does communication
 +         * when there really is NEMD.
 +         */
 +        
 +        if (PAR(cr) && (ekind->bNEMD)) 
 +        {
 +            accumulate_u(cr,&(ir->opts),ekind);
 +        }
 +        debug_gmx();
 +        if (bReadEkin)
 +        {
 +            restore_ekinstate_from_state(cr,ekind,&state_global->ekinstate);
 +        }
 +        else 
 +        {
 +
 +            calc_ke_part(state,&(ir->opts),mdatoms,ekind,nrnb,bEkinAveVel,bIterate);
 +        }
 +        
 +        debug_gmx();
-     if (bTemp || bPres || bEner || bConstrain) 
 +    }
 +
-                             ir,ekind,constr,vcm,
++    /* Calculate center of mass velocity if necessary, also parallellized */
++    if (bStopCM)
++    {
++        calc_vcm_grp(fplog,mdatoms->start,mdatoms->homenr,mdatoms,
++                     state->x,state->v,vcm);
++    }
++
++    if (bTemp || bStopCM || bPres || bEner || bConstrain)
 +    {
 +        if (!bGStat)
 +        {
 +            /* We will not sum ekinh_old,                                                            
 +             * so signal that we still have to do it.                                                
 +             */
 +            *bSumEkinhOld = TRUE;
 +
 +        }
 +        else
 +        {
 +            if (gs != NULL)
 +            {
 +                for(i=0; i<eglsNR; i++)
 +                {
 +                    gs_buf[i] = gs->sig[i];
 +                }
 +            }
 +            if (PAR(cr)) 
 +            {
 +                wallcycle_start(wcycle,ewcMoveE);
 +                global_stat(fplog,gstat,cr,enerd,force_vir,shake_vir,mu_tot,
-     if (bEner) {
-         /* Do center of mass motion removal */
-         if (bStopCM && !bRerunMD) /* is this correct?  Does it get called too often with this logic? */
-         {
-             check_cm_grp(fplog,vcm,ir,1);
-             do_stopcm_grp(fplog,mdatoms->start,mdatoms->homenr,mdatoms->cVCM,
-                           state->x,state->v,vcm);
-             inc_nrnb(nrnb,eNR_STOPCM,mdatoms->homenr);
-         }
++                            ir,ekind,constr,bStopCM ? vcm : NULL,
 +                            gs != NULL ? eglsNR : 0,gs_buf,
 +                            top_global,state,
 +                            *bSumEkinhOld,flags);
 +                wallcycle_stop(wcycle,ewcMoveE);
 +            }
 +            if (gs != NULL)
 +            {
 +                if (MULTISIM(cr) && bInterSimGS)
 +                {
 +                    if (MASTER(cr))
 +                    {
 +                        /* Communicate the signals between the simulations */
 +                        gmx_sum_sim(eglsNR,gs_buf,cr->ms);
 +                    }
 +                    /* Communicate the signals form the master to the others */
 +                    gmx_bcast(eglsNR*sizeof(gs_buf[0]),gs_buf,cr);
 +                }
 +                for(i=0; i<eglsNR; i++)
 +                {
 +                    if (bInterSimGS || gs_simlocal[i])
 +                    {
 +                        /* Set the communicated signal only when it is non-zero,
 +                         * since signals might not be processed at each MD step.
 +                         */
 +                        gsi = (gs_buf[i] >= 0 ?
 +                               (int)(gs_buf[i] + 0.5) :
 +                               (int)(gs_buf[i] - 0.5));
 +                        if (gsi != 0)
 +                        {
 +                            gs->set[i] = gsi;
 +                        }
 +                        /* Turn off the local signal */
 +                        gs->sig[i] = 0;
 +                    }
 +                }
 +            }
 +            *bSumEkinhOld = FALSE;
 +        }
 +    }
 +    
 +    if (!ekind->bNEMD && debug && bTemp && (vcm->nr > 0))
 +    {
 +        correct_ekin(debug,
 +                     mdatoms->start,mdatoms->start+mdatoms->homenr,
 +                     state->v,vcm->group_p[0],
 +                     mdatoms->massT,mdatoms->tmass,ekind->ekin);
 +    }
 +    
-         enerd->term[F_PRES] = calc_pres(fr->ePBC,ir->nwall,box,ekind->ekin,total_vir,pres,
-                                         (fr->eeltype==eelPPPM)?enerd->term[F_COUL_RECIP]:0.0);
++    /* Do center of mass motion removal */
++    if (bStopCM)
++    {
++        check_cm_grp(fplog,vcm,ir,1);
++        do_stopcm_grp(fplog,mdatoms->start,mdatoms->homenr,mdatoms->cVCM,
++                      state->x,state->v,vcm);
++        inc_nrnb(nrnb,eNR_STOPCM,mdatoms->homenr);
++    }
 +
++    if (bEner)
++    {
 +        /* Calculate the amplitude of the cosine velocity profile */
 +        ekind->cosacc.vcos = ekind->cosacc.mvcos/mdatoms->tmass;
 +    }
 +
 +    if (bTemp) 
 +    {
 +        /* Sum the kinetic energies of the groups & calc temp */
 +        /* compute full step kinetic energies if vv, or if vv-avek and we are computing the pressure with IR_NPT_TROTTER */
 +        /* three maincase:  VV with AveVel (md-vv), vv with AveEkin (md-vv-avek), leap with AveEkin (md).  
 +           Leap with AveVel is not supported; it's not clear that it will actually work.  
 +           bEkinAveVel: If TRUE, we simply multiply ekin by ekinscale to get a full step kinetic energy. 
 +           If FALSE, we average ekinh_old and ekinh*ekinscale_nhc to get an averaged half step kinetic energy.
 +           bSaveEkinOld: If TRUE (in the case of iteration = bIterate is TRUE), we don't reset the ekinscale_nhc.  
 +           If FALSE, we go ahead and erase over it.
 +        */ 
 +        enerd->term[F_TEMP] = sum_ekin(&(ir->opts),ekind,&(enerd->term[F_DKDL]),
 +                                       bEkinAveVel,bIterate,bScaleEkin);
 + 
 +        enerd->term[F_EKIN] = trace(ekind->ekin);
 +    }
 +    
 +    /* ##########  Long range energy information ###### */
 +    
 +    if (bEner || bPres || bConstrain) 
 +    {
 +        calc_dispcorr(fplog,ir,fr,0,top_global->natoms,box,state->lambda,
 +                      corr_pres,corr_vir,&prescorr,&enercorr,&dvdlcorr);
 +    }
 +    
 +    if (bEner && bFirstIterate) 
 +    {
 +        enerd->term[F_DISPCORR] = enercorr;
 +        enerd->term[F_EPOT] += enercorr;
 +        enerd->term[F_DVDL] += dvdlcorr;
 +        if (fr->efep != efepNO) {
 +            enerd->dvdl_lin += dvdlcorr;
 +        }
 +    }
 +    
 +    /* ########## Now pressure ############## */
 +    if (bPres || bConstrain) 
 +    {
 +        
 +        m_add(force_vir,shake_vir,total_vir);
 +        
 +        /* Calculate pressure and apply LR correction if PPPM is used.
 +         * Use the box from last timestep since we already called update().
 +         */
 +        
++        enerd->term[F_PRES] = calc_pres(fr->ePBC,ir->nwall,box,ekind->ekin,total_vir,pres);
 +        
 +        /* Calculate long range corrections to pressure and energy */
 +        /* this adds to enerd->term[F_PRES] and enerd->term[F_ETOT], 
 +           and computes enerd->term[F_DISPCORR].  Also modifies the 
 +           total_vir and pres tesors */
 +        
 +        m_add(total_vir,corr_vir,total_vir);
 +        m_add(pres,corr_pres,pres);
 +        enerd->term[F_PDISPCORR] = prescorr;
 +        enerd->term[F_PRES] += prescorr;
 +        *pcurr = enerd->term[F_PRES];
 +        /* calculate temperature using virial */
 +        enerd->term[F_VTEMP] = calc_temp(trace(total_vir),ir->opts.nrdf[0]);
 +        
 +    }    
 +}
 +
 +void check_nst_param(FILE *fplog,t_commrec *cr,
 +                     const char *desc_nst,int nst,
 +                     const char *desc_p,int *p)
 +{
 +    char buf[STRLEN];
 +
 +    if (*p > 0 && *p % nst != 0)
 +    {
 +        /* Round up to the next multiple of nst */
 +        *p = ((*p)/nst + 1)*nst;
 +        sprintf(buf,"NOTE: %s changes %s to %d\n",desc_nst,desc_p,*p);
 +        md_print_warning(cr,fplog,buf);
 +    }
 +}
 +
 +void reset_all_counters(FILE *fplog,t_commrec *cr,
 +                        gmx_large_int_t step,
 +                        gmx_large_int_t *step_rel,t_inputrec *ir,
 +                        gmx_wallcycle_t wcycle,t_nrnb *nrnb,
 +                        gmx_runtime_t *runtime)
 +{
 +    char buf[STRLEN],sbuf[STEPSTRSIZE];
 +
 +    /* Reset all the counters related to performance over the run */
 +    sprintf(buf,"Step %s: resetting all time and cycle counters\n",
 +            gmx_step_str(step,sbuf));
 +    md_print_warning(cr,fplog,buf);
 +
 +    wallcycle_stop(wcycle,ewcRUN);
 +    wallcycle_reset_all(wcycle);
 +    if (DOMAINDECOMP(cr))
 +    {
 +        reset_dd_statistics_counters(cr->dd);
 +    }
 +    init_nrnb(nrnb);
 +    ir->init_step += *step_rel;
 +    ir->nsteps    -= *step_rel;
 +    *step_rel = 0;
 +    wallcycle_start(wcycle,ewcRUN);
 +    runtime_start(runtime);
 +    print_date_and_time(fplog,cr->nodeid,"Restarted time",runtime);
 +}
 +
 +void min_zero(int *n,int i)
 +{
 +    if (i > 0 && (*n == 0 || i < *n))
 +    {
 +        *n = i;
 +    }
 +}
 +
 +int lcd4(int i1,int i2,int i3,int i4)
 +{
 +    int nst;
 +
 +    nst = 0;
 +    min_zero(&nst,i1);
 +    min_zero(&nst,i2);
 +    min_zero(&nst,i3);
 +    min_zero(&nst,i4);
 +    if (nst == 0)
 +    {
 +        gmx_incons("All 4 inputs for determininig nstglobalcomm are <= 0");
 +    }
 +    
 +    while (nst > 1 && ((i1 > 0 && i1 % nst != 0)  ||
 +                       (i2 > 0 && i2 % nst != 0)  ||
 +                       (i3 > 0 && i3 % nst != 0)  ||
 +                       (i4 > 0 && i4 % nst != 0)))
 +    {
 +        nst--;
 +    }
 +
 +    return nst;
 +}
 +
 +int check_nstglobalcomm(FILE *fplog,t_commrec *cr,
 +                        int nstglobalcomm,t_inputrec *ir)
 +{
 +    char buf[STRLEN];
 +
 +    if (!EI_DYNAMICS(ir->eI))
 +    {
 +        nstglobalcomm = 1;
 +    }
 +
 +    if (nstglobalcomm == -1)
 +    {
 +        if (!(ir->nstcalcenergy > 0 ||
 +              ir->nstlist > 0 ||
 +              ir->etc != etcNO ||
 +              ir->epc != epcNO))
 +        {
 +            nstglobalcomm = 10;
 +            if (ir->nstenergy > 0 && ir->nstenergy < nstglobalcomm)
 +            {
 +                nstglobalcomm = ir->nstenergy;
 +            }
 +        }
 +        else
 +        {
 +            /* Ensure that we do timely global communication for
 +             * (possibly) each of the four following options.
 +             */
 +            nstglobalcomm = lcd4(ir->nstcalcenergy,
 +                                 ir->nstlist,
 +                                 ir->etc != etcNO ? ir->nsttcouple : 0,
 +                                 ir->epc != epcNO ? ir->nstpcouple : 0);
 +        }
 +    }
 +    else
 +    {
 +        if (ir->nstlist > 0 &&
 +            nstglobalcomm > ir->nstlist && nstglobalcomm % ir->nstlist != 0)
 +        {
 +            nstglobalcomm = (nstglobalcomm / ir->nstlist)*ir->nstlist;
 +            sprintf(buf,"WARNING: nstglobalcomm is larger than nstlist, but not a multiple, setting it to %d\n",nstglobalcomm);
 +            md_print_warning(cr,fplog,buf);
 +        }
 +        if (ir->nstcalcenergy > 0)
 +        {
 +            check_nst_param(fplog,cr,"-gcom",nstglobalcomm,
 +                            "nstcalcenergy",&ir->nstcalcenergy);
 +        }
 +        if (ir->etc != etcNO && ir->nsttcouple > 0)
 +        {
 +            check_nst_param(fplog,cr,"-gcom",nstglobalcomm,
 +                            "nsttcouple",&ir->nsttcouple);
 +        }
 +        if (ir->epc != epcNO && ir->nstpcouple > 0)
 +        {
 +            check_nst_param(fplog,cr,"-gcom",nstglobalcomm,
 +                            "nstpcouple",&ir->nstpcouple);
 +        }
 +
 +        check_nst_param(fplog,cr,"-gcom",nstglobalcomm,
 +                        "nstenergy",&ir->nstenergy);
 +
 +        check_nst_param(fplog,cr,"-gcom",nstglobalcomm,
 +                        "nstlog",&ir->nstlog);
 +    }
 +
 +    if (ir->comm_mode != ecmNO && ir->nstcomm < nstglobalcomm)
 +    {
 +        sprintf(buf,"WARNING: Changing nstcomm from %d to %d\n",
 +                ir->nstcomm,nstglobalcomm);
 +        md_print_warning(cr,fplog,buf);
 +        ir->nstcomm = nstglobalcomm;
 +    }
 +
 +    return nstglobalcomm;
 +}
 +
 +void check_ir_old_tpx_versions(t_commrec *cr,FILE *fplog,
 +                               t_inputrec *ir,gmx_mtop_t *mtop)
 +{
 +    /* Check required for old tpx files */
 +    if (IR_TWINRANGE(*ir) && ir->nstlist > 1 &&
 +        ir->nstcalcenergy % ir->nstlist != 0)
 +    {
 +        md_print_warning(cr,fplog,"Old tpr file with twin-range settings: modifying energy calculation and/or T/P-coupling frequencies");
 +
 +        if (gmx_mtop_ftype_count(mtop,F_CONSTR) +
 +            gmx_mtop_ftype_count(mtop,F_CONSTRNC) > 0 &&
 +            ir->eConstrAlg == econtSHAKE)
 +        {
 +            md_print_warning(cr,fplog,"With twin-range cut-off's and SHAKE the virial and pressure are incorrect");
 +            if (ir->epc != epcNO)
 +            {
 +                gmx_fatal(FARGS,"Can not do pressure coupling with twin-range cut-off's and SHAKE");
 +            }
 +        }
 +        check_nst_param(fplog,cr,"nstlist",ir->nstlist,
 +                        "nstcalcenergy",&ir->nstcalcenergy);
 +        if (ir->epc != epcNO)
 +        {
 +            check_nst_param(fplog,cr,"nstlist",ir->nstlist,
 +                            "nstpcouple",&ir->nstpcouple);
 +        }
 +        check_nst_param(fplog,cr,"nstcalcenergy",ir->nstcalcenergy,
 +                        "nstenergy",&ir->nstenergy);
 +        check_nst_param(fplog,cr,"nstcalcenergy",ir->nstcalcenergy,
 +                        "nstlog",&ir->nstlog);
 +        if (ir->efep != efepNO)
 +        {
 +            check_nst_param(fplog,cr,"nstcalcenergy",ir->nstcalcenergy,
 +                            "nstdhdl",&ir->nstdhdl);
 +        }
 +    }
 +}
 +
 +void rerun_parallel_comm(t_commrec *cr,t_trxframe *fr,
 +                         gmx_bool *bNotLastFrame)
 +{
 +    gmx_bool bAlloc;
 +    rvec *xp,*vp;
 +
 +    bAlloc = (fr->natoms == 0);
 +
 +    if (MASTER(cr) && !*bNotLastFrame)
 +    {
 +        fr->natoms = -1;
 +    }
 +    xp = fr->x;
 +    vp = fr->v;
 +    gmx_bcast(sizeof(*fr),fr,cr);
 +    fr->x = xp;
 +    fr->v = vp;
 +
 +    *bNotLastFrame = (fr->natoms >= 0);
 +
 +    if (*bNotLastFrame && PARTDECOMP(cr))
 +    {
 +        /* x and v are the only variable size quantities stored in trr
 +         * that are required for rerun (f is not needed).
 +         */
 +        if (bAlloc)
 +        {
 +            snew(fr->x,fr->natoms);
 +            snew(fr->v,fr->natoms);
 +        }
 +        if (fr->bX)
 +        {
 +            gmx_bcast(fr->natoms*sizeof(fr->x[0]),fr->x[0],cr);
 +        }
 +        if (fr->bV)
 +        {
 +            gmx_bcast(fr->natoms*sizeof(fr->v[0]),fr->v[0],cr);
 +        }
 +    }
 +}
 +
 +void md_print_warning(const t_commrec *cr,FILE *fplog,const char *buf)
 +{
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr,"\n%s\n",buf);
 +    }
 +    if (fplog)
 +    {
 +        fprintf(fplog,"\n%s\n",buf);
 +    }
 +}
Simple merge
index 3ca50a3db80eade48f896b8c99c7666dfdb8b4b8,0000000000000000000000000000000000000000..21c34c6750321d428bc7ba78a15b1e83742987b3
mode 100644,000000..100644
--- /dev/null
@@@ -1,2472 -1,0 +1,2469 @@@
-   /* Calculate long range corrections to pressure and energy */
-   calc_dispcorr(fplog,inputrec,fr,count,top_global->natoms,ems->s.box,ems->s.lambda,
-                 pres,force_vir,&prescorr,&enercorr,&dvdlcorr);
-   /* don't think these next 4 lines  can be moved in for now, because we 
-      don't always want to write it -- figure out how to clean this up MRS 8/4/2009 */
-   enerd->term[F_DISPCORR] = enercorr;
-   enerd->term[F_EPOT] += enercorr;
-   enerd->term[F_PRES] += prescorr;
-   enerd->term[F_DVDL] += dvdlcorr;
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <string.h>
 +#include <time.h>
 +#include <math.h>
 +#include "sysstuff.h"
 +#include "string2.h"
 +#include "network.h"
 +#include "confio.h"
 +#include "copyrite.h"
 +#include "smalloc.h"
 +#include "nrnb.h"
 +#include "main.h"
 +#include "force.h"
 +#include "macros.h"
 +#include "random.h"
 +#include "names.h"
 +#include "gmx_fatal.h"
 +#include "txtdump.h"
 +#include "typedefs.h"
 +#include "update.h"
 +#include "constr.h"
 +#include "vec.h"
 +#include "statutil.h"
 +#include "tgroup.h"
 +#include "mdebin.h"
 +#include "vsite.h"
 +#include "force.h"
 +#include "mdrun.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "trnio.h"
 +#include "sparsematrix.h"
 +#include "mtxio.h"
 +#include "mdatoms.h"
 +#include "ns.h"
 +#include "gmx_wallcycle.h"
 +#include "mtop_util.h"
 +#include "gmxfio.h"
 +#include "pme.h"
 +#include "membed.h"
 +
 +typedef struct {
 +  t_state s;
 +  rvec    *f;
 +  real    epot;
 +  real    fnorm;
 +  real    fmax;
 +  int     a_fmax;
 +} em_state_t;
 +
 +static em_state_t *init_em_state()
 +{
 +  em_state_t *ems;
 +  
 +  snew(ems,1);
 +
 +  return ems;
 +}
 +
 +static void print_em_start(FILE *fplog,t_commrec *cr,gmx_runtime_t *runtime,
 +                           gmx_wallcycle_t wcycle,
 +                           const char *name)
 +{
 +    char buf[STRLEN];
 +
 +    runtime_start(runtime);
 +
 +    sprintf(buf,"Started %s",name);
 +    print_date_and_time(fplog,cr->nodeid,buf,NULL);
 +
 +    wallcycle_start(wcycle,ewcRUN);
 +}
 +static void em_time_end(FILE *fplog,t_commrec *cr,gmx_runtime_t *runtime,
 +                        gmx_wallcycle_t wcycle)
 +{
 +    wallcycle_stop(wcycle,ewcRUN);
 +
 +    runtime_end(runtime);
 +}
 +
 +static void sp_header(FILE *out,const char *minimizer,real ftol,int nsteps)
 +{
 +    fprintf(out,"\n");
 +    fprintf(out,"%s:\n",minimizer);
 +    fprintf(out,"   Tolerance (Fmax)   = %12.5e\n",ftol);
 +    fprintf(out,"   Number of steps    = %12d\n",nsteps);
 +}
 +
 +static void warn_step(FILE *fp,real ftol,gmx_bool bLastStep,gmx_bool bConstrain)
 +{
 +    if (bLastStep)
 +    {
 +        fprintf(fp,"\nReached the maximum number of steps before reaching Fmax < %g\n",ftol);
 +    }
 +    else
 +    {
 +        fprintf(fp,"\nStepsize too small, or no change in energy.\n"
 +                "Converged to machine precision,\n"
 +                "but not to the requested precision Fmax < %g\n",
 +                ftol);
 +        if (sizeof(real)<sizeof(double))
 +        {
 +            fprintf(fp,"\nDouble precision normally gives you higher accuracy.\n");
 +        }
 +        if (bConstrain)
 +        {
 +            fprintf(fp,"You might need to increase your constraint accuracy, or turn\n"
 +                    "off constraints alltogether (set constraints = none in mdp file)\n");
 +        }
 +    }
 +}
 +
 +
 +
 +static void print_converged(FILE *fp,const char *alg,real ftol,
 +                          gmx_large_int_t count,gmx_bool bDone,gmx_large_int_t nsteps,
 +                          real epot,real fmax, int nfmax, real fnorm)
 +{
 +  char buf[STEPSTRSIZE];
 +
 +  if (bDone)
 +    fprintf(fp,"\n%s converged to Fmax < %g in %s steps\n",
 +          alg,ftol,gmx_step_str(count,buf)); 
 +  else if(count<nsteps)
 +    fprintf(fp,"\n%s converged to machine precision in %s steps,\n"
 +               "but did not reach the requested Fmax < %g.\n",
 +          alg,gmx_step_str(count,buf),ftol);
 +  else 
 +    fprintf(fp,"\n%s did not converge to Fmax < %g in %s steps.\n",
 +          alg,ftol,gmx_step_str(count,buf));
 +
 +#ifdef GMX_DOUBLE
 +  fprintf(fp,"Potential Energy  = %21.14e\n",epot); 
 +  fprintf(fp,"Maximum force     = %21.14e on atom %d\n",fmax,nfmax+1); 
 +  fprintf(fp,"Norm of force     = %21.14e\n",fnorm); 
 +#else
 +  fprintf(fp,"Potential Energy  = %14.7e\n",epot); 
 +  fprintf(fp,"Maximum force     = %14.7e on atom %d\n",fmax,nfmax+1); 
 +  fprintf(fp,"Norm of force     = %14.7e\n",fnorm); 
 +#endif
 +}
 +
 +static void get_f_norm_max(t_commrec *cr,
 +                         t_grpopts *opts,t_mdatoms *mdatoms,rvec *f,
 +                         real *fnorm,real *fmax,int *a_fmax)
 +{
 +  double fnorm2,*sum;
 +  real fmax2,fmax2_0,fam;
 +  int  la_max,a_max,start,end,i,m,gf;
 +
 +  /* This routine finds the largest force and returns it.
 +   * On parallel machines the global max is taken.
 +   */
 +  fnorm2 = 0;
 +  fmax2 = 0;
 +  la_max = -1;
 +  gf = 0;
 +  start = mdatoms->start;
 +  end   = mdatoms->homenr + start;
 +  if (mdatoms->cFREEZE) {
 +    for(i=start; i<end; i++) {
 +      gf = mdatoms->cFREEZE[i];
 +      fam = 0;
 +      for(m=0; m<DIM; m++)
 +      if (!opts->nFreeze[gf][m])
 +        fam += sqr(f[i][m]);
 +      fnorm2 += fam;
 +      if (fam > fmax2) {
 +      fmax2  = fam;
 +      la_max = i;
 +      }
 +    }
 +  } else {
 +    for(i=start; i<end; i++) {
 +      fam = norm2(f[i]);
 +      fnorm2 += fam;
 +      if (fam > fmax2) {
 +      fmax2  = fam;
 +      la_max = i;
 +      }
 +    }
 +  }
 +
 +  if (la_max >= 0 && DOMAINDECOMP(cr)) {
 +    a_max = cr->dd->gatindex[la_max];
 +  } else {
 +    a_max = la_max;
 +  }
 +  if (PAR(cr)) {
 +    snew(sum,2*cr->nnodes+1);
 +    sum[2*cr->nodeid]   = fmax2;
 +    sum[2*cr->nodeid+1] = a_max;
 +    sum[2*cr->nnodes]   = fnorm2;
 +    gmx_sumd(2*cr->nnodes+1,sum,cr);
 +    fnorm2 = sum[2*cr->nnodes];
 +    /* Determine the global maximum */
 +    for(i=0; i<cr->nnodes; i++) {
 +      if (sum[2*i] > fmax2) {
 +      fmax2 = sum[2*i];
 +      a_max = (int)(sum[2*i+1] + 0.5);
 +      }
 +    }
 +    sfree(sum);
 +  }
 +
 +  if (fnorm)
 +    *fnorm = sqrt(fnorm2);
 +  if (fmax)
 +    *fmax  = sqrt(fmax2);
 +  if (a_fmax)
 +    *a_fmax = a_max;
 +}
 +
 +static void get_state_f_norm_max(t_commrec *cr,
 +                         t_grpopts *opts,t_mdatoms *mdatoms,
 +                         em_state_t *ems)
 +{
 +  get_f_norm_max(cr,opts,mdatoms,ems->f,&ems->fnorm,&ems->fmax,&ems->a_fmax);
 +}
 +
 +void init_em(FILE *fplog,const char *title,
 +             t_commrec *cr,t_inputrec *ir,
 +             t_state *state_global,gmx_mtop_t *top_global,
 +             em_state_t *ems,gmx_localtop_t **top,
 +             rvec **f,rvec **f_global,
 +             t_nrnb *nrnb,rvec mu_tot,
 +             t_forcerec *fr,gmx_enerdata_t **enerd,
 +             t_graph **graph,t_mdatoms *mdatoms,gmx_global_stat_t *gstat,
 +             gmx_vsite_t *vsite,gmx_constr_t constr,
 +             int nfile,const t_filenm fnm[],
 +             gmx_mdoutf_t **outf,t_mdebin **mdebin)
 +{
 +    int  start,homenr,i;
 +    real dvdlambda;
 +    
 +    if (fplog)
 +    {
 +        fprintf(fplog,"Initiating %s\n",title);
 +    }
 +    
 +    state_global->ngtc = 0;
 +    
 +    /* Initiate some variables */
 +    if (ir->efep != efepNO)
 +    {
 +        state_global->lambda = ir->init_lambda;
 +    }
 +    else 
 +    {
 +        state_global->lambda = 0.0;
 +    }
 +    
 +    init_nrnb(nrnb);
 +    
 +    if (DOMAINDECOMP(cr))
 +    {
 +        *top = dd_init_local_top(top_global);
 +        
 +        dd_init_local_state(cr->dd,state_global,&ems->s);
 +
 +        *f = NULL;
 +        
 +        /* Distribute the charge groups over the nodes from the master node */
 +        dd_partition_system(fplog,ir->init_step,cr,TRUE,1,
 +                            state_global,top_global,ir,
 +                            &ems->s,&ems->f,mdatoms,*top,
 +                            fr,vsite,NULL,constr,
 +                            nrnb,NULL,FALSE);
 +        dd_store_state(cr->dd,&ems->s);
 +        
 +        if (ir->nstfout)
 +        {
 +            snew(*f_global,top_global->natoms);
 +        }
 +        else
 +        {
 +            *f_global = NULL;
 +        }
 +        *graph = NULL;
 +    }
 +    else
 +    {
 +        snew(*f,top_global->natoms);
 +
 +        /* Just copy the state */
 +        ems->s = *state_global;
 +        snew(ems->s.x,ems->s.nalloc);
 +        snew(ems->f,ems->s.nalloc);
 +        for(i=0; i<state_global->natoms; i++)
 +        {
 +            copy_rvec(state_global->x[i],ems->s.x[i]);
 +        }
 +        copy_mat(state_global->box,ems->s.box);
 +        
 +        if (PAR(cr) && ir->eI != eiNM)
 +        {
 +            /* Initialize the particle decomposition and split the topology */
 +            *top = split_system(fplog,top_global,ir,cr);
 +            
 +            pd_cg_range(cr,&fr->cg0,&fr->hcg);
 +        }
 +        else
 +        {
 +            *top = gmx_mtop_generate_local_top(top_global,ir);
 +        }
 +        *f_global = *f;
 +        
 +        if (ir->ePBC != epbcNONE && !ir->bPeriodicMols)
 +        {
 +            *graph = mk_graph(fplog,&((*top)->idef),0,top_global->natoms,FALSE,FALSE);
 +        }
 +        else
 +        {
 +            *graph = NULL;
 +        }
 +
 +        if (PARTDECOMP(cr))
 +        {
 +            pd_at_range(cr,&start,&homenr);
 +            homenr -= start;
 +        }
 +        else
 +        {
 +            start  = 0;
 +            homenr = top_global->natoms;
 +        }
 +        atoms2md(top_global,ir,0,NULL,start,homenr,mdatoms);
 +        update_mdatoms(mdatoms,state_global->lambda);
 +    
 +        if (vsite)
 +        {
 +            set_vsite_top(vsite,*top,mdatoms,cr);
 +        }
 +    }
 +    
 +    if (constr)
 +    {
 +        if (ir->eConstrAlg == econtSHAKE &&
 +            gmx_mtop_ftype_count(top_global,F_CONSTR) > 0)
 +        {
 +            gmx_fatal(FARGS,"Can not do energy minimization with %s, use %s\n",
 +                      econstr_names[econtSHAKE],econstr_names[econtLINCS]);
 +        }
 +        
 +        if (!DOMAINDECOMP(cr))
 +        {
 +            set_constraints(constr,*top,ir,mdatoms,cr);
 +        }
 +
 +        if (!ir->bContinuation)
 +        {
 +            /* Constrain the starting coordinates */
 +            dvdlambda=0;
 +            constrain(PAR(cr) ? NULL : fplog,TRUE,TRUE,constr,&(*top)->idef,
 +                      ir,NULL,cr,-1,0,mdatoms,
 +                      ems->s.x,ems->s.x,NULL,ems->s.box,
 +                      ems->s.lambda,&dvdlambda,
 +                      NULL,NULL,nrnb,econqCoord,FALSE,0,0);
 +        }
 +    }
 +    
 +    if (PAR(cr))
 +    {
 +        *gstat = global_stat_init(ir);
 +    }
 +    
 +    *outf = init_mdoutf(nfile,fnm,0,cr,ir,NULL);
 +
 +    snew(*enerd,1);
 +    init_enerdata(top_global->groups.grps[egcENER].nr,ir->n_flambda,*enerd);
 +
 +    if (mdebin != NULL)
 +    {
 +        /* Init bin for energy stuff */
 +        *mdebin = init_mdebin((*outf)->fp_ene,top_global,ir,NULL); 
 +    }
 +
 +    clear_rvec(mu_tot);
 +    calc_shifts(ems->s.box,fr->shift_vec);
 +}
 +
 +static void finish_em(FILE *fplog,t_commrec *cr,gmx_mdoutf_t *outf,
 +                      gmx_runtime_t *runtime,gmx_wallcycle_t wcycle)
 +{
 +  if (!(cr->duty & DUTY_PME)) {
 +    /* Tell the PME only node to finish */
 +    gmx_pme_finish(cr);
 +  }
 +
 +  done_mdoutf(outf);
 +
 +  em_time_end(fplog,cr,runtime,wcycle);
 +}
 +
 +static void swap_em_state(em_state_t *ems1,em_state_t *ems2)
 +{
 +  em_state_t tmp;
 +
 +  tmp   = *ems1;
 +  *ems1 = *ems2;
 +  *ems2 = tmp;
 +}
 +
 +static void copy_em_coords(em_state_t *ems,t_state *state)
 +{
 +    int i;
 +
 +    for(i=0; (i<state->natoms); i++)
 +    {
 +        copy_rvec(ems->s.x[i],state->x[i]);
 +    }
 +}
 +
 +static void write_em_traj(FILE *fplog,t_commrec *cr,
 +                          gmx_mdoutf_t *outf,
 +                          gmx_bool bX,gmx_bool bF,const char *confout,
 +                          gmx_mtop_t *top_global,
 +                          t_inputrec *ir,gmx_large_int_t step,
 +                          em_state_t *state,
 +                          t_state *state_global,rvec *f_global)
 +{
 +    int mdof_flags;
 +
 +    if ((bX || bF || confout != NULL) && !DOMAINDECOMP(cr))
 +    {
 +        copy_em_coords(state,state_global);
 +        f_global = state->f;
 +    }
 +    
 +    mdof_flags = 0;
 +    if (bX) { mdof_flags |= MDOF_X; }
 +    if (bF) { mdof_flags |= MDOF_F; }
 +    write_traj(fplog,cr,outf,mdof_flags,
 +               top_global,step,(double)step,
 +               &state->s,state_global,state->f,f_global,NULL,NULL);
 +    
 +    if (confout != NULL && MASTER(cr))
 +    {
 +        if (ir->ePBC != epbcNONE && !ir->bPeriodicMols && DOMAINDECOMP(cr))
 +        {
 +            /* Make molecules whole only for confout writing */
 +            do_pbc_mtop(fplog,ir->ePBC,state_global->box,top_global,
 +                        state_global->x);
 +        }
 +
 +        write_sto_conf_mtop(confout,
 +                            *top_global->name,top_global,
 +                            state_global->x,NULL,ir->ePBC,state_global->box);
 +    }
 +}
 +
 +static void do_em_step(t_commrec *cr,t_inputrec *ir,t_mdatoms *md,
 +                     em_state_t *ems1,real a,rvec *f,em_state_t *ems2,
 +                     gmx_constr_t constr,gmx_localtop_t *top,
 +                     t_nrnb *nrnb,gmx_wallcycle_t wcycle,
 +                     gmx_large_int_t count)
 +
 +{
 +  t_state *s1,*s2;
 +  int  start,end,gf,i,m;
 +  rvec *x1,*x2;
 +  real dvdlambda;
 +
 +  s1 = &ems1->s;
 +  s2 = &ems2->s;
 +
 +  if (DOMAINDECOMP(cr) && s1->ddp_count != cr->dd->ddp_count)
 +    gmx_incons("state mismatch in do_em_step");
 +
 +  s2->flags = s1->flags;
 +
 +  if (s2->nalloc != s1->nalloc) {
 +    s2->nalloc = s1->nalloc;
 +    srenew(s2->x,s1->nalloc);
 +    srenew(ems2->f,  s1->nalloc);
 +    if (s2->flags & (1<<estCGP))
 +      srenew(s2->cg_p,  s1->nalloc);
 +  }
 +  
 +  s2->natoms = s1->natoms;
 +  s2->lambda = s1->lambda;
 +  copy_mat(s1->box,s2->box);
 +
 +  start = md->start;
 +  end   = md->start + md->homenr;
 +
 +  x1 = s1->x;
 +  x2 = s2->x;
 +  gf = 0;
 +  for(i=start; i<end; i++) {
 +    if (md->cFREEZE)
 +      gf = md->cFREEZE[i];
 +    for(m=0; m<DIM; m++) {
 +      if (ir->opts.nFreeze[gf][m])
 +      x2[i][m] = x1[i][m];
 +      else
 +      x2[i][m] = x1[i][m] + a*f[i][m];
 +    }
 +  }
 +
 +  if (s2->flags & (1<<estCGP)) {
 +    /* Copy the CG p vector */
 +    x1 = s1->cg_p;
 +    x2 = s2->cg_p;
 +    for(i=start; i<end; i++)
 +      copy_rvec(x1[i],x2[i]);
 +  }
 +
 +  if (DOMAINDECOMP(cr)) {
 +    s2->ddp_count = s1->ddp_count;
 +    if (s2->cg_gl_nalloc < s1->cg_gl_nalloc) {
 +      s2->cg_gl_nalloc = s1->cg_gl_nalloc;
 +      srenew(s2->cg_gl,s2->cg_gl_nalloc);
 +    }
 +    s2->ncg_gl = s1->ncg_gl;
 +    for(i=0; i<s2->ncg_gl; i++)
 +      s2->cg_gl[i] = s1->cg_gl[i];
 +    s2->ddp_count_cg_gl = s1->ddp_count_cg_gl;
 +  }
 +
 +  if (constr) {
 +    wallcycle_start(wcycle,ewcCONSTR);
 +    dvdlambda = 0;
 +    constrain(NULL,TRUE,TRUE,constr,&top->idef,       
 +              ir,NULL,cr,count,0,md,
 +              s1->x,s2->x,NULL,s2->box,s2->lambda,
 +              &dvdlambda,NULL,NULL,nrnb,econqCoord,FALSE,0,0);
 +    wallcycle_stop(wcycle,ewcCONSTR);
 +  }
 +}
 +
 +static void em_dd_partition_system(FILE *fplog,int step,t_commrec *cr,
 +                                   gmx_mtop_t *top_global,t_inputrec *ir,
 +                                   em_state_t *ems,gmx_localtop_t *top,
 +                                   t_mdatoms *mdatoms,t_forcerec *fr,
 +                                   gmx_vsite_t *vsite,gmx_constr_t constr,
 +                                   t_nrnb *nrnb,gmx_wallcycle_t wcycle)
 +{
 +    /* Repartition the domain decomposition */
 +    wallcycle_start(wcycle,ewcDOMDEC);
 +    dd_partition_system(fplog,step,cr,FALSE,1,
 +                        NULL,top_global,ir,
 +                        &ems->s,&ems->f,
 +                        mdatoms,top,fr,vsite,NULL,constr,
 +                        nrnb,wcycle,FALSE);
 +    dd_store_state(cr->dd,&ems->s);
 +    wallcycle_stop(wcycle,ewcDOMDEC);
 +}
 +    
 +static void evaluate_energy(FILE *fplog,gmx_bool bVerbose,t_commrec *cr,
 +                            t_state *state_global,gmx_mtop_t *top_global,
 +                            em_state_t *ems,gmx_localtop_t *top,
 +                            t_inputrec *inputrec,
 +                            t_nrnb *nrnb,gmx_wallcycle_t wcycle,
 +                            gmx_global_stat_t gstat,
 +                            gmx_vsite_t *vsite,gmx_constr_t constr,
 +                            t_fcdata *fcd,
 +                            t_graph *graph,t_mdatoms *mdatoms,
 +                            t_forcerec *fr,rvec mu_tot,
 +                            gmx_enerdata_t *enerd,tensor vir,tensor pres,
 +                            gmx_large_int_t count,gmx_bool bFirst)
 +{
 +  real t;
 +  gmx_bool bNS;
 +  int  nabnsb;
 +  tensor force_vir,shake_vir,ekin;
 +  real dvdl,prescorr,enercorr,dvdlcorr;
 +  real terminate=0;
 +  
 +  /* Set the time to the initial time, the time does not change during EM */
 +  t = inputrec->init_t;
 +
 +  if (bFirst ||
 +      (DOMAINDECOMP(cr) && ems->s.ddp_count < cr->dd->ddp_count)) {
 +    /* This the first state or an old state used before the last ns */
 +    bNS = TRUE;
 +  } else {
 +    bNS = FALSE;
 +    if (inputrec->nstlist > 0) {
 +      bNS = TRUE;
 +    } else if (inputrec->nstlist == -1) {
 +      nabnsb = natoms_beyond_ns_buffer(inputrec,fr,&top->cgs,NULL,ems->s.x);
 +      if (PAR(cr))
 +      gmx_sumi(1,&nabnsb,cr);
 +      bNS = (nabnsb > 0);
 +    }
 +  }
 +
 +  if (vsite)
 +    construct_vsites(fplog,vsite,ems->s.x,nrnb,1,NULL,
 +                   top->idef.iparams,top->idef.il,
 +                   fr->ePBC,fr->bMolPBC,graph,cr,ems->s.box);
 +
 +  if (DOMAINDECOMP(cr)) {
 +    if (bNS) {
 +      /* Repartition the domain decomposition */
 +      em_dd_partition_system(fplog,count,cr,top_global,inputrec,
 +                           ems,top,mdatoms,fr,vsite,constr,
 +                           nrnb,wcycle);
 +    }
 +  }
 +      
 +    /* Calc force & energy on new trial position  */
 +    /* do_force always puts the charge groups in the box and shifts again
 +     * We do not unshift, so molecules are always whole in congrad.c
 +     */
 +    do_force(fplog,cr,inputrec,
 +             count,nrnb,wcycle,top,top_global,&top_global->groups,
 +             ems->s.box,ems->s.x,&ems->s.hist,
 +             ems->f,force_vir,mdatoms,enerd,fcd,
 +             ems->s.lambda,graph,fr,vsite,mu_tot,t,NULL,NULL,TRUE,
 +             GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES | GMX_FORCE_VIRIAL |
 +             (bNS ? GMX_FORCE_NS | GMX_FORCE_DOLR : 0));
 +      
 +  /* Clear the unused shake virial and pressure */
 +  clear_mat(shake_vir);
 +  clear_mat(pres);
 +
-     calc_pres(fr->ePBC,inputrec->nwall,ems->s.box,ekin,vir,pres,
-             (fr->eeltype==eelPPPM)?enerd->term[F_COUL_RECIP]:0.0);
 +    /* Communicate stuff when parallel */
 +    if (PAR(cr) && inputrec->eI != eiNM)
 +    {
 +        wallcycle_start(wcycle,ewcMoveE);
 +
 +        global_stat(fplog,gstat,cr,enerd,force_vir,shake_vir,mu_tot,
 +                    inputrec,NULL,NULL,NULL,1,&terminate,
 +                    top_global,&ems->s,FALSE,
 +                    CGLO_ENERGY | 
 +                    CGLO_PRESSURE | 
 +                    CGLO_CONSTRAINT | 
 +                    CGLO_FIRSTITERATE);
 +
 +        wallcycle_stop(wcycle,ewcMoveE);
 +    }
 +
++    /* Calculate long range corrections to pressure and energy */
++    calc_dispcorr(fplog,inputrec,fr,count,top_global->natoms,ems->s.box,ems->s.lambda,
++                  pres,force_vir,&prescorr,&enercorr,&dvdlcorr);
++    enerd->term[F_DISPCORR] = enercorr;
++    enerd->term[F_EPOT] += enercorr;
++    enerd->term[F_PRES] += prescorr;
++    enerd->term[F_DVDL] += dvdlcorr;
++
 +  ems->epot = enerd->term[F_EPOT];
 +  
 +  if (constr) {
 +    /* Project out the constraint components of the force */
 +    wallcycle_start(wcycle,ewcCONSTR);
 +    dvdl = 0;
 +    constrain(NULL,FALSE,FALSE,constr,&top->idef,
 +              inputrec,NULL,cr,count,0,mdatoms,
 +              ems->s.x,ems->f,ems->f,ems->s.box,ems->s.lambda,&dvdl,
 +              NULL,&shake_vir,nrnb,econqForceDispl,FALSE,0,0);
 +    if (fr->bSepDVDL && fplog)
 +      fprintf(fplog,sepdvdlformat,"Constraints",t,dvdl);
 +    enerd->term[F_DHDL_CON] += dvdl;
 +    m_add(force_vir,shake_vir,vir);
 +    wallcycle_stop(wcycle,ewcCONSTR);
 +  } else {
 +    copy_mat(force_vir,vir);
 +  }
 +
 +  clear_mat(ekin);
 +  enerd->term[F_PRES] =
++    calc_pres(fr->ePBC,inputrec->nwall,ems->s.box,ekin,vir,pres);
 +
 +  sum_dhdl(enerd,ems->s.lambda,inputrec);
 +
 +    if (EI_ENERGY_MINIMIZATION(inputrec->eI))
 +    {
 +        get_state_f_norm_max(cr,&(inputrec->opts),mdatoms,ems);
 +    }
 +}
 +
 +static double reorder_partsum(t_commrec *cr,t_grpopts *opts,t_mdatoms *mdatoms,
 +                            gmx_mtop_t *mtop,
 +                            em_state_t *s_min,em_state_t *s_b)
 +{
 +  rvec *fm,*fb,*fmg;
 +  t_block *cgs_gl;
 +  int ncg,*cg_gl,*index,c,cg,i,a0,a1,a,gf,m;
 +  double partsum;
 +  unsigned char *grpnrFREEZE;
 +
 +  if (debug)
 +    fprintf(debug,"Doing reorder_partsum\n");
 +
 +  fm = s_min->f;
 +  fb = s_b->f;
 +
 +  cgs_gl = dd_charge_groups_global(cr->dd);
 +  index = cgs_gl->index;
 +
 +  /* Collect fm in a global vector fmg.
 +   * This conflicts with the spirit of domain decomposition,
 +   * but to fully optimize this a much more complicated algorithm is required.
 +   */
 +  snew(fmg,mtop->natoms);
 +  
 +  ncg   = s_min->s.ncg_gl;
 +  cg_gl = s_min->s.cg_gl;
 +  i = 0;
 +  for(c=0; c<ncg; c++) {
 +    cg = cg_gl[c];
 +    a0 = index[cg];
 +    a1 = index[cg+1];
 +    for(a=a0; a<a1; a++) {
 +      copy_rvec(fm[i],fmg[a]);
 +      i++;
 +    }
 +  }
 +  gmx_sum(mtop->natoms*3,fmg[0],cr);
 +
 +  /* Now we will determine the part of the sum for the cgs in state s_b */
 +  ncg   = s_b->s.ncg_gl;
 +  cg_gl = s_b->s.cg_gl;
 +  partsum = 0;
 +  i = 0;
 +  gf = 0;
 +  grpnrFREEZE = mtop->groups.grpnr[egcFREEZE];
 +  for(c=0; c<ncg; c++) {
 +    cg = cg_gl[c];
 +    a0 = index[cg];
 +    a1 = index[cg+1];
 +    for(a=a0; a<a1; a++) {
 +      if (mdatoms->cFREEZE && grpnrFREEZE) {
 +      gf = grpnrFREEZE[i];
 +      }
 +      for(m=0; m<DIM; m++) {
 +      if (!opts->nFreeze[gf][m]) {
 +        partsum += (fb[i][m] - fmg[a][m])*fb[i][m];
 +      }
 +      }
 +      i++;
 +    }
 +  }
 +  
 +  sfree(fmg);
 +
 +  return partsum;
 +}
 +
 +static real pr_beta(t_commrec *cr,t_grpopts *opts,t_mdatoms *mdatoms,
 +                  gmx_mtop_t *mtop,
 +                  em_state_t *s_min,em_state_t *s_b)
 +{
 +  rvec *fm,*fb;
 +  double sum;
 +  int  gf,i,m;
 +
 +  /* This is just the classical Polak-Ribiere calculation of beta;
 +   * it looks a bit complicated since we take freeze groups into account,
 +   * and might have to sum it in parallel runs.
 +   */
 +  
 +  if (!DOMAINDECOMP(cr) ||
 +      (s_min->s.ddp_count == cr->dd->ddp_count &&
 +       s_b->s.ddp_count   == cr->dd->ddp_count)) {
 +    fm = s_min->f;
 +    fb = s_b->f;
 +    sum = 0;
 +    gf = 0;
 +    /* This part of code can be incorrect with DD,
 +     * since the atom ordering in s_b and s_min might differ.
 +     */
 +    for(i=mdatoms->start; i<mdatoms->start+mdatoms->homenr; i++) {
 +      if (mdatoms->cFREEZE)
 +      gf = mdatoms->cFREEZE[i];
 +      for(m=0; m<DIM; m++)
 +      if (!opts->nFreeze[gf][m]) {
 +        sum += (fb[i][m] - fm[i][m])*fb[i][m];
 +      } 
 +    }
 +  } else {
 +    /* We need to reorder cgs while summing */
 +    sum = reorder_partsum(cr,opts,mdatoms,mtop,s_min,s_b);
 +  }
 +  if (PAR(cr))
 +    gmx_sumd(1,&sum,cr);
 +
 +  return sum/sqr(s_min->fnorm);
 +}
 +
 +double do_cg(FILE *fplog,t_commrec *cr,
 +             int nfile,const t_filenm fnm[],
 +             const output_env_t oenv, gmx_bool bVerbose,gmx_bool bCompact,
 +             int nstglobalcomm,
 +             gmx_vsite_t *vsite,gmx_constr_t constr,
 +             int stepout,
 +             t_inputrec *inputrec,
 +             gmx_mtop_t *top_global,t_fcdata *fcd,
 +             t_state *state_global,
 +             t_mdatoms *mdatoms,
 +             t_nrnb *nrnb,gmx_wallcycle_t wcycle,
 +             gmx_edsam_t ed,
 +             t_forcerec *fr,
 +             int repl_ex_nst,int repl_ex_seed,
 +             gmx_membed_t *membed,
 +             real cpt_period,real max_hours,
 +             const char *deviceOptions,
 +             unsigned long Flags,
 +             gmx_runtime_t *runtime)
 +{
 +  const char *CG="Polak-Ribiere Conjugate Gradients";
 +
 +  em_state_t *s_min,*s_a,*s_b,*s_c;
 +  gmx_localtop_t *top;
 +  gmx_enerdata_t *enerd;
 +  rvec   *f;
 +  gmx_global_stat_t gstat;
 +  t_graph    *graph;
 +  rvec   *f_global,*p,*sf,*sfm;
 +  double gpa,gpb,gpc,tmp,sum[2],minstep;
 +  real   fnormn;
 +  real   stepsize;    
 +  real   a,b,c,beta=0.0;
 +  real   epot_repl=0;
 +  real   pnorm;
 +  t_mdebin   *mdebin;
 +  gmx_bool   converged,foundlower;
 +  rvec   mu_tot;
 +  gmx_bool   do_log=FALSE,do_ene=FALSE,do_x,do_f;
 +  tensor vir,pres;
 +  int    number_steps,neval=0,nstcg=inputrec->nstcgsteep;
 +  gmx_mdoutf_t *outf;
 +  int    i,m,gf,step,nminstep;
 +  real   terminate=0;  
 +
 +  step=0;
 +
 +  s_min = init_em_state();
 +  s_a   = init_em_state();
 +  s_b   = init_em_state();
 +  s_c   = init_em_state();
 +
 +  /* Init em and store the local state in s_min */
 +  init_em(fplog,CG,cr,inputrec,
 +          state_global,top_global,s_min,&top,&f,&f_global,
 +          nrnb,mu_tot,fr,&enerd,&graph,mdatoms,&gstat,vsite,constr,
 +          nfile,fnm,&outf,&mdebin);
 +  
 +  /* Print to log file */
 +  print_em_start(fplog,cr,runtime,wcycle,CG);
 +  
 +  /* Max number of steps */
 +  number_steps=inputrec->nsteps;
 +
 +  if (MASTER(cr))
 +    sp_header(stderr,CG,inputrec->em_tol,number_steps);
 +  if (fplog)
 +    sp_header(fplog,CG,inputrec->em_tol,number_steps);
 +
 +  /* Call the force routine and some auxiliary (neighboursearching etc.) */
 +  /* do_force always puts the charge groups in the box and shifts again
 +   * We do not unshift, so molecules are always whole in congrad.c
 +   */
 +  evaluate_energy(fplog,bVerbose,cr,
 +                state_global,top_global,s_min,top,
 +                inputrec,nrnb,wcycle,gstat,
 +                vsite,constr,fcd,graph,mdatoms,fr,
 +                mu_tot,enerd,vir,pres,-1,TRUE);
 +  where();
 +
 +  if (MASTER(cr)) {
 +    /* Copy stuff to the energy bin for easy printing etc. */
 +    upd_mdebin(mdebin,FALSE,FALSE,(double)step,
 +             mdatoms->tmass,enerd,&s_min->s,s_min->s.box,
 +             NULL,NULL,vir,pres,NULL,mu_tot,constr);
 +    
 +    print_ebin_header(fplog,step,step,s_min->s.lambda);
 +    print_ebin(outf->fp_ene,TRUE,FALSE,FALSE,fplog,step,step,eprNORMAL,
 +               TRUE,mdebin,fcd,&(top_global->groups),&(inputrec->opts));
 +  }
 +  where();
 +
 +  /* Estimate/guess the initial stepsize */
 +  stepsize = inputrec->em_stepsize/s_min->fnorm;
 + 
 +  if (MASTER(cr)) {
 +    fprintf(stderr,"   F-max             = %12.5e on atom %d\n",
 +          s_min->fmax,s_min->a_fmax+1);
 +    fprintf(stderr,"   F-Norm            = %12.5e\n",
 +          s_min->fnorm/sqrt(state_global->natoms));
 +    fprintf(stderr,"\n");
 +    /* and copy to the log file too... */
 +    fprintf(fplog,"   F-max             = %12.5e on atom %d\n",
 +          s_min->fmax,s_min->a_fmax+1);
 +    fprintf(fplog,"   F-Norm            = %12.5e\n",
 +          s_min->fnorm/sqrt(state_global->natoms));
 +    fprintf(fplog,"\n");
 +  }  
 +  /* Start the loop over CG steps.            
 +   * Each successful step is counted, and we continue until
 +   * we either converge or reach the max number of steps.
 +   */
 +  converged = FALSE;
 +  for(step=0; (number_steps<0 || (number_steps>=0 && step<=number_steps)) && !converged;step++) {
 +    
 +    /* start taking steps in a new direction 
 +     * First time we enter the routine, beta=0, and the direction is 
 +     * simply the negative gradient.
 +     */
 +
 +    /* Calculate the new direction in p, and the gradient in this direction, gpa */
 +    p  = s_min->s.cg_p;
 +    sf = s_min->f;
 +    gpa = 0;
 +    gf = 0;
 +    for(i=mdatoms->start; i<mdatoms->start+mdatoms->homenr; i++) {
 +      if (mdatoms->cFREEZE) 
 +      gf = mdatoms->cFREEZE[i];
 +      for(m=0; m<DIM; m++) {
 +      if (!inputrec->opts.nFreeze[gf][m]) {
 +        p[i][m] = sf[i][m] + beta*p[i][m];
 +        gpa -= p[i][m]*sf[i][m];
 +        /* f is negative gradient, thus the sign */
 +      } else {
 +          p[i][m] = 0;
 +      }
 +      }
 +    }
 +    
 +    /* Sum the gradient along the line across CPUs */
 +    if (PAR(cr))
 +      gmx_sumd(1,&gpa,cr);
 +
 +    /* Calculate the norm of the search vector */
 +    get_f_norm_max(cr,&(inputrec->opts),mdatoms,p,&pnorm,NULL,NULL);
 +    
 +    /* Just in case stepsize reaches zero due to numerical precision... */
 +    if(stepsize<=0)     
 +      stepsize = inputrec->em_stepsize/pnorm;
 +    
 +    /* 
 +     * Double check the value of the derivative in the search direction.
 +     * If it is positive it must be due to the old information in the
 +     * CG formula, so just remove that and start over with beta=0.
 +     * This corresponds to a steepest descent step.
 +     */
 +    if(gpa>0) {
 +      beta = 0;
 +      step--; /* Don't count this step since we are restarting */
 +      continue; /* Go back to the beginning of the big for-loop */
 +    }
 +
 +    /* Calculate minimum allowed stepsize, before the average (norm)
 +     * relative change in coordinate is smaller than precision
 +     */
 +    minstep=0;
 +    for (i=mdatoms->start; i<mdatoms->start+mdatoms->homenr; i++) {
 +      for(m=0; m<DIM; m++) {
 +      tmp = fabs(s_min->s.x[i][m]);
 +      if(tmp < 1.0)
 +        tmp = 1.0;
 +      tmp = p[i][m]/tmp;
 +      minstep += tmp*tmp;
 +      }
 +    }
 +    /* Add up from all CPUs */
 +    if(PAR(cr))
 +      gmx_sumd(1,&minstep,cr);
 +
 +    minstep = GMX_REAL_EPS/sqrt(minstep/(3*state_global->natoms));
 +
 +    if(stepsize<minstep) {
 +      converged=TRUE;
 +      break;
 +    }
 +    
 +    /* Write coordinates if necessary */
 +    do_x = do_per_step(step,inputrec->nstxout);
 +    do_f = do_per_step(step,inputrec->nstfout);
 +    
 +    write_em_traj(fplog,cr,outf,do_x,do_f,NULL,
 +                  top_global,inputrec,step,
 +                  s_min,state_global,f_global);
 +    
 +    /* Take a step downhill.
 +     * In theory, we should minimize the function along this direction.
 +     * That is quite possible, but it turns out to take 5-10 function evaluations
 +     * for each line. However, we dont really need to find the exact minimum -
 +     * it is much better to start a new CG step in a modified direction as soon
 +     * as we are close to it. This will save a lot of energy evaluations.
 +     *
 +     * In practice, we just try to take a single step.
 +     * If it worked (i.e. lowered the energy), we increase the stepsize but
 +     * the continue straight to the next CG step without trying to find any minimum.
 +     * If it didn't work (higher energy), there must be a minimum somewhere between
 +     * the old position and the new one.
 +     * 
 +     * Due to the finite numerical accuracy, it turns out that it is a good idea
 +     * to even accept a SMALL increase in energy, if the derivative is still downhill.
 +     * This leads to lower final energies in the tests I've done. / Erik 
 +     */
 +    s_a->epot = s_min->epot;
 +    a = 0.0;
 +    c = a + stepsize; /* reference position along line is zero */
 +    
 +    if (DOMAINDECOMP(cr) && s_min->s.ddp_count < cr->dd->ddp_count) {
 +      em_dd_partition_system(fplog,step,cr,top_global,inputrec,
 +                           s_min,top,mdatoms,fr,vsite,constr,
 +                           nrnb,wcycle);
 +    }
 +
 +    /* Take a trial step (new coords in s_c) */
 +    do_em_step(cr,inputrec,mdatoms,s_min,c,s_min->s.cg_p,s_c,
 +             constr,top,nrnb,wcycle,-1);
 +    
 +    neval++;
 +    /* Calculate energy for the trial step */
 +    evaluate_energy(fplog,bVerbose,cr,
 +                  state_global,top_global,s_c,top,
 +                  inputrec,nrnb,wcycle,gstat,
 +                  vsite,constr,fcd,graph,mdatoms,fr,
 +                  mu_tot,enerd,vir,pres,-1,FALSE);
 +    
 +    /* Calc derivative along line */
 +    p  = s_c->s.cg_p;
 +    sf = s_c->f;
 +    gpc=0;
 +    for(i=mdatoms->start; i<mdatoms->start+mdatoms->homenr; i++) {
 +      for(m=0; m<DIM; m++) 
 +        gpc -= p[i][m]*sf[i][m];  /* f is negative gradient, thus the sign */
 +    }
 +    /* Sum the gradient along the line across CPUs */
 +    if (PAR(cr))
 +      gmx_sumd(1,&gpc,cr);
 +
 +    /* This is the max amount of increase in energy we tolerate */
 +    tmp=sqrt(GMX_REAL_EPS)*fabs(s_a->epot);
 +
 +    /* Accept the step if the energy is lower, or if it is not significantly higher
 +     * and the line derivative is still negative.
 +     */
 +    if (s_c->epot < s_a->epot || (gpc < 0 && s_c->epot < (s_a->epot + tmp))) {
 +      foundlower = TRUE;
 +      /* Great, we found a better energy. Increase step for next iteration
 +       * if we are still going down, decrease it otherwise
 +       */
 +      if(gpc<0)
 +      stepsize *= 1.618034;  /* The golden section */
 +      else
 +      stepsize *= 0.618034;  /* 1/golden section */
 +    } else {
 +      /* New energy is the same or higher. We will have to do some work
 +       * to find a smaller value in the interval. Take smaller step next time!
 +       */
 +      foundlower = FALSE;
 +      stepsize *= 0.618034;
 +    }    
 +
 +
 +
 +    
 +    /* OK, if we didn't find a lower value we will have to locate one now - there must
 +     * be one in the interval [a=0,c].
 +     * The same thing is valid here, though: Don't spend dozens of iterations to find
 +     * the line minimum. We try to interpolate based on the derivative at the endpoints,
 +     * and only continue until we find a lower value. In most cases this means 1-2 iterations.
 +     *
 +     * I also have a safeguard for potentially really patological functions so we never
 +     * take more than 20 steps before we give up ...
 +     *
 +     * If we already found a lower value we just skip this step and continue to the update.
 +     */
 +    if (!foundlower) {
 +      nminstep=0;
 +
 +      do {
 +      /* Select a new trial point.
 +       * If the derivatives at points a & c have different sign we interpolate to zero,
 +       * otherwise just do a bisection.
 +       */
 +      if(gpa<0 && gpc>0)
 +        b = a + gpa*(a-c)/(gpc-gpa);
 +      else
 +        b = 0.5*(a+c);                
 +      
 +      /* safeguard if interpolation close to machine accuracy causes errors:
 +       * never go outside the interval
 +       */
 +      if(b<=a || b>=c)
 +        b = 0.5*(a+c);
 +      
 +      if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count) {
 +        /* Reload the old state */
 +        em_dd_partition_system(fplog,-1,cr,top_global,inputrec,
 +                               s_min,top,mdatoms,fr,vsite,constr,
 +                               nrnb,wcycle);
 +      }
 +
 +      /* Take a trial step to this new point - new coords in s_b */
 +      do_em_step(cr,inputrec,mdatoms,s_min,b,s_min->s.cg_p,s_b,
 +                 constr,top,nrnb,wcycle,-1);
 +      
 +      neval++;
 +      /* Calculate energy for the trial step */
 +      evaluate_energy(fplog,bVerbose,cr,
 +                      state_global,top_global,s_b,top,
 +                      inputrec,nrnb,wcycle,gstat,
 +                      vsite,constr,fcd,graph,mdatoms,fr,
 +                      mu_tot,enerd,vir,pres,-1,FALSE);
 +      
 +      /* p does not change within a step, but since the domain decomposition
 +       * might change, we have to use cg_p of s_b here.
 +       */
 +      p  = s_b->s.cg_p;
 +      sf = s_b->f;
 +      gpb=0;
 +      for(i=mdatoms->start; i<mdatoms->start+mdatoms->homenr; i++) {
 +        for(m=0; m<DIM; m++)
 +            gpb -= p[i][m]*sf[i][m];   /* f is negative gradient, thus the sign */
 +      }
 +      /* Sum the gradient along the line across CPUs */
 +      if (PAR(cr))
 +        gmx_sumd(1,&gpb,cr);
 +      
 +      if (debug)
 +        fprintf(debug,"CGE: EpotA %f EpotB %f EpotC %f gpb %f\n",
 +                s_a->epot,s_b->epot,s_c->epot,gpb);
 +
 +      epot_repl = s_b->epot;
 +      
 +      /* Keep one of the intervals based on the value of the derivative at the new point */
 +      if (gpb > 0) {
 +        /* Replace c endpoint with b */
 +        swap_em_state(s_b,s_c);
 +        c = b;
 +        gpc = gpb;
 +      } else {
 +        /* Replace a endpoint with b */
 +        swap_em_state(s_b,s_a);
 +        a = b;
 +        gpa = gpb;
 +      }
 +      
 +      /* 
 +       * Stop search as soon as we find a value smaller than the endpoints.
 +       * Never run more than 20 steps, no matter what.
 +       */
 +      nminstep++;
 +      } while ((epot_repl > s_a->epot || epot_repl > s_c->epot) &&
 +             (nminstep < 20));     
 +      
 +      if (fabs(epot_repl - s_min->epot) < fabs(s_min->epot)*GMX_REAL_EPS ||
 +        nminstep >= 20) {
 +      /* OK. We couldn't find a significantly lower energy.
 +       * If beta==0 this was steepest descent, and then we give up.
 +       * If not, set beta=0 and restart with steepest descent before quitting.
 +         */
 +      if (beta == 0.0) {
 +        /* Converged */
 +        converged = TRUE;
 +        break;
 +      } else {
 +        /* Reset memory before giving up */
 +        beta = 0.0;
 +        continue;
 +      }
 +      }
 +      
 +      /* Select min energy state of A & C, put the best in B.
 +       */
 +      if (s_c->epot < s_a->epot) {
 +      if (debug)
 +        fprintf(debug,"CGE: C (%f) is lower than A (%f), moving C to B\n",
 +                s_c->epot,s_a->epot);
 +      swap_em_state(s_b,s_c);
 +      gpb = gpc;
 +      b = c;
 +      } else {
 +      if (debug)
 +        fprintf(debug,"CGE: A (%f) is lower than C (%f), moving A to B\n",
 +                s_a->epot,s_c->epot);
 +      swap_em_state(s_b,s_a);
 +      gpb = gpa;
 +      b = a;
 +      }
 +      
 +    } else {
 +      if (debug)
 +      fprintf(debug,"CGE: Found a lower energy %f, moving C to B\n",
 +              s_c->epot);
 +      swap_em_state(s_b,s_c);
 +      gpb = gpc;
 +      b = c;
 +    }
 +    
 +    /* new search direction */
 +    /* beta = 0 means forget all memory and restart with steepest descents. */
 +    if (nstcg && ((step % nstcg)==0)) 
 +      beta = 0.0;
 +    else {
 +      /* s_min->fnorm cannot be zero, because then we would have converged
 +       * and broken out.
 +       */
 +
 +      /* Polak-Ribiere update.
 +       * Change to fnorm2/fnorm2_old for Fletcher-Reeves
 +       */
 +      beta = pr_beta(cr,&inputrec->opts,mdatoms,top_global,s_min,s_b);
 +    }
 +    /* Limit beta to prevent oscillations */
 +    if (fabs(beta) > 5.0)
 +      beta = 0.0;
 +    
 +    
 +    /* update positions */
 +    swap_em_state(s_min,s_b);
 +    gpa = gpb;
 +    
 +    /* Print it if necessary */
 +    if (MASTER(cr)) {
 +      if(bVerbose)
 +      fprintf(stderr,"\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n",
 +              step,s_min->epot,s_min->fnorm/sqrt(state_global->natoms),
 +              s_min->fmax,s_min->a_fmax+1);
 +      /* Store the new (lower) energies */
 +      upd_mdebin(mdebin,FALSE,FALSE,(double)step,
 +               mdatoms->tmass,enerd,&s_min->s,s_min->s.box,
 +               NULL,NULL,vir,pres,NULL,mu_tot,constr);
 +      do_log = do_per_step(step,inputrec->nstlog);
 +      do_ene = do_per_step(step,inputrec->nstenergy);
 +      if(do_log)
 +      print_ebin_header(fplog,step,step,s_min->s.lambda);
 +      print_ebin(outf->fp_ene,do_ene,FALSE,FALSE,
 +               do_log ? fplog : NULL,step,step,eprNORMAL,
 +               TRUE,mdebin,fcd,&(top_global->groups),&(inputrec->opts));
 +    }
 +    
 +    /* Stop when the maximum force lies below tolerance.
 +     * If we have reached machine precision, converged is already set to true.
 +     */       
 +    converged = converged || (s_min->fmax < inputrec->em_tol);
 +    
 +  } /* End of the loop */
 +  
 +  if (converged)      
 +    step--; /* we never took that last step in this case */
 +  
 +    if (s_min->fmax > inputrec->em_tol)
 +    {
 +        if (MASTER(cr))
 +        {
 +            warn_step(stderr,inputrec->em_tol,step-1==number_steps,FALSE);
 +            warn_step(fplog ,inputrec->em_tol,step-1==number_steps,FALSE);
 +        }
 +        converged = FALSE; 
 +    }
 +  
 +  if (MASTER(cr)) {
 +    /* If we printed energy and/or logfile last step (which was the last step)
 +     * we don't have to do it again, but otherwise print the final values.
 +     */
 +    if(!do_log) {
 +      /* Write final value to log since we didn't do anything the last step */
 +      print_ebin_header(fplog,step,step,s_min->s.lambda);
 +    }
 +    if (!do_ene || !do_log) {
 +      /* Write final energy file entries */
 +      print_ebin(outf->fp_ene,!do_ene,FALSE,FALSE,
 +               !do_log ? fplog : NULL,step,step,eprNORMAL,
 +               TRUE,mdebin,fcd,&(top_global->groups),&(inputrec->opts));
 +    }
 +  }
 +
 +  /* Print some stuff... */
 +  if (MASTER(cr))
 +    fprintf(stderr,"\nwriting lowest energy coordinates.\n");
 +  
 +  /* IMPORTANT!
 +   * For accurate normal mode calculation it is imperative that we
 +   * store the last conformation into the full precision binary trajectory.
 +   *
 +   * However, we should only do it if we did NOT already write this step
 +   * above (which we did if do_x or do_f was true).
 +   */  
 +  do_x = !do_per_step(step,inputrec->nstxout);
 +  do_f = (inputrec->nstfout > 0 && !do_per_step(step,inputrec->nstfout));
 +  
 +  write_em_traj(fplog,cr,outf,do_x,do_f,ftp2fn(efSTO,nfile,fnm),
 +                top_global,inputrec,step,
 +                s_min,state_global,f_global);
 +  
 +  fnormn = s_min->fnorm/sqrt(state_global->natoms);
 +  
 +  if (MASTER(cr)) {
 +    print_converged(stderr,CG,inputrec->em_tol,step,converged,number_steps,
 +                  s_min->epot,s_min->fmax,s_min->a_fmax,fnormn);
 +    print_converged(fplog,CG,inputrec->em_tol,step,converged,number_steps,
 +                  s_min->epot,s_min->fmax,s_min->a_fmax,fnormn);
 +    
 +    fprintf(fplog,"\nPerformed %d energy evaluations in total.\n",neval);
 +  }
 +  
 +  finish_em(fplog,cr,outf,runtime,wcycle);
 +  
 +  /* To print the actual number of steps we needed somewhere */
 +  runtime->nsteps_done = step;
 +
 +  return 0;
 +} /* That's all folks */
 +
 +
 +double do_lbfgs(FILE *fplog,t_commrec *cr,
 +                int nfile,const t_filenm fnm[],
 +                const output_env_t oenv, gmx_bool bVerbose,gmx_bool bCompact,
 +                int nstglobalcomm,
 +                gmx_vsite_t *vsite,gmx_constr_t constr,
 +                int stepout,
 +                t_inputrec *inputrec,
 +                gmx_mtop_t *top_global,t_fcdata *fcd,
 +                t_state *state,
 +                t_mdatoms *mdatoms,
 +                t_nrnb *nrnb,gmx_wallcycle_t wcycle,
 +                gmx_edsam_t ed,
 +                t_forcerec *fr,
 +                int repl_ex_nst,int repl_ex_seed,
 +                gmx_membed_t *membed,
 +                real cpt_period,real max_hours,
 +                const char *deviceOptions,
 +                unsigned long Flags,
 +                gmx_runtime_t *runtime)
 +{
 +  static const char *LBFGS="Low-Memory BFGS Minimizer";
 +  em_state_t ems;
 +  gmx_localtop_t *top;
 +  gmx_enerdata_t *enerd;
 +  rvec   *f;
 +  gmx_global_stat_t gstat;
 +  t_graph    *graph;
 +  rvec   *f_global;
 +  int    ncorr,nmaxcorr,point,cp,neval,nminstep;
 +  double stepsize,gpa,gpb,gpc,tmp,minstep;
 +  real   *rho,*alpha,*ff,*xx,*p,*s,*lastx,*lastf,**dx,**dg;   
 +  real   *xa,*xb,*xc,*fa,*fb,*fc,*xtmp,*ftmp;
 +  real   a,b,c,maxdelta,delta;
 +  real   diag,Epot0,Epot,EpotA,EpotB,EpotC;
 +  real   dgdx,dgdg,sq,yr,beta;
 +  t_mdebin   *mdebin;
 +  gmx_bool   converged,first;
 +  rvec   mu_tot;
 +  real   fnorm,fmax;
 +  gmx_bool   do_log,do_ene,do_x,do_f,foundlower,*frozen;
 +  tensor vir,pres;
 +  int    start,end,number_steps;
 +  gmx_mdoutf_t *outf;
 +  int    i,k,m,n,nfmax,gf,step;
 +  /* not used */
 +  real   terminate;
 +
 +  if (PAR(cr))
 +    gmx_fatal(FARGS,"Cannot do parallel L-BFGS Minimization - yet.\n");
 +  
 +  n = 3*state->natoms;
 +  nmaxcorr = inputrec->nbfgscorr;
 +  
 +  /* Allocate memory */
 +  /* Use pointers to real so we dont have to loop over both atoms and
 +   * dimensions all the time...
 +   * x/f are allocated as rvec *, so make new x0/f0 pointers-to-real
 +   * that point to the same memory.
 +   */
 +  snew(xa,n);
 +  snew(xb,n);
 +  snew(xc,n);
 +  snew(fa,n);
 +  snew(fb,n);
 +  snew(fc,n);
 +  snew(frozen,n);
 +
 +  snew(p,n); 
 +  snew(lastx,n); 
 +  snew(lastf,n); 
 +  snew(rho,nmaxcorr);
 +  snew(alpha,nmaxcorr);
 +  
 +  snew(dx,nmaxcorr);
 +  for(i=0;i<nmaxcorr;i++)
 +    snew(dx[i],n);
 +  
 +  snew(dg,nmaxcorr);
 +  for(i=0;i<nmaxcorr;i++)
 +    snew(dg[i],n);
 +
 +  step = 0;
 +  neval = 0; 
 +
 +  /* Init em */
 +  init_em(fplog,LBFGS,cr,inputrec,
 +          state,top_global,&ems,&top,&f,&f_global,
 +          nrnb,mu_tot,fr,&enerd,&graph,mdatoms,&gstat,vsite,constr,
 +          nfile,fnm,&outf,&mdebin);
 +  /* Do_lbfgs is not completely updated like do_steep and do_cg,
 +   * so we free some memory again.
 +   */
 +  sfree(ems.s.x);
 +  sfree(ems.f);
 +
 +  xx = (real *)state->x;
 +  ff = (real *)f;
 +
 +  start = mdatoms->start;
 +  end   = mdatoms->homenr + start;
 +    
 +  /* Print to log file */
 +  print_em_start(fplog,cr,runtime,wcycle,LBFGS);
 +  
 +  do_log = do_ene = do_x = do_f = TRUE;
 +  
 +  /* Max number of steps */
 +  number_steps=inputrec->nsteps;
 +
 +  /* Create a 3*natoms index to tell whether each degree of freedom is frozen */
 +  gf = 0;
 +  for(i=start; i<end; i++) {
 +    if (mdatoms->cFREEZE)
 +      gf = mdatoms->cFREEZE[i];
 +     for(m=0; m<DIM; m++) 
 +       frozen[3*i+m]=inputrec->opts.nFreeze[gf][m];  
 +  }
 +  if (MASTER(cr))
 +    sp_header(stderr,LBFGS,inputrec->em_tol,number_steps);
 +  if (fplog)
 +    sp_header(fplog,LBFGS,inputrec->em_tol,number_steps);
 +  
 +  if (vsite)
 +    construct_vsites(fplog,vsite,state->x,nrnb,1,NULL,
 +                   top->idef.iparams,top->idef.il,
 +                   fr->ePBC,fr->bMolPBC,graph,cr,state->box);
 +  
 +  /* Call the force routine and some auxiliary (neighboursearching etc.) */
 +  /* do_force always puts the charge groups in the box and shifts again
 +   * We do not unshift, so molecules are always whole
 +   */
 +  neval++;
 +  ems.s.x = state->x;
 +  ems.f = f;
 +  evaluate_energy(fplog,bVerbose,cr,
 +                state,top_global,&ems,top,
 +                inputrec,nrnb,wcycle,gstat,
 +                vsite,constr,fcd,graph,mdatoms,fr,
 +                mu_tot,enerd,vir,pres,-1,TRUE);
 +  where();
 +      
 +  if (MASTER(cr)) {
 +    /* Copy stuff to the energy bin for easy printing etc. */
 +    upd_mdebin(mdebin,FALSE,FALSE,(double)step,
 +             mdatoms->tmass,enerd,state,state->box,
 +             NULL,NULL,vir,pres,NULL,mu_tot,constr);
 +    
 +    print_ebin_header(fplog,step,step,state->lambda);
 +    print_ebin(outf->fp_ene,TRUE,FALSE,FALSE,fplog,step,step,eprNORMAL,
 +               TRUE,mdebin,fcd,&(top_global->groups),&(inputrec->opts));
 +  }
 +  where();
 +  
 +  /* This is the starting energy */
 +  Epot = enerd->term[F_EPOT];
 +  
 +  fnorm = ems.fnorm;
 +  fmax  = ems.fmax;
 +  nfmax = ems.a_fmax;
 +  
 +  /* Set the initial step.
 +   * since it will be multiplied by the non-normalized search direction 
 +   * vector (force vector the first time), we scale it by the
 +   * norm of the force.
 +   */
 +  
 +  if (MASTER(cr)) {
 +    fprintf(stderr,"Using %d BFGS correction steps.\n\n",nmaxcorr);
 +    fprintf(stderr,"   F-max             = %12.5e on atom %d\n",fmax,nfmax+1);
 +    fprintf(stderr,"   F-Norm            = %12.5e\n",fnorm/sqrt(state->natoms));
 +    fprintf(stderr,"\n");
 +    /* and copy to the log file too... */
 +    fprintf(fplog,"Using %d BFGS correction steps.\n\n",nmaxcorr);
 +    fprintf(fplog,"   F-max             = %12.5e on atom %d\n",fmax,nfmax+1);
 +    fprintf(fplog,"   F-Norm            = %12.5e\n",fnorm/sqrt(state->natoms));
 +    fprintf(fplog,"\n");
 +  }   
 +  
 +  point=0;
 +  for(i=0;i<n;i++)
 +    if(!frozen[i])
 +      dx[point][i] = ff[i];  /* Initial search direction */
 +    else
 +      dx[point][i] = 0;
 +
 +  stepsize = 1.0/fnorm;
 +  converged = FALSE;
 +  
 +  /* Start the loop over BFGS steps.          
 +   * Each successful step is counted, and we continue until
 +   * we either converge or reach the max number of steps.
 +   */
 +  
 +  ncorr=0;
 +
 +  /* Set the gradient from the force */
 +  converged = FALSE;
 +  for(step=0; (number_steps<0 || (number_steps>=0 && step<=number_steps)) && !converged; step++) {
 +    
 +    /* Write coordinates if necessary */
 +    do_x = do_per_step(step,inputrec->nstxout);
 +    do_f = do_per_step(step,inputrec->nstfout);
 +    
 +    write_traj(fplog,cr,outf,MDOF_X | MDOF_F,
 +               top_global,step,(real)step,state,state,f,f,NULL,NULL);
 +
 +    /* Do the linesearching in the direction dx[point][0..(n-1)] */
 +    
 +    /* pointer to current direction - point=0 first time here */
 +    s=dx[point];
 +    
 +    /* calculate line gradient */
 +    for(gpa=0,i=0;i<n;i++) 
 +      gpa-=s[i]*ff[i];
 +
 +    /* Calculate minimum allowed stepsize, before the average (norm) 
 +     * relative change in coordinate is smaller than precision 
 +     */
 +    for(minstep=0,i=0;i<n;i++) {
 +      tmp=fabs(xx[i]);
 +      if(tmp<1.0)
 +      tmp=1.0;
 +      tmp = s[i]/tmp;
 +      minstep += tmp*tmp;
 +    }
 +    minstep = GMX_REAL_EPS/sqrt(minstep/n);
 +    
 +    if(stepsize<minstep) {
 +      converged=TRUE;
 +      break;
 +    }
 +    
 +    /* Store old forces and coordinates */
 +    for(i=0;i<n;i++) {
 +      lastx[i]=xx[i];
 +      lastf[i]=ff[i];
 +    }
 +    Epot0=Epot;
 +    
 +    first=TRUE;
 +    
 +    for(i=0;i<n;i++)
 +      xa[i]=xx[i];
 +    
 +    /* Take a step downhill.
 +     * In theory, we should minimize the function along this direction.
 +     * That is quite possible, but it turns out to take 5-10 function evaluations
 +     * for each line. However, we dont really need to find the exact minimum -
 +     * it is much better to start a new BFGS step in a modified direction as soon
 +     * as we are close to it. This will save a lot of energy evaluations.
 +     *
 +     * In practice, we just try to take a single step.
 +     * If it worked (i.e. lowered the energy), we increase the stepsize but
 +     * the continue straight to the next BFGS step without trying to find any minimum.
 +     * If it didn't work (higher energy), there must be a minimum somewhere between
 +     * the old position and the new one.
 +     * 
 +     * Due to the finite numerical accuracy, it turns out that it is a good idea
 +     * to even accept a SMALL increase in energy, if the derivative is still downhill.
 +     * This leads to lower final energies in the tests I've done. / Erik 
 +     */
 +    foundlower=FALSE;
 +    EpotA = Epot0;
 +    a = 0.0;
 +    c = a + stepsize; /* reference position along line is zero */
 +
 +    /* Check stepsize first. We do not allow displacements 
 +     * larger than emstep.
 +     */
 +    do {
 +      c = a + stepsize;
 +      maxdelta=0;
 +      for(i=0;i<n;i++) {
 +      delta=c*s[i];
 +      if(delta>maxdelta)
 +        maxdelta=delta;
 +      }
 +      if(maxdelta>inputrec->em_stepsize)
 +      stepsize*=0.1;
 +    } while(maxdelta>inputrec->em_stepsize);
 +
 +    /* Take a trial step */
 +    for (i=0; i<n; i++)
 +      xc[i] = lastx[i] + c*s[i];
 +    
 +    neval++;
 +    /* Calculate energy for the trial step */
 +    ems.s.x = (rvec *)xc;
 +    ems.f   = (rvec *)fc;
 +    evaluate_energy(fplog,bVerbose,cr,
 +                  state,top_global,&ems,top,
 +                  inputrec,nrnb,wcycle,gstat,
 +                  vsite,constr,fcd,graph,mdatoms,fr,
 +                  mu_tot,enerd,vir,pres,step,FALSE);
 +    EpotC = ems.epot;
 +    
 +    /* Calc derivative along line */
 +    for(gpc=0,i=0; i<n; i++) {
 +      gpc -= s[i]*fc[i];   /* f is negative gradient, thus the sign */
 +    }
 +    /* Sum the gradient along the line across CPUs */
 +    if (PAR(cr))
 +      gmx_sumd(1,&gpc,cr);
 +    
 +     /* This is the max amount of increase in energy we tolerate */
 +   tmp=sqrt(GMX_REAL_EPS)*fabs(EpotA);
 +    
 +    /* Accept the step if the energy is lower, or if it is not significantly higher
 +     * and the line derivative is still negative.
 +     */
 +    if(EpotC<EpotA || (gpc<0 && EpotC<(EpotA+tmp))) {
 +      foundlower = TRUE;
 +      /* Great, we found a better energy. Increase step for next iteration
 +       * if we are still going down, decrease it otherwise
 +       */
 +      if(gpc<0)
 +      stepsize *= 1.618034;  /* The golden section */
 +      else
 +      stepsize *= 0.618034;  /* 1/golden section */
 +    } else {
 +      /* New energy is the same or higher. We will have to do some work
 +       * to find a smaller value in the interval. Take smaller step next time!
 +       */
 +      foundlower = FALSE;
 +      stepsize *= 0.618034;
 +    }    
 +    
 +    /* OK, if we didn't find a lower value we will have to locate one now - there must
 +     * be one in the interval [a=0,c].
 +     * The same thing is valid here, though: Don't spend dozens of iterations to find
 +     * the line minimum. We try to interpolate based on the derivative at the endpoints,
 +     * and only continue until we find a lower value. In most cases this means 1-2 iterations.
 +     *
 +     * I also have a safeguard for potentially really patological functions so we never
 +     * take more than 20 steps before we give up ...
 +     *
 +     * If we already found a lower value we just skip this step and continue to the update.
 +     */
 +
 +    if(!foundlower) {
 +     
 +      nminstep=0;
 +      do {
 +      /* Select a new trial point.
 +       * If the derivatives at points a & c have different sign we interpolate to zero,
 +       * otherwise just do a bisection.
 +       */
 +      
 +      if(gpa<0 && gpc>0)
 +        b = a + gpa*(a-c)/(gpc-gpa);
 +      else
 +        b = 0.5*(a+c);                
 +      
 +      /* safeguard if interpolation close to machine accuracy causes errors:
 +       * never go outside the interval
 +       */
 +      if(b<=a || b>=c)
 +        b = 0.5*(a+c);
 +      
 +      /* Take a trial step */
 +      for (i=0; i<n; i++) 
 +        xb[i] = lastx[i] + b*s[i];
 +      
 +      neval++;
 +      /* Calculate energy for the trial step */
 +      ems.s.x = (rvec *)xb;
 +      ems.f   = (rvec *)fb;
 +      evaluate_energy(fplog,bVerbose,cr,
 +                      state,top_global,&ems,top,
 +                      inputrec,nrnb,wcycle,gstat,
 +                      vsite,constr,fcd,graph,mdatoms,fr,
 +                      mu_tot,enerd,vir,pres,step,FALSE);
 +      EpotB = ems.epot;
 +      
 +      fnorm = ems.fnorm;
 +      
 +      for(gpb=0,i=0; i<n; i++) 
 +        gpb -= s[i]*fb[i];   /* f is negative gradient, thus the sign */
 +      
 +      /* Sum the gradient along the line across CPUs */
 +      if (PAR(cr))
 +        gmx_sumd(1,&gpb,cr);
 +      
 +      /* Keep one of the intervals based on the value of the derivative at the new point */
 +      if(gpb>0) {
 +        /* Replace c endpoint with b */
 +        EpotC = EpotB;
 +        c = b;
 +        gpc = gpb;
 +        /* swap coord pointers b/c */
 +        xtmp = xb; 
 +        ftmp = fb;
 +        xb = xc; 
 +        fb = fc;
 +        xc = xtmp;
 +        fc = ftmp;
 +      } else {
 +        /* Replace a endpoint with b */
 +        EpotA = EpotB;
 +        a = b;
 +        gpa = gpb;
 +        /* swap coord pointers a/b */
 +        xtmp = xb; 
 +        ftmp = fb;
 +        xb = xa; 
 +        fb = fa;
 +        xa = xtmp; 
 +        fa = ftmp;
 +      }
 +      
 +      /* 
 +       * Stop search as soon as we find a value smaller than the endpoints,
 +       * or if the tolerance is below machine precision.
 +       * Never run more than 20 steps, no matter what.
 +       */
 +      nminstep++; 
 +      } while((EpotB>EpotA || EpotB>EpotC) && (nminstep<20));
 +
 +      if(fabs(EpotB-Epot0)<GMX_REAL_EPS || nminstep>=20) {
 +      /* OK. We couldn't find a significantly lower energy.
 +       * If ncorr==0 this was steepest descent, and then we give up.
 +       * If not, reset memory to restart as steepest descent before quitting.
 +         */
 +      if(ncorr==0) {
 +      /* Converged */
 +        converged=TRUE;
 +        break;
 +      } else {
 +        /* Reset memory */
 +        ncorr=0;
 +        /* Search in gradient direction */
 +        for(i=0;i<n;i++)
 +          dx[point][i]=ff[i];
 +        /* Reset stepsize */
 +        stepsize = 1.0/fnorm;
 +        continue;
 +      }
 +      }
 +      
 +      /* Select min energy state of A & C, put the best in xx/ff/Epot
 +       */
 +      if(EpotC<EpotA) {
 +      Epot = EpotC;
 +      /* Use state C */
 +      for(i=0;i<n;i++) {
 +        xx[i]=xc[i];
 +        ff[i]=fc[i];
 +      }
 +      stepsize=c;
 +      } else {
 +      Epot = EpotA;
 +      /* Use state A */
 +      for(i=0;i<n;i++) {
 +        xx[i]=xa[i];
 +        ff[i]=fa[i];
 +      }
 +      stepsize=a;
 +      }
 +      
 +    } else {
 +      /* found lower */
 +      Epot = EpotC;
 +      /* Use state C */
 +      for(i=0;i<n;i++) {
 +      xx[i]=xc[i];
 +      ff[i]=fc[i];
 +      }
 +      stepsize=c;
 +    }
 +
 +    /* Update the memory information, and calculate a new 
 +     * approximation of the inverse hessian 
 +     */
 +    
 +    /* Have new data in Epot, xx, ff */       
 +    if(ncorr<nmaxcorr)
 +      ncorr++;
 +
 +    for(i=0;i<n;i++) {
 +      dg[point][i]=lastf[i]-ff[i];
 +      dx[point][i]*=stepsize;
 +    }
 +    
 +    dgdg=0;
 +    dgdx=0;   
 +    for(i=0;i<n;i++) {
 +      dgdg+=dg[point][i]*dg[point][i];
 +      dgdx+=dg[point][i]*dx[point][i];
 +    }
 +    
 +    diag=dgdx/dgdg;
 +    
 +    rho[point]=1.0/dgdx;
 +    point++;
 +    
 +    if(point>=nmaxcorr)
 +      point=0;
 +    
 +    /* Update */
 +    for(i=0;i<n;i++)
 +      p[i]=ff[i];
 +    
 +    cp=point;
 +    
 +    /* Recursive update. First go back over the memory points */
 +    for(k=0;k<ncorr;k++) {
 +      cp--;
 +      if(cp<0) 
 +      cp=ncorr-1;
 +      
 +      sq=0;
 +      for(i=0;i<n;i++)
 +      sq+=dx[cp][i]*p[i];
 +      
 +      alpha[cp]=rho[cp]*sq;
 +      
 +      for(i=0;i<n;i++)
 +      p[i] -= alpha[cp]*dg[cp][i];            
 +    }
 +    
 +    for(i=0;i<n;i++)
 +      p[i] *= diag;
 +    
 +    /* And then go forward again */
 +    for(k=0;k<ncorr;k++) {
 +      yr = 0;
 +      for(i=0;i<n;i++)
 +      yr += p[i]*dg[cp][i];
 +      
 +      beta = rho[cp]*yr;          
 +      beta = alpha[cp]-beta;
 +      
 +      for(i=0;i<n;i++)
 +      p[i] += beta*dx[cp][i];
 +      
 +      cp++;   
 +      if(cp>=ncorr)
 +      cp=0;
 +    }
 +    
 +    for(i=0;i<n;i++)
 +      if(!frozen[i])
 +      dx[point][i] = p[i];
 +      else
 +      dx[point][i] = 0;
 +
 +    stepsize=1.0;
 +    
 +    /* Test whether the convergence criterion is met */
 +    get_f_norm_max(cr,&(inputrec->opts),mdatoms,f,&fnorm,&fmax,&nfmax);
 +    
 +    /* Print it if necessary */
 +    if (MASTER(cr)) {
 +      if(bVerbose)
 +      fprintf(stderr,"\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n",
 +              step,Epot,fnorm/sqrt(state->natoms),fmax,nfmax+1);
 +      /* Store the new (lower) energies */
 +      upd_mdebin(mdebin,FALSE,FALSE,(double)step,
 +               mdatoms->tmass,enerd,state,state->box,
 +               NULL,NULL,vir,pres,NULL,mu_tot,constr);
 +      do_log = do_per_step(step,inputrec->nstlog);
 +      do_ene = do_per_step(step,inputrec->nstenergy);
 +      if(do_log)
 +      print_ebin_header(fplog,step,step,state->lambda);
 +      print_ebin(outf->fp_ene,do_ene,FALSE,FALSE,
 +               do_log ? fplog : NULL,step,step,eprNORMAL,
 +               TRUE,mdebin,fcd,&(top_global->groups),&(inputrec->opts));
 +    }
 +    
 +    /* Stop when the maximum force lies below tolerance.
 +     * If we have reached machine precision, converged is already set to true.
 +     */
 +    
 +    converged = converged || (fmax < inputrec->em_tol);
 +    
 +  } /* End of the loop */
 +  
 +  if(converged)       
 +    step--; /* we never took that last step in this case */
 +  
 +    if(fmax>inputrec->em_tol)
 +    {
 +        if (MASTER(cr))
 +        {
 +            warn_step(stderr,inputrec->em_tol,step-1==number_steps,FALSE);
 +            warn_step(fplog ,inputrec->em_tol,step-1==number_steps,FALSE);
 +        }
 +        converged = FALSE; 
 +    }
 +  
 +  /* If we printed energy and/or logfile last step (which was the last step)
 +   * we don't have to do it again, but otherwise print the final values.
 +   */
 +  if(!do_log) /* Write final value to log since we didn't do anythin last step */
 +    print_ebin_header(fplog,step,step,state->lambda);
 +  if(!do_ene || !do_log) /* Write final energy file entries */
 +    print_ebin(outf->fp_ene,!do_ene,FALSE,FALSE,
 +             !do_log ? fplog : NULL,step,step,eprNORMAL,
 +             TRUE,mdebin,fcd,&(top_global->groups),&(inputrec->opts));
 +  
 +  /* Print some stuff... */
 +  if (MASTER(cr))
 +    fprintf(stderr,"\nwriting lowest energy coordinates.\n");
 +  
 +  /* IMPORTANT!
 +   * For accurate normal mode calculation it is imperative that we
 +   * store the last conformation into the full precision binary trajectory.
 +   *
 +   * However, we should only do it if we did NOT already write this step
 +   * above (which we did if do_x or do_f was true).
 +   */  
 +  do_x = !do_per_step(step,inputrec->nstxout);
 +  do_f = !do_per_step(step,inputrec->nstfout);
 +  write_em_traj(fplog,cr,outf,do_x,do_f,ftp2fn(efSTO,nfile,fnm),
 +                top_global,inputrec,step,
 +                &ems,state,f);
 +  
 +  if (MASTER(cr)) {
 +    print_converged(stderr,LBFGS,inputrec->em_tol,step,converged,
 +                  number_steps,Epot,fmax,nfmax,fnorm/sqrt(state->natoms));
 +    print_converged(fplog,LBFGS,inputrec->em_tol,step,converged,
 +                  number_steps,Epot,fmax,nfmax,fnorm/sqrt(state->natoms));
 +    
 +    fprintf(fplog,"\nPerformed %d energy evaluations in total.\n",neval);
 +  }
 +  
 +  finish_em(fplog,cr,outf,runtime,wcycle);
 +
 +  /* To print the actual number of steps we needed somewhere */
 +  runtime->nsteps_done = step;
 +
 +  return 0;
 +} /* That's all folks */
 +
 +
 +double do_steep(FILE *fplog,t_commrec *cr,
 +                int nfile, const t_filenm fnm[],
 +                const output_env_t oenv, gmx_bool bVerbose,gmx_bool bCompact,
 +                int nstglobalcomm,
 +                gmx_vsite_t *vsite,gmx_constr_t constr,
 +                int stepout,
 +                t_inputrec *inputrec,
 +                gmx_mtop_t *top_global,t_fcdata *fcd,
 +                t_state *state_global,
 +                t_mdatoms *mdatoms,
 +                t_nrnb *nrnb,gmx_wallcycle_t wcycle,
 +                gmx_edsam_t ed,
 +                t_forcerec *fr,
 +                int repl_ex_nst,int repl_ex_seed,
 +                gmx_membed_t *membed,
 +                real cpt_period,real max_hours,
 +                const char *deviceOptions,
 +                unsigned long Flags,
 +                gmx_runtime_t *runtime)
 +{ 
 +  const char *SD="Steepest Descents";
 +  em_state_t *s_min,*s_try;
 +  rvec       *f_global;
 +  gmx_localtop_t *top;
 +  gmx_enerdata_t *enerd;
 +  rvec   *f;
 +  gmx_global_stat_t gstat;
 +  t_graph    *graph;
 +  real   stepsize,constepsize;
 +  real   ustep,dvdlambda,fnormn;
 +  gmx_mdoutf_t *outf;
 +  t_mdebin   *mdebin; 
 +  gmx_bool   bDone,bAbort,do_x,do_f; 
 +  tensor vir,pres; 
 +  rvec   mu_tot;
 +  int    nsteps;
 +  int    count=0; 
 +  int    steps_accepted=0; 
 +  /* not used */
 +  real   terminate=0;
 +
 +  s_min = init_em_state();
 +  s_try = init_em_state();
 +
 +  /* Init em and store the local state in s_try */
 +  init_em(fplog,SD,cr,inputrec,
 +          state_global,top_global,s_try,&top,&f,&f_global,
 +          nrnb,mu_tot,fr,&enerd,&graph,mdatoms,&gstat,vsite,constr,
 +          nfile,fnm,&outf,&mdebin);
 +      
 +  /* Print to log file  */
 +  print_em_start(fplog,cr,runtime,wcycle,SD);
 +    
 +  /* Set variables for stepsize (in nm). This is the largest  
 +   * step that we are going to make in any direction. 
 +   */
 +  ustep = inputrec->em_stepsize; 
 +  stepsize = 0;
 +  
 +  /* Max number of steps  */
 +  nsteps = inputrec->nsteps; 
 +  
 +  if (MASTER(cr)) 
 +    /* Print to the screen  */
 +    sp_header(stderr,SD,inputrec->em_tol,nsteps);
 +  if (fplog)
 +    sp_header(fplog,SD,inputrec->em_tol,nsteps);
 +    
 +  /**** HERE STARTS THE LOOP ****
 +   * count is the counter for the number of steps 
 +   * bDone will be TRUE when the minimization has converged
 +   * bAbort will be TRUE when nsteps steps have been performed or when
 +   * the stepsize becomes smaller than is reasonable for machine precision
 +   */
 +  count  = 0;
 +  bDone  = FALSE;
 +  bAbort = FALSE;
 +  while( !bDone && !bAbort ) {
 +    bAbort = (nsteps >= 0) && (count == nsteps);
 +    
 +    /* set new coordinates, except for first step */
 +    if (count > 0) {
 +      do_em_step(cr,inputrec,mdatoms,s_min,stepsize,s_min->f,s_try,
 +               constr,top,nrnb,wcycle,count);
 +    }
 +    
 +    evaluate_energy(fplog,bVerbose,cr,
 +                  state_global,top_global,s_try,top,
 +                  inputrec,nrnb,wcycle,gstat,
 +                  vsite,constr,fcd,graph,mdatoms,fr,
 +                  mu_tot,enerd,vir,pres,count,count==0);
 +       
 +    if (MASTER(cr))
 +      print_ebin_header(fplog,count,count,s_try->s.lambda);
 +
 +    if (count == 0)
 +      s_min->epot = s_try->epot + 1;
 +    
 +    /* Print it if necessary  */
 +    if (MASTER(cr)) {
 +      if (bVerbose) {
 +      fprintf(stderr,"Step=%5d, Dmax= %6.1e nm, Epot= %12.5e Fmax= %11.5e, atom= %d%c",
 +              count,ustep,s_try->epot,s_try->fmax,s_try->a_fmax+1,
 +              (s_try->epot < s_min->epot) ? '\n' : '\r');
 +      }
 +      
 +      if (s_try->epot < s_min->epot) {
 +      /* Store the new (lower) energies  */
 +      upd_mdebin(mdebin,FALSE,FALSE,(double)count,
 +                 mdatoms->tmass,enerd,&s_try->s,s_try->s.box,
 +                 NULL,NULL,vir,pres,NULL,mu_tot,constr);
 +      print_ebin(outf->fp_ene,TRUE,
 +                 do_per_step(steps_accepted,inputrec->nstdisreout),
 +                 do_per_step(steps_accepted,inputrec->nstorireout),
 +                 fplog,count,count,eprNORMAL,TRUE,
 +                 mdebin,fcd,&(top_global->groups),&(inputrec->opts));
 +      fflush(fplog);
 +      }
 +    } 
 +    
 +    /* Now if the new energy is smaller than the previous...  
 +     * or if this is the first step!
 +     * or if we did random steps! 
 +     */
 +    
 +    if ( (count==0) || (s_try->epot < s_min->epot) ) {
 +      steps_accepted++; 
 +
 +      /* Test whether the convergence criterion is met...  */
 +      bDone = (s_try->fmax < inputrec->em_tol);
 +      
 +      /* Copy the arrays for force, positions and energy  */
 +      /* The 'Min' array always holds the coords and forces of the minimal 
 +       sampled energy  */
 +      swap_em_state(s_min,s_try);
 +      if (count > 0)
 +      ustep *= 1.2;
 +
 +      /* Write to trn, if necessary */
 +      do_x = do_per_step(steps_accepted,inputrec->nstxout);
 +      do_f = do_per_step(steps_accepted,inputrec->nstfout);
 +      write_em_traj(fplog,cr,outf,do_x,do_f,NULL,
 +                    top_global,inputrec,count,
 +                    s_min,state_global,f_global);
 +    } 
 +    else {
 +      /* If energy is not smaller make the step smaller...  */
 +      ustep *= 0.5;
 +
 +      if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count) {
 +      /* Reload the old state */
 +      em_dd_partition_system(fplog,count,cr,top_global,inputrec,
 +                             s_min,top,mdatoms,fr,vsite,constr,
 +                             nrnb,wcycle);
 +      }
 +    }
 +    
 +    /* Determine new step  */
 +    stepsize = ustep/s_min->fmax;
 +    
 +    /* Check if stepsize is too small, with 1 nm as a characteristic length */
 +#ifdef GMX_DOUBLE
 +        if (count == nsteps || ustep < 1e-12)
 +#else
 +        if (count == nsteps || ustep < 1e-6)
 +#endif
 +        {
 +            if (MASTER(cr))
 +            {
 +                warn_step(stderr,inputrec->em_tol,count==nsteps,constr!=NULL);
 +                warn_step(fplog ,inputrec->em_tol,count==nsteps,constr!=NULL);
 +            }
 +            bAbort=TRUE;
 +        }
 +    
 +    count++;
 +  } /* End of the loop  */
 +  
 +    /* Print some shit...  */
 +  if (MASTER(cr)) 
 +    fprintf(stderr,"\nwriting lowest energy coordinates.\n"); 
 +  write_em_traj(fplog,cr,outf,TRUE,inputrec->nstfout,ftp2fn(efSTO,nfile,fnm),
 +              top_global,inputrec,count,
 +              s_min,state_global,f_global);
 +
 +  fnormn = s_min->fnorm/sqrt(state_global->natoms);
 +
 +  if (MASTER(cr)) {
 +    print_converged(stderr,SD,inputrec->em_tol,count,bDone,nsteps,
 +                  s_min->epot,s_min->fmax,s_min->a_fmax,fnormn);
 +    print_converged(fplog,SD,inputrec->em_tol,count,bDone,nsteps,
 +                  s_min->epot,s_min->fmax,s_min->a_fmax,fnormn);
 +  }
 +
 +  finish_em(fplog,cr,outf,runtime,wcycle);
 +  
 +  /* To print the actual number of steps we needed somewhere */
 +  inputrec->nsteps=count;
 +
 +  runtime->nsteps_done = count;
 +  
 +  return 0;
 +} /* That's all folks */
 +
 +
 +double do_nm(FILE *fplog,t_commrec *cr,
 +             int nfile,const t_filenm fnm[],
 +             const output_env_t oenv, gmx_bool bVerbose,gmx_bool bCompact,
 +             int nstglobalcomm,
 +             gmx_vsite_t *vsite,gmx_constr_t constr,
 +             int stepout,
 +             t_inputrec *inputrec,
 +             gmx_mtop_t *top_global,t_fcdata *fcd,
 +             t_state *state_global,
 +             t_mdatoms *mdatoms,
 +             t_nrnb *nrnb,gmx_wallcycle_t wcycle,
 +             gmx_edsam_t ed,
 +             t_forcerec *fr,
 +             int repl_ex_nst,int repl_ex_seed,
 +             gmx_membed_t *membed,
 +             real cpt_period,real max_hours,
 +             const char *deviceOptions,
 +             unsigned long Flags,
 +             gmx_runtime_t *runtime)
 +{
 +    const char *NM = "Normal Mode Analysis";
 +    gmx_mdoutf_t *outf;
 +    int        natoms,atom,d;
 +    int        nnodes,node;
 +    rvec       *f_global;
 +    gmx_localtop_t *top;
 +    gmx_enerdata_t *enerd;
 +    rvec       *f;
 +    gmx_global_stat_t gstat;
 +    t_graph    *graph;
 +    real       t,lambda;
 +    gmx_bool       bNS;
 +    tensor     vir,pres;
 +    rvec       mu_tot;
 +    rvec       *fneg,*dfdx;
 +    gmx_bool       bSparse; /* use sparse matrix storage format */
 +    size_t     sz;
 +    gmx_sparsematrix_t * sparse_matrix = NULL;
 +    real *     full_matrix             = NULL;
 +    em_state_t *   state_work;
 +      
 +    /* added with respect to mdrun */
 +    int        i,j,k,row,col;
 +    real       der_range=10.0*sqrt(GMX_REAL_EPS);
 +    real       x_min;
 +    real       fnorm,fmax;
 +    
 +    if (constr != NULL)
 +    {
 +        gmx_fatal(FARGS,"Constraints present with Normal Mode Analysis, this combination is not supported");
 +    }
 +
 +    state_work = init_em_state();
 +    
 +    /* Init em and store the local state in state_minimum */
 +    init_em(fplog,NM,cr,inputrec,
 +            state_global,top_global,state_work,&top,
 +            &f,&f_global,
 +            nrnb,mu_tot,fr,&enerd,&graph,mdatoms,&gstat,vsite,constr,
 +            nfile,fnm,&outf,NULL);
 +    
 +    natoms = top_global->natoms;
 +    snew(fneg,natoms);
 +    snew(dfdx,natoms);
 +    
 +#ifndef GMX_DOUBLE
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr,
 +                "NOTE: This version of Gromacs has been compiled in single precision,\n"
 +                "      which MIGHT not be accurate enough for normal mode analysis.\n"
 +                "      Gromacs now uses sparse matrix storage, so the memory requirements\n"
 +                "      are fairly modest even if you recompile in double precision.\n\n");
 +    }
 +#endif
 +    
 +    /* Check if we can/should use sparse storage format.
 +     *
 +     * Sparse format is only useful when the Hessian itself is sparse, which it
 +      * will be when we use a cutoff.    
 +      * For small systems (n<1000) it is easier to always use full matrix format, though.
 +      */
 +    if(EEL_FULL(fr->eeltype) || fr->rlist==0.0)
 +    {
 +        fprintf(stderr,"Non-cutoff electrostatics used, forcing full Hessian format.\n");
 +        bSparse = FALSE;
 +    }
 +    else if(top_global->natoms < 1000)
 +    {
 +        fprintf(stderr,"Small system size (N=%d), using full Hessian format.\n",top_global->natoms);
 +        bSparse = FALSE;
 +    }
 +    else
 +    {
 +        fprintf(stderr,"Using compressed symmetric sparse Hessian format.\n");
 +        bSparse = TRUE;
 +    }
 +    
 +    sz = DIM*top_global->natoms;
 +    
 +    fprintf(stderr,"Allocating Hessian memory...\n\n");
 +
 +    if(bSparse)
 +    {
 +        sparse_matrix=gmx_sparsematrix_init(sz);
 +        sparse_matrix->compressed_symmetric = TRUE;
 +    }
 +    else
 +    {
 +        snew(full_matrix,sz*sz);
 +    }
 +    
 +    /* Initial values */
 +    t      = inputrec->init_t;
 +    lambda = inputrec->init_lambda;
 +    
 +    init_nrnb(nrnb);
 +    
 +    where();
 +    
 +    /* Write start time and temperature */
 +    print_em_start(fplog,cr,runtime,wcycle,NM);
 +
 +    /* fudge nr of steps to nr of atoms */
 +    inputrec->nsteps = natoms*2;
 +
 +    if (MASTER(cr)) 
 +    {
 +        fprintf(stderr,"starting normal mode calculation '%s'\n%d steps.\n\n",
 +                *(top_global->name),(int)inputrec->nsteps);
 +    }
 +
 +    nnodes = cr->nnodes;
 +   
 +    /* Make evaluate_energy do a single node force calculation */
 +    cr->nnodes = 1;
 +    evaluate_energy(fplog,bVerbose,cr,
 +                    state_global,top_global,state_work,top,
 +                    inputrec,nrnb,wcycle,gstat,
 +                    vsite,constr,fcd,graph,mdatoms,fr,
 +                    mu_tot,enerd,vir,pres,-1,TRUE);
 +    cr->nnodes = nnodes;
 +
 +    /* if forces are not small, warn user */
 +    get_state_f_norm_max(cr,&(inputrec->opts),mdatoms,state_work);
 +
 +    if (MASTER(cr))
 +    {
 +        fprintf(stderr,"Maximum force:%12.5e\n",state_work->fmax);
 +        if (state_work->fmax > 1.0e-3) 
 +        {
 +            fprintf(stderr,"Maximum force probably not small enough to");
 +            fprintf(stderr," ensure that you are in an \nenergy well. ");
 +            fprintf(stderr,"Be aware that negative eigenvalues may occur");
 +            fprintf(stderr," when the\nresulting matrix is diagonalized.\n");
 +        }
 +    }
 +    
 +    /***********************************************************
 +     *
 +     *      Loop over all pairs in matrix 
 +     * 
 +     *      do_force called twice. Once with positive and 
 +     *      once with negative displacement 
 +     *
 +     ************************************************************/
 +
 +    /* Steps are divided one by one over the nodes */
 +    for(atom=cr->nodeid; atom<natoms; atom+=nnodes) 
 +    {
 +        
 +        for (d=0; d<DIM; d++) 
 +        {
 +            x_min = state_work->s.x[atom][d];
 +
 +            state_work->s.x[atom][d] = x_min - der_range;
 +          
 +            /* Make evaluate_energy do a single node force calculation */
 +            cr->nnodes = 1;
 +            evaluate_energy(fplog,bVerbose,cr,
 +                            state_global,top_global,state_work,top,
 +                            inputrec,nrnb,wcycle,gstat,
 +                            vsite,constr,fcd,graph,mdatoms,fr,
 +                            mu_tot,enerd,vir,pres,atom*2,FALSE);
 +                      
 +            for(i=0; i<natoms; i++)
 +            {
 +                copy_rvec(state_work->f[i], fneg[i]);
 +            }
 +            
 +            state_work->s.x[atom][d] = x_min + der_range;
 +            
 +            evaluate_energy(fplog,bVerbose,cr,
 +                            state_global,top_global,state_work,top,
 +                            inputrec,nrnb,wcycle,gstat,
 +                            vsite,constr,fcd,graph,mdatoms,fr,
 +                            mu_tot,enerd,vir,pres,atom*2+1,FALSE);
 +            cr->nnodes = nnodes;
 +
 +            /* x is restored to original */
 +            state_work->s.x[atom][d] = x_min;
 +
 +            for(j=0; j<natoms; j++) 
 +            {
 +                for (k=0; (k<DIM); k++) 
 +                {
 +                    dfdx[j][k] =
 +                        -(state_work->f[j][k] - fneg[j][k])/(2*der_range);
 +                }
 +            }
 +
 +            if (!MASTER(cr))
 +            {
 +#ifdef GMX_MPI
 +#ifdef GMX_DOUBLE
 +#define mpi_type MPI_DOUBLE
 +#else
 +#define mpi_type MPI_FLOAT
 +#endif
 +                MPI_Send(dfdx[0],natoms*DIM,mpi_type,MASTERNODE(cr),cr->nodeid,
 +                         cr->mpi_comm_mygroup);
 +#endif
 +            }
 +            else
 +            {
 +                for(node=0; (node<nnodes && atom+node<natoms); node++)
 +                {
 +                    if (node > 0)
 +                    {
 +#ifdef GMX_MPI
 +                        MPI_Status stat;
 +                        MPI_Recv(dfdx[0],natoms*DIM,mpi_type,node,node,
 +                                 cr->mpi_comm_mygroup,&stat);
 +#undef mpi_type
 +#endif
 +                    }
 +
 +                    row = (atom + node)*DIM + d;
 +
 +                    for(j=0; j<natoms; j++) 
 +                    {
 +                        for(k=0; k<DIM; k++) 
 +                        {
 +                            col = j*DIM + k;
 +                            
 +                            if (bSparse)
 +                            {
 +                                if (col >= row && dfdx[j][k] != 0.0)
 +                                {
 +                                    gmx_sparsematrix_increment_value(sparse_matrix,
 +                                                                     row,col,dfdx[j][k]);
 +                                }
 +                            }
 +                            else
 +                            {
 +                                full_matrix[row*sz+col] = dfdx[j][k];
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +            
 +            if (bVerbose && fplog)
 +            {
 +                fflush(fplog);            
 +            }
 +        }
 +        /* write progress */
 +        if (MASTER(cr) && bVerbose) 
 +        {
 +            fprintf(stderr,"\rFinished step %d out of %d",
 +                    min(atom+nnodes,natoms),natoms); 
 +            fflush(stderr);
 +        }
 +    }
 +    
 +    if (MASTER(cr)) 
 +    {
 +        fprintf(stderr,"\n\nWriting Hessian...\n");
 +        gmx_mtxio_write(ftp2fn(efMTX,nfile,fnm),sz,sz,full_matrix,sparse_matrix);
 +    }
 +
 +    finish_em(fplog,cr,outf,runtime,wcycle);
 +
 +    runtime->nsteps_done = natoms*2;
 +    
 +    return 0;
 +}
Simple merge
index 2f5d2bc2a6efc39c04a57d478d58f5ea12c0ebfa,0000000000000000000000000000000000000000..a80677daa9d0ce6fe2efe364e20bd152de77a875
mode 100644,000000..100644
--- /dev/null
@@@ -1,4230 -1,0 +1,4352 @@@
-         srenew(work->denom,work->nalloc);
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + *
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +/* IMPORTANT FOR DEVELOPERS:
 + *
 + * Triclinic pme stuff isn't entirely trivial, and we've experienced
 + * some bugs during development (many of them due to me). To avoid
 + * this in the future, please check the following things if you make
 + * changes in this file:
 + *
 + * 1. You should obtain identical (at least to the PME precision)
 + *    energies, forces, and virial for
 + *    a rectangular box and a triclinic one where the z (or y) axis is
 + *    tilted a whole box side. For instance you could use these boxes:
 + *
 + *    rectangular       triclinic
 + *     2  0  0           2  0  0
 + *     0  2  0           0  2  0
 + *     0  0  6           2  2  6
 + *
 + * 2. You should check the energy conservation in a triclinic box.
 + *
 + * It might seem an overkill, but better safe than sorry.
 + * /Erik 001109
 + */
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#ifdef GMX_LIB_MPI
 +#include <mpi.h>
 +#endif
 +#ifdef GMX_THREAD_MPI
 +#include "tmpi.h"
 +#endif
 +
 +#ifdef GMX_OPENMP
 +#include <omp.h>
 +#endif
 +
 +#include <stdio.h>
 +#include <string.h>
 +#include <math.h>
++#include <assert.h>
 +#include "typedefs.h"
 +#include "txtdump.h"
 +#include "vec.h"
 +#include "gmxcomplex.h"
 +#include "smalloc.h"
 +#include "futil.h"
 +#include "coulomb.h"
 +#include "gmx_fatal.h"
 +#include "pme.h"
 +#include "network.h"
 +#include "physics.h"
 +#include "nrnb.h"
 +#include "copyrite.h"
 +#include "gmx_wallcycle.h"
 +#include "gmx_parallel_3dfft.h"
 +#include "pdbio.h"
 +#include "gmx_cyclecounter.h"
 +#include "macros.h"
 +
 +#if ( !defined(GMX_DOUBLE) && ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_X86_64_SSE2) ) )
 +#include "gmx_sse2_single.h"
 +
 +#define PME_SSE
 +/* Some old AMD processors could have problems with unaligned loads+stores */
 +#ifndef GMX_FAHCORE
 +#define PME_SSE_UNALIGNED
 +#endif
 +#endif
 +
 +#define DFT_TOL 1e-7
 +/* #define PRT_FORCE */
 +/* conditions for on the fly time-measurement */
 +/* #define TAKETIME (step > 1 && timesteps < 10) */
 +#define TAKETIME FALSE
 +
 +/* #define PME_TIME_THREADS */
 +
 +#ifdef GMX_DOUBLE
 +#define mpi_type MPI_DOUBLE
 +#else
 +#define mpi_type MPI_FLOAT
 +#endif
 +
 +/* GMX_CACHE_SEP should be a multiple of 16 to preserve alignment */
 +#define GMX_CACHE_SEP 64
 +
 +/* We only define a maximum to be able to use local arrays without allocation.
 + * An order larger than 12 should never be needed, even for test cases.
 + * If needed it can be changed here.
 + */
 +#define PME_ORDER_MAX 12
 +
 +/* Internal datastructures */
 +typedef struct {
 +    int send_index0;
 +    int send_nindex;
 +    int recv_index0;
 +    int recv_nindex;
 +} pme_grid_comm_t;
 +
 +typedef struct {
 +#ifdef GMX_MPI
 +    MPI_Comm mpi_comm;
 +#endif
 +    int  nnodes,nodeid;
 +    int  *s2g0;
 +    int  *s2g1;
 +    int  noverlap_nodes;
 +    int  *send_id,*recv_id;
 +    pme_grid_comm_t *comm_data;
 +    real *sendbuf;
 +    real *recvbuf;
 +} pme_overlap_t;
 +
 +typedef struct {
 +    int *n;     /* Cumulative counts of the number of particles per thread */
 +    int nalloc; /* Allocation size of i */
 +    int *i;     /* Particle indices ordered on thread index (n) */
 +} thread_plist_t;
 +
 +typedef struct {
 +    int  n;
 +    int  *ind;
 +    splinevec theta;
 +    splinevec dtheta;
 +} splinedata_t;
 +
 +typedef struct {
 +    int  dimind;            /* The index of the dimension, 0=x, 1=y */
 +    int  nslab;
 +    int  nodeid;
 +#ifdef GMX_MPI
 +    MPI_Comm mpi_comm;
 +#endif
 +
 +    int  *node_dest;        /* The nodes to send x and q to with DD */
 +    int  *node_src;         /* The nodes to receive x and q from with DD */
 +    int  *buf_index;        /* Index for commnode into the buffers */
 +
 +    int  maxshift;
 +
 +    int  npd;
 +    int  pd_nalloc;
 +    int  *pd;
 +    int  *count;            /* The number of atoms to send to each node */
 +    int  **count_thread;
 +    int  *rcount;           /* The number of atoms to receive */
 +
 +    int  n;
 +    int  nalloc;
 +    rvec *x;
 +    real *q;
 +    rvec *f;
 +    gmx_bool bSpread;       /* These coordinates are used for spreading */
 +    int  pme_order;
 +    ivec *idx;
 +    rvec *fractx;            /* Fractional coordinate relative to the
 +                              * lower cell boundary
 +                              */
 +    int  nthread;
 +    int  *thread_idx;        /* Which thread should spread which charge */
 +    thread_plist_t *thread_plist;
 +    splinedata_t *spline;
 +} pme_atomcomm_t;
 +
 +#define FLBS  3
 +#define FLBSZ 4
 +
 +typedef struct {
 +    ivec ci;     /* The spatial location of this grid       */
 +    ivec n;      /* The size of *grid, including order-1    */
 +    ivec offset; /* The grid offset from the full node grid */
 +    int  order;  /* PME spreading order                     */
 +    real *grid;  /* The grid local thread, size n           */
 +} pmegrid_t;
 +
 +typedef struct {
 +    pmegrid_t grid;     /* The full node grid (non thread-local)            */
 +    int  nthread;       /* The number of threads operating on this grid     */
 +    ivec nc;            /* The local spatial decomposition over the threads */
 +    pmegrid_t *grid_th; /* Array of grids for each thread                   */
 +    int  **g2t;         /* The grid to thread index                         */
 +    ivec nthread_comm;  /* The number of threads to communicate with        */
 +} pmegrids_t;
 +
 +
 +typedef struct {
 +#ifdef PME_SSE
 +    /* Masks for SSE aligned spreading and gathering */
 +    __m128 mask_SSE0[6],mask_SSE1[6];
 +#else
 +    int dummy; /* C89 requires that struct has at least one member */
 +#endif
 +} pme_spline_work_t;
 +
 +typedef struct {
 +    /* work data for solve_pme */
 +    int      nalloc;
 +    real *   mhx;
 +    real *   mhy;
 +    real *   mhz;
 +    real *   m2;
 +    real *   denom;
 +    real *   tmp1_alloc;
 +    real *   tmp1;
 +    real *   eterm;
 +    real *   m2inv;
 +
 +    real     energy;
 +    matrix   vir;
 +} pme_work_t;
 +
 +typedef struct gmx_pme {
 +    int  ndecompdim;         /* The number of decomposition dimensions */
 +    int  nodeid;             /* Our nodeid in mpi->mpi_comm */
 +    int  nodeid_major;
 +    int  nodeid_minor;
 +    int  nnodes;             /* The number of nodes doing PME */
 +    int  nnodes_major;
 +    int  nnodes_minor;
 +
 +    MPI_Comm mpi_comm;
 +    MPI_Comm mpi_comm_d[2];  /* Indexed on dimension, 0=x, 1=y */
 +#ifdef GMX_MPI
 +    MPI_Datatype  rvec_mpi;  /* the pme vector's MPI type */
 +#endif
 +
 +    int  nthread;            /* The number of threads doing PME */
 +
 +    gmx_bool bPPnode;        /* Node also does particle-particle forces */
 +    gmx_bool bFEP;           /* Compute Free energy contribution */
 +    int nkx,nky,nkz;         /* Grid dimensions */
++    gmx_bool bP3M;           /* Do P3M: optimize the influence function */
 +    int pme_order;
 +    real epsilon_r;
 +
 +    pmegrids_t pmegridA;  /* Grids on which we do spreading/interpolation, includes overlap */
 +    pmegrids_t pmegridB;
 +    /* The PME charge spreading grid sizes/strides, includes pme_order-1 */
 +    int     pmegrid_nx,pmegrid_ny,pmegrid_nz;
 +    /* pmegrid_nz might be larger than strictly necessary to ensure
 +     * memory alignment, pmegrid_nz_base gives the real base size.
 +     */
 +    int     pmegrid_nz_base;
 +    /* The local PME grid starting indices */
 +    int     pmegrid_start_ix,pmegrid_start_iy,pmegrid_start_iz;
 +
 +    /* Work data for spreading and gathering */
 +    pme_spline_work_t spline_work;
 +
 +    real *fftgridA;             /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
 +    real *fftgridB;             /* inside the interpolation grid, but separate for 2D PME decomp. */
 +    int   fftgrid_nx,fftgrid_ny,fftgrid_nz;
 +
 +    t_complex *cfftgridA;             /* Grids for complex FFT data */
 +    t_complex *cfftgridB;
 +    int   cfftgrid_nx,cfftgrid_ny,cfftgrid_nz;
 +
 +    gmx_parallel_3dfft_t  pfft_setupA;
 +    gmx_parallel_3dfft_t  pfft_setupB;
 +
 +    int  *nnx,*nny,*nnz;
 +    real *fshx,*fshy,*fshz;
 +
 +    pme_atomcomm_t atc[2];  /* Indexed on decomposition index */
 +    matrix    recipbox;
 +    splinevec bsp_mod;
 +
 +    pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */
 +
 +    pme_atomcomm_t atc_energy; /* Only for gmx_pme_calc_energy */
 +
 +    rvec *bufv;             /* Communication buffer */
 +    real *bufr;             /* Communication buffer */
 +    int  buf_nalloc;        /* The communication buffer size */
 +
 +    /* thread local work data for solve_pme */
 +    pme_work_t *work;
 +
 +    /* Work data for PME_redist */
 +    gmx_bool redist_init;
 +    int *    scounts;
 +    int *    rcounts;
 +    int *    sdispls;
 +    int *    rdispls;
 +    int *    sidx;
 +    int *    idxa;
 +    real *   redist_buf;
 +    int      redist_buf_nalloc;
 +
 +    /* Work data for sum_qgrid */
 +    real *   sum_qgrid_tmp;
 +    real *   sum_qgrid_dd_tmp;
 +} t_gmx_pme;
 +
 +
 +static void calc_interpolation_idx(gmx_pme_t pme,pme_atomcomm_t *atc,
 +                                   int start,int end,int thread)
 +{
 +    int  i;
 +    int  *idxptr,tix,tiy,tiz;
 +    real *xptr,*fptr,tx,ty,tz;
 +    real rxx,ryx,ryy,rzx,rzy,rzz;
 +    int  nx,ny,nz;
 +    int  start_ix,start_iy,start_iz;
 +    int  *g2tx,*g2ty,*g2tz;
 +    gmx_bool bThreads;
 +    int  *thread_idx=NULL;
 +    thread_plist_t *tpl=NULL;
 +    int  *tpl_n=NULL;
 +    int  thread_i;
 +
 +    nx  = pme->nkx;
 +    ny  = pme->nky;
 +    nz  = pme->nkz;
 +
 +    start_ix = pme->pmegrid_start_ix;
 +    start_iy = pme->pmegrid_start_iy;
 +    start_iz = pme->pmegrid_start_iz;
 +
 +    rxx = pme->recipbox[XX][XX];
 +    ryx = pme->recipbox[YY][XX];
 +    ryy = pme->recipbox[YY][YY];
 +    rzx = pme->recipbox[ZZ][XX];
 +    rzy = pme->recipbox[ZZ][YY];
 +    rzz = pme->recipbox[ZZ][ZZ];
 +
 +    g2tx = pme->pmegridA.g2t[XX];
 +    g2ty = pme->pmegridA.g2t[YY];
 +    g2tz = pme->pmegridA.g2t[ZZ];
 +
 +    bThreads = (atc->nthread > 1);
 +    if (bThreads)
 +    {
 +        thread_idx = atc->thread_idx;
 +
 +        tpl   = &atc->thread_plist[thread];
 +        tpl_n = tpl->n;
 +        for(i=0; i<atc->nthread; i++)
 +        {
 +            tpl_n[i] = 0;
 +        }
 +    }
 +
 +    for(i=start; i<end; i++) {
 +        xptr   = atc->x[i];
 +        idxptr = atc->idx[i];
 +        fptr   = atc->fractx[i];
 +
 +        /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
 +        tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 );
 +        ty = ny * (                  xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 );
 +        tz = nz * (                                   xptr[ZZ] * rzz + 2.0 );
 +
 +        tix = (int)(tx);
 +        tiy = (int)(ty);
 +        tiz = (int)(tz);
 +
 +        /* Because decomposition only occurs in x and y,
 +         * we never have a fraction correction in z.
 +         */
 +        fptr[XX] = tx - tix + pme->fshx[tix];
 +        fptr[YY] = ty - tiy + pme->fshy[tiy];
 +        fptr[ZZ] = tz - tiz;
 +
 +        idxptr[XX] = pme->nnx[tix];
 +        idxptr[YY] = pme->nny[tiy];
 +        idxptr[ZZ] = pme->nnz[tiz];
 +
 +#ifdef DEBUG
 +        range_check(idxptr[XX],0,pme->pmegrid_nx);
 +        range_check(idxptr[YY],0,pme->pmegrid_ny);
 +        range_check(idxptr[ZZ],0,pme->pmegrid_nz);
 +#endif
 +
 +        if (bThreads)
 +        {
 +            thread_i = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
 +            thread_idx[i] = thread_i;
 +            tpl_n[thread_i]++;
 +        }
 +    }
 +
 +    if (bThreads)
 +    {
 +        /* Make a list of particle indices sorted on thread */
 +
 +        /* Get the cumulative count */
 +        for(i=1; i<atc->nthread; i++)
 +        {
 +            tpl_n[i] += tpl_n[i-1];
 +        }
 +        /* The current implementation distributes particles equally
 +         * over the threads, so we could actually allocate for that
 +         * in pme_realloc_atomcomm_things.
 +         */
 +        if (tpl_n[atc->nthread-1] > tpl->nalloc)
 +        {
 +            tpl->nalloc = over_alloc_large(tpl_n[atc->nthread-1]);
 +            srenew(tpl->i,tpl->nalloc);
 +        }
 +        /* Set tpl_n to the cumulative start */
 +        for(i=atc->nthread-1; i>=1; i--)
 +        {
 +            tpl_n[i] = tpl_n[i-1];
 +        }
 +        tpl_n[0] = 0;
 +
 +        /* Fill our thread local array with indices sorted on thread */
 +        for(i=start; i<end; i++)
 +        {
 +            tpl->i[tpl_n[atc->thread_idx[i]]++] = i;
 +        }
 +        /* Now tpl_n contains the cummulative count again */
 +    }
 +}
 +
 +static void make_thread_local_ind(pme_atomcomm_t *atc,
 +                                  int thread,splinedata_t *spline)
 +{
 +    int  n,t,i,start,end;
 +    thread_plist_t *tpl;
 +
 +    /* Combine the indices made by each thread into one index */
 +
 +    n = 0;
 +    start = 0;
 +    for(t=0; t<atc->nthread; t++)
 +    {
 +        tpl = &atc->thread_plist[t];
 +        /* Copy our part (start - end) from the list of thread t */
 +        if (thread > 0)
 +        {
 +            start = tpl->n[thread-1];
 +        }
 +        end = tpl->n[thread];
 +        for(i=start; i<end; i++)
 +        {
 +            spline->ind[n++] = tpl->i[i];
 +        }
 +    }
 +
 +    spline->n = n;
 +}
 +
 +
 +static void pme_calc_pidx(int start, int end,
 +                          matrix recipbox, rvec x[],
 +                          pme_atomcomm_t *atc, int *count)
 +{
 +    int  nslab,i;
 +    int  si;
 +    real *xptr,s;
 +    real rxx,ryx,rzx,ryy,rzy;
 +    int *pd;
 +
 +    /* Calculate PME task index (pidx) for each grid index.
 +     * Here we always assign equally sized slabs to each node
 +     * for load balancing reasons (the PME grid spacing is not used).
 +     */
 +
 +    nslab = atc->nslab;
 +    pd    = atc->pd;
 +
 +    /* Reset the count */
 +    for(i=0; i<nslab; i++)
 +    {
 +        count[i] = 0;
 +    }
 +
 +    if (atc->dimind == 0)
 +    {
 +        rxx = recipbox[XX][XX];
 +        ryx = recipbox[YY][XX];
 +        rzx = recipbox[ZZ][XX];
 +        /* Calculate the node index in x-dimension */
 +        for(i=start; i<end; i++)
 +        {
 +            xptr   = x[i];
 +            /* Fractional coordinates along box vectors */
 +            s = nslab*(xptr[XX]*rxx + xptr[YY]*ryx + xptr[ZZ]*rzx);
 +            si = (int)(s + 2*nslab) % nslab;
 +            pd[i] = si;
 +            count[si]++;
 +        }
 +    }
 +    else
 +    {
 +        ryy = recipbox[YY][YY];
 +        rzy = recipbox[ZZ][YY];
 +        /* Calculate the node index in y-dimension */
 +        for(i=start; i<end; i++)
 +        {
 +            xptr   = x[i];
 +            /* Fractional coordinates along box vectors */
 +            s = nslab*(xptr[YY]*ryy + xptr[ZZ]*rzy);
 +            si = (int)(s + 2*nslab) % nslab;
 +            pd[i] = si;
 +            count[si]++;
 +        }
 +    }
 +}
 +
 +static void pme_calc_pidx_wrapper(int natoms, matrix recipbox, rvec x[],
 +                                  pme_atomcomm_t *atc)
 +{
 +    int nthread,thread,slab;
 +
 +    nthread = atc->nthread;
 +
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +    for(thread=0; thread<nthread; thread++)
 +    {
 +        pme_calc_pidx(natoms* thread   /nthread,
 +                      natoms*(thread+1)/nthread,
 +                      recipbox,x,atc,atc->count_thread[thread]);
 +    }
 +    /* Non-parallel reduction, since nslab is small */
 +
 +    for(thread=1; thread<nthread; thread++)
 +    {
 +        for(slab=0; slab<atc->nslab; slab++)
 +        {
 +            atc->count_thread[0][slab] += atc->count_thread[thread][slab];
 +        }
 +    }
 +}
 +
 +static void pme_realloc_splinedata(splinedata_t *spline, pme_atomcomm_t *atc)
 +{
 +    int i,d;
 +
 +    srenew(spline->ind,atc->nalloc);
 +    /* Initialize the index to identity so it works without threads */
 +    for(i=0; i<atc->nalloc; i++)
 +    {
 +        spline->ind[i] = i;
 +    }
 +
 +    for(d=0;d<DIM;d++)
 +    {
 +        srenew(spline->theta[d] ,atc->pme_order*atc->nalloc);
 +        srenew(spline->dtheta[d],atc->pme_order*atc->nalloc);
 +    }
 +}
 +
 +static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
 +{
 +    int nalloc_old,i,j,nalloc_tpl;
 +
 +    /* We have to avoid a NULL pointer for atc->x to avoid
 +     * possible fatal errors in MPI routines.
 +     */
 +    if (atc->n > atc->nalloc || atc->nalloc == 0)
 +    {
 +        nalloc_old = atc->nalloc;
 +        atc->nalloc = over_alloc_dd(max(atc->n,1));
 +
 +        if (atc->nslab > 1) {
 +            srenew(atc->x,atc->nalloc);
 +            srenew(atc->q,atc->nalloc);
 +            srenew(atc->f,atc->nalloc);
 +            for(i=nalloc_old; i<atc->nalloc; i++)
 +            {
 +                clear_rvec(atc->f[i]);
 +            }
 +        }
 +        if (atc->bSpread) {
 +            srenew(atc->fractx,atc->nalloc);
 +            srenew(atc->idx   ,atc->nalloc);
 +
 +            if (atc->nthread > 1)
 +            {
 +                srenew(atc->thread_idx,atc->nalloc);
 +            }
 +
 +            for(i=0; i<atc->nthread; i++)
 +            {
 +                pme_realloc_splinedata(&atc->spline[i],atc);
 +            }
 +        }
 +    }
 +}
 +
 +static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
 +                         int n, gmx_bool bXF, rvec *x_f, real *charge,
 +                         pme_atomcomm_t *atc)
 +/* Redistribute particle data for PME calculation */
 +/* domain decomposition by x coordinate           */
 +{
 +    int *idxa;
 +    int i, ii;
 +
 +    if(FALSE == pme->redist_init) {
 +        snew(pme->scounts,atc->nslab);
 +        snew(pme->rcounts,atc->nslab);
 +        snew(pme->sdispls,atc->nslab);
 +        snew(pme->rdispls,atc->nslab);
 +        snew(pme->sidx,atc->nslab);
 +        pme->redist_init = TRUE;
 +    }
 +    if (n > pme->redist_buf_nalloc) {
 +        pme->redist_buf_nalloc = over_alloc_dd(n);
 +        srenew(pme->redist_buf,pme->redist_buf_nalloc*DIM);
 +    }
 +
 +    pme->idxa = atc->pd;
 +
 +#ifdef GMX_MPI
 +    if (forw && bXF) {
 +        /* forward, redistribution from pp to pme */
 +
 +        /* Calculate send counts and exchange them with other nodes */
 +        for(i=0; (i<atc->nslab); i++) pme->scounts[i]=0;
 +        for(i=0; (i<n); i++) pme->scounts[pme->idxa[i]]++;
 +        MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm);
 +
 +        /* Calculate send and receive displacements and index into send
 +           buffer */
 +        pme->sdispls[0]=0;
 +        pme->rdispls[0]=0;
 +        pme->sidx[0]=0;
 +        for(i=1; i<atc->nslab; i++) {
 +            pme->sdispls[i]=pme->sdispls[i-1]+pme->scounts[i-1];
 +            pme->rdispls[i]=pme->rdispls[i-1]+pme->rcounts[i-1];
 +            pme->sidx[i]=pme->sdispls[i];
 +        }
 +        /* Total # of particles to be received */
 +        atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1];
 +
 +        pme_realloc_atomcomm_things(atc);
 +
 +        /* Copy particle coordinates into send buffer and exchange*/
 +        for(i=0; (i<n); i++) {
 +            ii=DIM*pme->sidx[pme->idxa[i]];
 +            pme->sidx[pme->idxa[i]]++;
 +            pme->redist_buf[ii+XX]=x_f[i][XX];
 +            pme->redist_buf[ii+YY]=x_f[i][YY];
 +            pme->redist_buf[ii+ZZ]=x_f[i][ZZ];
 +        }
 +        MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
 +                      pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
 +                      pme->rvec_mpi, atc->mpi_comm);
 +    }
 +    if (forw) {
 +        /* Copy charge into send buffer and exchange*/
 +        for(i=0; i<atc->nslab; i++) pme->sidx[i]=pme->sdispls[i];
 +        for(i=0; (i<n); i++) {
 +            ii=pme->sidx[pme->idxa[i]];
 +            pme->sidx[pme->idxa[i]]++;
 +            pme->redist_buf[ii]=charge[i];
 +        }
 +        MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, mpi_type,
 +                      atc->q, pme->rcounts, pme->rdispls, mpi_type,
 +                      atc->mpi_comm);
 +    }
 +    else { /* backward, redistribution from pme to pp */
 +        MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi,
 +                      pme->redist_buf, pme->scounts, pme->sdispls,
 +                      pme->rvec_mpi, atc->mpi_comm);
 +
 +        /* Copy data from receive buffer */
 +        for(i=0; i<atc->nslab; i++)
 +            pme->sidx[i] = pme->sdispls[i];
 +        for(i=0; (i<n); i++) {
 +            ii = DIM*pme->sidx[pme->idxa[i]];
 +            x_f[i][XX] += pme->redist_buf[ii+XX];
 +            x_f[i][YY] += pme->redist_buf[ii+YY];
 +            x_f[i][ZZ] += pme->redist_buf[ii+ZZ];
 +            pme->sidx[pme->idxa[i]]++;
 +        }
 +    }
 +#endif
 +}
 +
 +static void pme_dd_sendrecv(pme_atomcomm_t *atc,
 +                            gmx_bool bBackward,int shift,
 +                            void *buf_s,int nbyte_s,
 +                            void *buf_r,int nbyte_r)
 +{
 +#ifdef GMX_MPI
 +    int dest,src;
 +    MPI_Status stat;
 +
 +    if (bBackward == FALSE) {
 +        dest = atc->node_dest[shift];
 +        src  = atc->node_src[shift];
 +    } else {
 +        dest = atc->node_src[shift];
 +        src  = atc->node_dest[shift];
 +    }
 +
 +    if (nbyte_s > 0 && nbyte_r > 0) {
 +        MPI_Sendrecv(buf_s,nbyte_s,MPI_BYTE,
 +                     dest,shift,
 +                     buf_r,nbyte_r,MPI_BYTE,
 +                     src,shift,
 +                     atc->mpi_comm,&stat);
 +    } else if (nbyte_s > 0) {
 +        MPI_Send(buf_s,nbyte_s,MPI_BYTE,
 +                 dest,shift,
 +                 atc->mpi_comm);
 +    } else if (nbyte_r > 0) {
 +        MPI_Recv(buf_r,nbyte_r,MPI_BYTE,
 +                 src,shift,
 +                 atc->mpi_comm,&stat);
 +    }
 +#endif
 +}
 +
 +static void dd_pmeredist_x_q(gmx_pme_t pme,
 +                             int n, gmx_bool bX, rvec *x, real *charge,
 +                             pme_atomcomm_t *atc)
 +{
 +    int *commnode,*buf_index;
 +    int nnodes_comm,i,nsend,local_pos,buf_pos,node,scount,rcount;
 +
 +    commnode  = atc->node_dest;
 +    buf_index = atc->buf_index;
 +
 +    nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
 +
 +    nsend = 0;
 +    for(i=0; i<nnodes_comm; i++) {
 +        buf_index[commnode[i]] = nsend;
 +        nsend += atc->count[commnode[i]];
 +    }
 +    if (bX) {
 +        if (atc->count[atc->nodeid] + nsend != n)
 +            gmx_fatal(FARGS,"%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
 +                      "This usually means that your system is not well equilibrated.",
 +                      n - (atc->count[atc->nodeid] + nsend),
 +                      pme->nodeid,'x'+atc->dimind);
 +
 +        if (nsend > pme->buf_nalloc) {
 +            pme->buf_nalloc = over_alloc_dd(nsend);
 +            srenew(pme->bufv,pme->buf_nalloc);
 +            srenew(pme->bufr,pme->buf_nalloc);
 +        }
 +
 +        atc->n = atc->count[atc->nodeid];
 +        for(i=0; i<nnodes_comm; i++) {
 +            scount = atc->count[commnode[i]];
 +            /* Communicate the count */
 +            if (debug)
 +                fprintf(debug,"dimind %d PME node %d send to node %d: %d\n",
 +                        atc->dimind,atc->nodeid,commnode[i],scount);
 +            pme_dd_sendrecv(atc,FALSE,i,
 +                            &scount,sizeof(int),
 +                            &atc->rcount[i],sizeof(int));
 +            atc->n += atc->rcount[i];
 +        }
 +
 +        pme_realloc_atomcomm_things(atc);
 +    }
 +
 +    local_pos = 0;
 +    for(i=0; i<n; i++) {
 +        node = atc->pd[i];
 +        if (node == atc->nodeid) {
 +            /* Copy direct to the receive buffer */
 +            if (bX) {
 +                copy_rvec(x[i],atc->x[local_pos]);
 +            }
 +            atc->q[local_pos] = charge[i];
 +            local_pos++;
 +        } else {
 +            /* Copy to the send buffer */
 +            if (bX) {
 +                copy_rvec(x[i],pme->bufv[buf_index[node]]);
 +            }
 +            pme->bufr[buf_index[node]] = charge[i];
 +            buf_index[node]++;
 +        }
 +    }
 +
 +    buf_pos = 0;
 +    for(i=0; i<nnodes_comm; i++) {
 +        scount = atc->count[commnode[i]];
 +        rcount = atc->rcount[i];
 +        if (scount > 0 || rcount > 0) {
 +            if (bX) {
 +                /* Communicate the coordinates */
 +                pme_dd_sendrecv(atc,FALSE,i,
 +                                pme->bufv[buf_pos],scount*sizeof(rvec),
 +                                atc->x[local_pos],rcount*sizeof(rvec));
 +            }
 +            /* Communicate the charges */
 +            pme_dd_sendrecv(atc,FALSE,i,
 +                            pme->bufr+buf_pos,scount*sizeof(real),
 +                            atc->q+local_pos,rcount*sizeof(real));
 +            buf_pos   += scount;
 +            local_pos += atc->rcount[i];
 +        }
 +    }
 +}
 +
 +static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc,
 +                           int n, rvec *f,
 +                           gmx_bool bAddF)
 +{
 +  int *commnode,*buf_index;
 +  int nnodes_comm,local_pos,buf_pos,i,scount,rcount,node;
 +
 +  commnode  = atc->node_dest;
 +  buf_index = atc->buf_index;
 +
 +  nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
 +
 +  local_pos = atc->count[atc->nodeid];
 +  buf_pos = 0;
 +  for(i=0; i<nnodes_comm; i++) {
 +    scount = atc->rcount[i];
 +    rcount = atc->count[commnode[i]];
 +    if (scount > 0 || rcount > 0) {
 +      /* Communicate the forces */
 +      pme_dd_sendrecv(atc,TRUE,i,
 +                      atc->f[local_pos],scount*sizeof(rvec),
 +                      pme->bufv[buf_pos],rcount*sizeof(rvec));
 +      local_pos += scount;
 +    }
 +    buf_index[commnode[i]] = buf_pos;
 +    buf_pos   += rcount;
 +  }
 +
 +    local_pos = 0;
 +    if (bAddF)
 +    {
 +        for(i=0; i<n; i++)
 +        {
 +            node = atc->pd[i];
 +            if (node == atc->nodeid)
 +            {
 +                /* Add from the local force array */
 +                rvec_inc(f[i],atc->f[local_pos]);
 +                local_pos++;
 +            }
 +            else
 +            {
 +                /* Add from the receive buffer */
 +                rvec_inc(f[i],pme->bufv[buf_index[node]]);
 +                buf_index[node]++;
 +            }
 +        }
 +    }
 +    else
 +    {
 +        for(i=0; i<n; i++)
 +        {
 +            node = atc->pd[i];
 +            if (node == atc->nodeid)
 +            {
 +                /* Copy from the local force array */
 +                copy_rvec(atc->f[local_pos],f[i]);
 +                local_pos++;
 +            }
 +            else
 +            {
 +                /* Copy from the receive buffer */
 +                copy_rvec(pme->bufv[buf_index[node]],f[i]);
 +                buf_index[node]++;
 +            }
 +        }
 +    }
 +}
 +
 +#ifdef GMX_MPI
 +static void
 +gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
 +{
 +    pme_overlap_t *overlap;
 +    int send_index0,send_nindex;
 +    int recv_index0,recv_nindex;
 +    MPI_Status stat;
 +    int i,j,k,ix,iy,iz,icnt;
 +    int ipulse,send_id,recv_id,datasize;
 +    real *p;
 +    real *sendptr,*recvptr;
 +
 +    /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
 +    overlap = &pme->overlap[1];
 +
 +    for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
 +    {
 +        /* Since we have already (un)wrapped the overlap in the z-dimension,
 +         * we only have to communicate 0 to nkz (not pmegrid_nz).
 +         */
 +        if (direction==GMX_SUM_QGRID_FORWARD)
 +        {
 +            send_id = overlap->send_id[ipulse];
 +            recv_id = overlap->recv_id[ipulse];
 +            send_index0   = overlap->comm_data[ipulse].send_index0;
 +            send_nindex   = overlap->comm_data[ipulse].send_nindex;
 +            recv_index0   = overlap->comm_data[ipulse].recv_index0;
 +            recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +        }
 +        else
 +        {
 +            send_id = overlap->recv_id[ipulse];
 +            recv_id = overlap->send_id[ipulse];
 +            send_index0   = overlap->comm_data[ipulse].recv_index0;
 +            send_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +            recv_index0   = overlap->comm_data[ipulse].send_index0;
 +            recv_nindex   = overlap->comm_data[ipulse].send_nindex;
 +        }
 +
 +        /* Copy data to contiguous send buffer */
 +        if (debug)
 +        {
 +            fprintf(debug,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
 +                    pme->nodeid,overlap->nodeid,send_id,
 +                    pme->pmegrid_start_iy,
 +                    send_index0-pme->pmegrid_start_iy,
 +                    send_index0-pme->pmegrid_start_iy+send_nindex);
 +        }
 +        icnt = 0;
 +        for(i=0;i<pme->pmegrid_nx;i++)
 +        {
 +            ix = i;
 +            for(j=0;j<send_nindex;j++)
 +            {
 +                iy = j + send_index0 - pme->pmegrid_start_iy;
 +                for(k=0;k<pme->nkz;k++)
 +                {
 +                    iz = k;
 +                    overlap->sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
 +                }
 +            }
 +        }
 +
 +        datasize      = pme->pmegrid_nx * pme->nkz;
 +
 +        MPI_Sendrecv(overlap->sendbuf,send_nindex*datasize,GMX_MPI_REAL,
 +                     send_id,ipulse,
 +                     overlap->recvbuf,recv_nindex*datasize,GMX_MPI_REAL,
 +                     recv_id,ipulse,
 +                     overlap->mpi_comm,&stat);
 +
 +        /* Get data from contiguous recv buffer */
 +        if (debug)
 +        {
 +            fprintf(debug,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
 +                    pme->nodeid,overlap->nodeid,recv_id,
 +                    pme->pmegrid_start_iy,
 +                    recv_index0-pme->pmegrid_start_iy,
 +                    recv_index0-pme->pmegrid_start_iy+recv_nindex);
 +        }
 +        icnt = 0;
 +        for(i=0;i<pme->pmegrid_nx;i++)
 +        {
 +            ix = i;
 +            for(j=0;j<recv_nindex;j++)
 +            {
 +                iy = j + recv_index0 - pme->pmegrid_start_iy;
 +                for(k=0;k<pme->nkz;k++)
 +                {
 +                    iz = k;
 +                    if(direction==GMX_SUM_QGRID_FORWARD)
 +                    {
 +                        grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
 +                    }
 +                    else
 +                    {
 +                        grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz]  = overlap->recvbuf[icnt++];
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    /* Major dimension is easier, no copying required,
 +     * but we might have to sum to separate array.
 +     * Since we don't copy, we have to communicate up to pmegrid_nz,
 +     * not nkz as for the minor direction.
 +     */
 +    overlap = &pme->overlap[0];
 +
 +    for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
 +    {
 +        if(direction==GMX_SUM_QGRID_FORWARD)
 +        {
 +            send_id = overlap->send_id[ipulse];
 +            recv_id = overlap->recv_id[ipulse];
 +            send_index0   = overlap->comm_data[ipulse].send_index0;
 +            send_nindex   = overlap->comm_data[ipulse].send_nindex;
 +            recv_index0   = overlap->comm_data[ipulse].recv_index0;
 +            recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +            recvptr   = overlap->recvbuf;
 +        }
 +        else
 +        {
 +            send_id = overlap->recv_id[ipulse];
 +            recv_id = overlap->send_id[ipulse];
 +            send_index0   = overlap->comm_data[ipulse].recv_index0;
 +            send_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +            recv_index0   = overlap->comm_data[ipulse].send_index0;
 +            recv_nindex   = overlap->comm_data[ipulse].send_nindex;
 +            recvptr   = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
 +        }
 +
 +        sendptr       = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
 +        datasize      = pme->pmegrid_ny * pme->pmegrid_nz;
 +
 +        if (debug)
 +        {
 +            fprintf(debug,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
 +                    pme->nodeid,overlap->nodeid,send_id,
 +                    pme->pmegrid_start_ix,
 +                    send_index0-pme->pmegrid_start_ix,
 +                    send_index0-pme->pmegrid_start_ix+send_nindex);
 +            fprintf(debug,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
 +                    pme->nodeid,overlap->nodeid,recv_id,
 +                    pme->pmegrid_start_ix,
 +                    recv_index0-pme->pmegrid_start_ix,
 +                    recv_index0-pme->pmegrid_start_ix+recv_nindex);
 +        }
 +
 +        MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
 +                     send_id,ipulse,
 +                     recvptr,recv_nindex*datasize,GMX_MPI_REAL,
 +                     recv_id,ipulse,
 +                     overlap->mpi_comm,&stat);
 +
 +        /* ADD data from contiguous recv buffer */
 +        if(direction==GMX_SUM_QGRID_FORWARD)
 +        {
 +            p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
 +            for(i=0;i<recv_nindex*datasize;i++)
 +            {
 +                p[i] += overlap->recvbuf[i];
 +            }
 +        }
 +    }
 +}
 +#endif
 +
 +
 +static int
 +copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
 +{
 +    ivec    local_fft_ndata,local_fft_offset,local_fft_size;
 +    ivec    local_pme_size;
 +    int     i,ix,iy,iz;
 +    int     pmeidx,fftidx;
 +
 +    /* Dimensions should be identical for A/B grid, so we just use A here */
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +
 +    local_pme_size[0] = pme->pmegrid_nx;
 +    local_pme_size[1] = pme->pmegrid_ny;
 +    local_pme_size[2] = pme->pmegrid_nz;
 +
 +    /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
 +     the offset is identical, and the PME grid always has more data (due to overlap)
 +     */
 +    {
 +#ifdef DEBUG_PME
 +        FILE *fp,*fp2;
 +        char fn[STRLEN],format[STRLEN];
 +        real val;
 +        sprintf(fn,"pmegrid%d.pdb",pme->nodeid);
 +        fp = ffopen(fn,"w");
 +        sprintf(fn,"pmegrid%d.txt",pme->nodeid);
 +        fp2 = ffopen(fn,"w");
 +     sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
 +#endif
 +
 +    for(ix=0;ix<local_fft_ndata[XX];ix++)
 +    {
 +        for(iy=0;iy<local_fft_ndata[YY];iy++)
 +        {
 +            for(iz=0;iz<local_fft_ndata[ZZ];iz++)
 +            {
 +                pmeidx = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
 +                fftidx = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
 +                fftgrid[fftidx] = pmegrid[pmeidx];
 +#ifdef DEBUG_PME
 +                val = 100*pmegrid[pmeidx];
 +                if (pmegrid[pmeidx] != 0)
 +                fprintf(fp,format,"ATOM",pmeidx,"CA","GLY",' ',pmeidx,' ',
 +                        5.0*ix,5.0*iy,5.0*iz,1.0,val);
 +                if (pmegrid[pmeidx] != 0)
 +                    fprintf(fp2,"%-12s  %5d  %5d  %5d  %12.5e\n",
 +                            "qgrid",
 +                            pme->pmegrid_start_ix + ix,
 +                            pme->pmegrid_start_iy + iy,
 +                            pme->pmegrid_start_iz + iz,
 +                            pmegrid[pmeidx]);
 +#endif
 +            }
 +        }
 +    }
 +#ifdef DEBUG_PME
 +    ffclose(fp);
 +    ffclose(fp2);
 +#endif
 +    }
 +    return 0;
 +}
 +
 +
 +static gmx_cycles_t omp_cyc_start()
 +{
 +    return gmx_cycles_read();
 +}
 +
 +static gmx_cycles_t omp_cyc_end(gmx_cycles_t c)
 +{
 +    return gmx_cycles_read() - c;
 +}
 +
 +
 +static int
 +copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid,
 +                        int nthread,int thread)
 +{
 +    ivec    local_fft_ndata,local_fft_offset,local_fft_size;
 +    ivec    local_pme_size;
 +    int     ixy0,ixy1,ixy,ix,iy,iz;
 +    int     pmeidx,fftidx;
 +#ifdef PME_TIME_THREADS
 +    gmx_cycles_t c1;
 +    static double cs1=0;
 +    static int cnt=0;
 +#endif
 +
 +#ifdef PME_TIME_THREADS
 +    c1 = omp_cyc_start();
 +#endif
 +    /* Dimensions should be identical for A/B grid, so we just use A here */
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +
 +    local_pme_size[0] = pme->pmegrid_nx;
 +    local_pme_size[1] = pme->pmegrid_ny;
 +    local_pme_size[2] = pme->pmegrid_nz;
 +
 +    /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
 +     the offset is identical, and the PME grid always has more data (due to overlap)
 +     */
 +    ixy0 = ((thread  )*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
 +    ixy1 = ((thread+1)*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
 +
 +    for(ixy=ixy0;ixy<ixy1;ixy++)
 +    {
 +        ix = ixy/local_fft_ndata[YY];
 +        iy = ixy - ix*local_fft_ndata[YY];
 +
 +        pmeidx = (ix*local_pme_size[YY] + iy)*local_pme_size[ZZ];
 +        fftidx = (ix*local_fft_size[YY] + iy)*local_fft_size[ZZ];
 +        for(iz=0;iz<local_fft_ndata[ZZ];iz++)
 +        {
 +            pmegrid[pmeidx+iz] = fftgrid[fftidx+iz];
 +        }
 +    }
 +
 +#ifdef PME_TIME_THREADS
 +    c1 = omp_cyc_end(c1);
 +    cs1 += (double)c1;
 +    cnt++;
 +    if (cnt % 20 == 0)
 +    {
 +        printf("copy %.2f\n",cs1*1e-9);
 +    }
 +#endif
 +
 +    return 0;
 +}
 +
 +
 +static void
 +wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
 +{
 +    int     nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix,iy,iz;
 +
 +    nx = pme->nkx;
 +    ny = pme->nky;
 +    nz = pme->nkz;
 +
 +    pnx = pme->pmegrid_nx;
 +    pny = pme->pmegrid_ny;
 +    pnz = pme->pmegrid_nz;
 +
 +    overlap = pme->pme_order - 1;
 +
 +    /* Add periodic overlap in z */
 +    for(ix=0; ix<pme->pmegrid_nx; ix++)
 +    {
 +        for(iy=0; iy<pme->pmegrid_ny; iy++)
 +        {
 +            for(iz=0; iz<overlap; iz++)
 +            {
 +                pmegrid[(ix*pny+iy)*pnz+iz] +=
 +                    pmegrid[(ix*pny+iy)*pnz+nz+iz];
 +            }
 +        }
 +    }
 +
 +    if (pme->nnodes_minor == 1)
 +    {
 +       for(ix=0; ix<pme->pmegrid_nx; ix++)
 +       {
 +           for(iy=0; iy<overlap; iy++)
 +           {
 +               for(iz=0; iz<nz; iz++)
 +               {
 +                   pmegrid[(ix*pny+iy)*pnz+iz] +=
 +                       pmegrid[(ix*pny+ny+iy)*pnz+iz];
 +               }
 +           }
 +       }
 +    }
 +
 +    if (pme->nnodes_major == 1)
 +    {
 +        ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
 +
 +        for(ix=0; ix<overlap; ix++)
 +        {
 +            for(iy=0; iy<ny_x; iy++)
 +            {
 +                for(iz=0; iz<nz; iz++)
 +                {
 +                    pmegrid[(ix*pny+iy)*pnz+iz] +=
 +                        pmegrid[((nx+ix)*pny+iy)*pnz+iz];
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void
 +unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
 +{
 +    int     nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix;
 +
 +    nx = pme->nkx;
 +    ny = pme->nky;
 +    nz = pme->nkz;
 +
 +    pnx = pme->pmegrid_nx;
 +    pny = pme->pmegrid_ny;
 +    pnz = pme->pmegrid_nz;
 +
 +    overlap = pme->pme_order - 1;
 +
 +    if (pme->nnodes_major == 1)
 +    {
 +        ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
 +
 +        for(ix=0; ix<overlap; ix++)
 +        {
 +            int iy,iz;
 +
 +            for(iy=0; iy<ny_x; iy++)
 +            {
 +                for(iz=0; iz<nz; iz++)
 +                {
 +                    pmegrid[((nx+ix)*pny+iy)*pnz+iz] =
 +                        pmegrid[(ix*pny+iy)*pnz+iz];
 +                }
 +            }
 +        }
 +    }
 +
 +    if (pme->nnodes_minor == 1)
 +    {
 +#pragma omp parallel for num_threads(pme->nthread) schedule(static)
 +       for(ix=0; ix<pme->pmegrid_nx; ix++)
 +       {
 +           int iy,iz;
 +
 +           for(iy=0; iy<overlap; iy++)
 +           {
 +               for(iz=0; iz<nz; iz++)
 +               {
 +                   pmegrid[(ix*pny+ny+iy)*pnz+iz] =
 +                       pmegrid[(ix*pny+iy)*pnz+iz];
 +               }
 +           }
 +       }
 +    }
 +
 +    /* Copy periodic overlap in z */
 +#pragma omp parallel for num_threads(pme->nthread) schedule(static)
 +    for(ix=0; ix<pme->pmegrid_nx; ix++)
 +    {
 +        int iy,iz;
 +
 +        for(iy=0; iy<pme->pmegrid_ny; iy++)
 +        {
 +            for(iz=0; iz<overlap; iz++)
 +            {
 +                pmegrid[(ix*pny+iy)*pnz+nz+iz] =
 +                    pmegrid[(ix*pny+iy)*pnz+iz];
 +            }
 +        }
 +    }
 +}
 +
 +static void clear_grid(int nx,int ny,int nz,real *grid,
 +                       ivec fs,int *flag,
 +                       int fx,int fy,int fz,
 +                       int order)
 +{
 +    int nc,ncz;
 +    int fsx,fsy,fsz,gx,gy,gz,g0x,g0y,x,y,z;
 +    int flind;
 +
 +    nc  = 2 + (order - 2)/FLBS;
 +    ncz = 2 + (order - 2)/FLBSZ;
 +
 +    for(fsx=fx; fsx<fx+nc; fsx++)
 +    {
 +        for(fsy=fy; fsy<fy+nc; fsy++)
 +        {
 +            for(fsz=fz; fsz<fz+ncz; fsz++)
 +            {
 +                flind = (fsx*fs[YY] + fsy)*fs[ZZ] + fsz;
 +                if (flag[flind] == 0)
 +                {
 +                    gx = fsx*FLBS;
 +                    gy = fsy*FLBS;
 +                    gz = fsz*FLBSZ;
 +                    g0x = (gx*ny + gy)*nz + gz;
 +                    for(x=0; x<FLBS; x++)
 +                    {
 +                        g0y = g0x;
 +                        for(y=0; y<FLBS; y++)
 +                        {
 +                            for(z=0; z<FLBSZ; z++)
 +                            {
 +                                grid[g0y+z] = 0;
 +                            }
 +                            g0y += nz;
 +                        }
 +                        g0x += ny*nz;
 +                    }
 +
 +                    flag[flind] = 1;
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +/* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
 +#define DO_BSPLINE(order)                            \
 +for(ithx=0; (ithx<order); ithx++)                    \
 +{                                                    \
 +    index_x = (i0+ithx)*pny*pnz;                     \
 +    valx    = qn*thx[ithx];                          \
 +                                                     \
 +    for(ithy=0; (ithy<order); ithy++)                \
 +    {                                                \
 +        valxy    = valx*thy[ithy];                   \
 +        index_xy = index_x+(j0+ithy)*pnz;            \
 +                                                     \
 +        for(ithz=0; (ithz<order); ithz++)            \
 +        {                                            \
 +            index_xyz        = index_xy+(k0+ithz);   \
 +            grid[index_xyz] += valxy*thz[ithz];      \
 +        }                                            \
 +    }                                                \
 +}
 +
 +
 +static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
 +                                     pme_atomcomm_t *atc, splinedata_t *spline,
 +                                     pme_spline_work_t *work)
 +{
 +
 +    /* spread charges from home atoms to local grid */
 +    real     *grid;
 +    pme_overlap_t *ol;
 +    int      b,i,nn,n,ithx,ithy,ithz,i0,j0,k0;
 +    int *    idxptr;
 +    int      order,norder,index_x,index_xy,index_xyz;
 +    real     valx,valxy,qn;
 +    real     *thx,*thy,*thz;
 +    int      localsize, bndsize;
 +    int      pnx,pny,pnz,ndatatot;
 +    int      offx,offy,offz;
 +
 +    pnx = pmegrid->n[XX];
 +    pny = pmegrid->n[YY];
 +    pnz = pmegrid->n[ZZ];
 +
 +    offx = pmegrid->offset[XX];
 +    offy = pmegrid->offset[YY];
 +    offz = pmegrid->offset[ZZ];
 +
 +    ndatatot = pnx*pny*pnz;
 +    grid = pmegrid->grid;
 +    for(i=0;i<ndatatot;i++)
 +    {
 +        grid[i] = 0;
 +    }
 +
 +    order = pmegrid->order;
 +
 +    for(nn=0; nn<spline->n; nn++)
 +    {
 +        n  = spline->ind[nn];
 +        qn = atc->q[n];
 +
 +        if (qn != 0)
 +        {
 +            idxptr = atc->idx[n];
 +            norder = nn*order;
 +
 +            i0   = idxptr[XX] - offx;
 +            j0   = idxptr[YY] - offy;
 +            k0   = idxptr[ZZ] - offz;
 +
 +            thx = spline->theta[XX] + norder;
 +            thy = spline->theta[YY] + norder;
 +            thz = spline->theta[ZZ] + norder;
 +
 +            switch (order) {
 +            case 4:
 +#ifdef PME_SSE
 +#ifdef PME_SSE_UNALIGNED
 +#define PME_SPREAD_SSE_ORDER4
 +#else
 +#define PME_SPREAD_SSE_ALIGNED
 +#define PME_ORDER 4
 +#endif
 +#include "pme_sse_single.h"
 +#else
 +                DO_BSPLINE(4);
 +#endif
 +                break;
 +            case 5:
 +#ifdef PME_SSE
 +#define PME_SPREAD_SSE_ALIGNED
 +#define PME_ORDER 5
 +#include "pme_sse_single.h"
 +#else
 +                DO_BSPLINE(5);
 +#endif
 +                break;
 +            default:
 +                DO_BSPLINE(order);
 +                break;
 +            }
 +        }
 +    }
 +}
 +
 +static void set_grid_alignment(int *pmegrid_nz,int pme_order)
 +{
 +#ifdef PME_SSE
 +    if (pme_order == 5
 +#ifndef PME_SSE_UNALIGNED
 +        || pme_order == 4
 +#endif
 +        )
 +    {
 +        /* Round nz up to a multiple of 4 to ensure alignment */
 +        *pmegrid_nz = ((*pmegrid_nz + 3) & ~3);
 +    }
 +#endif
 +}
 +
 +static void set_gridsize_alignment(int *gridsize,int pme_order)
 +{
 +#ifdef PME_SSE
 +#ifndef PME_SSE_UNALIGNED
 +    if (pme_order == 4)
 +    {
 +        /* Add extra elements to ensured aligned operations do not go
 +         * beyond the allocated grid size.
 +         * Note that for pme_order=5, the pme grid z-size alignment
 +         * ensures that we will not go beyond the grid size.
 +         */
 +         *gridsize += 4;
 +    }
 +#endif
 +#endif
 +}
 +
 +static void pmegrid_init(pmegrid_t *grid,
 +                         int cx, int cy, int cz,
 +                         int x0, int y0, int z0,
 +                         int x1, int y1, int z1,
 +                         gmx_bool set_alignment,
 +                         int pme_order,
 +                         real *ptr)
 +{
 +    int nz,gridsize;
 +
 +    grid->ci[XX] = cx;
 +    grid->ci[YY] = cy;
 +    grid->ci[ZZ] = cz;
 +    grid->offset[XX] = x0;
 +    grid->offset[YY] = y0;
 +    grid->offset[ZZ] = z0;
 +    grid->n[XX]      = x1 - x0 + pme_order - 1;
 +    grid->n[YY]      = y1 - y0 + pme_order - 1;
 +    grid->n[ZZ]      = z1 - z0 + pme_order - 1;
 +
 +    nz = grid->n[ZZ];
 +    set_grid_alignment(&nz,pme_order);
 +    if (set_alignment)
 +    {
 +        grid->n[ZZ] = nz;
 +    }
 +    else if (nz != grid->n[ZZ])
 +    {
 +        gmx_incons("pmegrid_init call with an unaligned z size");
 +    }
 +
 +    grid->order = pme_order;
 +    if (ptr == NULL)
 +    {
 +        gridsize = grid->n[XX]*grid->n[YY]*grid->n[ZZ];
 +        set_gridsize_alignment(&gridsize,pme_order);
 +        snew_aligned(grid->grid,gridsize,16);
 +    }
 +    else
 +    {
 +        grid->grid = ptr;
 +    }
 +}
 +
 +static int div_round_up(int enumerator,int denominator)
 +{
 +    return (enumerator + denominator - 1)/denominator;
 +}
 +
 +static void make_subgrid_division(const ivec n,int ovl,int nthread,
 +                                  ivec nsub)
 +{
 +    int gsize_opt,gsize;
 +    int nsx,nsy,nsz;
 +    char *env;
 +
 +    gsize_opt = -1;
 +    for(nsx=1; nsx<=nthread; nsx++)
 +    {
 +        if (nthread % nsx == 0)
 +        {
 +            for(nsy=1; nsy<=nthread; nsy++)
 +            {
 +                if (nsx*nsy <= nthread && nthread % (nsx*nsy) == 0)
 +                {
 +                    nsz = nthread/(nsx*nsy);
 +
 +                    /* Determine the number of grid points per thread */
 +                    gsize =
 +                        (div_round_up(n[XX],nsx) + ovl)*
 +                        (div_round_up(n[YY],nsy) + ovl)*
 +                        (div_round_up(n[ZZ],nsz) + ovl);
 +
 +                    /* Minimize the number of grids points per thread
 +                     * and, secondarily, the number of cuts in minor dimensions.
 +                     */
 +                    if (gsize_opt == -1 ||
 +                        gsize < gsize_opt ||
 +                        (gsize == gsize_opt &&
 +                         (nsz < nsub[ZZ] || (nsz == nsub[ZZ] && nsy < nsub[YY]))))
 +                    {
 +                        nsub[XX] = nsx;
 +                        nsub[YY] = nsy;
 +                        nsub[ZZ] = nsz;
 +                        gsize_opt = gsize;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    env = getenv("GMX_PME_THREAD_DIVISION");
 +    if (env != NULL)
 +    {
 +        sscanf(env,"%d %d %d",&nsub[XX],&nsub[YY],&nsub[ZZ]);
 +    }
 +
 +    if (nsub[XX]*nsub[YY]*nsub[ZZ] != nthread)
 +    {
 +        gmx_fatal(FARGS,"PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)",nsub[XX],nsub[YY],nsub[ZZ],nthread);
 +    }
 +}
 +
 +static void pmegrids_init(pmegrids_t *grids,
 +                          int nx,int ny,int nz,int nz_base,
 +                          int pme_order,
 +                          int nthread,
 +                          int overlap_x,
 +                          int overlap_y)
 +{
 +    ivec n,n_base,g0,g1;
 +    int t,x,y,z,d,i,tfac;
 +    int max_comm_lines;
 +
 +    n[XX] = nx - (pme_order - 1);
 +    n[YY] = ny - (pme_order - 1);
 +    n[ZZ] = nz - (pme_order - 1);
 +
 +    copy_ivec(n,n_base);
 +    n_base[ZZ] = nz_base;
 +
 +    pmegrid_init(&grids->grid,0,0,0,0,0,0,n[XX],n[YY],n[ZZ],FALSE,pme_order,
 +                 NULL);
 +
 +    grids->nthread = nthread;
 +
 +    make_subgrid_division(n_base,pme_order-1,grids->nthread,grids->nc);
 +
 +    if (grids->nthread > 1)
 +    {
 +        ivec nst;
 +        int gridsize;
 +        real *grid_all;
 +
 +        for(d=0; d<DIM; d++)
 +        {
 +            nst[d] = div_round_up(n[d],grids->nc[d]) + pme_order - 1;
 +        }
 +        set_grid_alignment(&nst[ZZ],pme_order);
 +
 +        if (debug)
 +        {
 +            fprintf(debug,"pmegrid thread local division: %d x %d x %d\n",
 +                    grids->nc[XX],grids->nc[YY],grids->nc[ZZ]);
 +            fprintf(debug,"pmegrid %d %d %d max thread pmegrid %d %d %d\n",
 +                    nx,ny,nz,
 +                    nst[XX],nst[YY],nst[ZZ]);
 +        }
 +
 +        snew(grids->grid_th,grids->nthread);
 +        t = 0;
 +        gridsize = nst[XX]*nst[YY]*nst[ZZ];
 +        set_gridsize_alignment(&gridsize,pme_order);
 +        snew_aligned(grid_all,
 +                     grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
 +                     16);
 +
 +        for(x=0; x<grids->nc[XX]; x++)
 +        {
 +            for(y=0; y<grids->nc[YY]; y++)
 +            {
 +                for(z=0; z<grids->nc[ZZ]; z++)
 +                {
 +                    pmegrid_init(&grids->grid_th[t],
 +                                 x,y,z,
 +                                 (n[XX]*(x  ))/grids->nc[XX],
 +                                 (n[YY]*(y  ))/grids->nc[YY],
 +                                 (n[ZZ]*(z  ))/grids->nc[ZZ],
 +                                 (n[XX]*(x+1))/grids->nc[XX],
 +                                 (n[YY]*(y+1))/grids->nc[YY],
 +                                 (n[ZZ]*(z+1))/grids->nc[ZZ],
 +                                 TRUE,
 +                                 pme_order,
 +                                 grid_all+GMX_CACHE_SEP+t*(gridsize+GMX_CACHE_SEP));
 +                    t++;
 +                }
 +            }
 +        }
 +    }
 +
 +    snew(grids->g2t,DIM);
 +    tfac = 1;
 +    for(d=DIM-1; d>=0; d--)
 +    {
 +        snew(grids->g2t[d],n[d]);
 +        t = 0;
 +        for(i=0; i<n[d]; i++)
 +        {
 +            /* The second check should match the parameters
 +             * of the pmegrid_init call above.
 +             */
 +            while (t + 1 < grids->nc[d] && i >= (n[d]*(t+1))/grids->nc[d])
 +            {
 +                t++;
 +            }
 +            grids->g2t[d][i] = t*tfac;
 +        }
 +
 +        tfac *= grids->nc[d];
 +
 +        switch (d)
 +        {
 +        case XX: max_comm_lines = overlap_x;     break;
 +        case YY: max_comm_lines = overlap_y;     break;
 +        case ZZ: max_comm_lines = pme_order - 1; break;
 +        }
 +        grids->nthread_comm[d] = 0;
 +        while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines)
 +        {
 +            grids->nthread_comm[d]++;
 +        }
 +        if (debug != NULL)
 +        {
 +            fprintf(debug,"pmegrid thread grid communication range in %c: %d\n",
 +                    'x'+d,grids->nthread_comm[d]);
 +        }
 +        /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
 +         * work, but this is not a problematic restriction.
 +         */
 +        if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d])
 +        {
 +            gmx_fatal(FARGS,"Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME",grids->nthread);
 +        }
 +    }
 +}
 +
 +
 +static void pmegrids_destroy(pmegrids_t *grids)
 +{
 +    int t;
 +
 +    if (grids->grid.grid != NULL)
 +    {
 +        sfree(grids->grid.grid);
 +
 +        if (grids->nthread > 0)
 +        {
 +            for(t=0; t<grids->nthread; t++)
 +            {
 +                sfree(grids->grid_th[t].grid);
 +            }
 +            sfree(grids->grid_th);
 +        }
 +    }
 +}
 +
 +
 +static void realloc_work(pme_work_t *work,int nkx)
 +{
 +    if (nkx > work->nalloc)
 +    {
 +        work->nalloc = nkx;
 +        srenew(work->mhx  ,work->nalloc);
 +        srenew(work->mhy  ,work->nalloc);
 +        srenew(work->mhz  ,work->nalloc);
 +        srenew(work->m2   ,work->nalloc);
- void gather_f_bsplines(gmx_pme_t pme,real *grid,
-                        gmx_bool bClearF,pme_atomcomm_t *atc,
-                        splinedata_t *spline,
-                        real scale)
 +        /* Allocate an aligned pointer for SSE operations, including 3 extra
 +         * elements at the end since SSE operates on 4 elements at a time.
 +         */
 +        sfree_aligned(work->denom);
 +        sfree_aligned(work->tmp1);
 +        sfree_aligned(work->eterm);
 +        snew_aligned(work->denom,work->nalloc+3,16);
 +        snew_aligned(work->tmp1 ,work->nalloc+3,16);
 +        snew_aligned(work->eterm,work->nalloc+3,16);
 +        srenew(work->m2inv,work->nalloc);
 +    }
 +}
 +
 +
 +static void free_work(pme_work_t *work)
 +{
 +    sfree(work->mhx);
 +    sfree(work->mhy);
 +    sfree(work->mhz);
 +    sfree(work->m2);
 +    sfree_aligned(work->denom);
 +    sfree_aligned(work->tmp1);
 +    sfree_aligned(work->eterm);
 +    sfree(work->m2inv);
 +}
 +
 +
 +#ifdef PME_SSE
 +    /* Calculate exponentials through SSE in float precision */
 +inline static void calc_exponentials(int start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
 +{
 +    {
 +        const __m128 two = _mm_set_ps(2.0f,2.0f,2.0f,2.0f);
 +        __m128 f_sse;
 +        __m128 lu;
 +        __m128 tmp_d1,d_inv,tmp_r,tmp_e;
 +        int kx;
 +        f_sse = _mm_load1_ps(&f);
 +        for(kx=0; kx<end; kx+=4)
 +        {
 +            tmp_d1   = _mm_load_ps(d_aligned+kx);
 +            lu       = _mm_rcp_ps(tmp_d1);
 +            d_inv    = _mm_mul_ps(lu,_mm_sub_ps(two,_mm_mul_ps(lu,tmp_d1)));
 +            tmp_r    = _mm_load_ps(r_aligned+kx);
 +            tmp_r    = gmx_mm_exp_ps(tmp_r);
 +            tmp_e    = _mm_mul_ps(f_sse,d_inv);
 +            tmp_e    = _mm_mul_ps(tmp_e,tmp_r);
 +            _mm_store_ps(e_aligned+kx,tmp_e);
 +        }
 +    }
 +}
 +#else
 +inline static void calc_exponentials(int start, int end, real f, real *d, real *r, real *e)
 +{
 +    int kx;
 +    for(kx=start; kx<end; kx++)
 +    {
 +        d[kx] = 1.0/d[kx];
 +    }
 +    for(kx=start; kx<end; kx++)
 +    {
 +        r[kx] = exp(r[kx]);
 +    }
 +    for(kx=start; kx<end; kx++)
 +    {
 +        e[kx] = f*r[kx]*d[kx];
 +    }
 +}
 +#endif
 +
 +
 +static int solve_pme_yzx(gmx_pme_t pme,t_complex *grid,
 +                         real ewaldcoeff,real vol,
 +                         gmx_bool bEnerVir,
 +                         int nthread,int thread)
 +{
 +    /* do recip sum over local cells in grid */
 +    /* y major, z middle, x minor or continuous */
 +    t_complex *p0;
 +    int     kx,ky,kz,maxkx,maxky,maxkz;
 +    int     nx,ny,nz,iyz0,iyz1,iyz,iy,iz,kxstart,kxend;
 +    real    mx,my,mz;
 +    real    factor=M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
 +    real    ets2,struct2,vfactor,ets2vf;
 +    real    d1,d2,energy=0;
 +    real    by,bz;
 +    real    virxx=0,virxy=0,virxz=0,viryy=0,viryz=0,virzz=0;
 +    real    rxx,ryx,ryy,rzx,rzy,rzz;
 +    pme_work_t *work;
 +    real    *mhx,*mhy,*mhz,*m2,*denom,*tmp1,*eterm,*m2inv;
 +    real    mhxk,mhyk,mhzk,m2k;
 +    real    corner_fac;
 +    ivec    complex_order;
 +    ivec    local_ndata,local_offset,local_size;
 +    real    elfac;
 +
 +    elfac = ONE_4PI_EPS0/pme->epsilon_r;
 +
 +    nx = pme->nkx;
 +    ny = pme->nky;
 +    nz = pme->nkz;
 +
 +    /* Dimensions should be identical for A/B grid, so we just use A here */
 +    gmx_parallel_3dfft_complex_limits(pme->pfft_setupA,
 +                                      complex_order,
 +                                      local_ndata,
 +                                      local_offset,
 +                                      local_size);
 +
 +    rxx = pme->recipbox[XX][XX];
 +    ryx = pme->recipbox[YY][XX];
 +    ryy = pme->recipbox[YY][YY];
 +    rzx = pme->recipbox[ZZ][XX];
 +    rzy = pme->recipbox[ZZ][YY];
 +    rzz = pme->recipbox[ZZ][ZZ];
 +
 +    maxkx = (nx+1)/2;
 +    maxky = (ny+1)/2;
 +    maxkz = nz/2+1;
 +
 +    work = &pme->work[thread];
 +    mhx   = work->mhx;
 +    mhy   = work->mhy;
 +    mhz   = work->mhz;
 +    m2    = work->m2;
 +    denom = work->denom;
 +    tmp1  = work->tmp1;
 +    eterm = work->eterm;
 +    m2inv = work->m2inv;
 +
 +    iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread   /nthread;
 +    iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
 +
 +    for(iyz=iyz0; iyz<iyz1; iyz++)
 +    {
 +        iy = iyz/local_ndata[ZZ];
 +        iz = iyz - iy*local_ndata[ZZ];
 +
 +        ky = iy + local_offset[YY];
 +
 +        if (ky < maxky)
 +        {
 +            my = ky;
 +        }
 +        else
 +        {
 +            my = (ky - ny);
 +        }
 +
 +        by = M_PI*vol*pme->bsp_mod[YY][ky];
 +
 +        kz = iz + local_offset[ZZ];
 +
 +        mz = kz;
 +
 +        bz = pme->bsp_mod[ZZ][kz];
 +
 +        /* 0.5 correction for corner points */
 +        corner_fac = 1;
 +        if (kz == 0 || kz == (nz+1)/2)
 +        {
 +            corner_fac = 0.5;
 +        }
 +
 +        p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
 +
 +        /* We should skip the k-space point (0,0,0) */
 +        if (local_offset[XX] > 0 || ky > 0 || kz > 0)
 +        {
 +            kxstart = local_offset[XX];
 +        }
 +        else
 +        {
 +            kxstart = local_offset[XX] + 1;
 +            p0++;
 +        }
 +        kxend = local_offset[XX] + local_ndata[XX];
 +
 +        if (bEnerVir)
 +        {
 +            /* More expensive inner loop, especially because of the storage
 +             * of the mh elements in array's.
 +             * Because x is the minor grid index, all mh elements
 +             * depend on kx for triclinic unit cells.
 +             */
 +
 +                /* Two explicit loops to avoid a conditional inside the loop */
 +            for(kx=kxstart; kx<maxkx; kx++)
 +            {
 +                mx = kx;
 +
 +                mhxk      = mx * rxx;
 +                mhyk      = mx * ryx + my * ryy;
 +                mhzk      = mx * rzx + my * rzy + mz * rzz;
 +                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 +                mhx[kx]   = mhxk;
 +                mhy[kx]   = mhyk;
 +                mhz[kx]   = mhzk;
 +                m2[kx]    = m2k;
 +                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
 +                tmp1[kx]  = -factor*m2k;
 +            }
 +
 +            for(kx=maxkx; kx<kxend; kx++)
 +            {
 +                mx = (kx - nx);
 +
 +                mhxk      = mx * rxx;
 +                mhyk      = mx * ryx + my * ryy;
 +                mhzk      = mx * rzx + my * rzy + mz * rzz;
 +                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 +                mhx[kx]   = mhxk;
 +                mhy[kx]   = mhyk;
 +                mhz[kx]   = mhzk;
 +                m2[kx]    = m2k;
 +                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
 +                tmp1[kx]  = -factor*m2k;
 +            }
 +
 +            for(kx=kxstart; kx<kxend; kx++)
 +            {
 +                m2inv[kx] = 1.0/m2[kx];
 +            }
 +
 +            calc_exponentials(kxstart,kxend,elfac,denom,tmp1,eterm);
 +
 +            for(kx=kxstart; kx<kxend; kx++,p0++)
 +            {
 +                d1      = p0->re;
 +                d2      = p0->im;
 +
 +                p0->re  = d1*eterm[kx];
 +                p0->im  = d2*eterm[kx];
 +
 +                struct2 = 2.0*(d1*d1+d2*d2);
 +
 +                tmp1[kx] = eterm[kx]*struct2;
 +            }
 +
 +            for(kx=kxstart; kx<kxend; kx++)
 +            {
 +                ets2     = corner_fac*tmp1[kx];
 +                vfactor  = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
 +                energy  += ets2;
 +
 +                ets2vf   = ets2*vfactor;
 +                virxx   += ets2vf*mhx[kx]*mhx[kx] - ets2;
 +                virxy   += ets2vf*mhx[kx]*mhy[kx];
 +                virxz   += ets2vf*mhx[kx]*mhz[kx];
 +                viryy   += ets2vf*mhy[kx]*mhy[kx] - ets2;
 +                viryz   += ets2vf*mhy[kx]*mhz[kx];
 +                virzz   += ets2vf*mhz[kx]*mhz[kx] - ets2;
 +            }
 +        }
 +        else
 +        {
 +            /* We don't need to calculate the energy and the virial.
 +             * In this case the triclinic overhead is small.
 +             */
 +
 +            /* Two explicit loops to avoid a conditional inside the loop */
 +
 +            for(kx=kxstart; kx<maxkx; kx++)
 +            {
 +                mx = kx;
 +
 +                mhxk      = mx * rxx;
 +                mhyk      = mx * ryx + my * ryy;
 +                mhzk      = mx * rzx + my * rzy + mz * rzz;
 +                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 +                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
 +                tmp1[kx]  = -factor*m2k;
 +            }
 +
 +            for(kx=maxkx; kx<kxend; kx++)
 +            {
 +                mx = (kx - nx);
 +
 +                mhxk      = mx * rxx;
 +                mhyk      = mx * ryx + my * ryy;
 +                mhzk      = mx * rzx + my * rzy + mz * rzz;
 +                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 +                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
 +                tmp1[kx]  = -factor*m2k;
 +            }
 +
 +            calc_exponentials(kxstart,kxend,elfac,denom,tmp1,eterm);
 +
 +            for(kx=kxstart; kx<kxend; kx++,p0++)
 +            {
 +                d1      = p0->re;
 +                d2      = p0->im;
 +
 +                p0->re  = d1*eterm[kx];
 +                p0->im  = d2*eterm[kx];
 +            }
 +        }
 +    }
 +
 +    if (bEnerVir)
 +    {
 +        /* Update virial with local values.
 +         * The virial is symmetric by definition.
 +         * this virial seems ok for isotropic scaling, but I'm
 +         * experiencing problems on semiisotropic membranes.
 +         * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
 +         */
 +        work->vir[XX][XX] = 0.25*virxx;
 +        work->vir[YY][YY] = 0.25*viryy;
 +        work->vir[ZZ][ZZ] = 0.25*virzz;
 +        work->vir[XX][YY] = work->vir[YY][XX] = 0.25*virxy;
 +        work->vir[XX][ZZ] = work->vir[ZZ][XX] = 0.25*virxz;
 +        work->vir[YY][ZZ] = work->vir[ZZ][YY] = 0.25*viryz;
 +
 +        /* This energy should be corrected for a charged system */
 +        work->energy = 0.5*energy;
 +    }
 +
 +    /* Return the loop count */
 +    return local_ndata[YY]*local_ndata[XX];
 +}
 +
 +static void get_pme_ener_vir(const gmx_pme_t pme,int nthread,
 +                             real *mesh_energy,matrix vir)
 +{
 +    /* This function sums output over threads
 +     * and should therefore only be called after thread synchronization.
 +     */
 +    int thread;
 +
 +    *mesh_energy = pme->work[0].energy;
 +    copy_mat(pme->work[0].vir,vir);
 +
 +    for(thread=1; thread<nthread; thread++)
 +    {
 +        *mesh_energy += pme->work[thread].energy;
 +        m_add(vir,pme->work[thread].vir,vir);
 +    }
 +}
 +
 +#define DO_FSPLINE(order)                      \
 +for(ithx=0; (ithx<order); ithx++)              \
 +{                                              \
 +    index_x = (i0+ithx)*pny*pnz;               \
 +    tx      = thx[ithx];                       \
 +    dx      = dthx[ithx];                      \
 +                                               \
 +    for(ithy=0; (ithy<order); ithy++)          \
 +    {                                          \
 +        index_xy = index_x+(j0+ithy)*pnz;      \
 +        ty       = thy[ithy];                  \
 +        dy       = dthy[ithy];                 \
 +        fxy1     = fz1 = 0;                    \
 +                                               \
 +        for(ithz=0; (ithz<order); ithz++)      \
 +        {                                      \
 +            gval  = grid[index_xy+(k0+ithz)];  \
 +            fxy1 += thz[ithz]*gval;            \
 +            fz1  += dthz[ithz]*gval;           \
 +        }                                      \
 +        fx += dx*ty*fxy1;                      \
 +        fy += tx*dy*fxy1;                      \
 +        fz += tx*ty*fz1;                       \
 +    }                                          \
 +}
 +
 +
-         qn = atc->q[n];
++static void gather_f_bsplines(gmx_pme_t pme,real *grid,
++                              gmx_bool bClearF,pme_atomcomm_t *atc,
++                              splinedata_t *spline,
++                              real scale)
 +{
 +    /* sum forces for local particles */
 +    int     nn,n,ithx,ithy,ithz,i0,j0,k0;
 +    int     index_x,index_xy;
 +    int     nx,ny,nz,pnx,pny,pnz;
 +    int *   idxptr;
 +    real    tx,ty,dx,dy,qn;
 +    real    fx,fy,fz,gval;
 +    real    fxy1,fz1;
 +    real    *thx,*thy,*thz,*dthx,*dthy,*dthz;
 +    int     norder;
 +    real    rxx,ryx,ryy,rzx,rzy,rzz;
 +    int     order;
 +
 +    pme_spline_work_t *work;
 +
 +    work = &pme->spline_work;
 +
 +    order = pme->pme_order;
 +    thx   = spline->theta[XX];
 +    thy   = spline->theta[YY];
 +    thz   = spline->theta[ZZ];
 +    dthx  = spline->dtheta[XX];
 +    dthy  = spline->dtheta[YY];
 +    dthz  = spline->dtheta[ZZ];
 +    nx    = pme->nkx;
 +    ny    = pme->nky;
 +    nz    = pme->nkz;
 +    pnx   = pme->pmegrid_nx;
 +    pny   = pme->pmegrid_ny;
 +    pnz   = pme->pmegrid_nz;
 +
 +    rxx   = pme->recipbox[XX][XX];
 +    ryx   = pme->recipbox[YY][XX];
 +    ryy   = pme->recipbox[YY][YY];
 +    rzx   = pme->recipbox[ZZ][XX];
 +    rzy   = pme->recipbox[ZZ][YY];
 +    rzz   = pme->recipbox[ZZ][ZZ];
 +
 +    for(nn=0; nn<spline->n; nn++)
 +    {
 +        n  = spline->ind[nn];
- void make_bspline_moduli(splinevec bsp_mod,int nx,int ny,int nz,int order)
++        qn = scale*atc->q[n];
 +
 +        if (bClearF)
 +        {
 +            atc->f[n][XX] = 0;
 +            atc->f[n][YY] = 0;
 +            atc->f[n][ZZ] = 0;
 +        }
 +        if (qn != 0)
 +        {
 +            fx     = 0;
 +            fy     = 0;
 +            fz     = 0;
 +            idxptr = atc->idx[n];
 +            norder = nn*order;
 +
 +            i0   = idxptr[XX];
 +            j0   = idxptr[YY];
 +            k0   = idxptr[ZZ];
 +
 +            /* Pointer arithmetic alert, next six statements */
 +            thx  = spline->theta[XX] + norder;
 +            thy  = spline->theta[YY] + norder;
 +            thz  = spline->theta[ZZ] + norder;
 +            dthx = spline->dtheta[XX] + norder;
 +            dthy = spline->dtheta[YY] + norder;
 +            dthz = spline->dtheta[ZZ] + norder;
 +
 +            switch (order) {
 +            case 4:
 +#ifdef PME_SSE
 +#ifdef PME_SSE_UNALIGNED
 +#define PME_GATHER_F_SSE_ORDER4
 +#else
 +#define PME_GATHER_F_SSE_ALIGNED
 +#define PME_ORDER 4
 +#endif
 +#include "pme_sse_single.h"
 +#else
 +                DO_FSPLINE(4);
 +#endif
 +                break;
 +            case 5:
 +#ifdef PME_SSE
 +#define PME_GATHER_F_SSE_ALIGNED
 +#define PME_ORDER 5
 +#include "pme_sse_single.h"
 +#else
 +                DO_FSPLINE(5);
 +#endif
 +                break;
 +            default:
 +                DO_FSPLINE(order);
 +                break;
 +            }
 +
 +            atc->f[n][XX] += -qn*( fx*nx*rxx );
 +            atc->f[n][YY] += -qn*( fx*nx*ryx + fy*ny*ryy );
 +            atc->f[n][ZZ] += -qn*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
 +        }
 +    }
 +    /* Since the energy and not forces are interpolated
 +     * the net force might not be exactly zero.
 +     * This can be solved by also interpolating F, but
 +     * that comes at a cost.
 +     * A better hack is to remove the net force every
 +     * step, but that must be done at a higher level
 +     * since this routine doesn't see all atoms if running
 +     * in parallel. Don't know how important it is?  EL 990726
 +     */
 +}
 +
 +
 +static real gather_energy_bsplines(gmx_pme_t pme,real *grid,
 +                                   pme_atomcomm_t *atc)
 +{
 +    splinedata_t *spline;
 +    int     n,ithx,ithy,ithz,i0,j0,k0;
 +    int     index_x,index_xy;
 +    int *   idxptr;
 +    real    energy,pot,tx,ty,qn,gval;
 +    real    *thx,*thy,*thz;
 +    int     norder;
 +    int     order;
 +
 +    spline = &atc->spline[0];
 +
 +    order = pme->pme_order;
 +
 +    energy = 0;
 +    for(n=0; (n<atc->n); n++) {
 +        qn      = atc->q[n];
 +
 +        if (qn != 0) {
 +            idxptr = atc->idx[n];
 +            norder = n*order;
 +
 +            i0   = idxptr[XX];
 +            j0   = idxptr[YY];
 +            k0   = idxptr[ZZ];
 +
 +            /* Pointer arithmetic alert, next three statements */
 +            thx  = spline->theta[XX] + norder;
 +            thy  = spline->theta[YY] + norder;
 +            thz  = spline->theta[ZZ] + norder;
 +
 +            pot = 0;
 +            for(ithx=0; (ithx<order); ithx++)
 +            {
 +                index_x = (i0+ithx)*pme->pmegrid_ny*pme->pmegrid_nz;
 +                tx      = thx[ithx];
 +
 +                for(ithy=0; (ithy<order); ithy++)
 +                {
 +                    index_xy = index_x+(j0+ithy)*pme->pmegrid_nz;
 +                    ty       = thy[ithy];
 +
 +                    for(ithz=0; (ithz<order); ithz++)
 +                    {
 +                        gval  = grid[index_xy+(k0+ithz)];
 +                        pot  += tx*ty*thz[ithz]*gval;
 +                    }
 +
 +                }
 +            }
 +
 +            energy += pot*qn;
 +        }
 +    }
 +
 +    return energy;
 +}
 +
 +/* Macro to force loop unrolling by fixing order.
 + * This gives a significant performance gain.
 + */
 +#define CALC_SPLINE(order)                     \
 +{                                              \
 +    int j,k,l;                                 \
 +    real dr,div;                               \
 +    real data[PME_ORDER_MAX];                  \
 +    real ddata[PME_ORDER_MAX];                 \
 +                                               \
 +    for(j=0; (j<DIM); j++)                     \
 +    {                                          \
 +        dr  = xptr[j];                         \
 +                                               \
 +        /* dr is relative offset from lower cell limit */ \
 +        data[order-1] = 0;                     \
 +        data[1] = dr;                          \
 +        data[0] = 1 - dr;                      \
 +                                               \
 +        for(k=3; (k<order); k++)               \
 +        {                                      \
 +            div = 1.0/(k - 1.0);               \
 +            data[k-1] = div*dr*data[k-2];      \
 +            for(l=1; (l<(k-1)); l++)           \
 +            {                                  \
 +                data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
 +                                   data[k-l-1]);                \
 +            }                                  \
 +            data[0] = div*(1-dr)*data[0];      \
 +        }                                      \
 +        /* differentiate */                    \
 +        ddata[0] = -data[0];                   \
 +        for(k=1; (k<order); k++)               \
 +        {                                      \
 +            ddata[k] = data[k-1] - data[k];    \
 +        }                                      \
 +                                               \
 +        div = 1.0/(order - 1);                 \
 +        data[order-1] = div*dr*data[order-2];  \
 +        for(l=1; (l<(order-1)); l++)           \
 +        {                                      \
 +            data[order-l-1] = div*((dr+l)*data[order-l-2]+    \
 +                               (order-l-dr)*data[order-l-1]); \
 +        }                                      \
 +        data[0] = div*(1 - dr)*data[0];        \
 +                                               \
 +        for(k=0; k<order; k++)                 \
 +        {                                      \
 +            theta[j][i*order+k]  = data[k];    \
 +            dtheta[j][i*order+k] = ddata[k];   \
 +        }                                      \
 +    }                                          \
 +}
 +
 +void make_bsplines(splinevec theta,splinevec dtheta,int order,
 +                   rvec fractx[],int nr,int ind[],real charge[],
 +                   gmx_bool bFreeEnergy)
 +{
 +    /* construct splines for local atoms */
 +    int  i,ii;
 +    real *xptr;
 +
 +    for(i=0; i<nr; i++)
 +    {
 +        /* With free energy we do not use the charge check.
 +         * In most cases this will be more efficient than calling make_bsplines
 +         * twice, since usually more than half the particles have charges.
 +         */
 +        ii = ind[i];
 +        if (bFreeEnergy || charge[ii] != 0.0) {
 +            xptr = fractx[ii];
 +            switch(order) {
 +            case 4:  CALC_SPLINE(4);     break;
 +            case 5:  CALC_SPLINE(5);     break;
 +            default: CALC_SPLINE(order); break;
 +            }
 +        }
 +    }
 +}
 +
 +
 +void make_dft_mod(real *mod,real *data,int ndata)
 +{
 +  int i,j;
 +  real sc,ss,arg;
 +
 +  for(i=0;i<ndata;i++) {
 +    sc=ss=0;
 +    for(j=0;j<ndata;j++) {
 +      arg=(2.0*M_PI*i*j)/ndata;
 +      sc+=data[j]*cos(arg);
 +      ss+=data[j]*sin(arg);
 +    }
 +    mod[i]=sc*sc+ss*ss;
 +  }
 +  for(i=0;i<ndata;i++)
 +    if(mod[i]<1e-7)
 +      mod[i]=(mod[i-1]+mod[i+1])*0.5;
 +}
 +
 +
-     if (PAR(cr))
++static void make_bspline_moduli(splinevec bsp_mod,
++                                int nx,int ny,int nz,int order)
 +{
 +  int nmax=max(nx,max(ny,nz));
 +  real *data,*ddata,*bsp_data;
 +  int i,k,l;
 +  real div;
 +
 +  snew(data,order);
 +  snew(ddata,order);
 +  snew(bsp_data,nmax);
 +
 +  data[order-1]=0;
 +  data[1]=0;
 +  data[0]=1;
 +
 +  for(k=3;k<order;k++) {
 +    div=1.0/(k-1.0);
 +    data[k-1]=0;
 +    for(l=1;l<(k-1);l++)
 +      data[k-l-1]=div*(l*data[k-l-2]+(k-l)*data[k-l-1]);
 +    data[0]=div*data[0];
 +  }
 +  /* differentiate */
 +  ddata[0]=-data[0];
 +  for(k=1;k<order;k++)
 +    ddata[k]=data[k-1]-data[k];
 +  div=1.0/(order-1);
 +  data[order-1]=0;
 +  for(l=1;l<(order-1);l++)
 +    data[order-l-1]=div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
 +  data[0]=div*data[0];
 +
 +  for(i=0;i<nmax;i++)
 +    bsp_data[i]=0;
 +  for(i=1;i<=order;i++)
 +    bsp_data[i]=data[i-1];
 +
 +  make_dft_mod(bsp_mod[XX],bsp_data,nx);
 +  make_dft_mod(bsp_mod[YY],bsp_data,ny);
 +  make_dft_mod(bsp_mod[ZZ],bsp_data,nz);
 +
 +  sfree(data);
 +  sfree(ddata);
 +  sfree(bsp_data);
 +}
 +
++
++/* Return the P3M optimal influence function */
++static double do_p3m_influence(double z, int order)
++{
++    double z2,z4;
++
++    z2 = z*z;
++    z4 = z2*z2;
++
++    /* The formula and most constants can be found in:
++     * Ballenegger et al., JCTC 8, 936 (2012)
++     */
++    switch(order)
++    {
++    case 2:
++        return 1.0 - 2.0*z2/3.0;
++        break;
++    case 3:
++        return 1.0 - z2 + 2.0*z4/15.0;
++        break;
++    case 4:
++        return 1.0 - 4.0*z2/3.0 + 2.0*z4/5.0 + 4.0*z2*z4/315.0;
++        break;
++    case 5:
++        return 1.0 - 5.0*z2/3.0 + 7.0*z4/9.0 - 17.0*z2*z4/189.0 + 2.0*z4*z4/2835.0;
++        break;
++    case 6:
++        return 1.0 - 2.0*z2 + 19.0*z4/15.0 - 256.0*z2*z4/945.0 + 62.0*z4*z4/4725.0 + 4.0*z2*z4*z4/155925.0;
++        break;
++    case 7:
++        return 1.0 - 7.0*z2/3.0 + 28.0*z4/15.0 - 16.0*z2*z4/27.0 + 26.0*z4*z4/405.0 - 2.0*z2*z4*z4/1485.0 + 4.0*z4*z4*z4/6081075.0;
++    case 8:
++        return 1.0 - 8.0*z2/3.0 + 116.0*z4/45.0 - 344.0*z2*z4/315.0 + 914.0*z4*z4/4725.0 - 248.0*z4*z4*z2/22275.0 + 21844.0*z4*z4*z4/212837625.0 - 8.0*z4*z4*z4*z2/638512875.0;
++        break;
++    }
++
++    return 0.0;
++}
++
++/* Calculate the P3M B-spline moduli for one dimension */
++static void make_p3m_bspline_moduli_dim(real *bsp_mod,int n,int order)
++{
++    double zarg,zai,sinzai,infl;
++    int    maxk,i;
++
++    if (order > 8)
++    {
++        gmx_fatal(FARGS,"The current P3M code only supports orders up to 8");
++    }
++
++    zarg = M_PI/n;
++
++    maxk = (n + 1)/2;
++
++    for(i=-maxk; i<0; i++)
++    {
++        zai    = zarg*i;
++        sinzai = sin(zai);
++        infl   = do_p3m_influence(sinzai,order);
++        bsp_mod[n+i] = infl*infl*pow(sinzai/zai,-2.0*order);
++    }
++    bsp_mod[0] = 1.0;
++    for(i=1; i<maxk; i++)
++    {
++        zai    = zarg*i;
++        sinzai = sin(zai);
++        infl   = do_p3m_influence(sinzai,order);
++        bsp_mod[i] = infl*infl*pow(sinzai/zai,-2.0*order);
++    }
++}
++
++/* Calculate the P3M B-spline moduli */
++static void make_p3m_bspline_moduli(splinevec bsp_mod,
++                                    int nx,int ny,int nz,int order)
++{
++    make_p3m_bspline_moduli_dim(bsp_mod[XX],nx,order);
++    make_p3m_bspline_moduli_dim(bsp_mod[YY],ny,order);
++    make_p3m_bspline_moduli_dim(bsp_mod[ZZ],nz,order);
++}
++
++
 +static void setup_coordinate_communication(pme_atomcomm_t *atc)
 +{
 +  int nslab,n,i;
 +  int fw,bw;
 +
 +  nslab = atc->nslab;
 +
 +  n = 0;
 +  for(i=1; i<=nslab/2; i++) {
 +    fw = (atc->nodeid + i) % nslab;
 +    bw = (atc->nodeid - i + nslab) % nslab;
 +    if (n < nslab - 1) {
 +      atc->node_dest[n] = fw;
 +      atc->node_src[n]  = bw;
 +      n++;
 +    }
 +    if (n < nslab - 1) {
 +      atc->node_dest[n] = bw;
 +      atc->node_src[n]  = fw;
 +      n++;
 +    }
 +  }
 +}
 +
 +int gmx_pme_destroy(FILE *log,gmx_pme_t *pmedata)
 +{
 +    int thread;
 +
 +    if(NULL != log)
 +    {
 +        fprintf(log,"Destroying PME data structures.\n");
 +    }
 +
 +    sfree((*pmedata)->nnx);
 +    sfree((*pmedata)->nny);
 +    sfree((*pmedata)->nnz);
 +
 +    pmegrids_destroy(&(*pmedata)->pmegridA);
 +
 +    sfree((*pmedata)->fftgridA);
 +    sfree((*pmedata)->cfftgridA);
 +    gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupA);
 +
 +    if ((*pmedata)->pmegridB.grid.grid != NULL)
 +    {
 +        pmegrids_destroy(&(*pmedata)->pmegridB);
 +        sfree((*pmedata)->fftgridB);
 +        sfree((*pmedata)->cfftgridB);
 +        gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupB);
 +    }
 +    for(thread=0; thread<(*pmedata)->nthread; thread++)
 +    {
 +        free_work(&(*pmedata)->work[thread]);
 +    }
 +    sfree((*pmedata)->work);
 +
 +    sfree(*pmedata);
 +    *pmedata = NULL;
 +
 +  return 0;
 +}
 +
 +static int mult_up(int n,int f)
 +{
 +    return ((n + f - 1)/f)*f;
 +}
 +
 +
 +static double pme_load_imbalance(gmx_pme_t pme)
 +{
 +    int    nma,nmi;
 +    double n1,n2,n3;
 +
 +    nma = pme->nnodes_major;
 +    nmi = pme->nnodes_minor;
 +
 +    n1 = mult_up(pme->nkx,nma)*mult_up(pme->nky,nmi)*pme->nkz;
 +    n2 = mult_up(pme->nkx,nma)*mult_up(pme->nkz,nmi)*pme->nky;
 +    n3 = mult_up(pme->nky,nma)*mult_up(pme->nkz,nmi)*pme->nkx;
 +
 +    /* pme_solve is roughly double the cost of an fft */
 +
 +    return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
 +}
 +
 +static void init_atomcomm(gmx_pme_t pme,pme_atomcomm_t *atc, t_commrec *cr,
 +                          int dimind,gmx_bool bSpread)
 +{
 +    int nk,k,s,thread;
 +
 +    atc->dimind = dimind;
 +    atc->nslab  = 1;
 +    atc->nodeid = 0;
 +    atc->pd_nalloc = 0;
 +#ifdef GMX_MPI
 +    if (pme->nnodes > 1)
 +    {
 +        atc->mpi_comm = pme->mpi_comm_d[dimind];
 +        MPI_Comm_size(atc->mpi_comm,&atc->nslab);
 +        MPI_Comm_rank(atc->mpi_comm,&atc->nodeid);
 +    }
 +    if (debug)
 +    {
 +        fprintf(debug,"For PME atom communication in dimind %d: nslab %d rank %d\n",atc->dimind,atc->nslab,atc->nodeid);
 +    }
 +#endif
 +
 +    atc->bSpread   = bSpread;
 +    atc->pme_order = pme->pme_order;
 +
 +    if (atc->nslab > 1)
 +    {
 +        /* These three allocations are not required for particle decomp. */
 +        snew(atc->node_dest,atc->nslab);
 +        snew(atc->node_src,atc->nslab);
 +        setup_coordinate_communication(atc);
 +
 +        snew(atc->count_thread,pme->nthread);
 +        for(thread=0; thread<pme->nthread; thread++)
 +        {
 +            snew(atc->count_thread[thread],atc->nslab);
 +        }
 +        atc->count = atc->count_thread[0];
 +        snew(atc->rcount,atc->nslab);
 +        snew(atc->buf_index,atc->nslab);
 +    }
 +
 +    atc->nthread = pme->nthread;
 +    if (atc->nthread > 1)
 +    {
 +        snew(atc->thread_plist,atc->nthread);
 +    }
 +    snew(atc->spline,atc->nthread);
 +    for(thread=0; thread<atc->nthread; thread++)
 +    {
 +        if (atc->nthread > 1)
 +        {
 +            snew(atc->thread_plist[thread].n,atc->nthread+2*GMX_CACHE_SEP);
 +            atc->thread_plist[thread].n += GMX_CACHE_SEP;
 +        }
 +    }
 +}
 +
 +static void
 +init_overlap_comm(pme_overlap_t *  ol,
 +                  int              norder,
 +#ifdef GMX_MPI
 +                  MPI_Comm         comm,
 +#endif
 +                  int              nnodes,
 +                  int              nodeid,
 +                  int              ndata,
 +                  int              commplainsize)
 +{
 +    int lbnd,rbnd,maxlr,b,i;
 +    int exten;
 +    int nn,nk;
 +    pme_grid_comm_t *pgc;
 +    gmx_bool bCont;
 +    int fft_start,fft_end,send_index1,recv_index1;
 +
 +#ifdef GMX_MPI
 +    ol->mpi_comm = comm;
 +#endif
 +
 +    ol->nnodes = nnodes;
 +    ol->nodeid = nodeid;
 +
 +    /* Linear translation of the PME grid wo'nt affect reciprocal space
 +     * calculations, so to optimize we only interpolate "upwards",
 +     * which also means we only have to consider overlap in one direction.
 +     * I.e., particles on this node might also be spread to grid indices
 +     * that belong to higher nodes (modulo nnodes)
 +     */
 +
 +    snew(ol->s2g0,ol->nnodes+1);
 +    snew(ol->s2g1,ol->nnodes);
 +    if (debug) { fprintf(debug,"PME slab boundaries:"); }
 +    for(i=0; i<nnodes; i++)
 +    {
 +        /* s2g0 the local interpolation grid start.
 +         * s2g1 the local interpolation grid end.
 +         * Because grid overlap communication only goes forward,
 +         * the grid the slabs for fft's should be rounded down.
 +         */
 +        ol->s2g0[i] = ( i   *ndata + 0       )/nnodes;
 +        ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
 +
 +        if (debug)
 +        {
 +            fprintf(debug,"  %3d %3d",ol->s2g0[i],ol->s2g1[i]);
 +        }
 +    }
 +    ol->s2g0[nnodes] = ndata;
 +    if (debug) { fprintf(debug,"\n"); }
 +
 +    /* Determine with how many nodes we need to communicate the grid overlap */
 +    b = 0;
 +    do
 +    {
 +        b++;
 +        bCont = FALSE;
 +        for(i=0; i<nnodes; i++)
 +        {
 +            if ((i+b <  nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
 +                (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
 +            {
 +                bCont = TRUE;
 +            }
 +        }
 +    }
 +    while (bCont && b < nnodes);
 +    ol->noverlap_nodes = b - 1;
 +
 +    snew(ol->send_id,ol->noverlap_nodes);
 +    snew(ol->recv_id,ol->noverlap_nodes);
 +    for(b=0; b<ol->noverlap_nodes; b++)
 +    {
 +        ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
 +        ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
 +    }
 +    snew(ol->comm_data, ol->noverlap_nodes);
 +
 +    for(b=0; b<ol->noverlap_nodes; b++)
 +    {
 +        pgc = &ol->comm_data[b];
 +        /* Send */
 +        fft_start        = ol->s2g0[ol->send_id[b]];
 +        fft_end          = ol->s2g0[ol->send_id[b]+1];
 +        if (ol->send_id[b] < nodeid)
 +        {
 +            fft_start += ndata;
 +            fft_end   += ndata;
 +        }
 +        send_index1      = ol->s2g1[nodeid];
 +        send_index1      = min(send_index1,fft_end);
 +        pgc->send_index0 = fft_start;
 +        pgc->send_nindex = max(0,send_index1 - pgc->send_index0);
 +
 +        /* We always start receiving to the first index of our slab */
 +        fft_start        = ol->s2g0[ol->nodeid];
 +        fft_end          = ol->s2g0[ol->nodeid+1];
 +        recv_index1      = ol->s2g1[ol->recv_id[b]];
 +        if (ol->recv_id[b] > nodeid)
 +        {
 +            recv_index1 -= ndata;
 +        }
 +        recv_index1      = min(recv_index1,fft_end);
 +        pgc->recv_index0 = fft_start;
 +        pgc->recv_nindex = max(0,recv_index1 - pgc->recv_index0);
 +    }
 +
 +    /* For non-divisible grid we need pme_order iso pme_order-1 */
 +    snew(ol->sendbuf,norder*commplainsize);
 +    snew(ol->recvbuf,norder*commplainsize);
 +}
 +
 +static void
 +make_gridindex5_to_localindex(int n,int local_start,int local_range,
 +                              int **global_to_local,
 +                              real **fraction_shift)
 +{
 +    int i;
 +    int * gtl;
 +    real * fsh;
 +
 +    snew(gtl,5*n);
 +    snew(fsh,5*n);
 +    for(i=0; (i<5*n); i++)
 +    {
 +        /* Determine the global to local grid index */
 +        gtl[i] = (i - local_start + n) % n;
 +        /* For coordinates that fall within the local grid the fraction
 +         * is correct, we don't need to shift it.
 +         */
 +        fsh[i] = 0;
 +        if (local_range < n)
 +        {
 +            /* Due to rounding issues i could be 1 beyond the lower or
 +             * upper boundary of the local grid. Correct the index for this.
 +             * If we shift the index, we need to shift the fraction by
 +             * the same amount in the other direction to not affect
 +             * the weights.
 +             * Note that due to this shifting the weights at the end of
 +             * the spline might change, but that will only involve values
 +             * between zero and values close to the precision of a real,
 +             * which is anyhow the accuracy of the whole mesh calculation.
 +             */
 +            /* With local_range=0 we should not change i=local_start */
 +            if (i % n != local_start)
 +            {
 +                if (gtl[i] == n-1)
 +                {
 +                    gtl[i] = 0;
 +                    fsh[i] = -1;
 +                }
 +                else if (gtl[i] == local_range)
 +                {
 +                    gtl[i] = local_range - 1;
 +                    fsh[i] = 1;
 +                }
 +            }
 +        }
 +    }
 +
 +    *global_to_local = gtl;
 +    *fraction_shift  = fsh;
 +}
 +
 +static void sse_mask_init(pme_spline_work_t *work,int order)
 +{
 +#ifdef PME_SSE
 +    float  tmp[8];
 +    __m128 zero_SSE;
 +    int    of,i;
 +
 +    zero_SSE = _mm_setzero_ps();
 +
 +    for(of=0; of<8-(order-1); of++)
 +    {
 +        for(i=0; i<8; i++)
 +        {
 +            tmp[i] = (i >= of && i < of+order ? 1 : 0);
 +        }
 +        work->mask_SSE0[of] = _mm_loadu_ps(tmp);
 +        work->mask_SSE1[of] = _mm_loadu_ps(tmp+4);
 +        work->mask_SSE0[of] = _mm_cmpgt_ps(work->mask_SSE0[of],zero_SSE);
 +        work->mask_SSE1[of] = _mm_cmpgt_ps(work->mask_SSE1[of],zero_SSE);
 +    }
 +#endif
 +}
 +
 +static void
 +gmx_pme_check_grid_restrictions(FILE *fplog,char dim,int nnodes,int *nk)
 +{
 +    int nk_new;
 +
 +    if (*nk % nnodes != 0)
 +    {
 +        nk_new = nnodes*(*nk/nnodes + 1);
 +
 +        if (2*nk_new >= 3*(*nk))
 +        {
 +            gmx_fatal(FARGS,"The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). The grid size would have to be increased by more than 50%% to make the grid divisible. Change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).",
 +                      dim,*nk,dim,nnodes,dim);
 +        }
 +
 +        if (fplog != NULL)
 +        {
 +            fprintf(fplog,"\nNOTE: The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). Increasing the PME grid size in dim %c to %d. This will increase the accuracy and will not decrease the performance significantly on this number of nodes. For optimal performance change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).\n\n",
 +                    dim,*nk,dim,nnodes,dim,nk_new,dim);
 +        }
 +
 +        *nk = nk_new;
 +    }
 +}
 +
 +int gmx_pme_init(gmx_pme_t *         pmedata,
 +                 t_commrec *         cr,
 +                 int                 nnodes_major,
 +                 int                 nnodes_minor,
 +                 t_inputrec *        ir,
 +                 int                 homenr,
 +                 gmx_bool            bFreeEnergy,
 +                 gmx_bool            bReproducible,
 +                 int                 nthread)
 +{
 +    gmx_pme_t pme=NULL;
 +
 +    pme_atomcomm_t *atc;
 +    ivec ndata;
 +
 +    if (debug)
 +        fprintf(debug,"Creating PME data structures.\n");
 +    snew(pme,1);
 +
 +    pme->redist_init         = FALSE;
 +    pme->sum_qgrid_tmp       = NULL;
 +    pme->sum_qgrid_dd_tmp    = NULL;
 +    pme->buf_nalloc          = 0;
 +    pme->redist_buf_nalloc   = 0;
 +
 +    pme->nnodes              = 1;
 +    pme->bPPnode             = TRUE;
 +
 +    pme->nnodes_major        = nnodes_major;
 +    pme->nnodes_minor        = nnodes_minor;
 +
 +#ifdef GMX_MPI
-         pme->mpi_comm        = cr->mpi_comm_mygroup;
++    if (nnodes_major*nnodes_minor > 1)
 +    {
-     make_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order);
++        pme->mpi_comm = cr->mpi_comm_mygroup;
 +
 +        MPI_Comm_rank(pme->mpi_comm,&pme->nodeid);
 +        MPI_Comm_size(pme->mpi_comm,&pme->nnodes);
++        if (pme->nnodes != nnodes_major*nnodes_minor)
++        {
++            gmx_incons("PME node count mismatch");
++        }
++    }
++    else
++    {
++        pme->mpi_comm = MPI_COMM_NULL;
 +    }
 +#endif
 +
 +    if (pme->nnodes == 1)
 +    {
 +        pme->ndecompdim = 0;
 +        pme->nodeid_major = 0;
 +        pme->nodeid_minor = 0;
 +#ifdef GMX_MPI
 +        pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL;
 +#endif
 +    }
 +    else
 +    {
 +        if (nnodes_minor == 1)
 +        {
 +#ifdef GMX_MPI
 +            pme->mpi_comm_d[0] = pme->mpi_comm;
 +            pme->mpi_comm_d[1] = MPI_COMM_NULL;
 +#endif
 +            pme->ndecompdim = 1;
 +            pme->nodeid_major = pme->nodeid;
 +            pme->nodeid_minor = 0;
 +
 +        }
 +        else if (nnodes_major == 1)
 +        {
 +#ifdef GMX_MPI
 +            pme->mpi_comm_d[0] = MPI_COMM_NULL;
 +            pme->mpi_comm_d[1] = pme->mpi_comm;
 +#endif
 +            pme->ndecompdim = 1;
 +            pme->nodeid_major = 0;
 +            pme->nodeid_minor = pme->nodeid;
 +        }
 +        else
 +        {
 +            if (pme->nnodes % nnodes_major != 0)
 +            {
 +                gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
 +            }
 +            pme->ndecompdim = 2;
 +
 +#ifdef GMX_MPI
 +            MPI_Comm_split(pme->mpi_comm,pme->nodeid % nnodes_minor,
 +                           pme->nodeid,&pme->mpi_comm_d[0]);  /* My communicator along major dimension */
 +            MPI_Comm_split(pme->mpi_comm,pme->nodeid/nnodes_minor,
 +                           pme->nodeid,&pme->mpi_comm_d[1]);  /* My communicator along minor dimension */
 +
 +            MPI_Comm_rank(pme->mpi_comm_d[0],&pme->nodeid_major);
 +            MPI_Comm_size(pme->mpi_comm_d[0],&pme->nnodes_major);
 +            MPI_Comm_rank(pme->mpi_comm_d[1],&pme->nodeid_minor);
 +            MPI_Comm_size(pme->mpi_comm_d[1],&pme->nnodes_minor);
 +#endif
 +        }
 +        pme->bPPnode = (cr->duty & DUTY_PP);
 +    }
 +
 +    pme->nthread = nthread;
 +
 +    if (ir->ePBC == epbcSCREW)
 +    {
 +        gmx_fatal(FARGS,"pme does not (yet) work with pbc = screw");
 +    }
 +
 +    pme->bFEP        = ((ir->efep != efepNO) && bFreeEnergy);
 +    pme->nkx         = ir->nkx;
 +    pme->nky         = ir->nky;
 +    pme->nkz         = ir->nkz;
++    pme->bP3M        = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL);
 +    pme->pme_order   = ir->pme_order;
 +    pme->epsilon_r   = ir->epsilon_r;
 +
 +    if (pme->pme_order > PME_ORDER_MAX)
 +    {
 +        gmx_fatal(FARGS,"pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
 +                  pme->pme_order,PME_ORDER_MAX);
 +    }
 +
 +    /* Currently pme.c supports only the fft5d FFT code.
 +     * Therefore the grid always needs to be divisible by nnodes.
 +     * When the old 1D code is also supported again, change this check.
 +     *
 +     * This check should be done before calling gmx_pme_init
 +     * and fplog should be passed iso stderr.
 +     *
 +    if (pme->ndecompdim >= 2)
 +    */
 +    if (pme->ndecompdim >= 1)
 +    {
 +        /*
 +        gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
 +                                        'x',nnodes_major,&pme->nkx);
 +        gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
 +                                        'y',nnodes_minor,&pme->nky);
 +        */
 +    }
 +
 +    if (pme->nkx <= pme->pme_order*(pme->nnodes_major > 1 ? 2 : 1) ||
 +        pme->nky <= pme->pme_order*(pme->nnodes_minor > 1 ? 2 : 1) ||
 +        pme->nkz <= pme->pme_order)
 +    {
 +        gmx_fatal(FARGS,"The pme grid dimensions need to be larger than pme_order (%d) and in parallel larger than 2*pme_ordern for x and/or y",pme->pme_order);
 +    }
 +
 +    if (pme->nnodes > 1) {
 +        double imbal;
 +
 +#ifdef GMX_MPI
 +        MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
 +        MPI_Type_commit(&(pme->rvec_mpi));
 +#endif
 +
 +        /* Note that the charge spreading and force gathering, which usually
 +         * takes about the same amount of time as FFT+solve_pme,
 +         * is always fully load balanced
 +         * (unless the charge distribution is inhomogeneous).
 +         */
 +
 +        imbal = pme_load_imbalance(pme);
 +        if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
 +        {
 +            fprintf(stderr,
 +                    "\n"
 +                    "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
 +                    "      For optimal PME load balancing\n"
 +                    "      PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
 +                    "      and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
 +                    "\n",
 +                    (int)((imbal-1)*100 + 0.5),
 +                    pme->nkx,pme->nky,pme->nnodes_major,
 +                    pme->nky,pme->nkz,pme->nnodes_minor);
 +        }
 +    }
 +
 +    /* For non-divisible grid we need pme_order iso pme_order-1 */
 +    /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
 +     * y is always copied through a buffer: we don't need padding in z,
 +     * but we do need the overlap in x because of the communication order.
 +     */
 +    init_overlap_comm(&pme->overlap[0],pme->pme_order,
 +#ifdef GMX_MPI
 +                      pme->mpi_comm_d[0],
 +#endif
 +                      pme->nnodes_major,pme->nodeid_major,
 +                      pme->nkx,
 +                      (div_round_up(pme->nky,pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
 +
 +    init_overlap_comm(&pme->overlap[1],pme->pme_order,
 +#ifdef GMX_MPI
 +                      pme->mpi_comm_d[1],
 +#endif
 +                      pme->nnodes_minor,pme->nodeid_minor,
 +                      pme->nky,
 +                      (div_round_up(pme->nkx,pme->nnodes_major)+pme->pme_order)*pme->nkz);
 +
 +    /* Check for a limitation of the (current) sum_fftgrid_dd code */
 +    if (pme->nthread > 1 &&
 +        (pme->overlap[0].noverlap_nodes > 1 ||
 +         pme->overlap[1].noverlap_nodes > 1))
 +    {
 +        gmx_fatal(FARGS,"With threads the number of grid lines per node along x and or y should be pme_order (%d) or more or exactly pme_order-1",pme->pme_order);
 +    }
 +
 +    snew(pme->bsp_mod[XX],pme->nkx);
 +    snew(pme->bsp_mod[YY],pme->nky);
 +    snew(pme->bsp_mod[ZZ],pme->nkz);
 +
 +    /* The required size of the interpolation grid, including overlap.
 +     * The allocated size (pmegrid_n?) might be slightly larger.
 +     */
 +    pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
 +                      pme->overlap[0].s2g0[pme->nodeid_major];
 +    pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
 +                      pme->overlap[1].s2g0[pme->nodeid_minor];
 +    pme->pmegrid_nz_base = pme->nkz;
 +    pme->pmegrid_nz = pme->pmegrid_nz_base + pme->pme_order - 1;
 +    set_grid_alignment(&pme->pmegrid_nz,pme->pme_order);
 +
 +    pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
 +    pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
 +    pme->pmegrid_start_iz = 0;
 +
 +    make_gridindex5_to_localindex(pme->nkx,
 +                                  pme->pmegrid_start_ix,
 +                                  pme->pmegrid_nx - (pme->pme_order-1),
 +                                  &pme->nnx,&pme->fshx);
 +    make_gridindex5_to_localindex(pme->nky,
 +                                  pme->pmegrid_start_iy,
 +                                  pme->pmegrid_ny - (pme->pme_order-1),
 +                                  &pme->nny,&pme->fshy);
 +    make_gridindex5_to_localindex(pme->nkz,
 +                                  pme->pmegrid_start_iz,
 +                                  pme->pmegrid_nz_base,
 +                                  &pme->nnz,&pme->fshz);
 +
 +    pmegrids_init(&pme->pmegridA,
 +                  pme->pmegrid_nx,pme->pmegrid_ny,pme->pmegrid_nz,
 +                  pme->pmegrid_nz_base,
 +                  pme->pme_order,
 +                  pme->nthread,
 +                  pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
 +                  pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
 +
 +    sse_mask_init(&pme->spline_work,pme->pme_order);
 +
 +    ndata[0] = pme->nkx;
 +    ndata[1] = pme->nky;
 +    ndata[2] = pme->nkz;
 +
 +    /* This routine will allocate the grid data to fit the FFTs */
 +    gmx_parallel_3dfft_init(&pme->pfft_setupA,ndata,
 +                            &pme->fftgridA,&pme->cfftgridA,
 +                            pme->mpi_comm_d,
 +                            pme->overlap[0].s2g0,pme->overlap[1].s2g0,
 +                            bReproducible,pme->nthread);
 +
 +    if (bFreeEnergy)
 +    {
 +        pmegrids_init(&pme->pmegridB,
 +                      pme->pmegrid_nx,pme->pmegrid_ny,pme->pmegrid_nz,
 +                      pme->pmegrid_nz_base,
 +                      pme->pme_order,
 +                      pme->nthread,
 +                      pme->nkx % pme->nnodes_major != 0,
 +                      pme->nky % pme->nnodes_minor != 0);
 +
 +        gmx_parallel_3dfft_init(&pme->pfft_setupB,ndata,
 +                                &pme->fftgridB,&pme->cfftgridB,
 +                                pme->mpi_comm_d,
 +                                pme->overlap[0].s2g0,pme->overlap[1].s2g0,
 +                                bReproducible,pme->nthread);
 +    }
 +    else
 +    {
 +        pme->pmegridB.grid.grid = NULL;
 +        pme->fftgridB           = NULL;
 +        pme->cfftgridB          = NULL;
 +    }
 +
-         if (grids->nthread == 1)
++    if (!pme->bP3M)
++    {
++        /* Use plain SPME B-spline interpolation */
++        make_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order);
++    }
++    else
++    {
++        /* Use the P3M grid-optimized influence function */
++        make_p3m_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order);
++    }
 +
 +    /* Use atc[0] for spreading */
 +    init_atomcomm(pme,&pme->atc[0],cr,nnodes_major > 1 ? 0 : 1,TRUE);
 +    if (pme->ndecompdim >= 2)
 +    {
 +        init_atomcomm(pme,&pme->atc[1],cr,1,FALSE);
 +    }
 +
 +    if (pme->nnodes == 1) {
 +        pme->atc[0].n = homenr;
 +        pme_realloc_atomcomm_things(&pme->atc[0]);
 +    }
 +
 +    {
 +        int thread;
 +
 +        /* Use fft5d, order after FFT is y major, z, x minor */
 +
 +        snew(pme->work,pme->nthread);
 +        for(thread=0; thread<pme->nthread; thread++)
 +        {
 +            realloc_work(&pme->work[thread],pme->nkx);
 +        }
 +    }
 +
 +    *pmedata = pme;
 +
 +    return 0;
 +}
 +
 +
 +static void copy_local_grid(gmx_pme_t pme,
 +                            pmegrids_t *pmegrids,int thread,real *fftgrid)
 +{
 +    ivec local_fft_ndata,local_fft_offset,local_fft_size;
 +    int  fft_my,fft_mz;
 +    int  nsx,nsy,nsz;
 +    ivec nf;
 +    int  offx,offy,offz,x,y,z,i0,i0t;
 +    int  d;
 +    pmegrid_t *pmegrid;
 +    real *grid_th;
 +
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +    fft_my = local_fft_size[YY];
 +    fft_mz = local_fft_size[ZZ];
 +
 +    pmegrid = &pmegrids->grid_th[thread];
 +
 +    nsx = pmegrid->n[XX];
 +    nsy = pmegrid->n[YY];
 +    nsz = pmegrid->n[ZZ];
 +
 +    for(d=0; d<DIM; d++)
 +    {
 +        nf[d] = min(pmegrid->n[d] - (pmegrid->order - 1),
 +                    local_fft_ndata[d] - pmegrid->offset[d]);
 +    }
 +
 +    offx = pmegrid->offset[XX];
 +    offy = pmegrid->offset[YY];
 +    offz = pmegrid->offset[ZZ];
 +
 +    /* Directly copy the non-overlapping parts of the local grids.
 +     * This also initializes the full grid.
 +     */
 +    grid_th = pmegrid->grid;
 +    for(x=0; x<nf[XX]; x++)
 +    {
 +        for(y=0; y<nf[YY]; y++)
 +        {
 +            i0  = ((offx + x)*fft_my + (offy + y))*fft_mz + offz;
 +            i0t = (x*nsy + y)*nsz;
 +            for(z=0; z<nf[ZZ]; z++)
 +            {
 +                fftgrid[i0+z] = grid_th[i0t+z];
 +            }
 +        }
 +    }
 +}
 +
 +static void print_sendbuf(gmx_pme_t pme,real *sendbuf)
 +{
 +    ivec local_fft_ndata,local_fft_offset,local_fft_size;
 +    pme_overlap_t *overlap;
 +    int datasize,nind;
 +    int i,x,y,z,n;
 +
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +    /* Major dimension */
 +    overlap = &pme->overlap[0];
 +
 +    nind   = overlap->comm_data[0].send_nindex;
 +
 +    for(y=0; y<local_fft_ndata[YY]; y++) {
 +         printf(" %2d",y);
 +    }
 +    printf("\n");
 +
 +    i = 0;
 +    for(x=0; x<nind; x++) {
 +        for(y=0; y<local_fft_ndata[YY]; y++) {
 +            n = 0;
 +            for(z=0; z<local_fft_ndata[ZZ]; z++) {
 +                if (sendbuf[i] != 0) n++;
 +                i++;
 +            }
 +            printf(" %2d",n);
 +        }
 +        printf("\n");
 +    }
 +}
 +
 +static void
 +reduce_threadgrid_overlap(gmx_pme_t pme,
 +                          const pmegrids_t *pmegrids,int thread,
 +                          real *fftgrid,real *commbuf_x,real *commbuf_y)
 +{
 +    ivec local_fft_ndata,local_fft_offset,local_fft_size;
 +    int  fft_nx,fft_ny,fft_nz;
 +    int  fft_my,fft_mz;
 +    int  buf_my=-1;
 +    int  nsx,nsy,nsz;
 +    ivec ne;
 +    int  offx,offy,offz,x,y,z,i0,i0t;
 +    int  sx,sy,sz,fx,fy,fz,tx1,ty1,tz1,ox,oy,oz;
 +    gmx_bool bClearBufX,bClearBufY,bClearBufXY,bClearBuf;
 +    gmx_bool bCommX,bCommY;
 +    int  d;
 +    int  thread_f;
 +    const pmegrid_t *pmegrid,*pmegrid_g,*pmegrid_f;
 +    const real *grid_th;
 +    real *commbuf=NULL;
 +
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +    fft_nx = local_fft_ndata[XX];
 +    fft_ny = local_fft_ndata[YY];
 +    fft_nz = local_fft_ndata[ZZ];
 +
 +    fft_my = local_fft_size[YY];
 +    fft_mz = local_fft_size[ZZ];
 +
 +    /* This routine is called when all thread have finished spreading.
 +     * Here each thread sums grid contributions calculated by other threads
 +     * to the thread local grid volume.
 +     * To minimize the number of grid copying operations,
 +     * this routines sums immediately from the pmegrid to the fftgrid.
 +     */
 +
 +    /* Determine which part of the full node grid we should operate on,
 +     * this is our thread local part of the full grid.
 +     */
 +    pmegrid = &pmegrids->grid_th[thread];
 +
 +    for(d=0; d<DIM; d++)
 +    {
 +        ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
 +                    local_fft_ndata[d]);
 +    }
 +
 +    offx = pmegrid->offset[XX];
 +    offy = pmegrid->offset[YY];
 +    offz = pmegrid->offset[ZZ];
 +
 +
 +    bClearBufX  = TRUE;
 +    bClearBufY  = TRUE;
 +    bClearBufXY = TRUE;
 +
 +    /* Now loop over all the thread data blocks that contribute
 +     * to the grid region we (our thread) are operating on.
 +     */
 +    /* Note that ffy_nx/y is equal to the number of grid points
 +     * between the first point of our node grid and the one of the next node.
 +     */
 +    for(sx=0; sx>=-pmegrids->nthread_comm[XX]; sx--)
 +    {
 +        fx = pmegrid->ci[XX] + sx;
 +        ox = 0;
 +        bCommX = FALSE;
 +        if (fx < 0) {
 +            fx += pmegrids->nc[XX];
 +            ox -= fft_nx;
 +            bCommX = (pme->nnodes_major > 1);
 +        }
 +        pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
 +        ox += pmegrid_g->offset[XX];
 +        if (!bCommX)
 +        {
 +            tx1 = min(ox + pmegrid_g->n[XX],ne[XX]);
 +        }
 +        else
 +        {
 +            tx1 = min(ox + pmegrid_g->n[XX],pme->pme_order);
 +        }
 +
 +        for(sy=0; sy>=-pmegrids->nthread_comm[YY]; sy--)
 +        {
 +            fy = pmegrid->ci[YY] + sy;
 +            oy = 0;
 +            bCommY = FALSE;
 +            if (fy < 0) {
 +                fy += pmegrids->nc[YY];
 +                oy -= fft_ny;
 +                bCommY = (pme->nnodes_minor > 1);
 +            }
 +            pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
 +            oy += pmegrid_g->offset[YY];
 +            if (!bCommY)
 +            {
 +                ty1 = min(oy + pmegrid_g->n[YY],ne[YY]);
 +            }
 +            else
 +            {
 +                ty1 = min(oy + pmegrid_g->n[YY],pme->pme_order);
 +            }
 +
 +            for(sz=0; sz>=-pmegrids->nthread_comm[ZZ]; sz--)
 +            {
 +                fz = pmegrid->ci[ZZ] + sz;
 +                oz = 0;
 +                if (fz < 0)
 +                {
 +                    fz += pmegrids->nc[ZZ];
 +                    oz -= fft_nz;
 +                }
 +                pmegrid_g = &pmegrids->grid_th[fz];
 +                oz += pmegrid_g->offset[ZZ];
 +                tz1 = min(oz + pmegrid_g->n[ZZ],ne[ZZ]);
 +
 +                if (sx == 0 && sy == 0 && sz == 0)
 +                {
 +                    /* We have already added our local contribution
 +                     * before calling this routine, so skip it here.
 +                     */
 +                    continue;
 +                }
 +
 +                thread_f = (fx*pmegrids->nc[YY] + fy)*pmegrids->nc[ZZ] + fz;
 +
 +                pmegrid_f = &pmegrids->grid_th[thread_f];
 +
 +                grid_th = pmegrid_f->grid;
 +
 +                nsx = pmegrid_f->n[XX];
 +                nsy = pmegrid_f->n[YY];
 +                nsz = pmegrid_f->n[ZZ];
 +
 +#ifdef DEBUG_PME_REDUCE
 +                printf("n%d t%d add %d  %2d %2d %2d  %2d %2d %2d  %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
 +                       pme->nodeid,thread,thread_f,
 +                       pme->pmegrid_start_ix,
 +                       pme->pmegrid_start_iy,
 +                       pme->pmegrid_start_iz,
 +                       sx,sy,sz,
 +                       offx-ox,tx1-ox,offx,tx1,
 +                       offy-oy,ty1-oy,offy,ty1,
 +                       offz-oz,tz1-oz,offz,tz1);
 +#endif
 +
 +                if (!(bCommX || bCommY))
 +                {
 +                    /* Copy from the thread local grid to the node grid */
 +                    for(x=offx; x<tx1; x++)
 +                    {
 +                        for(y=offy; y<ty1; y++)
 +                        {
 +                            i0  = (x*fft_my + y)*fft_mz;
 +                            i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
 +                            for(z=offz; z<tz1; z++)
 +                            {
 +                                fftgrid[i0+z] += grid_th[i0t+z];
 +                            }
 +                        }
 +                    }
 +                }
 +                else
 +                {
 +                    /* The order of this conditional decides
 +                     * where the corner volume gets stored with x+y decomp.
 +                     */
 +                    if (bCommY)
 +                    {
 +                        commbuf = commbuf_y;
 +                        buf_my  = ty1 - offy;
 +                        if (bCommX)
 +                        {
 +                            /* We index commbuf modulo the local grid size */
 +                            commbuf += buf_my*fft_nx*fft_nz;
 +
 +                            bClearBuf  = bClearBufXY;
 +                            bClearBufXY = FALSE;
 +                        }
 +                        else
 +                        {
 +                            bClearBuf  = bClearBufY;
 +                            bClearBufY = FALSE;
 +                        }
 +                    }
 +                    else
 +                    {
 +                        commbuf = commbuf_x;
 +                        buf_my  = fft_ny;
 +                        bClearBuf  = bClearBufX;
 +                        bClearBufX = FALSE;
 +                    }
 +
 +                    /* Copy to the communication buffer */
 +                    for(x=offx; x<tx1; x++)
 +                    {
 +                        for(y=offy; y<ty1; y++)
 +                        {
 +                            i0  = (x*buf_my + y)*fft_nz;
 +                            i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
 +
 +                            if (bClearBuf)
 +                            {
 +                                /* First access of commbuf, initialize it */
 +                                for(z=offz; z<tz1; z++)
 +                                {
 +                                    commbuf[i0+z]  = grid_th[i0t+z];
 +                                }
 +                            }
 +                            else
 +                            {
 +                                for(z=offz; z<tz1; z++)
 +                                {
 +                                    commbuf[i0+z] += grid_th[i0t+z];
 +                                }
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void sum_fftgrid_dd(gmx_pme_t pme,real *fftgrid)
 +{
 +    ivec local_fft_ndata,local_fft_offset,local_fft_size;
 +    pme_overlap_t *overlap;
 +    int  send_nindex;
 +    int  recv_index0,recv_nindex;
 +#ifdef GMX_MPI
 +    MPI_Status stat;
 +#endif
 +    int  ipulse,send_id,recv_id,datasize,gridsize,size_yx;
 +    real *sendptr,*recvptr;
 +    int  x,y,z,indg,indb;
 +
 +    /* Note that this routine is only used for forward communication.
 +     * Since the force gathering, unlike the charge spreading,
 +     * can be trivially parallelized over the particles,
 +     * the backwards process is much simpler and can use the "old"
 +     * communication setup.
 +     */
 +
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +
 +    /* Currently supports only a single communication pulse */
 +
 +/* for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++) */
 +    if (pme->nnodes_minor > 1)
 +    {
 +        /* Major dimension */
 +        overlap = &pme->overlap[1];
 +
 +        if (pme->nnodes_major > 1)
 +        {
 +             size_yx = pme->overlap[0].comm_data[0].send_nindex;
 +        }
 +        else
 +        {
 +            size_yx = 0;
 +        }
 +        datasize = (local_fft_ndata[XX]+size_yx)*local_fft_ndata[ZZ];
 +
 +        ipulse = 0;
 +
 +        send_id = overlap->send_id[ipulse];
 +        recv_id = overlap->recv_id[ipulse];
 +        send_nindex   = overlap->comm_data[ipulse].send_nindex;
 +        /* recv_index0   = overlap->comm_data[ipulse].recv_index0; */
 +        recv_index0 = 0;
 +        recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +
 +        sendptr = overlap->sendbuf;
 +        recvptr = overlap->recvbuf;
 +
 +        /*
 +        printf("node %d comm %2d x %2d x %2d\n",pme->nodeid,
 +               local_fft_ndata[XX]+size_yx,send_nindex,local_fft_ndata[ZZ]);
 +        printf("node %d send %f, %f\n",pme->nodeid,
 +               sendptr[0],sendptr[send_nindex*datasize-1]);
 +        */
 +
 +#ifdef GMX_MPI
 +        MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
 +                     send_id,ipulse,
 +                     recvptr,recv_nindex*datasize,GMX_MPI_REAL,
 +                     recv_id,ipulse,
 +                     overlap->mpi_comm,&stat);
 +#endif
 +
 +        for(x=0; x<local_fft_ndata[XX]; x++)
 +        {
 +            for(y=0; y<recv_nindex; y++)
 +            {
 +                indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
 +                indb = (x*recv_nindex        + y)*local_fft_ndata[ZZ];
 +                for(z=0; z<local_fft_ndata[ZZ]; z++)
 +                {
 +                    fftgrid[indg+z] += recvptr[indb+z];
 +                }
 +            }
 +        }
 +        if (pme->nnodes_major > 1)
 +        {
 +            sendptr = pme->overlap[0].sendbuf;
 +            for(x=0; x<size_yx; x++)
 +            {
 +                for(y=0; y<recv_nindex; y++)
 +                {
 +                    indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
 +                    indb = ((local_fft_ndata[XX] + x)*recv_nindex +y)*local_fft_ndata[ZZ];
 +                    for(z=0; z<local_fft_ndata[ZZ]; z++)
 +                    {
 +                        sendptr[indg+z] += recvptr[indb+z];
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    /* for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++) */
 +    if (pme->nnodes_major > 1)
 +    {
 +        /* Major dimension */
 +        overlap = &pme->overlap[0];
 +
 +        datasize = local_fft_ndata[YY]*local_fft_ndata[ZZ];
 +        gridsize = local_fft_size[YY] *local_fft_size[ZZ];
 +
 +        ipulse = 0;
 +
 +        send_id = overlap->send_id[ipulse];
 +        recv_id = overlap->recv_id[ipulse];
 +        send_nindex   = overlap->comm_data[ipulse].send_nindex;
 +        /* recv_index0   = overlap->comm_data[ipulse].recv_index0; */
 +        recv_index0 = 0;
 +        recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +
 +        sendptr = overlap->sendbuf;
 +        recvptr = overlap->recvbuf;
 +
 +        if (debug != NULL)
 +        {
 +            fprintf(debug,"PME fftgrid comm %2d x %2d x %2d\n",
 +                   send_nindex,local_fft_ndata[YY],local_fft_ndata[ZZ]);
 +        }
 +
 +#ifdef GMX_MPI
 +        MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
 +                     send_id,ipulse,
 +                     recvptr,recv_nindex*datasize,GMX_MPI_REAL,
 +                     recv_id,ipulse,
 +                     overlap->mpi_comm,&stat);
 +#endif
 +
 +        for(x=0; x<recv_nindex; x++)
 +        {
 +            for(y=0; y<local_fft_ndata[YY]; y++)
 +            {
 +                indg = (x*local_fft_size[YY]  + y)*local_fft_size[ZZ];
 +                indb = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
 +                for(z=0; z<local_fft_ndata[ZZ]; z++)
 +                {
 +                    fftgrid[indg+z] += recvptr[indb+z];
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void spread_on_grid(gmx_pme_t pme,
 +                           pme_atomcomm_t *atc,pmegrids_t *grids,
 +                           gmx_bool bCalcSplines,gmx_bool bSpread,
 +                           real *fftgrid)
 +{
 +    int nthread,thread;
 +#ifdef PME_TIME_THREADS
 +    gmx_cycles_t c1,c2,c3,ct1a,ct1b,ct1c;
 +    static double cs1=0,cs2=0,cs3=0;
 +    static double cs1a[6]={0,0,0,0,0,0};
 +    static int cnt=0;
 +#endif
 +
 +    nthread = pme->nthread;
 +
 +#ifdef PME_TIME_THREADS
 +    c1 = omp_cyc_start();
 +#endif
 +    if (bCalcSplines)
 +    {
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +        for(thread=0; thread<nthread; thread++)
 +        {
 +            int start,end;
 +
 +            start = atc->n* thread   /nthread;
 +            end   = atc->n*(thread+1)/nthread;
 +
 +            /* Compute fftgrid index for all atoms,
 +             * with help of some extra variables.
 +             */
 +            calc_interpolation_idx(pme,atc,start,end,thread);
 +        }
 +    }
 +#ifdef PME_TIME_THREADS
 +    c1 = omp_cyc_end(c1);
 +    cs1 += (double)c1;
 +#endif
 +
 +#ifdef PME_TIME_THREADS
 +    c2 = omp_cyc_start();
 +#endif
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +    for(thread=0; thread<nthread; thread++)
 +    {
 +        splinedata_t *spline;
 +        pmegrid_t *grid;
 +
 +        /* make local bsplines  */
-     if (grids->nthread > 1)
++        if (grids == NULL || grids->nthread == 1)
 +        {
 +            spline = &atc->spline[0];
 +
 +            spline->n = atc->n;
 +
 +            grid = &grids->grid;
 +        }
 +        else
 +        {
 +            spline = &atc->spline[thread];
 +
 +            make_thread_local_ind(atc,thread,spline);
 +
 +            grid = &grids->grid_th[thread];
 +        }
 +
 +        if (bCalcSplines)
 +        {
 +            make_bsplines(spline->theta,spline->dtheta,pme->pme_order,
 +                          atc->fractx,spline->n,spline->ind,atc->q,pme->bFEP);
 +        }
 +
 +        if (bSpread)
 +        {
 +            /* put local atoms on grid. */
 +#ifdef PME_TIME_SPREAD
 +            ct1a = omp_cyc_start();
 +#endif
 +            spread_q_bsplines_thread(grid,atc,spline,&pme->spline_work);
 +
 +            if (grids->nthread > 1)
 +            {
 +                copy_local_grid(pme,grids,thread,fftgrid);
 +            }
 +#ifdef PME_TIME_SPREAD
 +            ct1a = omp_cyc_end(ct1a);
 +            cs1a[thread] += (double)ct1a;
 +#endif
 +        }
 +    }
 +#ifdef PME_TIME_THREADS
 +    c2 = omp_cyc_end(c2);
 +    cs2 += (double)c2;
 +#endif
 +
-                                   flags & GMX_PME_CALC_ENER_VIR,
++    if (bSpread && grids->nthread > 1)
 +    {
 +#ifdef PME_TIME_THREADS
 +        c3 = omp_cyc_start();
 +#endif
 +#pragma omp parallel for num_threads(grids->nthread) schedule(static)
 +        for(thread=0; thread<grids->nthread; thread++)
 +        {
 +            reduce_threadgrid_overlap(pme,grids,thread,
 +                                      fftgrid,
 +                                      pme->overlap[0].sendbuf,
 +                                      pme->overlap[1].sendbuf);
 +#ifdef PRINT_PME_SENDBUF
 +            print_sendbuf(pme,pme->overlap[0].sendbuf);
 +#endif
 +        }
 +#ifdef PME_TIME_THREADS
 +        c3 = omp_cyc_end(c3);
 +        cs3 += (double)c3;
 +#endif
 +
 +        if (pme->nnodes > 1)
 +        {
 +            /* Communicate the overlapping part of the fftgrid */
 +            sum_fftgrid_dd(pme,fftgrid);
 +        }
 +    }
 +
 +#ifdef PME_TIME_THREADS
 +    cnt++;
 +    if (cnt % 20 == 0)
 +    {
 +        printf("idx %.2f spread %.2f red %.2f",
 +               cs1*1e-9,cs2*1e-9,cs3*1e-9);
 +#ifdef PME_TIME_SPREAD
 +        for(thread=0; thread<nthread; thread++)
 +            printf(" %.2f",cs1a[thread]*1e-9);
 +#endif
 +        printf("\n");
 +    }
 +#endif
 +}
 +
 +
 +static void dump_grid(FILE *fp,
 +                      int sx,int sy,int sz,int nx,int ny,int nz,
 +                      int my,int mz,const real *g)
 +{
 +    int x,y,z;
 +
 +    for(x=0; x<nx; x++)
 +    {
 +        for(y=0; y<ny; y++)
 +        {
 +            for(z=0; z<nz; z++)
 +            {
 +                fprintf(fp,"%2d %2d %2d %6.3f\n",
 +                        sx+x,sy+y,sz+z,g[(x*my + y)*mz + z]);
 +            }
 +        }
 +    }
 +}
 +
 +static void dump_local_fftgrid(gmx_pme_t pme,const real *fftgrid)
 +{
 +    ivec local_fft_ndata,local_fft_offset,local_fft_size;
 +
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +
 +    dump_grid(stderr,
 +              pme->pmegrid_start_ix,
 +              pme->pmegrid_start_iy,
 +              pme->pmegrid_start_iz,
 +              pme->pmegrid_nx-pme->pme_order+1,
 +              pme->pmegrid_ny-pme->pme_order+1,
 +              pme->pmegrid_nz-pme->pme_order+1,
 +              local_fft_size[YY],
 +              local_fft_size[ZZ],
 +              fftgrid);
 +}
 +
 +
 +void gmx_pme_calc_energy(gmx_pme_t pme,int n,rvec *x,real *q,real *V)
 +{
 +    pme_atomcomm_t *atc;
 +    pmegrids_t *grid;
 +
 +    if (pme->nnodes > 1)
 +    {
 +        gmx_incons("gmx_pme_calc_energy called in parallel");
 +    }
 +    if (pme->bFEP > 1)
 +    {
 +        gmx_incons("gmx_pme_calc_energy with free energy");
 +    }
 +
 +    atc = &pme->atc_energy;
++    atc->nthread   = 1;
++    if (atc->spline == NULL)
++    {
++        snew(atc->spline,atc->nthread);
++    }
 +    atc->nslab     = 1;
 +    atc->bSpread   = TRUE;
 +    atc->pme_order = pme->pme_order;
 +    atc->n         = n;
 +    pme_realloc_atomcomm_things(atc);
 +    atc->x         = x;
 +    atc->q         = q;
 +
 +    /* We only use the A-charges grid */
 +    grid = &pme->pmegridA;
 +
 +    spread_on_grid(pme,atc,NULL,TRUE,FALSE,pme->fftgridA);
 +
 +    *V = gather_energy_bsplines(pme,grid->grid.grid,atc);
 +}
 +
 +
 +static void reset_pmeonly_counters(t_commrec *cr,gmx_wallcycle_t wcycle,
 +        t_nrnb *nrnb,t_inputrec *ir, gmx_large_int_t step_rel)
 +{
 +    /* Reset all the counters related to performance over the run */
 +    wallcycle_stop(wcycle,ewcRUN);
 +    wallcycle_reset_all(wcycle);
 +    init_nrnb(nrnb);
 +    ir->init_step += step_rel;
 +    ir->nsteps    -= step_rel;
 +    wallcycle_start(wcycle,ewcRUN);
 +}
 +
 +
 +int gmx_pmeonly(gmx_pme_t pme,
 +                t_commrec *cr,    t_nrnb *nrnb,
 +                gmx_wallcycle_t wcycle,
 +                real ewaldcoeff,  gmx_bool bGatherOnly,
 +                t_inputrec *ir)
 +{
 +    gmx_pme_pp_t pme_pp;
 +    int  natoms;
 +    matrix box;
 +    rvec *x_pp=NULL,*f_pp=NULL;
 +    real *chargeA=NULL,*chargeB=NULL;
 +    real lambda=0;
 +    int  maxshift_x=0,maxshift_y=0;
 +    real energy,dvdlambda;
 +    matrix vir;
 +    float cycles;
 +    int  count;
 +    gmx_bool bEnerVir;
 +    gmx_large_int_t step,step_rel;
 +
 +
 +    pme_pp = gmx_pme_pp_init(cr);
 +
 +    init_nrnb(nrnb);
 +
 +    count = 0;
 +    do /****** this is a quasi-loop over time steps! */
 +    {
 +        /* Domain decomposition */
 +        natoms = gmx_pme_recv_q_x(pme_pp,
 +                                  &chargeA,&chargeB,box,&x_pp,&f_pp,
 +                                  &maxshift_x,&maxshift_y,
 +                                  &pme->bFEP,&lambda,
 +                                  &bEnerVir,
 +                                  &step);
 +
 +        if (natoms == -1) {
 +            /* We should stop: break out of the loop */
 +            break;
 +        }
 +
 +        step_rel = step - ir->init_step;
 +
 +        if (count == 0)
 +            wallcycle_start(wcycle,ewcRUN);
 +
 +        wallcycle_start(wcycle,ewcPMEMESH);
 +
 +        dvdlambda = 0;
 +        clear_mat(vir);
 +        gmx_pme_do(pme,0,natoms,x_pp,f_pp,chargeA,chargeB,box,
 +                   cr,maxshift_x,maxshift_y,nrnb,wcycle,vir,ewaldcoeff,
 +                   &energy,lambda,&dvdlambda,
 +                   GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
 +
 +        cycles = wallcycle_stop(wcycle,ewcPMEMESH);
 +
 +        gmx_pme_send_force_vir_ener(pme_pp,
 +                                    f_pp,vir,energy,dvdlambda,
 +                                    cycles);
 +
 +        count++;
 +
 +        if (step_rel == wcycle_get_reset_counters(wcycle))
 +        {
 +            /* Reset all the counters related to performance over the run */
 +            reset_pmeonly_counters(cr,wcycle,nrnb,ir,step_rel);
 +            wcycle_set_reset_counters(wcycle, 0);
 +        }
 +
 +    } /***** end of quasi-loop, we stop with the break above */
 +    while (TRUE);
 +
 +    return 0;
 +}
 +
 +int gmx_pme_do(gmx_pme_t pme,
 +               int start,       int homenr,
 +               rvec x[],        rvec f[],
 +               real *chargeA,   real *chargeB,
 +               matrix box, t_commrec *cr,
 +               int  maxshift_x, int maxshift_y,
 +               t_nrnb *nrnb,    gmx_wallcycle_t wcycle,
 +               matrix vir,      real ewaldcoeff,
 +               real *energy,    real lambda,
 +               real *dvdlambda, int flags)
 +{
 +    int     q,d,i,j,ntot,npme;
 +    int     nx,ny,nz;
 +    int     n_d,local_ny;
 +    pme_atomcomm_t *atc=NULL;
 +    pmegrids_t *pmegrid=NULL;
 +    real    *grid=NULL;
 +    real    *ptr;
 +    rvec    *x_d,*f_d;
 +    real    *charge=NULL,*q_d;
 +    real    energy_AB[2];
 +    matrix  vir_AB[2];
 +    gmx_bool bClearF;
 +    gmx_parallel_3dfft_t pfft_setup;
 +    real *  fftgrid;
 +    t_complex * cfftgrid;
 +    int     thread;
++    gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR;
++    gmx_bool bCalcF = flags & GMX_PME_CALC_F;
++
++    assert(pme->nnodes > 0);
++    assert(pme->nnodes == 1 || pme->ndecompdim > 0);
 +
 +    if (pme->nnodes > 1) {
 +        atc = &pme->atc[0];
 +        atc->npd = homenr;
 +        if (atc->npd > atc->pd_nalloc) {
 +            atc->pd_nalloc = over_alloc_dd(atc->npd);
 +            srenew(atc->pd,atc->pd_nalloc);
 +        }
 +        atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y);
 +    }
 +    else
 +    {
 +        /* This could be necessary for TPI */
 +        pme->atc[0].n = homenr;
 +    }
 +
 +    for(q=0; q<(pme->bFEP ? 2 : 1); q++) {
 +        if (q == 0) {
 +            pmegrid = &pme->pmegridA;
 +            fftgrid = pme->fftgridA;
 +            cfftgrid = pme->cfftgridA;
 +            pfft_setup = pme->pfft_setupA;
 +            charge = chargeA+start;
 +        } else {
 +            pmegrid = &pme->pmegridB;
 +            fftgrid = pme->fftgridB;
 +            cfftgrid = pme->cfftgridB;
 +            pfft_setup = pme->pfft_setupB;
 +            charge = chargeB+start;
 +        }
 +        grid = pmegrid->grid.grid;
 +        /* Unpack structure */
 +        if (debug) {
 +            fprintf(debug,"PME: nnodes = %d, nodeid = %d\n",
 +                    cr->nnodes,cr->nodeid);
 +            fprintf(debug,"Grid = %p\n",(void*)grid);
 +            if (grid == NULL)
 +                gmx_fatal(FARGS,"No grid!");
 +        }
 +        where();
 +
 +        m_inv_ur0(box,pme->recipbox);
 +
 +        if (pme->nnodes == 1) {
 +            atc = &pme->atc[0];
 +            if (DOMAINDECOMP(cr)) {
 +                atc->n = homenr;
 +                pme_realloc_atomcomm_things(atc);
 +            }
 +            atc->x = x;
 +            atc->q = charge;
 +            atc->f = f;
 +        } else {
 +            wallcycle_start(wcycle,ewcPME_REDISTXF);
 +            for(d=pme->ndecompdim-1; d>=0; d--)
 +            {
 +                if (d == pme->ndecompdim-1)
 +                {
 +                    n_d = homenr;
 +                    x_d = x + start;
 +                    q_d = charge;
 +                }
 +                else
 +                {
 +                    n_d = pme->atc[d+1].n;
 +                    x_d = atc->x;
 +                    q_d = atc->q;
 +                }
 +                atc = &pme->atc[d];
 +                atc->npd = n_d;
 +                if (atc->npd > atc->pd_nalloc) {
 +                    atc->pd_nalloc = over_alloc_dd(atc->npd);
 +                    srenew(atc->pd,atc->pd_nalloc);
 +                }
 +                atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y);
 +                pme_calc_pidx_wrapper(n_d,pme->recipbox,x_d,atc);
 +                where();
 +
 +                /* Redistribute x (only once) and qA or qB */
 +                if (DOMAINDECOMP(cr)) {
 +                    dd_pmeredist_x_q(pme, n_d, q==0, x_d, q_d, atc);
 +                } else {
 +                    pmeredist_pd(pme, TRUE, n_d, q==0, x_d, q_d, atc);
 +                }
 +            }
 +            where();
 +
 +            wallcycle_stop(wcycle,ewcPME_REDISTXF);
 +        }
 +
 +        if (debug)
 +            fprintf(debug,"Node= %6d, pme local particles=%6d\n",
 +                    cr->nodeid,atc->n);
 +
 +        if (flags & GMX_PME_SPREAD_Q)
 +        {
 +            wallcycle_start(wcycle,ewcPME_SPREADGATHER);
 +
 +            /* Spread the charges on a grid */
 +            spread_on_grid(pme,&pme->atc[0],pmegrid,q==0,TRUE,fftgrid);
 +
 +            if (q == 0)
 +            {
 +                inc_nrnb(nrnb,eNR_WEIGHTS,DIM*atc->n);
 +            }
 +            inc_nrnb(nrnb,eNR_SPREADQBSP,
 +                     pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
 +
 +            if (pme->nthread == 1)
 +            {
 +                wrap_periodic_pmegrid(pme,grid);
 +
 +                /* sum contributions to local grid from other nodes */
 +#ifdef GMX_MPI
 +                if (pme->nnodes > 1)
 +                {
 +                    gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_FORWARD);
 +                    where();
 +                }
 +#endif
 +
 +                copy_pmegrid_to_fftgrid(pme,grid,fftgrid);
 +            }
 +
 +            wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
 +
 +            /*
 +            dump_local_fftgrid(pme,fftgrid);
 +            exit(0);
 +            */
 +        }
 +
 +        /* Here we start a large thread parallel region */
 +#pragma omp parallel for num_threads(pme->nthread) schedule(static)
 +        for(thread=0; thread<pme->nthread; thread++)
 +        {
 +            if (flags & GMX_PME_SOLVE)
 +            {
 +                int loop_count;
 +
 +                /* do 3d-fft */
 +                if (thread == 0)
 +                {
 +                    wallcycle_start(wcycle,ewcPME_FFT);
 +                }
 +                gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_REAL_TO_COMPLEX,
 +                                           fftgrid,cfftgrid,thread,wcycle);
 +                if (thread == 0)
 +                {
 +                    wallcycle_stop(wcycle,ewcPME_FFT);
 +                }
 +                where();
 +
 +                /* solve in k-space for our local cells */
 +                if (thread == 0)
 +                {
 +                    wallcycle_start(wcycle,ewcPME_SOLVE);
 +                }
 +                loop_count =
 +                    solve_pme_yzx(pme,cfftgrid,ewaldcoeff,
 +                                  box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
-             if (flags & GMX_PME_CALC_F)
++                                  bCalcEnerVir,
 +                                  pme->nthread,thread);
 +                if (thread == 0)
 +                {
 +                    wallcycle_stop(wcycle,ewcPME_SOLVE);
 +                    where();
 +                    inc_nrnb(nrnb,eNR_SOLVEPME,loop_count);
 +                }
 +            }
 +
-         if (flags & GMX_PME_CALC_F)
++            if (bCalcF)
 +            {
 +                /* do 3d-invfft */
 +                if (thread == 0)
 +                {
 +                    where();
 +                    wallcycle_start(wcycle,ewcPME_FFT);
 +                }
 +                gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_COMPLEX_TO_REAL,
 +                                           cfftgrid,fftgrid,thread,wcycle);
 +                if (thread == 0)
 +                {
 +                    wallcycle_stop(wcycle,ewcPME_FFT);
 +
 +                    where();
 +
 +                    if (pme->nodeid == 0)
 +                    {
 +                        ntot = pme->nkx*pme->nky*pme->nkz;
 +                        npme  = ntot*log((real)ntot)/log(2.0);
 +                        inc_nrnb(nrnb,eNR_FFT,2*npme);
 +                    }
 +
 +                    wallcycle_start(wcycle,ewcPME_SPREADGATHER);
 +                }
 +
 +                copy_fftgrid_to_pmegrid(pme,fftgrid,grid,pme->nthread,thread);
 +            }
 +        }
 +        /* End of thread parallel section.
 +         * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
 +         */
 +
-         if (flags & GMX_PME_CALC_ENER_VIR)
++        if (bCalcF)
 +        {
 +            /* distribute local grid to all nodes */
 +#ifdef GMX_MPI
 +            if (pme->nnodes > 1) {
 +                gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_BACKWARD);
 +            }
 +#endif
 +            where();
 +
 +            unwrap_periodic_pmegrid(pme,grid);
 +
 +            /* interpolate forces for our local atoms */
 +
 +            where();
 +
 +            /* If we are running without parallelization,
 +             * atc->f is the actual force array, not a buffer,
 +             * therefore we should not clear it.
 +             */
 +            bClearF = (q == 0 && PAR(cr));
 +#pragma omp parallel for num_threads(pme->nthread) schedule(static)
 +            for(thread=0; thread<pme->nthread; thread++)
 +            {
 +                gather_f_bsplines(pme,grid,bClearF,atc,
 +                                  &atc->spline[thread],
 +                                  pme->bFEP ? (q==0 ? 1.0-lambda : lambda) : 1.0);
 +            }
 +
 +            where();
 +
 +            inc_nrnb(nrnb,eNR_GATHERFBSP,
 +                     pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
 +            wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
 +        }
 +
-     if ((flags & GMX_PME_CALC_F) && pme->nnodes > 1) {
++        if (bCalcEnerVir)
 +        {
 +            /* This should only be called on the master thread
 +             * and after the threads have synchronized.
 +             */
 +            get_pme_ener_vir(pme,pme->nthread,&energy_AB[q],vir_AB[q]);
 +        }
 +    } /* of q-loop */
 +
-     if (!pme->bFEP) {
-         *energy = energy_AB[0];
-         m_add(vir,vir_AB[0],vir);
-     } else {
-         *energy = (1.0-lambda)*energy_AB[0] + lambda*energy_AB[1];
-         *dvdlambda += energy_AB[1] - energy_AB[0];
-         for(i=0; i<DIM; i++)
-             for(j=0; j<DIM; j++)
-                 vir[i][j] += (1.0-lambda)*vir_AB[0][i][j] + lambda*vir_AB[1][i][j];
++    if (bCalcF && pme->nnodes > 1) {
 +        wallcycle_start(wcycle,ewcPME_REDISTXF);
 +        for(d=0; d<pme->ndecompdim; d++)
 +        {
 +            atc = &pme->atc[d];
 +            if (d == pme->ndecompdim - 1)
 +            {
 +                n_d = homenr;
 +                f_d = f + start;
 +            }
 +            else
 +            {
 +                n_d = pme->atc[d+1].n;
 +                f_d = pme->atc[d+1].f;
 +            }
 +            if (DOMAINDECOMP(cr)) {
 +                dd_pmeredist_f(pme,atc,n_d,f_d,
 +                               d==pme->ndecompdim-1 && pme->bPPnode);
 +            } else {
 +                pmeredist_pd(pme, FALSE, n_d, TRUE, f_d, NULL, atc);
 +            }
 +        }
 +
 +        wallcycle_stop(wcycle,ewcPME_REDISTXF);
 +    }
 +    where();
 +
++    if (bCalcEnerVir)
++    {
++        if (!pme->bFEP) {
++            *energy = energy_AB[0];
++            m_add(vir,vir_AB[0],vir);
++        } else {
++            *energy = (1.0-lambda)*energy_AB[0] + lambda*energy_AB[1];
++            *dvdlambda += energy_AB[1] - energy_AB[0];
++            for(i=0; i<DIM; i++)
++            {
++                for(j=0; j<DIM; j++)
++                {
++                    vir[i][j] += (1.0-lambda)*vir_AB[0][i][j] + 
++                        lambda*vir_AB[1][i][j];
++                }
++            }
++        }
++    }
++    else
++    {
++        *energy = 0;
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"PME mesh energy: %g\n",*energy);
 +    }
 +
 +    return 0;
 +}
index 30d13d9150084d105bbff004bca56d242da193be,0000000000000000000000000000000000000000..ab2287dbe3491ae4d9d1353378d7c745a286fd9e
mode 100644,000000..100644
--- /dev/null
@@@ -1,1031 -1,0 +1,1050 @@@
-             if (qS != atom[aS].qB)
-               gmx_fatal(FARGS,"polarize can not be used with qA != qB");
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2008, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 + 
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <string.h>
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "gmx_fatal.h"
 +#include "vec.h"
 +#include "txtdump.h"
 +#include "mdrun.h"
 +#include "partdec.h"
 +#include "mdatoms.h"
 +#include "vsite.h"
 +#include "network.h"
 +#include "names.h"
 +#include "constr.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "physics.h"
 +#include "copyrite.h"
 +#include "shellfc.h"
 +#include "mtop_util.h"
 +#include "chargegroup.h"
 +#include "macros.h"
 +
 +
 +typedef struct {
 +  int     nnucl;
 +  atom_id shell;              /* The shell id                         */
 +  atom_id nucl1,nucl2,nucl3;  /* The nuclei connected to the shell    */
 +  /* gmx_bool    bInterCG; */       /* Coupled to nuclei outside cg?        */
 +  real    k;                  /* force constant                       */
 +  real    k_1;                        /* 1 over force constant                */
 +  rvec    xold;
 +  rvec    fold;
 +  rvec    step;
 +} t_shell;
 +
 +typedef struct gmx_shellfc {
 +  int     nshell_gl;       /* The number of shells in the system       */
 +  t_shell *shell_gl;       /* All the shells (for DD only)             */
 +  int     *shell_index_gl; /* Global shell index (for DD only)         */
 +  gmx_bool    bInterCG;        /* Are there inter charge-group shells?     */
 +  int     nshell;          /* The number of local shells               */
 +  t_shell *shell;          /* The local shells                         */
 +  int     shell_nalloc;    /* The allocation size of shell             */
 +  gmx_bool    bPredict;        /* Predict shell positions                  */
 +  gmx_bool    bForceInit;      /* Force initialization of shell positions  */
 +  int     nflexcon;        /* The number of flexible constraints       */
 +  rvec    *x[2];           /* Array for iterative minimization         */
 +  rvec    *f[2];           /* Array for iterative minimization         */
 +  int     x_nalloc;        /* The allocation size of x and f           */
 +  rvec    *acc_dir;        /* Acceleration direction for flexcon       */
 +  rvec    *x_old;          /* Old coordinates for flexcon              */
 +  int     flex_nalloc;     /* The allocation size of acc_dir and x_old */
 +  rvec    *adir_xnold;     /* Work space for init_adir                 */
 +  rvec    *adir_xnew;      /* Work space for init_adir                 */
 +  int     adir_nalloc;     /* Work space for init_adir                 */
 +} t_gmx_shellfc;
 +
 +      
 +static void pr_shell(FILE *fplog,int ns,t_shell s[])
 +{
 +  int i;
 +  
 +  fprintf(fplog,"SHELL DATA\n");
 +  fprintf(fplog,"%5s  %8s  %5s  %5s  %5s\n",
 +        "Shell","Force k","Nucl1","Nucl2","Nucl3");
 +  for(i=0; (i<ns); i++) {
 +    fprintf(fplog,"%5d  %8.3f  %5d",s[i].shell,1.0/s[i].k_1,s[i].nucl1);
 +    if (s[i].nnucl == 2)
 +      fprintf(fplog,"  %5d\n",s[i].nucl2);
 +    else if (s[i].nnucl == 3)
 +      fprintf(fplog,"  %5d  %5d\n",s[i].nucl2,s[i].nucl3);
 +    else
 +      fprintf(fplog,"\n");
 +  }
 +}
 +
 +static void predict_shells(FILE *fplog,rvec x[],rvec v[],real dt,
 +                         int ns,t_shell s[],
 +                         real mass[],gmx_mtop_t *mtop,gmx_bool bInit)
 +{
 +  int  i,m,s1,n1,n2,n3;
 +  real dt_1,dt_2,dt_3,fudge,tm,m1,m2,m3;
 +  rvec *ptr;
 +  t_atom *atom;
 +  
 +  /* We introduce a fudge factor for performance reasons: with this choice
 +   * the initial force on the shells is about a factor of two lower than 
 +   * without
 +   */
 +  fudge = 1.0;
 +    
 +  if (bInit) {
 +    if (fplog)
 +      fprintf(fplog,"RELAX: Using prediction for initial shell placement\n");
 +    ptr  = x;
 +    dt_1 = 1;
 +  }
 +  else {
 +    ptr  = v;
 +    dt_1 = fudge*dt;
 +  }
 +    
 +  for(i=0; (i<ns); i++) {
 +    s1 = s[i].shell;
 +    if (bInit)
 +      clear_rvec(x[s1]);
 +    switch (s[i].nnucl) {
 +    case 1:
 +      n1 = s[i].nucl1;
 +      for(m=0; (m<DIM); m++)
 +      x[s1][m]+=ptr[n1][m]*dt_1;
 +      break;
 +    case 2:
 +      n1 = s[i].nucl1;
 +      n2 = s[i].nucl2;
 +      if (mass) {
 +      m1 = mass[n1];
 +      m2 = mass[n2];
 +      } else {
 +      /* Not the correct masses with FE, but it is just a prediction... */
 +      m1 = atom[n1].m;
 +      m2 = atom[n2].m;
 +      }
 +      tm = dt_1/(m1+m2);
 +      for(m=0; (m<DIM); m++)
 +      x[s1][m]+=(m1*ptr[n1][m]+m2*ptr[n2][m])*tm;
 +      break;
 +    case 3:
 +      n1 = s[i].nucl1;
 +      n2 = s[i].nucl2;
 +      n3 = s[i].nucl3;
 +      if (mass) {
 +      m1 = mass[n1];
 +      m2 = mass[n2];
 +      m3 = mass[n3];
 +      } else {
 +      /* Not the correct masses with FE, but it is just a prediction... */
 +      gmx_mtop_atomnr_to_atom(mtop,n1,&atom);
 +      m1 = atom->m;
 +      gmx_mtop_atomnr_to_atom(mtop,n2,&atom);
 +      m2 = atom->m;
 +      gmx_mtop_atomnr_to_atom(mtop,n3,&atom);
 +      m3 = atom->m;
 +      }
 +      tm = dt_1/(m1+m2+m3);
 +      for(m=0; (m<DIM); m++)
 +      x[s1][m]+=(m1*ptr[n1][m]+m2*ptr[n2][m]+m3*ptr[n3][m])*tm;
 +      break;
 +    default:
 +      gmx_fatal(FARGS,"Shell %d has %d nuclei!",i,s[i].nnucl);
 +    }
 +  }
 +}
 +
 +gmx_shellfc_t init_shell_flexcon(FILE *fplog,
 +                               gmx_mtop_t *mtop,int nflexcon,
 +                               rvec *x)
 +{
 +  struct gmx_shellfc *shfc;
 +  t_shell     *shell;
 +  int         *shell_index=NULL,*at2cg;
 +  t_atom      *atom;
 +  int         n[eptNR],ns,nshell,nsi;
 +  int         i,j,nmol,type,mb,mt,a_offset,cg,mol,ftype,nra;
 +  real        qS,alpha;
 +  int         aS,aN=0; /* Shell and nucleus */
 +  int         bondtypes[] = { F_BONDS, F_HARMONIC, F_CUBICBONDS, F_POLARIZATION, F_ANHARM_POL, F_WATER_POL };
 +#define NBT asize(bondtypes)
 +  t_iatom     *ia;
 +  gmx_mtop_atomloop_block_t aloopb;
 +  gmx_mtop_atomloop_all_t aloop;
 +  gmx_ffparams_t *ffparams;
 +  gmx_molblock_t *molb;
 +  gmx_moltype_t *molt;
 +  t_block     *cgs;
 +
 +  /* Count number of shells, and find their indices */
 +  for(i=0; (i<eptNR); i++) {
 +    n[i] = 0;
 +  }
 +
 +  aloopb = gmx_mtop_atomloop_block_init(mtop);
 +  while (gmx_mtop_atomloop_block_next(aloopb,&atom,&nmol)) {
 +    n[atom->ptype] += nmol;
 +  }
 +
 +  if (fplog) {
 +    /* Print the number of each particle type */  
 +    for(i=0; (i<eptNR); i++) {
 +      if (n[i] != 0) {
 +      fprintf(fplog,"There are: %d %ss\n",n[i],ptype_str[i]);
 +      }
 +    }
 +  }
 +
 +  nshell = n[eptShell];
 +  
 +  if (nshell == 0 && nflexcon == 0) {
 +    return NULL;
 +  }
 +
 +  snew(shfc,1);
 +  shfc->nflexcon = nflexcon;
 +
 +  if (nshell == 0) {
 +    return shfc;
 +  }
 +
 +  /* We have shells: fill the shell data structure */
 +
 +  /* Global system sized array, this should be avoided */
 +  snew(shell_index,mtop->natoms);
 +
 +  aloop = gmx_mtop_atomloop_all_init(mtop);
 +  nshell = 0;
 +  while (gmx_mtop_atomloop_all_next(aloop,&i,&atom)) {
 +    if (atom->ptype == eptShell) {
 +      shell_index[i] = nshell++;
 +    }
 +  }
 +
 +  snew(shell,nshell);
 +  
 +  /* Initiate the shell structures */    
 +  for(i=0; (i<nshell); i++) {
 +    shell[i].shell = NO_ATID;
 +    shell[i].nnucl = 0;
 +    shell[i].nucl1 = NO_ATID;
 +    shell[i].nucl2 = NO_ATID;
 +    shell[i].nucl3 = NO_ATID;
 +    /* shell[i].bInterCG=FALSE; */
 +    shell[i].k_1   = 0;
 +    shell[i].k     = 0;
 +  }
 +
 +  ffparams = &mtop->ffparams;
 +
 +  /* Now fill the structures */
 +  shfc->bInterCG = FALSE;
 +  ns = 0;
 +  a_offset = 0;
 +  for(mb=0; mb<mtop->nmolblock; mb++) {
 +    molb = &mtop->molblock[mb];
 +    molt = &mtop->moltype[molb->type];
 +
 +    cgs = &molt->cgs;
 +    snew(at2cg,molt->atoms.nr);
 +    for(cg=0; cg<cgs->nr; cg++) {
 +      for(i=cgs->index[cg]; i<cgs->index[cg+1]; i++) {
 +      at2cg[i] = cg;
 +      }
 +    }
 +
 +    atom = molt->atoms.atom;
 +    for(mol=0; mol<molb->nmol; mol++) {
 +      for(j=0; (j<NBT); j++) {
 +      ia = molt->ilist[bondtypes[j]].iatoms;
 +      for(i=0; (i<molt->ilist[bondtypes[j]].nr); ) {
 +        type  = ia[0];
 +        ftype = ffparams->functype[type];
 +        nra   = interaction_function[ftype].nratoms;
 +        
 +        /* Check whether we have a bond with a shell */
 +        aS = NO_ATID;
 +        
 +        switch (bondtypes[j]) {
 +        case F_BONDS:
 +        case F_HARMONIC:
 +        case F_CUBICBONDS:
 +        case F_POLARIZATION:
 +        case F_ANHARM_POL:
 +          if (atom[ia[1]].ptype == eptShell) {
 +            aS = ia[1];
 +            aN = ia[2];
 +          }
 +          else if (atom[ia[2]].ptype == eptShell) {
 +            aS = ia[2];
 +            aN = ia[1];
 +          }
 +          break;
 +        case F_WATER_POL:
 +          aN    = ia[4];  /* Dummy */
 +          aS    = ia[5];  /* Shell */
 +          break;
 +        default:
 +          gmx_fatal(FARGS,"Death Horror: %s, %d",__FILE__,__LINE__);
 +        }
 +        
 +        if (aS != NO_ATID) {    
 +          qS = atom[aS].q;
 +          
 +          /* Check whether one of the particles is a shell... */
 +          nsi = shell_index[a_offset+aS];
 +          if ((nsi < 0) || (nsi >= nshell))
 +            gmx_fatal(FARGS,"nsi is %d should be within 0 - %d. aS = %d",
 +                      nsi,nshell,aS);
 +          if (shell[nsi].shell == NO_ATID) {
 +            shell[nsi].shell = a_offset + aS;
 +            ns ++;
 +          }
 +          else if (shell[nsi].shell != a_offset+aS)
 +            gmx_fatal(FARGS,"Weird stuff in %s, %d",__FILE__,__LINE__);
 +          
 +          if      (shell[nsi].nucl1 == NO_ATID) {
 +            shell[nsi].nucl1 = a_offset + aN;
 +          } else if (shell[nsi].nucl2 == NO_ATID) {
 +            shell[nsi].nucl2 = a_offset + aN;
 +          } else if (shell[nsi].nucl3 == NO_ATID) {
 +            shell[nsi].nucl3 = a_offset + aN;
 +          } else {
 +            if (fplog)
 +              pr_shell(fplog,ns,shell);
 +            gmx_fatal(FARGS,"Can not handle more than three bonds per shell\n");
 +          }
 +          if (at2cg[aS] != at2cg[aN]) {
 +            /* shell[nsi].bInterCG = TRUE; */
 +            shfc->bInterCG = TRUE;
 +          }
 +          
 +          switch (bondtypes[j]) {
 +          case F_BONDS:
 +          case F_HARMONIC:
 +            shell[nsi].k    += ffparams->iparams[type].harmonic.krA;
 +            break;
 +          case F_CUBICBONDS:
 +            shell[nsi].k    += ffparams->iparams[type].cubic.kb;
 +            break;
 +          case F_POLARIZATION:
 +          case F_ANHARM_POL:
-             if (qS != atom[aS].qB)
-               gmx_fatal(FARGS,"water_pol can not be used with qA != qB");
++            if (!gmx_within_tol(qS, atom[aS].qB, GMX_REAL_EPS*10))
++              gmx_fatal(FARGS,"polarize can not be used with qA(%e) != qB(%e) for atom %d of molecule block %d", qS, atom[aS].qB, aS+1, mb+1);
 +            shell[nsi].k    += sqr(qS)*ONE_4PI_EPS0/
 +              ffparams->iparams[type].polarize.alpha;
 +            break;
 +          case F_WATER_POL:
-       if (dx != 0 && df != 0) {
-         k_est = -dx/df;
-         if (k_est >= 2*s[i].step[d]) {
-           s[i].step[d] *= 1.2;
-         } else if (k_est <= 0) {
-           s[i].step[d] *= 0.8;
-         } else {
-           s[i].step[d] = 0.8*s[i].step[d] + 0.2*k_est;
-         }
-       } else if (dx != 0) {
-         s[i].step[d] *= 1.2;
-       }
++            if (!gmx_within_tol(qS, atom[aS].qB, GMX_REAL_EPS*10))
++              gmx_fatal(FARGS,"water_pol can not be used with qA(%e) != qB(%e) for atom %d of molecule block %d", qS, atom[aS].qB, aS+1, mb+1);
 +            alpha          = (ffparams->iparams[type].wpol.al_x+
 +                              ffparams->iparams[type].wpol.al_y+
 +                              ffparams->iparams[type].wpol.al_z)/3.0;
 +            shell[nsi].k  += sqr(qS)*ONE_4PI_EPS0/alpha;
 +            break;
 +          default:
 +            gmx_fatal(FARGS,"Death Horror: %s, %d",__FILE__,__LINE__);
 +          }
 +          shell[nsi].nnucl++;
 +        }
 +        ia += nra+1;
 +        i  += nra+1;
 +      }
 +      }
 +      a_offset += molt->atoms.nr;
 +    }
 +    /* Done with this molecule type */
 +    sfree(at2cg);
 +  }
 +  
 +  /* Verify whether it's all correct */
 +  if (ns != nshell)
 +    gmx_fatal(FARGS,"Something weird with shells. They may not be bonded to something");
 +  
 +  for(i=0; (i<ns); i++)
 +    shell[i].k_1 = 1.0/shell[i].k;
 +  
 +  if (debug)
 +    pr_shell(debug,ns,shell);
 +
 +  
 +  shfc->nshell_gl      = ns;
 +  shfc->shell_gl       = shell;
 +  shfc->shell_index_gl = shell_index;
 +
 +  shfc->bPredict   = (getenv("GMX_NOPREDICT") == NULL);
 +  shfc->bForceInit = FALSE;
 +  if (!shfc->bPredict) {
 +    if (fplog)
 +      fprintf(fplog,"\nWill never predict shell positions\n");
 +  } else {
 +    shfc->bForceInit = (getenv("GMX_FORCEINIT") != NULL);
 +    if (shfc->bForceInit && fplog)
 +      fprintf(fplog,"\nWill always initiate shell positions\n");
 +  }
 +
 +  if (shfc->bPredict) {
 +    if (x) {
 +      predict_shells(fplog,x,NULL,0,shfc->nshell_gl,shfc->shell_gl,
 +                   NULL,mtop,TRUE);
 +    }
 +
 +    if (shfc->bInterCG) {
 +      if (fplog)
 +      fprintf(fplog,"\nNOTE: there all shells that are connected to particles outside thier own charge group, will not predict shells positions during the run\n\n");
 +      shfc->bPredict = FALSE;
 +    }
 +  }
 +
 +  return shfc;
 +}
 +
 +void make_local_shells(t_commrec *cr,t_mdatoms *md,
 +                     struct gmx_shellfc *shfc)
 +{
 +  t_shell *shell;
 +  int a0,a1,*ind,nshell,i;
 +  gmx_domdec_t *dd=NULL;
 +
 +  if (PAR(cr)) {
 +    if (DOMAINDECOMP(cr)) {
 +      dd = cr->dd;
 +      a0 = 0;
 +      a1 = dd->nat_home;
 +    } else {
 +      pd_at_range(cr,&a0,&a1);
 +    }
 +  } else {
 +    /* Single node: we need all shells, just copy the pointer */
 +    shfc->nshell = shfc->nshell_gl;
 +    shfc->shell  = shfc->shell_gl;
 +    
 +    return;
 +  }
 +
 +  ind = shfc->shell_index_gl;
 +
 +  nshell = 0;
 +  shell  = shfc->shell; 
 +  for(i=a0; i<a1; i++) {
 +    if (md->ptype[i] == eptShell) {
 +      if (nshell+1 > shfc->shell_nalloc) {
 +      shfc->shell_nalloc = over_alloc_dd(nshell+1);
 +      srenew(shell,shfc->shell_nalloc);
 +      }
 +      if (dd) {
 +      shell[nshell] = shfc->shell_gl[ind[dd->gatindex[i]]];
 +      } else {
 +      shell[nshell] = shfc->shell_gl[ind[i]];
 +      }
 +      /* With inter-cg shells we can no do shell prediction,
 +       * so we do not need the nuclei numbers.
 +       */
 +      if (!shfc->bInterCG) {
 +      shell[nshell].nucl1   = i + shell[nshell].nucl1 - shell[nshell].shell;
 +      if (shell[nshell].nnucl > 1)
 +        shell[nshell].nucl2 = i + shell[nshell].nucl2 - shell[nshell].shell;
 +      if (shell[nshell].nnucl > 2)
 +        shell[nshell].nucl3 = i + shell[nshell].nucl3 - shell[nshell].shell;
 +      }
 +      shell[nshell].shell = i;
 +      nshell++;
 +    }
 +  }
 +
 +  shfc->nshell = nshell;
 +  shfc->shell  = shell;
 +}
 +
 +static void do_1pos(rvec xnew,rvec xold,rvec f,real step)
 +{
 +  real xo,yo,zo;
 +  real dx,dy,dz;
 +  
 +  xo=xold[XX];
 +  yo=xold[YY];
 +  zo=xold[ZZ];
 +
 +  dx=f[XX]*step;
 +  dy=f[YY]*step;
 +  dz=f[ZZ]*step;
 +
 +  xnew[XX]=xo+dx;
 +  xnew[YY]=yo+dy;
 +  xnew[ZZ]=zo+dz;
 +}
 +
 +static void do_1pos3(rvec xnew,rvec xold,rvec f,rvec step)
 +{
 +  real xo,yo,zo;
 +  real dx,dy,dz;
 +  
 +  xo=xold[XX];
 +  yo=xold[YY];
 +  zo=xold[ZZ];
 +
 +  dx=f[XX]*step[XX];
 +  dy=f[YY]*step[YY];
 +  dz=f[ZZ]*step[ZZ];
 +
 +  xnew[XX]=xo+dx;
 +  xnew[YY]=yo+dy;
 +  xnew[ZZ]=zo+dz;
 +}
 +
 +static void directional_sd(FILE *log,rvec xold[],rvec xnew[],rvec acc_dir[],
 +                         int start,int homenr,real step)
 +{
 +  int  i;
 +
 +  for(i=start; i<homenr; i++)
 +    do_1pos(xnew[i],xold[i],acc_dir[i],step);
 +}
 +
 +static void shell_pos_sd(FILE *log,rvec xcur[],rvec xnew[],rvec f[],
 +                       int ns,t_shell s[],int count)
 +{
++    const real step_scale_min = 0.8,
++        step_scale_increment = 0.2,
++        step_scale_max = 1.2,
++        step_scale_multiple = (step_scale_max - step_scale_min) / step_scale_increment;
 +  int  i,shell,d;
 +  real dx,df,k_est;
 +#ifdef PRINT_STEP  
 +  real step_min,step_max;
 +
 +  step_min = 1e30;
 +  step_max = 0;
 +#endif
 +  for(i=0; (i<ns); i++) {
 +    shell = s[i].shell;
 +    if (count == 1) {
 +      for(d=0; d<DIM; d++) {
 +      s[i].step[d] = s[i].k_1;
 +#ifdef PRINT_STEP
 +      step_min = min(step_min,s[i].step[d]);
 +      step_max = max(step_max,s[i].step[d]);
 +#endif
 +      }
 +    } else {
 +      for(d=0; d<DIM; d++) {
 +      dx = xcur[shell][d] - s[i].xold[d];
 +      df =    f[shell][d] - s[i].fold[d];
++    /* -dx/df gets used to generate an interpolated value, but would
++     * cause a NaN if df were binary-equal to zero. Values close to
++     * zero won't cause problems (because of the min() and max()), so
++     * just testing for binary inequality is OK. */
++    if (0.0 != df)
++    {
++        k_est = -dx/df;
++        /* Scale the step size by a factor interpolated from
++         * step_scale_min to step_scale_max, as k_est goes from 0 to
++         * step_scale_multiple * s[i].step[d] */
++        s[i].step[d] =
++            step_scale_min * s[i].step[d] +
++            step_scale_increment * min(step_scale_multiple * s[i].step[d], max(k_est, 0));
++    }
++    else
++    {
++        /* Here 0 == df */
++        if (gmx_numzero(dx)) /* 0 == dx */
++        {
++            /* Likely this will never happen, but if it does just
++             * don't scale the step. */
++        }
++        else /* 0 != dx */
++        {
++            s[i].step[d] *= step_scale_max;
++        }
++    }
 +#ifdef PRINT_STEP
 +      step_min = min(step_min,s[i].step[d]);
 +      step_max = max(step_max,s[i].step[d]);
 +#endif
 +      }
 +    }
 +    copy_rvec(xcur[shell],s[i].xold);
 +    copy_rvec(f[shell],   s[i].fold);
 +
 +    do_1pos3(xnew[shell],xcur[shell],f[shell],s[i].step);
 +
 +    if (gmx_debug_at) {
 +      fprintf(debug,"shell[%d] = %d\n",i,shell);
 +      pr_rvec(debug,0,"fshell",f[shell],DIM,TRUE);
 +      pr_rvec(debug,0,"xold",xcur[shell],DIM,TRUE);
 +      pr_rvec(debug,0,"step",s[i].step,DIM,TRUE);
 +      pr_rvec(debug,0,"xnew",xnew[shell],DIM,TRUE);
 +    }
 +  }
 +#ifdef PRINT_STEP
 +  printf("step %.3e %.3e\n",step_min,step_max);
 +#endif
 +}
 +
 +static void decrease_step_size(int nshell,t_shell s[])
 +{
 +  int i;
 +  
 +  for(i=0; i<nshell; i++)
 +    svmul(0.8,s[i].step,s[i].step);
 +}
 +
 +static void print_epot(FILE *fp,gmx_large_int_t mdstep,int count,real epot,real df,
 +                     int ndir,real sf_dir)
 +{
 +  char buf[22];
 +
 +  fprintf(fp,"MDStep=%5s/%2d EPot: %12.8e, rmsF: %6.2e",
 +        gmx_step_str(mdstep,buf),count,epot,df);
 +  if (ndir)
 +    fprintf(fp,", dir. rmsF: %6.2e\n",sqrt(sf_dir/ndir));
 +  else
 +    fprintf(fp,"\n");
 +}
 +
 +
 +static real rms_force(t_commrec *cr,rvec f[],int ns,t_shell s[],
 +                    int ndir,real *sf_dir,real *Epot)
 +{
 +  int  i,shell,ntot;
 +  double buf[4];
 +
 +  buf[0] = *sf_dir;
 +  for(i=0; i<ns; i++) {
 +    shell = s[i].shell;
 +    buf[0]  += norm2(f[shell]);
 +  }
 +  ntot = ns;
 +
 +  if (PAR(cr)) {
 +    buf[1] = ntot;
 +    buf[2] = *sf_dir;
 +    buf[3] = *Epot;
 +    gmx_sumd(4,buf,cr);
 +    ntot = (int)(buf[1] + 0.5);
 +    *sf_dir = buf[2];
 +    *Epot   = buf[3];
 +  }
 +  ntot += ndir;
 +
 +  return (ntot ? sqrt(buf[0]/ntot) : 0);
 +}
 +
 +static void check_pbc(FILE *fp,rvec x[],int shell)
 +{
 +  int m,now;
 +  
 +  now = shell-4;
 +  for(m=0; (m<DIM); m++)
 +    if (fabs(x[shell][m]-x[now][m]) > 0.3) {
 +      pr_rvecs(fp,0,"SHELL-X",x+now,5);
 +      break;
 +    }
 +}
 +
 +static void dump_shells(FILE *fp,rvec x[],rvec f[],real ftol,int ns,t_shell s[])
 +{
 +  int  i,shell;
 +  real ft2,ff2;
 +  
 +  ft2 = sqr(ftol);
 +  
 +  for(i=0; (i<ns); i++) {
 +    shell = s[i].shell;
 +    ff2   = iprod(f[shell],f[shell]);
 +    if (ff2 > ft2)
 +      fprintf(fp,"SHELL %5d, force %10.5f  %10.5f  %10.5f, |f| %10.5f\n",
 +            shell,f[shell][XX],f[shell][YY],f[shell][ZZ],sqrt(ff2));
 +    check_pbc(fp,x,shell);
 +  }
 +}
 +
 +static void init_adir(FILE *log,gmx_shellfc_t shfc,
 +                    gmx_constr_t constr,t_idef *idef,t_inputrec *ir,
 +                    t_commrec *cr,int dd_ac1,
 +                    gmx_large_int_t step,t_mdatoms *md,int start,int end,
 +                    rvec *x_old,rvec *x_init,rvec *x,
 +                    rvec *f,rvec *acc_dir,matrix box,
 +                    real lambda,real *dvdlambda,t_nrnb *nrnb)
 +{
 +  rvec   *xnold,*xnew;
 +  double w_dt;
 +  int    gf,ga,gt;
 +  real   dt,scale;
 +  int    n,d; 
 +  unsigned short *ptype;
 +  rvec   p,dx;
 +  
 +  if (DOMAINDECOMP(cr))
 +    n = dd_ac1;
 +  else
 +    n = end - start;
 +  if (n > shfc->adir_nalloc) {
 +    shfc->adir_nalloc = over_alloc_dd(n);
 +    srenew(shfc->adir_xnold,shfc->adir_nalloc);
 +    srenew(shfc->adir_xnew ,shfc->adir_nalloc);
 +  }
 +  xnold = shfc->adir_xnold;
 +  xnew  = shfc->adir_xnew;
 +    
 +  ptype = md->ptype;
 +
 +  dt = ir->delta_t;
 +
 +  /* Does NOT work with freeze or acceleration groups (yet) */
 +  for (n=start; n<end; n++) {  
 +    w_dt = md->invmass[n]*dt;
 +    
 +    for (d=0; d<DIM; d++) {
 +      if ((ptype[n] != eptVSite) && (ptype[n] != eptShell)) {
 +      xnold[n-start][d] = x[n][d] - (x_init[n][d] - x_old[n][d]);
 +      xnew[n-start][d] = 2*x[n][d] - x_old[n][d] + f[n][d]*w_dt*dt;
 +      } else {
 +      xnold[n-start][d] = x[n][d];
 +      xnew[n-start][d] = x[n][d];
 +      }
 +    }
 +  }
 +  constrain(log,FALSE,FALSE,constr,idef,ir,NULL,cr,step,0,md,
 +          x,xnold-start,NULL,box,
 +          lambda,dvdlambda,NULL,NULL,nrnb,econqCoord,FALSE,0,0);
 +  constrain(log,FALSE,FALSE,constr,idef,ir,NULL,cr,step,0,md,
 +          x,xnew-start,NULL,box,
 +          lambda,dvdlambda,NULL,NULL,nrnb,econqCoord,FALSE,0,0);
 +
 +  /* Set xnew to minus the acceleration */
 +  for (n=start; n<end; n++) {
 +    for(d=0; d<DIM; d++)
 +      xnew[n-start][d] =
 +      -(2*x[n][d]-xnold[n-start][d]-xnew[n-start][d])/sqr(dt)
 +      - f[n][d]*md->invmass[n];
 +    clear_rvec(acc_dir[n]);
 +  }
 +
 +  /* Project the acceleration on the old bond directions */
 +  constrain(log,FALSE,FALSE,constr,idef,ir,NULL,cr,step,0,md,
 +          x_old,xnew-start,acc_dir,box,
 +          lambda,dvdlambda,NULL,NULL,nrnb,econqDeriv_FlexCon,FALSE,0,0); 
 +}
 +
 +int relax_shell_flexcon(FILE *fplog,t_commrec *cr,gmx_bool bVerbose,
 +                      gmx_large_int_t mdstep,t_inputrec *inputrec,
 +                      gmx_bool bDoNS,int force_flags,
 +                      gmx_bool bStopCM,
 +                      gmx_localtop_t *top,
 +                      gmx_mtop_t* mtop,
 +                      gmx_constr_t constr,
 +                      gmx_enerdata_t *enerd,t_fcdata *fcd,
 +                      t_state *state,rvec f[],
 +                      tensor force_vir,
 +                      t_mdatoms *md,
 +                      t_nrnb *nrnb,gmx_wallcycle_t wcycle,
 +                      t_graph *graph,
 +                      gmx_groups_t *groups,
 +                      struct gmx_shellfc *shfc,
 +                      t_forcerec *fr,
 +                      gmx_bool bBornRadii,
 +                      double t,rvec mu_tot,
 +                      int natoms,gmx_bool *bConverged,
 +                      gmx_vsite_t *vsite,
 +                      FILE *fp_field)
 +{
 +  int    nshell;
 +  t_shell *shell;
 +  t_idef *idef;
 +  rvec   *pos[2],*force[2],*acc_dir=NULL,*x_old=NULL;
 +  real   Epot[2],df[2];
 +  rvec   dx;
 +  real   sf_dir,invdt;
 +  real   ftol,xiH,xiS,dum=0;
 +  char   sbuf[22];
 +  gmx_bool   bCont,bInit;
 +  int    nat,dd_ac0,dd_ac1=0,i;
 +  int    start=md->start,homenr=md->homenr,end=start+homenr,cg0,cg1;
 +  int    nflexcon,g,number_steps,d,Min=0,count=0;
 +#define  Try (1-Min)             /* At start Try = 1 */
 +
 +  bCont        = (mdstep == inputrec->init_step) && inputrec->bContinuation;
 +  bInit        = (mdstep == inputrec->init_step) || shfc->bForceInit;
 +  ftol         = inputrec->em_tol;
 +  number_steps = inputrec->niter;
 +  nshell       = shfc->nshell;
 +  shell        = shfc->shell;
 +  nflexcon     = shfc->nflexcon;
 +
 +  idef = &top->idef;
 +
 +  if (DOMAINDECOMP(cr)) {
 +    nat = dd_natoms_vsite(cr->dd);
 +    if (nflexcon > 0) {
 +      dd_get_constraint_range(cr->dd,&dd_ac0,&dd_ac1);
 +      nat = max(nat,dd_ac1);
 +    }
 +  } else {
 +    nat = state->natoms;
 +  }
 +
 +  if (nat > shfc->x_nalloc) {
 +    /* Allocate local arrays */
 +    shfc->x_nalloc = over_alloc_dd(nat);
 +    for(i=0; (i<2); i++) {
 +      srenew(shfc->x[i],shfc->x_nalloc);
 +      srenew(shfc->f[i],shfc->x_nalloc);
 +    }
 +  }
 +  for(i=0; (i<2); i++) {
 +    pos[i]   = shfc->x[i];
 +    force[i] = shfc->f[i];
 +  }
 +     
 +  /* With particle decomposition this code only works
 +   * when all particles involved with each shell are in the same cg.
 +   */
 +
 +  if (bDoNS && inputrec->ePBC != epbcNONE && !DOMAINDECOMP(cr)) {
 +    /* This is the only time where the coordinates are used
 +     * before do_force is called, which normally puts all
 +     * charge groups in the box.
 +     */
 +    if (PARTDECOMP(cr)) {
 +      pd_cg_range(cr,&cg0,&cg1);
 +    } else {
 +      cg0 = 0;
 +      cg1 = top->cgs.nr;
 +    }
 +    put_charge_groups_in_box(fplog,cg0,cg1,fr->ePBC,state->box,
 +                           &(top->cgs),state->x,fr->cg_cm);
 +    if (graph)
 +      mk_mshift(fplog,graph,fr->ePBC,state->box,state->x);
 +  }
 +
 +  /* After this all coordinate arrays will contain whole molecules */
 +  if (graph)
 +    shift_self(graph,state->box,state->x);
 +
 +  if (nflexcon) {
 +    if (nat > shfc->flex_nalloc) {
 +      shfc->flex_nalloc = over_alloc_dd(nat);
 +      srenew(shfc->acc_dir,shfc->flex_nalloc);
 +      srenew(shfc->x_old,shfc->flex_nalloc);
 +    }
 +    acc_dir = shfc->acc_dir;
 +    x_old   = shfc->x_old;
 +    for(i=0; i<homenr; i++) {
 +      for(d=0; d<DIM; d++)
 +        shfc->x_old[i][d] =
 +        state->x[start+i][d] - state->v[start+i][d]*inputrec->delta_t;
 +    }
 +  }
 +
 +  /* Do a prediction of the shell positions */
 +  if (shfc->bPredict && !bCont) {
 +    predict_shells(fplog,state->x,state->v,inputrec->delta_t,nshell,shell,
 +                 md->massT,NULL,bInit);
 +  }
 +
 +  /* do_force expected the charge groups to be in the box */
 +  if (graph)
 +    unshift_self(graph,state->box,state->x);
 +
 +  /* Calculate the forces first time around */
 +  if (gmx_debug_at) {
 +    pr_rvecs(debug,0,"x b4 do_force",state->x + start,homenr);
 +  }
 +  do_force(fplog,cr,inputrec,mdstep,nrnb,wcycle,top,mtop,groups,
 +         state->box,state->x,&state->hist,
 +         force[Min],force_vir,md,enerd,fcd,
 +         state->lambda,graph,
 +         fr,vsite,mu_tot,t,fp_field,NULL,bBornRadii,
 +         (bDoNS ? GMX_FORCE_NS : 0) | force_flags);
 +
 +  sf_dir = 0;
 +  if (nflexcon) {
 +    init_adir(fplog,shfc,
 +            constr,idef,inputrec,cr,dd_ac1,mdstep,md,start,end,
 +            shfc->x_old-start,state->x,state->x,force[Min],
 +            shfc->acc_dir-start,state->box,state->lambda,&dum,nrnb);
 +
 +    for(i=start; i<end; i++)
 +      sf_dir += md->massT[i]*norm2(shfc->acc_dir[i-start]);
 +  }
 +
 +  Epot[Min] = enerd->term[F_EPOT];
 +
 +  df[Min]=rms_force(cr,shfc->f[Min],nshell,shell,nflexcon,&sf_dir,&Epot[Min]);
 +  df[Try]=0;
 +  if (debug) {
 +    fprintf(debug,"df = %g  %g\n",df[Min],df[Try]);
 +  }
 +
 +  if (gmx_debug_at) {
 +    pr_rvecs(debug,0,"force0",force[Min],md->nr);
 +  }
 +
 +  if (nshell+nflexcon > 0) {
 +    /* Copy x to pos[Min] & pos[Try]: during minimization only the
 +     * shell positions are updated, therefore the other particles must
 +     * be set here.
 +     */
 +    memcpy(pos[Min],state->x,nat*sizeof(state->x[0]));
 +    memcpy(pos[Try],state->x,nat*sizeof(state->x[0]));
 +  }
 +  
 +  if (bVerbose && MASTER(cr))
 +    print_epot(stdout,mdstep,0,Epot[Min],df[Min],nflexcon,sf_dir);
 +
 +  if (debug) {
 +    fprintf(debug,"%17s: %14.10e\n",
 +          interaction_function[F_EKIN].longname,enerd->term[F_EKIN]);
 +    fprintf(debug,"%17s: %14.10e\n",
 +          interaction_function[F_EPOT].longname,enerd->term[F_EPOT]);
 +    fprintf(debug,"%17s: %14.10e\n",
 +          interaction_function[F_ETOT].longname,enerd->term[F_ETOT]);
 +    fprintf(debug,"SHELLSTEP %s\n",gmx_step_str(mdstep,sbuf));
 +  }
 +  
 +  /* First check whether we should do shells, or whether the force is 
 +   * low enough even without minimization.
 +   */
 +  *bConverged = (df[Min] < ftol);
 +  
 +  for(count=1; (!(*bConverged) && (count < number_steps)); count++) {
 +    if (vsite)
 +      construct_vsites(fplog,vsite,pos[Min],nrnb,inputrec->delta_t,state->v,
 +                     idef->iparams,idef->il,
 +                     fr->ePBC,fr->bMolPBC,graph,cr,state->box);
 +     
 +    if (nflexcon) {
 +      init_adir(fplog,shfc,
 +              constr,idef,inputrec,cr,dd_ac1,mdstep,md,start,end,
 +              x_old-start,state->x,pos[Min],force[Min],acc_dir-start,
 +              state->box,state->lambda,&dum,nrnb);
 +      
 +      directional_sd(fplog,pos[Min],pos[Try],acc_dir-start,start,end,
 +                   fr->fc_stepsize);
 +    }
 +    
 +    /* New positions, Steepest descent */
 +    shell_pos_sd(fplog,pos[Min],pos[Try],force[Min],nshell,shell,count); 
 +
 +    /* do_force expected the charge groups to be in the box */
 +    if (graph)
 +      unshift_self(graph,state->box,pos[Try]);
 +
 +    if (gmx_debug_at) {
 +      pr_rvecs(debug,0,"RELAX: pos[Min]  ",pos[Min] + start,homenr);
 +      pr_rvecs(debug,0,"RELAX: pos[Try]  ",pos[Try] + start,homenr);
 +    }
 +    /* Try the new positions */
 +    do_force(fplog,cr,inputrec,1,nrnb,wcycle,
 +           top,mtop,groups,state->box,pos[Try],&state->hist,
 +           force[Try],force_vir,
 +           md,enerd,fcd,state->lambda,graph,
 +           fr,vsite,mu_tot,t,fp_field,NULL,bBornRadii,
 +           force_flags);
 +    
 +    if (gmx_debug_at) {
 +      pr_rvecs(debug,0,"RELAX: force[Min]",force[Min] + start,homenr);
 +      pr_rvecs(debug,0,"RELAX: force[Try]",force[Try] + start,homenr);
 +    }
 +    sf_dir = 0;
 +    if (nflexcon) {
 +      init_adir(fplog,shfc,
 +              constr,idef,inputrec,cr,dd_ac1,mdstep,md,start,end,
 +              x_old-start,state->x,pos[Try],force[Try],acc_dir-start,
 +              state->box,state->lambda,&dum,nrnb);
 +
 +      for(i=start; i<end; i++)
 +      sf_dir += md->massT[i]*norm2(acc_dir[i-start]);
 +    }
 +
 +    Epot[Try] = enerd->term[F_EPOT]; 
 +    
 +    df[Try]=rms_force(cr,force[Try],nshell,shell,nflexcon,&sf_dir,&Epot[Try]);
 +
 +    if (debug)
 +      fprintf(debug,"df = %g  %g\n",df[Min],df[Try]);
 +
 +    if (debug) {
 +      if (gmx_debug_at)
 +      pr_rvecs(debug,0,"F na do_force",force[Try] + start,homenr);
 +      if (gmx_debug_at) {
 +      fprintf(debug,"SHELL ITER %d\n",count);
 +      dump_shells(debug,pos[Try],force[Try],ftol,nshell,shell);
 +      }
 +    }
 +
 +    if (bVerbose && MASTER(cr))
 +      print_epot(stdout,mdstep,count,Epot[Try],df[Try],nflexcon,sf_dir);
 +      
 +    *bConverged = (df[Try] < ftol);
 +    
 +    if ((df[Try] < df[Min])) {
 +      if (debug)
 +      fprintf(debug,"Swapping Min and Try\n");
 +      if (nflexcon) {
 +      /* Correct the velocities for the flexible constraints */
 +      invdt = 1/inputrec->delta_t;
 +      for(i=start; i<end; i++) {
 +        for(d=0; d<DIM; d++)
 +          state->v[i][d] += (pos[Try][i][d] - pos[Min][i][d])*invdt;
 +      }
 +      }
 +      Min  = Try;
 +    } else {
 +      decrease_step_size(nshell,shell);
 +    }
 +  }
 +  if (MASTER(cr) && !(*bConverged)) {
 +    /* Note that the energies and virial are incorrect when not converged */
 +    if (fplog)
 +      fprintf(fplog,
 +            "step %s: EM did not converge in %d iterations, RMS force %.3f\n",
 +            gmx_step_str(mdstep,sbuf),number_steps,df[Min]);
 +    fprintf(stderr,
 +          "step %s: EM did not converge in %d iterations, RMS force %.3f\n",
 +          gmx_step_str(mdstep,sbuf),number_steps,df[Min]);
 +  }
 +
 +  /* Copy back the coordinates and the forces */
 +  memcpy(state->x,pos[Min],nat*sizeof(state->x[0]));
 +  memcpy(f,force[Min],nat*sizeof(f[0]));
 +
 +  return count; 
 +}
 +
index 859c6bd9d97fdd733c257723e73e371d96a9b264,0000000000000000000000000000000000000000..4285ffd58497b649e5a2401214c1abba2ab89f4d
mode 100644,000000..100644
--- /dev/null
@@@ -1,1593 -1,0 +1,1559 @@@
- #include "pppm.h"
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * GROwing Monsters And Cloning Shrimps
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#ifdef GMX_CRAY_XT3
 +#include<catamount/dclock.h>
 +#endif
 +
 +
 +#include <stdio.h>
 +#include <time.h>
 +#ifdef HAVE_SYS_TIME_H
 +#include <sys/time.h>
 +#endif
 +#include <math.h>
 +#include "typedefs.h"
 +#include "string2.h"
 +#include "gmxfio.h"
 +#include "smalloc.h"
 +#include "names.h"
 +#include "confio.h"
 +#include "mvdata.h"
 +#include "txtdump.h"
 +#include "pbc.h"
 +#include "chargegroup.h"
 +#include "vec.h"
 +#include <time.h>
 +#include "nrnb.h"
 +#include "mshift.h"
 +#include "mdrun.h"
 +#include "update.h"
 +#include "physics.h"
 +#include "main.h"
 +#include "mdatoms.h"
 +#include "force.h"
 +#include "bondf.h"
 +#include "pme.h"
-     double mass,tmass,vcm[4];
 +#include "disre.h"
 +#include "orires.h"
 +#include "network.h"
 +#include "calcmu.h"
 +#include "constr.h"
 +#include "xvgr.h"
 +#include "trnio.h"
 +#include "xtcio.h"
 +#include "copyrite.h"
 +#include "pull_rotation.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "gmx_wallcycle.h"
 +#include "genborn.h"
 +
 +#ifdef GMX_LIB_MPI
 +#include <mpi.h>
 +#endif
 +#ifdef GMX_THREAD_MPI
 +#include "tmpi.h"
 +#endif
 +
 +#include "adress.h"
 +#include "qmmm.h"
 +
 +#if 0
 +typedef struct gmx_timeprint {
 +    
 +} t_gmx_timeprint;
 +#endif
 +
 +/* Portable version of ctime_r implemented in src/gmxlib/string2.c, but we do not want it declared in public installed headers */
 +char *
 +gmx_ctime_r(const time_t *clock,char *buf, int n);
 +
 +
 +double
 +gmx_gettime()
 +{
 +#ifdef HAVE_GETTIMEOFDAY
 +      struct timeval t;
 +      double seconds;
 +      
 +      gettimeofday(&t,NULL);
 +      
 +      seconds = (double) t.tv_sec + 1e-6*(double)t.tv_usec;
 +      
 +      return seconds;
 +#else
 +      double  seconds;
 +      
 +      seconds = time(NULL);
 +      
 +      return seconds;
 +#endif
 +}
 +
 +
 +#define difftime(end,start) ((double)(end)-(double)(start))
 +
 +void print_time(FILE *out,gmx_runtime_t *runtime,gmx_large_int_t step,   
 +                t_inputrec *ir, t_commrec *cr)
 +{
 +    time_t finish;
 +    char   timebuf[STRLEN];
 +    double dt;
 +    char buf[48];
 +    
 +#ifndef GMX_THREAD_MPI
 +    if (!PAR(cr))
 +#endif
 +    {
 +        fprintf(out,"\r");
 +    }
 +    fprintf(out,"step %s",gmx_step_str(step,buf));
 +    if ((step >= ir->nstlist))
 +    {
 +        if ((ir->nstlist == 0) || ((step % ir->nstlist) == 0))
 +        {
 +            /* We have done a full cycle let's update time_per_step */
 +            runtime->last = gmx_gettime();
 +            dt = difftime(runtime->last,runtime->real);
 +            runtime->time_per_step = dt/(step - ir->init_step + 1);
 +        }
 +        dt = (ir->nsteps + ir->init_step - step)*runtime->time_per_step;
 +        
 +        if (ir->nsteps >= 0)
 +        {
 +            if (dt >= 300)
 +            {    
 +                finish = (time_t) (runtime->last + dt);
 +                gmx_ctime_r(&finish,timebuf,STRLEN);
 +                sprintf(buf,"%s",timebuf);
 +                buf[strlen(buf)-1]='\0';
 +                fprintf(out,", will finish %s",buf);
 +            }
 +            else
 +                fprintf(out,", remaining runtime: %5d s          ",(int)dt);
 +        }
 +        else
 +        {
 +            fprintf(out," performance: %.1f ns/day    ",
 +                    ir->delta_t/1000*24*60*60/runtime->time_per_step);
 +        }
 +    }
 +#ifndef GMX_THREAD_MPI
 +    if (PAR(cr))
 +    {
 +        fprintf(out,"\n");
 +    }
 +#endif
 +
 +    fflush(out);
 +}
 +
 +#ifdef NO_CLOCK 
 +#define clock() -1
 +#endif
 +
 +static double set_proctime(gmx_runtime_t *runtime)
 +{
 +    double diff;
 +#ifdef GMX_CRAY_XT3
 +    double prev;
 +
 +    prev = runtime->proc;
 +    runtime->proc = dclock();
 +    
 +    diff = runtime->proc - prev;
 +#else
 +    clock_t prev;
 +
 +    prev = runtime->proc;
 +    runtime->proc = clock();
 +
 +    diff = (double)(runtime->proc - prev)/(double)CLOCKS_PER_SEC;
 +#endif
 +    if (diff < 0)
 +    {
 +        /* The counter has probably looped, ignore this data */
 +        diff = 0;
 +    }
 +
 +    return diff;
 +}
 +
 +void runtime_start(gmx_runtime_t *runtime)
 +{
 +    runtime->real = gmx_gettime();
 +    runtime->proc          = 0;
 +    set_proctime(runtime);
 +    runtime->realtime      = 0;
 +    runtime->proctime      = 0;
 +    runtime->last          = 0;
 +    runtime->time_per_step = 0;
 +}
 +
 +void runtime_end(gmx_runtime_t *runtime)
 +{
 +    double now;
 +    
 +    now = gmx_gettime();
 +    
 +    runtime->proctime += set_proctime(runtime);
 +    runtime->realtime  = now - runtime->real;
 +    runtime->real      = now;
 +}
 +
 +void runtime_upd_proc(gmx_runtime_t *runtime)
 +{
 +    runtime->proctime += set_proctime(runtime);
 +}
 +
 +void print_date_and_time(FILE *fplog,int nodeid,const char *title,
 +                         const gmx_runtime_t *runtime)
 +{
 +    int i;
 +    char timebuf[STRLEN];
 +    char time_string[STRLEN];
 +    time_t tmptime;
 +
 +    if (fplog)
 +    {
 +        if (runtime != NULL)
 +        {
 +            tmptime = (time_t) runtime->real;
 +            gmx_ctime_r(&tmptime,timebuf,STRLEN);
 +        }
 +        else
 +        {
 +            tmptime = (time_t) gmx_gettime();
 +            gmx_ctime_r(&tmptime,timebuf,STRLEN);
 +        }
 +        for(i=0; timebuf[i]>=' '; i++)
 +        {
 +            time_string[i]=timebuf[i];
 +        }
 +        time_string[i]='\0';
 +
 +        fprintf(fplog,"%s on node %d %s\n",title,nodeid,time_string);
 +    }
 +}
 +
 +static void sum_forces(int start,int end,rvec f[],rvec flr[])
 +{
 +  int i;
 +  
 +  if (gmx_debug_at) {
 +    pr_rvecs(debug,0,"fsr",f+start,end-start);
 +    pr_rvecs(debug,0,"flr",flr+start,end-start);
 +  }
 +  for(i=start; (i<end); i++)
 +    rvec_inc(f[i],flr[i]);
 +}
 +
 +/* 
 + * calc_f_el calculates forces due to an electric field.
 + *
 + * force is kJ mol^-1 nm^-1 = e * kJ mol^-1 nm^-1 / e 
 + *
 + * Et[] contains the parameters for the time dependent 
 + * part of the field (not yet used). 
 + * Ex[] contains the parameters for
 + * the spatial dependent part of the field. You can have cool periodic
 + * fields in principle, but only a constant field is supported
 + * now. 
 + * The function should return the energy due to the electric field
 + * (if any) but for now returns 0.
 + *
 + * WARNING:
 + * There can be problems with the virial.
 + * Since the field is not self-consistent this is unavoidable.
 + * For neutral molecules the virial is correct within this approximation.
 + * For neutral systems with many charged molecules the error is small.
 + * But for systems with a net charge or a few charged molecules
 + * the error can be significant when the field is high.
 + * Solution: implement a self-consitent electric field into PME.
 + */
 +static void calc_f_el(FILE *fp,int  start,int homenr,
 +                      real charge[],rvec x[],rvec f[],
 +                      t_cosines Ex[],t_cosines Et[],double t)
 +{
 +    rvec Ext;
 +    real t0;
 +    int  i,m;
 +    
 +    for(m=0; (m<DIM); m++)
 +    {
 +        if (Et[m].n > 0)
 +        {
 +            if (Et[m].n == 3)
 +            {
 +                t0 = Et[m].a[1];
 +                Ext[m] = cos(Et[m].a[0]*(t-t0))*exp(-sqr(t-t0)/(2.0*sqr(Et[m].a[2])));
 +            }
 +            else
 +            {
 +                Ext[m] = cos(Et[m].a[0]*t);
 +            }
 +        }
 +        else
 +        {
 +            Ext[m] = 1.0;
 +        }
 +        if (Ex[m].n > 0)
 +        {
 +            /* Convert the field strength from V/nm to MD-units */
 +            Ext[m] *= Ex[m].a[0]*FIELDFAC;
 +            for(i=start; (i<start+homenr); i++)
 +                f[i][m] += charge[i]*Ext[m];
 +        }
 +        else
 +        {
 +            Ext[m] = 0;
 +        }
 +    }
 +    if (fp != NULL)
 +    {
 +        fprintf(fp,"%10g  %10g  %10g  %10g #FIELD\n",t,
 +                Ext[XX]/FIELDFAC,Ext[YY]/FIELDFAC,Ext[ZZ]/FIELDFAC);
 +    }
 +}
 +
 +static void calc_virial(FILE *fplog,int start,int homenr,rvec x[],rvec f[],
 +                      tensor vir_part,t_graph *graph,matrix box,
 +                      t_nrnb *nrnb,const t_forcerec *fr,int ePBC)
 +{
 +  int i,j;
 +  tensor virtest;
 +
 +  /* The short-range virial from surrounding boxes */
 +  clear_mat(vir_part);
 +  calc_vir(fplog,SHIFTS,fr->shift_vec,fr->fshift,vir_part,ePBC==epbcSCREW,box);
 +  inc_nrnb(nrnb,eNR_VIRIAL,SHIFTS);
 +  
 +  /* Calculate partial virial, for local atoms only, based on short range. 
 +   * Total virial is computed in global_stat, called from do_md 
 +   */
 +  f_calc_vir(fplog,start,start+homenr,x,f,vir_part,graph,box);
 +  inc_nrnb(nrnb,eNR_VIRIAL,homenr);
 +
 +  /* Add position restraint contribution */
 +  for(i=0; i<DIM; i++) {
 +    vir_part[i][i] += fr->vir_diag_posres[i];
 +  }
 +
 +  /* Add wall contribution */
 +  for(i=0; i<DIM; i++) {
 +    vir_part[i][ZZ] += fr->vir_wall_z[i];
 +  }
 +
 +  if (debug)
 +    pr_rvecs(debug,0,"vir_part",vir_part,DIM);
 +}
 +
 +static void print_large_forces(FILE *fp,t_mdatoms *md,t_commrec *cr,
 +                             gmx_large_int_t step,real pforce,rvec *x,rvec *f)
 +{
 +  int  i;
 +  real pf2,fn2;
 +  char buf[STEPSTRSIZE];
 +
 +  pf2 = sqr(pforce);
 +  for(i=md->start; i<md->start+md->homenr; i++) {
 +    fn2 = norm2(f[i]);
 +    /* We also catch NAN, if the compiler does not optimize this away. */
 +    if (fn2 >= pf2 || fn2 != fn2) {
 +      fprintf(fp,"step %s  atom %6d  x %8.3f %8.3f %8.3f  force %12.5e\n",
 +            gmx_step_str(step,buf),
 +            ddglatnr(cr->dd,i),x[i][XX],x[i][YY],x[i][ZZ],sqrt(fn2));
 +    }
 +  }
 +}
 +
 +void do_force(FILE *fplog,t_commrec *cr,
 +              t_inputrec *inputrec,
 +              gmx_large_int_t step,t_nrnb *nrnb,gmx_wallcycle_t wcycle,
 +              gmx_localtop_t *top,
 +              gmx_mtop_t *mtop,
 +              gmx_groups_t *groups,
 +              matrix box,rvec x[],history_t *hist,
 +              rvec f[],
 +              tensor vir_force,
 +              t_mdatoms *mdatoms,
 +              gmx_enerdata_t *enerd,t_fcdata *fcd,
 +              real lambda,t_graph *graph,
 +              t_forcerec *fr,gmx_vsite_t *vsite,rvec mu_tot,
 +              double t,FILE *field,gmx_edsam_t ed,
 +              gmx_bool bBornRadii,
 +              int flags)
 +{
 +    int    cg0,cg1,i,j;
 +    int    start,homenr;
 +    double mu[2*DIM]; 
 +    gmx_bool   bSepDVDL,bStateChanged,bNS,bFillGrid,bCalcCGCM,bBS;
 +    gmx_bool   bDoLongRange,bDoForces,bSepLRF;
 +    gmx_bool   bDoAdressWF;
 +    matrix boxs;
 +    real   e,v,dvdl;
 +    t_pbc  pbc;
 +    float  cycles_ppdpme,cycles_pme,cycles_seppme,cycles_force;
 +  
 +    start  = mdatoms->start;
 +    homenr = mdatoms->homenr;
 +
 +    bSepDVDL = (fr->bSepDVDL && do_per_step(step,inputrec->nstlog));
 +
 +    clear_mat(vir_force);
 +
 +    if (PARTDECOMP(cr))
 +    {
 +        pd_cg_range(cr,&cg0,&cg1);
 +    }
 +    else
 +    {
 +        cg0 = 0;
 +        if (DOMAINDECOMP(cr))
 +        {
 +            cg1 = cr->dd->ncg_tot;
 +        }
 +        else
 +        {
 +            cg1 = top->cgs.nr;
 +        }
 +        if (fr->n_tpi > 0)
 +        {
 +            cg1--;
 +        }
 +    }
 +
 +    bStateChanged = (flags & GMX_FORCE_STATECHANGED);
 +    bNS           = (flags & GMX_FORCE_NS) && (fr->bAllvsAll==FALSE); 
 +    bFillGrid     = (bNS && bStateChanged);
 +    bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
 +    bDoLongRange  = (fr->bTwinRange && bNS && (flags & GMX_FORCE_DOLR));
 +    bDoForces     = (flags & GMX_FORCE_FORCES);
 +    bSepLRF       = (bDoLongRange && bDoForces && (flags & GMX_FORCE_SEPLRF));
 +    /* should probably move this to the forcerec since it doesn't change */
 +    bDoAdressWF   = ((fr->adress_type!=eAdressOff));
 +
 +    if (bStateChanged)
 +    {
 +        update_forcerec(fplog,fr,box);
 +        
 +        /* Calculate total (local) dipole moment in a temporary common array. 
 +         * This makes it possible to sum them over nodes faster.
 +         */
 +        calc_mu(start,homenr,
 +                x,mdatoms->chargeA,mdatoms->chargeB,mdatoms->nChargePerturbed,
 +                mu,mu+DIM);
 +    }
 +  
 +  if (fr->ePBC != epbcNONE) { 
 +    /* Compute shift vectors every step,
 +     * because of pressure coupling or box deformation!
 +     */
 +    if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
 +      calc_shifts(box,fr->shift_vec);
 +    
 +    if (bCalcCGCM) { 
 +      put_charge_groups_in_box(fplog,cg0,cg1,fr->ePBC,box,
 +                             &(top->cgs),x,fr->cg_cm);
 +      inc_nrnb(nrnb,eNR_CGCM,homenr);
 +      inc_nrnb(nrnb,eNR_RESETX,cg1-cg0);
 +    } 
 +    else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph) {
 +      unshift_self(graph,box,x);
 +    }
 +  } 
 +  else if (bCalcCGCM) {
 +    calc_cgcm(fplog,cg0,cg1,&(top->cgs),x,fr->cg_cm);
 +    inc_nrnb(nrnb,eNR_CGCM,homenr);
 +  }
 +  
 +  if (bCalcCGCM) {
 +    if (PAR(cr)) {
 +      move_cgcm(fplog,cr,fr->cg_cm);
 +    }
 +    if (gmx_debug_at)
 +      pr_rvecs(debug,0,"cgcm",fr->cg_cm,top->cgs.nr);
 +  }
 +
 +#ifdef GMX_MPI
 +  if (!(cr->duty & DUTY_PME)) {
 +    /* Send particle coordinates to the pme nodes.
 +     * Since this is only implemented for domain decomposition
 +     * and domain decomposition does not use the graph,
 +     * we do not need to worry about shifting.
 +     */    
 +
 +    wallcycle_start(wcycle,ewcPP_PMESENDX);
 +
 +    bBS = (inputrec->nwall == 2);
 +    if (bBS) {
 +      copy_mat(box,boxs);
 +      svmul(inputrec->wall_ewald_zfac,boxs[ZZ],boxs[ZZ]);
 +    }
 +
 +    gmx_pme_send_x(cr,bBS ? boxs : box,x,
 +                   mdatoms->nChargePerturbed,lambda,
 +                   ( flags & GMX_FORCE_VIRIAL),step);
 +
 +    wallcycle_stop(wcycle,ewcPP_PMESENDX);
 +  }
 +#endif /* GMX_MPI */
 +
 +    /* Communicate coordinates and sum dipole if necessary */
 +    if (PAR(cr))
 +    {
 +        wallcycle_start(wcycle,ewcMOVEX);
 +        if (DOMAINDECOMP(cr))
 +        {
 +            dd_move_x(cr->dd,box,x);
 +        }
 +        else
 +        {
 +            move_x(fplog,cr,GMX_LEFT,GMX_RIGHT,x,nrnb);
 +        }
 +        /* When we don't need the total dipole we sum it in global_stat */
 +        if (bStateChanged && NEED_MUTOT(*inputrec))
 +        {
 +            gmx_sumd(2*DIM,mu,cr);
 +        }
 +        wallcycle_stop(wcycle,ewcMOVEX);
 +    }
 +    if (bStateChanged)
 +    {
 +
 +        /* update adress weight beforehand */
 +        if(bDoAdressWF)
 +        {
 +            /* need pbc for adress weight calculation with pbc_dx */
 +            set_pbc(&pbc,inputrec->ePBC,box);
 +            if(fr->adress_site == eAdressSITEcog)
 +            {
 +                update_adress_weights_cog(top->idef.iparams,top->idef.il,x,fr,mdatoms,
 +                                          inputrec->ePBC==epbcNONE ? NULL : &pbc);
 +            }
 +            else if (fr->adress_site == eAdressSITEcom)
 +            {
 +                update_adress_weights_com(fplog,cg0,cg1,&(top->cgs),x,fr,mdatoms,
 +                                          inputrec->ePBC==epbcNONE ? NULL : &pbc);
 +            }
 +            else if (fr->adress_site == eAdressSITEatomatom){
 +                update_adress_weights_atom_per_atom(cg0,cg1,&(top->cgs),x,fr,mdatoms,
 +                                          inputrec->ePBC==epbcNONE ? NULL : &pbc);
 +            }
 +            else
 +            {
 +                update_adress_weights_atom(cg0,cg1,&(top->cgs),x,fr,mdatoms,
 +                                           inputrec->ePBC==epbcNONE ? NULL : &pbc);
 +            }
 +        }
 +
 +        for(i=0; i<2; i++)
 +        {
 +            for(j=0;j<DIM;j++)
 +            {
 +                fr->mu_tot[i][j] = mu[i*DIM + j];
 +            }
 +        }
 +    }
 +    if (fr->efep == efepNO)
 +    {
 +        copy_rvec(fr->mu_tot[0],mu_tot);
 +    }
 +    else
 +    {
 +        for(j=0; j<DIM; j++)
 +        {
 +            mu_tot[j] =
 +                (1.0 - lambda)*fr->mu_tot[0][j] + lambda*fr->mu_tot[1][j];
 +        }
 +    }
 +
 +    /* Reset energies */
 +    reset_enerdata(&(inputrec->opts),fr,bNS,enerd,MASTER(cr));
 +    clear_rvecs(SHIFTS,fr->fshift);
 +
 +    if (bNS)
 +    {
 +        wallcycle_start(wcycle,ewcNS);
 +        
 +        if (graph && bStateChanged)
 +        {
 +            /* Calculate intramolecular shift vectors to make molecules whole */
 +            mk_mshift(fplog,graph,fr->ePBC,box,x);
 +        }
 +
 +        /* Reset long range forces if necessary */
 +        if (fr->bTwinRange)
 +        {
 +            /* Reset the (long-range) forces if necessary */
 +            clear_rvecs(fr->natoms_force_constr,bSepLRF ? fr->f_twin : f);
 +        }
 +
 +        /* Do the actual neighbour searching and if twin range electrostatics
 +         * also do the calculation of long range forces and energies.
 +         */
 +        dvdl = 0; 
 +        ns(fplog,fr,x,box,
 +           groups,&(inputrec->opts),top,mdatoms,
 +           cr,nrnb,lambda,&dvdl,&enerd->grpp,bFillGrid,
 +           bDoLongRange,bDoForces,bSepLRF ? fr->f_twin : f);
 +        if (bSepDVDL)
 +        {
 +            fprintf(fplog,sepdvdlformat,"LR non-bonded",0.0,dvdl);
 +        }
 +        enerd->dvdl_lin += dvdl;
 +        
 +        wallcycle_stop(wcycle,ewcNS);
 +    }
 +      
 +    if (inputrec->implicit_solvent && bNS) 
 +    {
 +        make_gb_nblist(cr,inputrec->gb_algorithm,inputrec->rlist,
 +                       x,box,fr,&top->idef,graph,fr->born);
 +    }
 +      
 +    if (DOMAINDECOMP(cr))
 +    {
 +        if (!(cr->duty & DUTY_PME))
 +        {
 +            wallcycle_start(wcycle,ewcPPDURINGPME);
 +            dd_force_flop_start(cr->dd,nrnb);
 +        }
 +    }
 +    
 +    if (inputrec->bRot)
 +    {
 +        /* Enforced rotation has its own cycle counter that starts after the collective
 +         * coordinates have been communicated. It is added to ddCyclF to allow
 +         * for proper load-balancing */
 +        wallcycle_start(wcycle,ewcROT);
 +        do_rotation(cr,inputrec,box,x,t,step,wcycle,bNS);
 +        wallcycle_stop(wcycle,ewcROT);
 +    }
 +
 +    /* Start the force cycle counter.
 +     * This counter is stopped in do_forcelow_level.
 +     * No parallel communication should occur while this counter is running,
 +     * since that will interfere with the dynamic load balancing.
 +     */
 +    wallcycle_start(wcycle,ewcFORCE);
 +
 +    if (bDoForces)
 +    {
 +        /* Reset forces for which the virial is calculated separately:
 +         * PME/Ewald forces if necessary */
 +        if (fr->bF_NoVirSum) 
 +        {
 +            if (flags & GMX_FORCE_VIRIAL)
 +            {
 +                fr->f_novirsum = fr->f_novirsum_alloc;
 +                if (fr->bDomDec)
 +                {
 +                    clear_rvecs(fr->f_novirsum_n,fr->f_novirsum);
 +                }
 +                else
 +                {
 +                    clear_rvecs(homenr,fr->f_novirsum+start);
 +                }
 +            }
 +            else
 +            {
 +                /* We are not calculating the pressure so we do not need
 +                 * a separate array for forces that do not contribute
 +                 * to the pressure.
 +                 */
 +                fr->f_novirsum = f;
 +            }
 +        }
 +
 +        if (bSepLRF)
 +        {
 +            /* Add the long range forces to the short range forces */
 +            for(i=0; i<fr->natoms_force_constr; i++)
 +            {
 +                copy_rvec(fr->f_twin[i],f[i]);
 +            }
 +        }
 +        else if (!(fr->bTwinRange && bNS))
 +        {
 +            /* Clear the short-range forces */
 +            clear_rvecs(fr->natoms_force_constr,f);
 +        }
 +
 +        clear_rvec(fr->vir_diag_posres);
 +    }
 +    if (inputrec->ePull == epullCONSTRAINT)
 +    {
 +        clear_pull_forces(inputrec->pull);
 +    }
 +
 +    /* update QMMMrec, if necessary */
 +    if(fr->bQMMM)
 +    {
 +        update_QMMMrec(cr,fr,x,mdatoms,box,top);
 +    }
 +
 +    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
 +    {
 +        /* Position restraints always require full pbc. Check if we already did it for Adress */
 +        if(!(bStateChanged && bDoAdressWF))
 +        {
 +            set_pbc(&pbc,inputrec->ePBC,box);
 +        }
 +        v = posres(top->idef.il[F_POSRES].nr,top->idef.il[F_POSRES].iatoms,
 +                   top->idef.iparams_posres,
 +                   (const rvec*)x,fr->f_novirsum,fr->vir_diag_posres,
 +                   inputrec->ePBC==epbcNONE ? NULL : &pbc,lambda,&dvdl,
 +                   fr->rc_scaling,fr->ePBC,fr->posres_com,fr->posres_comB);
 +        if (bSepDVDL)
 +        {
 +            fprintf(fplog,sepdvdlformat,
 +                    interaction_function[F_POSRES].longname,v,dvdl);
 +        }
 +        enerd->term[F_POSRES] += v;
 +        /* This linear lambda dependence assumption is only correct
 +         * when only k depends on lambda,
 +         * not when the reference position depends on lambda.
 +         * grompp checks for this.
 +         */
 +        enerd->dvdl_lin += dvdl;
 +        inc_nrnb(nrnb,eNR_POSRES,top->idef.il[F_POSRES].nr/2);
 +    }
 +
 +    /* Compute the bonded and non-bonded energies and optionally forces */    
 +    do_force_lowlevel(fplog,step,fr,inputrec,&(top->idef),
 +                      cr,nrnb,wcycle,mdatoms,&(inputrec->opts),
 +                      x,hist,f,enerd,fcd,mtop,top,fr->born,
 +                      &(top->atomtypes),bBornRadii,box,
 +                      lambda,graph,&(top->excls),fr->mu_tot,
 +                      flags,&cycles_pme);
 +    
 +    cycles_force = wallcycle_stop(wcycle,ewcFORCE);
 +    
 +    if (ed)
 +    {
 +        do_flood(fplog,cr,x,f,ed,box,step);
 +    }
 +      
 +    if (DOMAINDECOMP(cr))
 +    {
 +        dd_force_flop_stop(cr->dd,nrnb);
 +        if (wcycle)
 +        {
 +            dd_cycles_add(cr->dd,cycles_force-cycles_pme,ddCyclF);
 +        }
 +    }
 +    
 +    if (bDoForces)
 +    {
 +        if (IR_ELEC_FIELD(*inputrec))
 +        {
 +            /* Compute forces due to electric field */
 +            calc_f_el(MASTER(cr) ? field : NULL,
 +                      start,homenr,mdatoms->chargeA,x,fr->f_novirsum,
 +                      inputrec->ex,inputrec->et,t);
 +        }
 +
 +        if (bDoAdressWF && fr->adress_icor == eAdressICThermoForce)
 +        {
 +            /* Compute thermodynamic force in hybrid AdResS region */
 +            adress_thermo_force(start,homenr,&(top->cgs),x,fr->f_novirsum,fr,mdatoms,
 +                                inputrec->ePBC==epbcNONE ? NULL : &pbc);
 +        }
 +        
 +        /* Communicate the forces */
 +        if (PAR(cr))
 +        {
 +            wallcycle_start(wcycle,ewcMOVEF);
 +            if (DOMAINDECOMP(cr))
 +            {
 +                dd_move_f(cr->dd,f,fr->fshift);
 +                /* Do we need to communicate the separate force array
 +                 * for terms that do not contribute to the single sum virial?
 +                 * Position restraints and electric fields do not introduce
 +                 * inter-cg forces, only full electrostatics methods do.
 +                 * When we do not calculate the virial, fr->f_novirsum = f,
 +                 * so we have already communicated these forces.
 +                 */
 +                if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
 +                    (flags & GMX_FORCE_VIRIAL))
 +                {
 +                    dd_move_f(cr->dd,fr->f_novirsum,NULL);
 +                }
 +                if (bSepLRF)
 +                {
 +                    /* We should not update the shift forces here,
 +                     * since f_twin is already included in f.
 +                     */
 +                    dd_move_f(cr->dd,fr->f_twin,NULL);
 +                }
 +            }
 +            else
 +            {
 +                pd_move_f(cr,f,nrnb);
 +                if (bSepLRF)
 +                {
 +                    pd_move_f(cr,fr->f_twin,nrnb);
 +                }
 +            }
 +            wallcycle_stop(wcycle,ewcMOVEF);
 +        }
 +
 +        /* If we have NoVirSum forces, but we do not calculate the virial,
 +         * we sum fr->f_novirum=f later.
 +         */
 +        if (vsite && !(fr->bF_NoVirSum && !(flags & GMX_FORCE_VIRIAL)))
 +        {
 +            wallcycle_start(wcycle,ewcVSITESPREAD);
 +            spread_vsite_f(fplog,vsite,x,f,fr->fshift,FALSE,NULL,nrnb,
 +                           &top->idef,fr->ePBC,fr->bMolPBC,graph,box,cr);
 +            wallcycle_stop(wcycle,ewcVSITESPREAD);
 +
 +            if (bSepLRF)
 +            {
 +                wallcycle_start(wcycle,ewcVSITESPREAD);
 +                spread_vsite_f(fplog,vsite,x,fr->f_twin,NULL,FALSE,NULL,
 +                               nrnb,
 +                               &top->idef,fr->ePBC,fr->bMolPBC,graph,box,cr);
 +                wallcycle_stop(wcycle,ewcVSITESPREAD);
 +            }
 +        }
 +        
 +        if (flags & GMX_FORCE_VIRIAL)
 +        {
 +            /* Calculation of the virial must be done after vsites! */
 +            calc_virial(fplog,mdatoms->start,mdatoms->homenr,x,f,
 +                        vir_force,graph,box,nrnb,fr,inputrec->ePBC);
 +        }
 +    }
 +
 +    enerd->term[F_COM_PULL] = 0;
 +    if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
 +    {
 +        /* Calculate the center of mass forces, this requires communication,
 +         * which is why pull_potential is called close to other communication.
 +         * The virial contribution is calculated directly,
 +         * which is why we call pull_potential after calc_virial.
 +         */
 +        set_pbc(&pbc,inputrec->ePBC,box);
 +        dvdl = 0; 
 +        enerd->term[F_COM_PULL] +=
 +            pull_potential(inputrec->ePull,inputrec->pull,mdatoms,&pbc,
 +                           cr,t,lambda,x,f,vir_force,&dvdl);
 +        if (bSepDVDL)
 +        {
 +            fprintf(fplog,sepdvdlformat,"Com pull",enerd->term[F_COM_PULL],dvdl);
 +        }
 +        enerd->dvdl_lin += dvdl;
 +    }
 +    
 +    /* Add the forces from enforced rotation potentials (if any) */
 +    if (inputrec->bRot)
 +    {
 +        wallcycle_start(wcycle,ewcROTadd);
 +        enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr,step,t);
 +        wallcycle_stop(wcycle,ewcROTadd);
 +    }
 +
 +    if (PAR(cr) && !(cr->duty & DUTY_PME))
 +    {
 +        cycles_ppdpme = wallcycle_stop(wcycle,ewcPPDURINGPME);
 +        dd_cycles_add(cr->dd,cycles_ppdpme,ddCyclPPduringPME);
 +
 +        /* In case of node-splitting, the PP nodes receive the long-range 
 +         * forces, virial and energy from the PME nodes here.
 +         */    
 +        wallcycle_start(wcycle,ewcPP_PMEWAITRECVF);
 +        dvdl = 0;
 +        gmx_pme_receive_f(cr,fr->f_novirsum,fr->vir_el_recip,&e,&dvdl,
 +                          &cycles_seppme);
 +        if (bSepDVDL)
 +        {
 +            fprintf(fplog,sepdvdlformat,"PME mesh",e,dvdl);
 +        }
 +        enerd->term[F_COUL_RECIP] += e;
 +        enerd->dvdl_lin += dvdl;
 +        if (wcycle)
 +        {
 +            dd_cycles_add(cr->dd,cycles_seppme,ddCyclPME);
 +        }
 +        wallcycle_stop(wcycle,ewcPP_PMEWAITRECVF);
 +    }
 +
 +    if (bDoForces && fr->bF_NoVirSum)
 +    {
 +        if (vsite)
 +        {
 +            /* Spread the mesh force on virtual sites to the other particles... 
 +             * This is parallellized. MPI communication is performed
 +             * if the constructing atoms aren't local.
 +             */
 +            wallcycle_start(wcycle,ewcVSITESPREAD);
 +            spread_vsite_f(fplog,vsite,x,fr->f_novirsum,NULL,
 +                           (flags & GMX_FORCE_VIRIAL),fr->vir_el_recip,
 +                           nrnb,
 +                           &top->idef,fr->ePBC,fr->bMolPBC,graph,box,cr);
 +            wallcycle_stop(wcycle,ewcVSITESPREAD);
 +        }
 +        if (flags & GMX_FORCE_VIRIAL)
 +        {
 +            /* Now add the forces, this is local */
 +            if (fr->bDomDec)
 +            {
 +                sum_forces(0,fr->f_novirsum_n,f,fr->f_novirsum);
 +            }
 +            else
 +            {
 +                sum_forces(start,start+homenr,f,fr->f_novirsum);
 +            }
 +            if (EEL_FULL(fr->eeltype))
 +            {
 +                /* Add the mesh contribution to the virial */
 +                m_add(vir_force,fr->vir_el_recip,vir_force);
 +            }
 +            if (debug)
 +            {
 +                pr_rvecs(debug,0,"vir_force",vir_force,DIM);
 +            }
 +        }
 +    }
 +    
 +    /* Sum the potential energy terms from group contributions */
 +    sum_epot(&(inputrec->opts),enerd);
 +    
 +    if (fr->print_force >= 0 && bDoForces)
 +    {
 +        print_large_forces(stderr,mdatoms,cr,step,fr->print_force,x,f);
 +    }
 +}
 +
 +void do_constrain_first(FILE *fplog,gmx_constr_t constr,
 +                        t_inputrec *ir,t_mdatoms *md,
 +                        t_state *state,rvec *f,
 +                        t_graph *graph,t_commrec *cr,t_nrnb *nrnb,
 +                        t_forcerec *fr, gmx_localtop_t *top, tensor shake_vir)
 +{
 +    int    i,m,start,end;
 +    gmx_large_int_t step;
-     for(m=0; (m<4); m++)
-         vcm[m] = 0;
-     for(i=start; i<end; i++) {
-         mass = md->massT[i];
-         for(m=0; m<DIM; m++) {
-             vcm[m] += state->v[i][m]*mass;
-         }
-         vcm[3] += mass;
-     }
-     
-     if (ir->nstcomm != 0 || debug) {
-         /* Compute the global sum of vcm */
-         if (debug)
-             fprintf(debug,"vcm: %8.3f  %8.3f  %8.3f,"
-                     " total mass = %12.5e\n",vcm[XX],vcm[YY],vcm[ZZ],vcm[3]);
-         if (PAR(cr))
-             gmx_sumd(4,vcm,cr);
-         tmass = vcm[3];
-         for(m=0; (m<DIM); m++)
-             vcm[m] /= tmass;
-         if (debug) 
-             fprintf(debug,"vcm: %8.3f  %8.3f  %8.3f,"
-                     " total mass = %12.5e\n",vcm[XX],vcm[YY],vcm[ZZ],tmass);
-         if (ir->nstcomm != 0) {
-             /* Now we have the velocity of center of mass, let's remove it */
-             for(i=start; (i<end); i++) {
-                 for(m=0; (m<DIM); m++)
-                     state->v[i][m] -= vcm[m];
-             }
-         }
-     }
 +    real   dt=ir->delta_t;
 +    real   dvdlambda;
 +    rvec   *savex;
 +    
 +    snew(savex,state->natoms);
 +
 +    start = md->start;
 +    end   = md->homenr + start;
 +    
 +    if (debug)
 +        fprintf(debug,"vcm: start=%d, homenr=%d, end=%d\n",
 +                start,md->homenr,end);
 +    /* Do a first constrain to reset particles... */
 +    step = ir->init_step;
 +    if (fplog)
 +    {
 +        char buf[STEPSTRSIZE];
 +        fprintf(fplog,"\nConstraining the starting coordinates (step %s)\n",
 +                gmx_step_str(step,buf));
 +    }
 +    dvdlambda = 0;
 +    
 +    /* constrain the current position */
 +    constrain(NULL,TRUE,FALSE,constr,&(top->idef),
 +              ir,NULL,cr,step,0,md,
 +              state->x,state->x,NULL,
 +              state->box,state->lambda,&dvdlambda,
 +              NULL,NULL,nrnb,econqCoord,ir->epc==epcMTTK,state->veta,state->veta);
 +    if (EI_VV(ir->eI)) 
 +    {
 +        /* constrain the inital velocity, and save it */
 +        /* also may be useful if we need the ekin from the halfstep for velocity verlet */
 +        /* might not yet treat veta correctly */
 +        constrain(NULL,TRUE,FALSE,constr,&(top->idef),
 +                  ir,NULL,cr,step,0,md,
 +                  state->x,state->v,state->v,
 +                  state->box,state->lambda,&dvdlambda,
 +                  NULL,NULL,nrnb,econqVeloc,ir->epc==epcMTTK,state->veta,state->veta);
 +    }
 +    /* constrain the inital velocities at t-dt/2 */
 +    if (EI_STATE_VELOCITY(ir->eI) && ir->eI!=eiVV)
 +    {
 +        for(i=start; (i<end); i++) 
 +        {
 +            for(m=0; (m<DIM); m++) 
 +            {
 +                /* Reverse the velocity */
 +                state->v[i][m] = -state->v[i][m];
 +                /* Store the position at t-dt in buf */
 +                savex[i][m] = state->x[i][m] + dt*state->v[i][m];
 +            }
 +        }
 +    /* Shake the positions at t=-dt with the positions at t=0                        
 +     * as reference coordinates.                                                     
 +         */
 +        if (fplog)
 +        {
 +            char buf[STEPSTRSIZE];
 +            fprintf(fplog,"\nConstraining the coordinates at t0-dt (step %s)\n",
 +                    gmx_step_str(step,buf));
 +        }
 +        dvdlambda = 0;
 +        constrain(NULL,TRUE,FALSE,constr,&(top->idef),
 +                  ir,NULL,cr,step,-1,md,
 +                  state->x,savex,NULL,
 +                  state->box,state->lambda,&dvdlambda,
 +                  state->v,NULL,nrnb,econqCoord,ir->epc==epcMTTK,state->veta,state->veta);
 +        
 +        for(i=start; i<end; i++) {
 +            for(m=0; m<DIM; m++) {
 +                /* Re-reverse the velocities */
 +                state->v[i][m] = -state->v[i][m];
 +            }
 +        }
 +    }
 +    
 +    sfree(savex);
 +}
 +
 +void calc_enervirdiff(FILE *fplog,int eDispCorr,t_forcerec *fr)
 +{
 +  double eners[2],virs[2],enersum,virsum,y0,f,g,h;
 +  double r0,r1,r,rc3,rc9,ea,eb,ec,pa,pb,pc,pd;
 +  double invscale,invscale2,invscale3;
 +  int    ri0,ri1,ri,i,offstart,offset;
 +  real   scale,*vdwtab; 
 +
 +  fr->enershiftsix = 0;
 +  fr->enershifttwelve = 0;
 +  fr->enerdiffsix = 0;
 +  fr->enerdifftwelve = 0;
 +  fr->virdiffsix = 0;
 +  fr->virdifftwelve = 0;
 +
 +  if (eDispCorr != edispcNO) {
 +    for(i=0; i<2; i++) {
 +      eners[i] = 0;
 +      virs[i]  = 0;
 +    }
 +    if ((fr->vdwtype == evdwSWITCH) || (fr->vdwtype == evdwSHIFT)) {
 +      if (fr->rvdw_switch == 0)
 +      gmx_fatal(FARGS,
 +                "With dispersion correction rvdw-switch can not be zero "
 +                "for vdw-type = %s",evdw_names[fr->vdwtype]);
 +
 +      scale  = fr->nblists[0].tab.scale;
 +      vdwtab = fr->nblists[0].vdwtab;
 +
 +      /* Round the cut-offs to exact table values for precision */
 +      ri0 = floor(fr->rvdw_switch*scale);
 +      ri1 = ceil(fr->rvdw*scale);
 +      r0  = ri0/scale;
 +      r1  = ri1/scale;
 +      rc3 = r0*r0*r0;
 +      rc9  = rc3*rc3*rc3;
 +
 +      if (fr->vdwtype == evdwSHIFT) {
 +      /* Determine the constant energy shift below rvdw_switch */
 +      fr->enershiftsix    = (real)(-1.0/(rc3*rc3)) - vdwtab[8*ri0];
 +      fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - vdwtab[8*ri0 + 4];
 +      }
 +      /* Add the constant part from 0 to rvdw_switch.
 +       * This integration from 0 to rvdw_switch overcounts the number
 +       * of interactions by 1, as it also counts the self interaction.
 +       * We will correct for this later.
 +       */
 +      eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
 +      eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
 +      
 +      invscale = 1.0/(scale);  
 +      invscale2 = invscale*invscale;
 +      invscale3 = invscale*invscale2;
 +
 +      /* following summation derived from cubic spline definition,
 +      Numerical Recipies in C, second edition, p. 113-116.  Exact
 +      for the cubic spline.  We first calculate the negative of
 +      the energy from rvdw to rvdw_switch, assuming that g(r)=1,
 +      and then add the more standard, abrupt cutoff correction to
 +      that result, yielding the long-range correction for a
 +      switched function.  We perform both the pressure and energy
 +      loops at the same time for simplicity, as the computational
 +      cost is low. */
 +      
 +      for (i=0;i<2;i++) {
 +        enersum = 0.0; virsum = 0.0;
 +        if (i==0)
 +        offstart = 0;
 +      else
 +        offstart = 4;
 +      for (ri=ri0; ri<ri1; ri++) {
 +          r = ri*invscale;
 +          ea = invscale3;
 +          eb = 2.0*invscale2*r;
 +          ec = invscale*r*r;
 +          
 +          pa = invscale3;
 +          pb = 3.0*invscale2*r;
 +          pc = 3.0*invscale*r*r;
 +          pd = r*r*r;
 +          
 +          /* this "8" is from the packing in the vdwtab array - perhaps
 +          should be #define'ed? */
 +          offset = 8*ri + offstart;
 +          y0 = vdwtab[offset];
 +          f = vdwtab[offset+1];
 +          g = vdwtab[offset+2];
 +          h = vdwtab[offset+3];
 +        
 +          enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2)+
 +            g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);  
 +          virsum  +=  f*(pa/4 + pb/3 + pc/2 + pd) + 
 +            2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
 +        
 +        }
 +        enersum *= 4.0*M_PI;
 +        virsum  *= 4.0*M_PI; 
 +        eners[i] -= enersum;
 +        virs[i]  -= virsum;
 +      }
 +
 +      /* now add the correction for rvdw_switch to infinity */
 +      eners[0] += -4.0*M_PI/(3.0*rc3);
 +      eners[1] +=  4.0*M_PI/(9.0*rc9);
 +      virs[0]  +=  8.0*M_PI/rc3;
 +      virs[1]  += -16.0*M_PI/(3.0*rc9);
 +    } 
 +    else if ((fr->vdwtype == evdwCUT) || (fr->vdwtype == evdwUSER)) {
 +      if (fr->vdwtype == evdwUSER && fplog)
 +      fprintf(fplog,
 +              "WARNING: using dispersion correction with user tables\n");
 +      rc3  = fr->rvdw*fr->rvdw*fr->rvdw;
 +      rc9  = rc3*rc3*rc3;
 +      eners[0] += -4.0*M_PI/(3.0*rc3);
 +      eners[1] +=  4.0*M_PI/(9.0*rc9);
 +      virs[0]  +=  8.0*M_PI/rc3;
 +      virs[1]  += -16.0*M_PI/(3.0*rc9);
 +    } else {
 +      gmx_fatal(FARGS,
 +              "Dispersion correction is not implemented for vdw-type = %s",
 +              evdw_names[fr->vdwtype]);
 +    }
 +    fr->enerdiffsix    = eners[0];
 +    fr->enerdifftwelve = eners[1];
 +    /* The 0.5 is due to the Gromacs definition of the virial */
 +    fr->virdiffsix     = 0.5*virs[0];
 +    fr->virdifftwelve  = 0.5*virs[1];
 +  }
 +}
 +
 +void calc_dispcorr(FILE *fplog,t_inputrec *ir,t_forcerec *fr,
 +                   gmx_large_int_t step,int natoms,
 +                   matrix box,real lambda,tensor pres,tensor virial,
 +                   real *prescorr, real *enercorr, real *dvdlcorr)
 +{
 +    gmx_bool bCorrAll,bCorrPres;
 +    real dvdlambda,invvol,dens,ninter,avcsix,avctwelve,enerdiff,svir=0,spres=0;
 +    int  m;
 +    
 +    *prescorr = 0;
 +    *enercorr = 0;
 +    *dvdlcorr = 0;
 +    
 +    clear_mat(virial);
 +    clear_mat(pres);
 +    
 +    if (ir->eDispCorr != edispcNO) {
 +        bCorrAll  = (ir->eDispCorr == edispcAllEner ||
 +                     ir->eDispCorr == edispcAllEnerPres);
 +        bCorrPres = (ir->eDispCorr == edispcEnerPres ||
 +                     ir->eDispCorr == edispcAllEnerPres);
 +        
 +        invvol = 1/det(box);
 +        if (fr->n_tpi) 
 +        {
 +            /* Only correct for the interactions with the inserted molecule */
 +            dens = (natoms - fr->n_tpi)*invvol;
 +            ninter = fr->n_tpi;
 +        } 
 +        else 
 +        {
 +            dens = natoms*invvol;
 +            ninter = 0.5*natoms;
 +        }
 +        
 +        if (ir->efep == efepNO) 
 +        {
 +            avcsix    = fr->avcsix[0];
 +            avctwelve = fr->avctwelve[0];
 +        } 
 +        else 
 +        {
 +            avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
 +            avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
 +        }
 +        
 +        enerdiff = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
 +        *enercorr += avcsix*enerdiff;
 +        dvdlambda = 0.0;
 +        if (ir->efep != efepNO) 
 +        {
 +            dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
 +        }
 +        if (bCorrAll) 
 +        {
 +            enerdiff = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
 +            *enercorr += avctwelve*enerdiff;
 +            if (fr->efep != efepNO) 
 +            {
 +                dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
 +            }
 +        }
 +        
 +        if (bCorrPres) 
 +        {
 +            svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
 +            if (ir->eDispCorr == edispcAllEnerPres)
 +            {
 +                svir += ninter*dens*avctwelve*fr->virdifftwelve/3.0;
 +            }
 +            /* The factor 2 is because of the Gromacs virial definition */
 +            spres = -2.0*invvol*svir*PRESFAC;
 +            
 +            for(m=0; m<DIM; m++) {
 +                virial[m][m] += svir;
 +                pres[m][m] += spres;
 +            }
 +            *prescorr += spres;
 +        }
 +        
 +        /* Can't currently control when it prints, for now, just print when degugging */
 +        if (debug)
 +        {
 +            if (bCorrAll) {
 +                fprintf(debug,"Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
 +                        avcsix,avctwelve);
 +            }
 +            if (bCorrPres) 
 +            {
 +                fprintf(debug,
 +                        "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
 +                        *enercorr,spres,svir);
 +            }
 +            else
 +            {
 +                fprintf(debug,"Long Range LJ corr.: Epot %10g\n",*enercorr);
 +            }
 +        }
 +        
 +        if (fr->bSepDVDL && do_per_step(step,ir->nstlog))
 +        {
 +            fprintf(fplog,sepdvdlformat,"Dispersion correction",
 +                    *enercorr,dvdlambda);
 +        }
 +        if (fr->efep != efepNO) 
 +        {
 +            *dvdlcorr += dvdlambda;
 +        }
 +    }
 +}
 +
 +void do_pbc_first(FILE *fplog,matrix box,t_forcerec *fr,
 +                t_graph *graph,rvec x[])
 +{
 +  if (fplog)
 +    fprintf(fplog,"Removing pbc first time\n");
 +  calc_shifts(box,fr->shift_vec);
 +  if (graph) {
 +    mk_mshift(fplog,graph,fr->ePBC,box,x);
 +    if (gmx_debug_at)
 +      p_graph(debug,"do_pbc_first 1",graph);
 +    shift_self(graph,box,x);
 +    /* By doing an extra mk_mshift the molecules that are broken
 +     * because they were e.g. imported from another software
 +     * will be made whole again. Such are the healing powers
 +     * of GROMACS.
 +     */
 +    mk_mshift(fplog,graph,fr->ePBC,box,x);
 +    if (gmx_debug_at)
 +      p_graph(debug,"do_pbc_first 2",graph);
 +  }
 +  if (fplog)
 +    fprintf(fplog,"Done rmpbc\n");
 +}
 +
 +static void low_do_pbc_mtop(FILE *fplog,int ePBC,matrix box,
 +                          gmx_mtop_t *mtop,rvec x[],
 +                          gmx_bool bFirst)
 +{
 +  t_graph *graph;
 +  int mb,as,mol;
 +  gmx_molblock_t *molb;
 +
 +  if (bFirst && fplog)
 +    fprintf(fplog,"Removing pbc first time\n");
 +
 +  snew(graph,1);
 +  as = 0;
 +  for(mb=0; mb<mtop->nmolblock; mb++) {
 +    molb = &mtop->molblock[mb];
 +    if (molb->natoms_mol == 1 || 
 +      (!bFirst && mtop->moltype[molb->type].cgs.nr == 1)) {
 +      /* Just one atom or charge group in the molecule, no PBC required */
 +      as += molb->nmol*molb->natoms_mol;
 +    } else {
 +      /* Pass NULL iso fplog to avoid graph prints for each molecule type */
 +      mk_graph_ilist(NULL,mtop->moltype[molb->type].ilist,
 +                   0,molb->natoms_mol,FALSE,FALSE,graph);
 +      
 +      for(mol=0; mol<molb->nmol; mol++) {
 +      mk_mshift(fplog,graph,ePBC,box,x+as);
 +      
 +      shift_self(graph,box,x+as);
 +      /* The molecule is whole now.
 +       * We don't need the second mk_mshift call as in do_pbc_first,
 +       * since we no longer need this graph.
 +       */
 +      
 +      as += molb->natoms_mol;
 +      }
 +      done_graph(graph);
 +    }
 +  }
 +  sfree(graph);
 +}
 +
 +void do_pbc_first_mtop(FILE *fplog,int ePBC,matrix box,
 +                     gmx_mtop_t *mtop,rvec x[])
 +{
 +  low_do_pbc_mtop(fplog,ePBC,box,mtop,x,TRUE);
 +}
 +
 +void do_pbc_mtop(FILE *fplog,int ePBC,matrix box,
 +               gmx_mtop_t *mtop,rvec x[])
 +{
 +  low_do_pbc_mtop(fplog,ePBC,box,mtop,x,FALSE);
 +}
 +
 +void finish_run(FILE *fplog,t_commrec *cr,const char *confout,
 +                t_inputrec *inputrec,
 +                t_nrnb nrnb[],gmx_wallcycle_t wcycle,
 +                gmx_runtime_t *runtime,
 +                gmx_bool bWriteStat)
 +{
 +  int    i,j;
 +  t_nrnb *nrnb_tot=NULL;
 +  real   delta_t;
 +  double nbfs,mflop;
 +  double cycles[ewcNR];
 +
 +  wallcycle_sum(cr,wcycle,cycles);
 +
 +  if (cr->nnodes > 1) {
 +    if (SIMMASTER(cr))
 +      snew(nrnb_tot,1);
 +#ifdef GMX_MPI
 +    MPI_Reduce(nrnb->n,nrnb_tot->n,eNRNB,MPI_DOUBLE,MPI_SUM,
 +               MASTERRANK(cr),cr->mpi_comm_mysim);
 +#endif  
 +  } else {
 +    nrnb_tot = nrnb;
 +  }
 +    
 +  if (SIMMASTER(cr)) {
 +    print_flop(fplog,nrnb_tot,&nbfs,&mflop);
 +    if (cr->nnodes > 1) {
 +      sfree(nrnb_tot);
 +    }
 +  }
 +
 +  if ((cr->duty & DUTY_PP) && DOMAINDECOMP(cr)) {
 +    print_dd_statistics(cr,inputrec,fplog);
 +  }
 +
 +#ifdef GMX_MPI
 +    if (PARTDECOMP(cr))
 +    {
 +        if (MASTER(cr))
 +        {
 +            t_nrnb     *nrnb_all;
 +            int        s;
 +            MPI_Status stat;
 +
 +            snew(nrnb_all,cr->nnodes);
 +            nrnb_all[0] = *nrnb;
 +            for(s=1; s<cr->nnodes; s++)
 +            {
 +                MPI_Recv(nrnb_all[s].n,eNRNB,MPI_DOUBLE,s,0,
 +                         cr->mpi_comm_mysim,&stat);
 +            }
 +            pr_load(fplog,cr,nrnb_all);
 +            sfree(nrnb_all);
 +        }
 +        else
 +        {
 +            MPI_Send(nrnb->n,eNRNB,MPI_DOUBLE,MASTERRANK(cr),0,
 +                     cr->mpi_comm_mysim);
 +        }
 +    }
 +#endif  
 +
 +  if (SIMMASTER(cr)) {
 +    wallcycle_print(fplog,cr->nnodes,cr->npmenodes,runtime->realtime,
 +                    wcycle,cycles);
 +
 +    if (EI_DYNAMICS(inputrec->eI)) {
 +      delta_t = inputrec->delta_t;
 +    } else {
 +      delta_t = 0;
 +    }
 +    
 +    if (fplog) {
 +        print_perf(fplog,runtime->proctime,runtime->realtime,
 +                   cr->nnodes-cr->npmenodes,
 +                   runtime->nsteps_done,delta_t,nbfs,mflop);
 +    }
 +    if (bWriteStat) {
 +        print_perf(stderr,runtime->proctime,runtime->realtime,
 +                   cr->nnodes-cr->npmenodes,
 +                   runtime->nsteps_done,delta_t,nbfs,mflop);
 +    }
 +
 +    /*
 +    runtime=inputrec->nsteps*inputrec->delta_t;
 +    if (bWriteStat) {
 +      if (cr->nnodes == 1)
 +      fprintf(stderr,"\n\n");
 +      print_perf(stderr,nodetime,realtime,runtime,&ntot,
 +               cr->nnodes-cr->npmenodes,FALSE);
 +    }
 +    wallcycle_print(fplog,cr->nnodes,cr->npmenodes,realtime,wcycle,cycles);
 +    print_perf(fplog,nodetime,realtime,runtime,&ntot,cr->nnodes-cr->npmenodes,
 +             TRUE);
 +    if (PARTDECOMP(cr))
 +      pr_load(fplog,cr,nrnb_all);
 +    if (cr->nnodes > 1)
 +      sfree(nrnb_all);
 +    */
 +  }
 +}
 +
 +void init_md(FILE *fplog,
 +             t_commrec *cr,t_inputrec *ir,const output_env_t oenv,
 +             double *t,double *t0,
 +             real *lambda,double *lam0,
 +             t_nrnb *nrnb,gmx_mtop_t *mtop,
 +             gmx_update_t *upd,
 +             int nfile,const t_filenm fnm[],
 +             gmx_mdoutf_t **outf,t_mdebin **mdebin,
 +             tensor force_vir,tensor shake_vir,rvec mu_tot,
 +             gmx_bool *bSimAnn,t_vcm **vcm, t_state *state, unsigned long Flags)
 +{
 +    int  i,j,n;
 +    real tmpt,mod;
 +      
 +    /* Initial values */
 +    *t = *t0       = ir->init_t;
 +    if (ir->efep != efepNO)
 +    {
 +        *lam0 = ir->init_lambda;
 +        *lambda = *lam0 + ir->init_step*ir->delta_lambda;
 +    }
 +    else
 +    {
 +        *lambda = *lam0   = 0.0;
 +    } 
 +
 +    *bSimAnn=FALSE;
 +    for(i=0;i<ir->opts.ngtc;i++)
 +    {
 +        /* set bSimAnn if any group is being annealed */
 +        if(ir->opts.annealing[i]!=eannNO)
 +        {
 +            *bSimAnn = TRUE;
 +        }
 +    }
 +    if (*bSimAnn)
 +    {
 +        update_annealing_target_temp(&(ir->opts),ir->init_t);
 +    }
 +    
 +    if (upd)
 +    {
 +        *upd = init_update(fplog,ir);
 +    }
 +    
 +    if (vcm != NULL)
 +    {
 +        *vcm = init_vcm(fplog,&mtop->groups,ir);
 +    }
 +    
 +    if (EI_DYNAMICS(ir->eI) && !(Flags & MD_APPENDFILES))
 +    {
 +        if (ir->etc == etcBERENDSEN)
 +        {
 +            please_cite(fplog,"Berendsen84a");
 +        }
 +        if (ir->etc == etcVRESCALE)
 +        {
 +            please_cite(fplog,"Bussi2007a");
 +        }
 +    }
 +    
 +    init_nrnb(nrnb);
 +    
 +    if (nfile != -1)
 +    {
 +        *outf = init_mdoutf(nfile,fnm,Flags,cr,ir,oenv);
 +
 +        *mdebin = init_mdebin((Flags & MD_APPENDFILES) ? NULL : (*outf)->fp_ene,
 +                              mtop,ir, (*outf)->fp_dhdl);
 +    }
 +    
 +    if (ir->bAdress)
 +    {
 +      please_cite(fplog,"Fritsch12");
 +      please_cite(fplog,"Junghans10");
 +    }
 +    /* Initiate variables */  
 +    clear_mat(force_vir);
 +    clear_mat(shake_vir);
 +    clear_rvec(mu_tot);
 +    
 +    debug_gmx();
 +}
 +
Simple merge
Simple merge
Simple merge
index cb649a938558f2dbe63d908676ff1b5b4a6a9e8e,0000000000000000000000000000000000000000..c0b28372d22d5983efe0eb0d66988ee8b52c502e
mode 100644,000000..100644
--- /dev/null
@@@ -1,531 -1,0 +1,532 @@@
-         "that [TT]-om[tt] only operates on the first selection.[PAR]",
 +/*
 + *
 + *                This source code is part of
 + *
 + *                 G   R   O   M   A   C   S
 + *
 + *          GROningen MAchine for Chemical Simulations
 + *
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2009, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + *
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + *
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + *
 + * For more info, check our website at http://www.gromacs.org
 + */
 +/*! \internal \file
 + * \brief
 + * Implements gmx::analysismodules::Select.
 + *
 + * \author Teemu Murtola <teemu.murtola@cbr.su.se>
 + * \ingroup module_trajectoryanalysis
 + */
 +#include "select.h"
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <algorithm>
 +#include <cstdio>
 +#include <string>
 +#include <vector>
 +
 +#include "gmxfio.h"
 +
 +#include "gromacs/analysisdata/analysisdata.h"
 +#include "gromacs/analysisdata/dataframe.h"
 +#include "gromacs/analysisdata/datamodule.h"
 +#include "gromacs/analysisdata/modules/plot.h"
 +#include "gromacs/options/basicoptions.h"
 +#include "gromacs/options/filenameoption.h"
 +#include "gromacs/options/options.h"
 +#include "gromacs/selection/selection.h"
 +#include "gromacs/selection/selectionoption.h"
 +#include "gromacs/trajectoryanalysis/analysissettings.h"
 +#include "gromacs/utility/exceptions.h"
 +#include "gromacs/utility/format.h"
 +
 +namespace gmx
 +{
 +
 +namespace analysismodules
 +{
 +
 +namespace
 +{
 +
 +/*! \internal \brief
 + * Data module for writing index files.
 + *
 + * \ingroup module_analysisdata
 + */
 +class IndexFileWriterModule : public AnalysisDataModuleInterface
 +{
 +    public:
 +        IndexFileWriterModule();
 +        virtual ~IndexFileWriterModule();
 +
 +        //! Sets the file name to write the index file to.
 +        void setFileName(const std::string &fnm);
 +        void addGroup(const std::string &name, bool bDynamic);
 +
 +        virtual int flags() const;
 +
 +        virtual void dataStarted(AbstractAnalysisData *data);
 +        virtual void frameStarted(const AnalysisDataFrameHeader &header);
 +        virtual void pointsAdded(const AnalysisDataPointSetRef &points);
 +        virtual void frameFinished(const AnalysisDataFrameHeader &header);
 +        virtual void dataFinished();
 +
 +    private:
 +        void closeFile();
 +
 +        struct GroupInfo
 +        {
 +            GroupInfo(const std::string &name, bool bDynamic)
 +                : name(name), bDynamic(bDynamic)
 +            { }
 +
 +            std::string         name;
 +            bool                bDynamic;
 +        };
 +
 +        std::string             _fnm;
 +        std::vector<GroupInfo>  _groups;
 +        FILE                   *_fp;
 +        int                     _currentGroup;
 +        int                     _currentSize;
 +        bool                    _bAnyWritten;
 +};
 +
 +/********************************************************************
 + * IndexFileWriterModule
 + */
 +
 +IndexFileWriterModule::IndexFileWriterModule()
 +    : _fp(NULL), _currentGroup(-1), _currentSize(0), _bAnyWritten(false)
 +{
 +}
 +
 +
 +IndexFileWriterModule::~IndexFileWriterModule()
 +{
 +    closeFile();
 +}
 +
 +
 +void IndexFileWriterModule::closeFile()
 +{
 +    if (_fp != NULL)
 +    {
 +        gmx_fio_fclose(_fp);
 +        _fp = NULL;
 +    }
 +}
 +
 +
 +void IndexFileWriterModule::setFileName(const std::string &fnm)
 +{
 +    _fnm = fnm;
 +}
 +
 +
 +void IndexFileWriterModule::addGroup(const std::string &name, bool bDynamic)
 +{
 +    std::string newName(name);
 +    std::replace(newName.begin(), newName.end(), ' ', '_');
 +    _groups.push_back(GroupInfo(newName, bDynamic));
 +}
 +
 +
 +int IndexFileWriterModule::flags() const
 +{
 +    return efAllowMulticolumn | efAllowMultipoint;
 +}
 +
 +
 +void IndexFileWriterModule::dataStarted(AbstractAnalysisData * /*data*/)
 +{
 +    if (!_fnm.empty())
 +    {
 +        _fp = gmx_fio_fopen(_fnm.c_str(), "w");
 +    }
 +}
 +
 +
 +void IndexFileWriterModule::frameStarted(const AnalysisDataFrameHeader & /*header*/)
 +{
 +    _bAnyWritten = false;
 +    _currentGroup = -1;
 +}
 +
 +
 +void
 +IndexFileWriterModule::pointsAdded(const AnalysisDataPointSetRef &points)
 +{
 +    if (_fp == NULL)
 +    {
 +        return;
 +    }
 +    bool bFirstFrame = (points.frameIndex() == 0);
 +    if (points.firstColumn() == 0)
 +    {
 +        ++_currentGroup;
 +        if (bFirstFrame || _groups[_currentGroup].bDynamic)
 +        {
 +            if (!bFirstFrame || _currentGroup > 0)
 +            {
 +                std::fprintf(_fp, "\n\n");
 +            }
 +            std::string name = _groups[_currentGroup].name;
 +            if (_groups[_currentGroup].bDynamic)
 +            {
 +                name += formatString("_f%d_t%.3f", points.frameIndex(), points.x());
 +            }
 +            std::fprintf(_fp, "[ %s ]", name.c_str());
 +            _bAnyWritten = true;
 +            _currentSize = 0;
 +        }
 +    }
 +    else
 +    {
 +        if (bFirstFrame || _groups[_currentGroup].bDynamic)
 +        {
 +            if (_currentSize % 15 == 0)
 +            {
 +                std::fprintf(_fp, "\n");
 +            }
 +            std::fprintf(_fp, "%4d ", static_cast<int>(points.y(0)));
 +            ++_currentSize;
 +        }
 +    }
 +}
 +
 +
 +void IndexFileWriterModule::frameFinished(const AnalysisDataFrameHeader & /*header*/)
 +{
 +}
 +
 +
 +void IndexFileWriterModule::dataFinished()
 +{
 +    if (_fp != NULL)
 +    {
 +        std::fprintf(_fp, "\n");
 +    }
 +    closeFile();
 +}
 +
 +} // namespace
 +
 +
 +/********************************************************************
 + * Select
 + */
 +
 +Select::Select()
 +    : _options("select", "Selection information"),
 +      _bDump(false), _bTotNorm(false), _bFracNorm(false), _bResInd(false),
 +      _top(NULL)
 +{
 +}
 +
 +
 +Select::~Select()
 +{
 +}
 +
 +
 +Options &
 +Select::initOptions(TrajectoryAnalysisSettings *settings)
 +{
 +    static const char *const desc[] = {
 +        "[TT]g_select[tt] writes out basic data about dynamic selections.",
 +        "It can be used for some simple analyses, or the output can",
 +        "be combined with output from other programs and/or external",
 +        "analysis programs to calculate more complex things.",
 +        "Any combination of the output options is possible, but note",
++        "that [TT]-om[tt] only operates on the first selection.",
++        "[TT]-os[tt] is the default output option if none is selected.[PAR]",
 +        "With [TT]-os[tt], calculates the number of positions in each",
 +        "selection for each frame. With [TT]-norm[tt], the output is",
 +        "between 0 and 1 and describes the fraction from the maximum",
 +        "number of positions (e.g., for selection 'resname RA and x < 5'",
 +        "the maximum number of positions is the number of atoms in",
 +        "RA residues). With [TT]-cfnorm[tt], the output is divided",
 +        "by the fraction covered by the selection.",
 +        "[TT]-norm[tt] and [TT]-cfnorm[tt] can be specified independently",
 +        "of one another.[PAR]",
 +        "With [TT]-oc[tt], the fraction covered by each selection is",
 +        "written out as a function of time.[PAR]",
 +        "With [TT]-oi[tt], the selected atoms/residues/molecules are",
 +        "written out as a function of time. In the output, the first",
 +        "column contains the frame time, the second contains the number",
 +        "of positions, followed by the atom/residue/molecule numbers.",
 +        "If more than one selection is specified, the size of the second",
 +        "group immediately follows the last number of the first group",
 +        "and so on. With [TT]-dump[tt], the frame time and the number",
 +        "of positions is omitted from the output. In this case, only one",
 +        "selection can be given.[PAR]",
 +        "With [TT]-on[tt], the selected atoms are written as a index file",
 +        "compatible with [TT]make_ndx[tt] and the analyzing tools. Each selection",
 +        "is written as a selection group and for dynamic selections a",
 +        "group is written for each frame.[PAR]",
 +        "For residue numbers, the output of [TT]-oi[tt] can be controlled",
 +        "with [TT]-resnr[tt]: [TT]number[tt] (default) prints the residue",
 +        "numbers as they appear in the input file, while [TT]index[tt] prints",
 +        "unique numbers assigned to the residues in the order they appear",
 +        "in the input file, starting with 1. The former is more intuitive,",
 +        "but if the input contains multiple residues with the same number,",
 +        "the output can be less useful.[PAR]",
 +        "With [TT]-om[tt], a mask is printed for the first selection",
 +        "as a function of time. Each line in the output corresponds to",
 +        "one frame, and contains either 0/1 for each atom/residue/molecule",
 +        "possibly selected. 1 stands for the atom/residue/molecule being",
 +        "selected for the current frame, 0 for not selected.",
 +        "With [TT]-dump[tt], the frame time is omitted from the output."
 +    };
 +
 +    _options.setDescription(concatenateStrings(desc));
 +
 +    _options.addOption(FileNameOption("os").filetype(eftPlot).outputFile()
 +                           .store(&_fnSize).defaultValueIfSet("size"));
 +    _options.addOption(FileNameOption("oc").filetype(eftPlot).outputFile()
 +                           .store(&_fnFrac).defaultValueIfSet("frac"));
 +    _options.addOption(FileNameOption("oi").filetype(eftGenericData).outputFile()
 +                           .store(&_fnIndex).defaultValueIfSet("index"));
 +    _options.addOption(FileNameOption("on").filetype(eftIndex).outputFile()
 +                           .store(&_fnNdx).defaultValueIfSet("index"));
 +    _options.addOption(FileNameOption("om").filetype(eftPlot).outputFile()
 +                           .store(&_fnMask).defaultValueIfSet("mask"));
 +
 +    _options.addOption(SelectionOption("select").required().multiValue()
 +                           .storeVector(&_sel));
 +
 +    _options.addOption(BooleanOption("dump").store(&_bDump)
 +        .description("Do not print the frame time (-om, -oi) or the index size (-oi)"));
 +    _options.addOption(BooleanOption("norm").store(&_bTotNorm)
 +        .description("Normalize by total number of positions with -os"));
 +    _options.addOption(BooleanOption("cfnorm").store(&_bFracNorm)
 +        .description("Normalize by covered fraction with -os"));
 +    const char *const cResNumberEnum[] = { "number", "index", NULL };
 +    _options.addOption(StringOption("resnr").store(&_resNumberType)
 +        .enumValue(cResNumberEnum).defaultEnumIndex(0)
 +        .description("Residue number output type"));
 +
 +    return _options;
 +}
 +
 +
 +void
 +Select::initAnalysis(const TrajectoryAnalysisSettings &settings,
 +                     const TopologyInformation &top)
 +{
 +    if (!_fnIndex.empty() && _bDump && _sel.size() > 1U)
 +    {
 +        GMX_THROW(InconsistentInputError("With -oi and -dump, there can be only one selection"));
 +    }
 +    _bResInd = (_resNumberType == "index");
 +
 +    for (SelectionList::iterator i = _sel.begin(); i != _sel.end(); ++i)
 +    {
 +        i->initCoveredFraction(CFRAC_SOLIDANGLE);
 +    }
 +
 +    // TODO: For large systems, a float may not have enough precision
 +    _sdata.setColumnCount(_sel.size());
 +    registerAnalysisDataset(&_sdata, "size");
 +    _totsize.reserve(_sel.size());
 +    for (size_t g = 0; g < _sel.size(); ++g)
 +    {
 +        _totsize.push_back(_sel[g].posCount());
 +    }
 +    if (!_fnSize.empty())
 +    {
 +        AnalysisDataPlotModulePointer plot(
 +            new AnalysisDataPlotModule(settings.plotSettings()));
 +        plot->setFileName(_fnSize);
 +        plot->setTitle("Selection size");
 +        plot->setXAxisIsTime();
 +        plot->setYLabel("Number");
 +        _sdata.addModule(plot);
 +    }
 +
 +    _cdata.setColumnCount(_sel.size());
 +    registerAnalysisDataset(&_cdata, "cfrac");
 +    if (!_fnFrac.empty())
 +    {
 +        AnalysisDataPlotModulePointer plot(
 +            new AnalysisDataPlotModule(settings.plotSettings()));
 +        plot->setFileName(_fnFrac);
 +        plot->setTitle("Covered fraction");
 +        plot->setXAxisIsTime();
 +        plot->setYLabel("Fraction");
 +        plot->setYFormat(6, 4);
 +        _cdata.addModule(plot);
 +    }
 +
 +    // TODO: For large systems, a float may not have enough precision
 +    _idata.setColumnCount(2);
 +    _idata.setMultipoint(true);
 +    registerAnalysisDataset(&_idata, "index");
 +    if (!_fnIndex.empty())
 +    {
 +        AnalysisDataPlotModulePointer plot(
 +            new AnalysisDataPlotModule(settings.plotSettings()));
 +        plot->setFileName(_fnIndex);
 +        plot->setPlainOutput(true);
 +        plot->setYFormat(4, 0);
 +        if (_bDump)
 +        {
 +            plot->setOmitX(_bDump);
 +            _idata.addColumnModule(1, 1, plot);
 +        }
 +        else
 +        {
 +            _idata.addModule(plot);
 +        }
 +    }
 +    if (!_fnNdx.empty())
 +    {
 +        boost::shared_ptr<IndexFileWriterModule> writer(new IndexFileWriterModule());
 +        writer->setFileName(_fnNdx);
 +        for (size_t g = 0; g < _sel.size(); ++g)
 +        {
 +            writer->addGroup(_sel[g].name(), _sel[g].isDynamic());
 +        }
 +        _idata.addModule(writer);
 +    }
 +
 +    _mdata.setColumnCount(_sel[0].posCount());
 +    registerAnalysisDataset(&_mdata, "mask");
 +    if (!_fnMask.empty())
 +    {
 +        if (_sel.size() > 1U)
 +        {
 +            fprintf(stderr, "WARNING: the mask (-om) will only be written for the first group\n");
 +        }
 +        if (!_sel[0].isDynamic())
 +        {
 +            fprintf(stderr, "WARNING: will not write the mask (-om) for a static selection\n");
 +        }
 +        else
 +        {
 +            AnalysisDataPlotModulePointer plot(
 +                new AnalysisDataPlotModule(settings.plotSettings()));
 +            plot->setFileName(_fnMask);
 +            plot->setPlainOutput(_bDump);
 +            plot->setOmitX(_bDump);
 +            plot->setTitle("Selection mask");
 +            plot->setXAxisIsTime();
 +            plot->setYLabel("Occupancy");
 +            plot->setYFormat(1, 0);
 +            _mdata.addModule(plot);
 +        }
 +    }
 +
 +    _top = top.topology();
 +}
 +
 +
 +void
 +Select::analyzeFrame(int frnr, const t_trxframe &fr, t_pbc *pbc,
 +                     TrajectoryAnalysisModuleData *pdata)
 +{
 +    AnalysisDataHandle sdh = pdata->dataHandle(_sdata);
 +    AnalysisDataHandle cdh = pdata->dataHandle(_cdata);
 +    AnalysisDataHandle idh = pdata->dataHandle(_idata);
 +    AnalysisDataHandle mdh = pdata->dataHandle(_mdata);
 +    const SelectionList &sel = pdata->parallelSelections(_sel);
 +
 +    sdh.startFrame(frnr, fr.time);
 +    for (size_t g = 0; g < sel.size(); ++g)
 +    {
 +        real normfac = _bFracNorm ? 1.0 / sel[g].coveredFraction() : 1.0;
 +        if (_bTotNorm)
 +        {
 +            normfac /= _totsize[g];
 +        }
 +        sdh.setPoint(g, sel[g].posCount() * normfac);
 +    }
 +    sdh.finishFrame();
 +
 +    cdh.startFrame(frnr, fr.time);
 +    for (size_t g = 0; g < sel.size(); ++g)
 +    {
 +        cdh.setPoint(g, sel[g].coveredFraction());
 +    }
 +    cdh.finishFrame();
 +
 +    idh.startFrame(frnr, fr.time);
 +    for (size_t g = 0; g < sel.size(); ++g)
 +    {
 +        idh.setPoint(0, sel[g].posCount());
 +        idh.finishPointSet();
 +        for (int i = 0; i < sel[g].posCount(); ++i)
 +        {
 +            const SelectionPosition &p = sel[g].position(i);
 +            if (sel[g].type() == INDEX_RES && !_bResInd)
 +            {
 +                idh.setPoint(1, _top->atoms.resinfo[p.mappedId()].nr);
 +            }
 +            else
 +            {
 +                idh.setPoint(1, p.mappedId() + 1);
 +            }
 +            idh.finishPointSet();
 +        }
 +    }
 +    idh.finishFrame();
 +
 +    mdh.startFrame(frnr, fr.time);
 +    int nextRefId = sel[0].position(0).refId();
 +    for (int b = 0, i = 0; b < _totsize[0]; ++b)
 +    {
 +        mdh.setPoint(b, b == nextRefId ? 1 : 0);
 +        if (b == nextRefId)
 +        {
 +            ++i;
 +            nextRefId = sel[0].position(i).refId();
 +        }
 +    }
 +    mdh.finishFrame();
 +}
 +
 +
 +void
 +Select::finishAnalysis(int /*nframes*/)
 +{
 +}
 +
 +
 +void
 +Select::writeOutput()
 +{
 +}
 +
 +
 +TrajectoryAnalysisModulePointer
 +Select::create()
 +{
 +    return TrajectoryAnalysisModulePointer(new Select());
 +}
 +
 +} // namespace analysismodules
 +
 +} // namespace gmx
Simple merge
index 8a951c1ea0762992f8b3ffc6388e61f2196a264d,0000000000000000000000000000000000000000..ed135f0a9159a3f3ed8c8864bcecf3af3a8a000c
mode 100644,000000..100644
--- /dev/null
@@@ -1,1866 -1,0 +1,1864 @@@
- #if ((defined WIN32 || defined _WIN32 || defined WIN64 || defined _WIN64) && !defined __CYGWIN__ && !defined __CYGWIN32__)
- /* _isnan() */
- #include <float.h>
- #endif
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
- #include "pppm.h"
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "sysstuff.h"
 +#include "vec.h"
 +#include "statutil.h"
 +#include "vcm.h"
 +#include "mdebin.h"
 +#include "nrnb.h"
 +#include "calcmu.h"
 +#include "index.h"
 +#include "vsite.h"
 +#include "update.h"
 +#include "ns.h"
 +#include "trnio.h"
 +#include "xtcio.h"
 +#include "mdrun.h"
 +#include "confio.h"
 +#include "network.h"
 +#include "pull.h"
 +#include "xvgr.h"
 +#include "physics.h"
 +#include "names.h"
 +#include "xmdrun.h"
 +#include "ionize.h"
 +#include "disre.h"
 +#include "orires.h"
 +#include "dihre.h"
-     real        temp0,mu_aver=0,dvdl;
 +#include "pme.h"
 +#include "mdatoms.h"
 +#include "repl_ex.h"
 +#include "qmmm.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "topsort.h"
 +#include "coulomb.h"
 +#include "constr.h"
 +#include "shellfc.h"
 +#include "compute_io.h"
 +#include "mvdata.h"
 +#include "checkpoint.h"
 +#include "mtop_util.h"
 +#include "sighandler.h"
 +#include "membed.h"
 +#include "string2.h"
 +
 +#ifdef GMX_LIB_MPI
 +#include <mpi.h>
 +#endif
 +#ifdef GMX_THREAD_MPI
 +#include "tmpi.h"
 +#endif
 +
 +#ifdef GMX_FAHCORE
 +#include "corewrap.h"
 +#endif
 +
 +
 +double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
 +             const output_env_t oenv, gmx_bool bVerbose,gmx_bool bCompact,
 +             int nstglobalcomm,
 +             gmx_vsite_t *vsite,gmx_constr_t constr,
 +             int stepout,t_inputrec *ir,
 +             gmx_mtop_t *top_global,
 +             t_fcdata *fcd,
 +             t_state *state_global,
 +             t_mdatoms *mdatoms,
 +             t_nrnb *nrnb,gmx_wallcycle_t wcycle,
 +             gmx_edsam_t ed,t_forcerec *fr,
 +             int repl_ex_nst,int repl_ex_seed,gmx_membed_t *membed,
 +             real cpt_period,real max_hours,
 +             const char *deviceOptions,
 +             unsigned long Flags,
 +             gmx_runtime_t *runtime)
 +{
 +    gmx_mdoutf_t *outf;
 +    gmx_large_int_t step,step_rel;
 +    double     run_time;
 +    double     t,t0,lam0;
 +    gmx_bool       bGStatEveryStep,bGStat,bNstEner,bCalcEnerPres;
 +    gmx_bool       bNS,bNStList,bSimAnn,bStopCM,bRerunMD,bNotLastFrame=FALSE,
 +               bFirstStep,bStateFromTPX,bInitStep,bLastStep,
 +               bBornRadii,bStartingFromCpt;
 +    gmx_bool       bDoDHDL=FALSE;
 +    gmx_bool       do_ene,do_log,do_verbose,bRerunWarnNoV=TRUE,
 +               bForceUpdate=FALSE,bCPT;
 +    int        mdof_flags;
 +    gmx_bool       bMasterState;
 +    int        force_flags,cglo_flags;
 +    tensor     force_vir,shake_vir,total_vir,tmp_vir,pres;
 +    int        i,m;
 +    t_trxstatus *status;
 +    rvec       mu_tot;
 +    t_vcm      *vcm;
 +    t_state    *bufstate=NULL;   
 +    matrix     *scale_tot,pcoupl_mu,M,ebox;
 +    gmx_nlheur_t nlh;
 +    t_trxframe rerun_fr;
 +    gmx_repl_ex_t repl_ex=NULL;
 +    int        nchkpt=1;
 +
 +    gmx_localtop_t *top;      
 +    t_mdebin *mdebin=NULL;
 +    t_state    *state=NULL;
 +    rvec       *f_global=NULL;
 +    int        n_xtc=-1;
 +    rvec       *x_xtc=NULL;
 +    gmx_enerdata_t *enerd;
 +    rvec       *f=NULL;
 +    gmx_global_stat_t gstat;
 +    gmx_update_t upd=NULL;
 +    t_graph    *graph=NULL;
 +    globsig_t   gs;
 +
 +    gmx_bool        bFFscan;
 +    gmx_groups_t *groups;
 +    gmx_ekindata_t *ekind, *ekind_save;
 +    gmx_shellfc_t shellfc;
 +    int         count,nconverged=0;
 +    real        timestep=0;
 +    double      tcount=0;
 +    gmx_bool        bIonize=FALSE;
 +    gmx_bool        bTCR=FALSE,bConverged=TRUE,bOK,bSumEkinhOld,bExchanged;
 +    gmx_bool        bAppend;
 +    gmx_bool        bResetCountersHalfMaxH=FALSE;
 +    gmx_bool        bVV,bIterations,bFirstIterate,bTemp,bPres,bTrotter;
-                         cglo_flags &~ CGLO_PRESSURE);
++    real        mu_aver=0,dvdl;
 +    int         a0,a1,gnx=0,ii;
 +    atom_id     *grpindex=NULL;
 +    char        *grpname;
 +    t_coupl_rec *tcr=NULL;
 +    rvec        *xcopy=NULL,*vcopy=NULL,*cbuf=NULL;
 +    matrix      boxcopy={{0}},lastbox;
 +      tensor      tmpvir;
 +      real        fom,oldfom,veta_save,pcurr,scalevir,tracevir;
 +      real        vetanew = 0;
 +    double      cycles;
 +      real        saved_conserved_quantity = 0;
 +    real        last_ekin = 0;
 +      int         iter_i;
 +      t_extmass   MassQ;
 +    int         **trotter_seq; 
 +    char        sbuf[STEPSTRSIZE],sbuf2[STEPSTRSIZE];
 +    int         handled_stop_condition=gmx_stop_cond_none; /* compare to get_stop_condition*/
 +    gmx_iterate_t iterate;
 +    gmx_large_int_t multisim_nsteps=-1; /* number of steps to do  before first multisim 
 +                                          simulation stops. If equal to zero, don't
 +                                          communicate any more between multisims.*/
 +#ifdef GMX_FAHCORE
 +    /* Temporary addition for FAHCORE checkpointing */
 +    int chkpt_ret;
 +#endif
 +
 +    /* Check for special mdrun options */
 +    bRerunMD = (Flags & MD_RERUN);
 +    bIonize  = (Flags & MD_IONIZE);
 +    bFFscan  = (Flags & MD_FFSCAN);
 +    bAppend  = (Flags & MD_APPENDFILES);
 +    if (Flags & MD_RESETCOUNTERSHALFWAY)
 +    {
 +        if (ir->nsteps > 0)
 +        {
 +            /* Signal to reset the counters half the simulation steps. */
 +            wcycle_set_reset_counters(wcycle,ir->nsteps/2);
 +        }
 +        /* Signal to reset the counters halfway the simulation time. */
 +        bResetCountersHalfMaxH = (max_hours > 0);
 +    }
 +
 +    /* md-vv uses averaged full step velocities for T-control 
 +       md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
 +       md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
 +    bVV = EI_VV(ir->eI);
 +    if (bVV) /* to store the initial velocities while computing virial */
 +    {
 +        snew(cbuf,top_global->natoms);
 +    }
 +    /* all the iteratative cases - only if there are constraints */ 
 +    bIterations = ((IR_NPT_TROTTER(ir)) && (constr) && (!bRerunMD));
 +    bTrotter = (bVV && (IR_NPT_TROTTER(ir) || (IR_NVT_TROTTER(ir))));        
 +    
 +    if (bRerunMD)
 +    {
 +        /* Since we don't know if the frames read are related in any way,
 +         * rebuild the neighborlist at every step.
 +         */
 +        ir->nstlist       = 1;
 +        ir->nstcalcenergy = 1;
 +        nstglobalcomm     = 1;
 +    }
 +
 +    check_ir_old_tpx_versions(cr,fplog,ir,top_global);
 +
 +    nstglobalcomm = check_nstglobalcomm(fplog,cr,nstglobalcomm,ir);
 +    bGStatEveryStep = (nstglobalcomm == 1);
 +
 +    if (!bGStatEveryStep && ir->nstlist == -1 && fplog != NULL)
 +    {
 +        fprintf(fplog,
 +                "To reduce the energy communication with nstlist = -1\n"
 +                "the neighbor list validity should not be checked at every step,\n"
 +                "this means that exact integration is not guaranteed.\n"
 +                "The neighbor list validity is checked after:\n"
 +                "  <n.list life time> - 2*std.dev.(n.list life time)  steps.\n"
 +                "In most cases this will result in exact integration.\n"
 +                "This reduces the energy communication by a factor of 2 to 3.\n"
 +                "If you want less energy communication, set nstlist > 3.\n\n");
 +    }
 +
 +    if (bRerunMD || bFFscan)
 +    {
 +        ir->nstxtcout = 0;
 +    }
 +    groups = &top_global->groups;
 +
 +    /* Initial values */
 +    init_md(fplog,cr,ir,oenv,&t,&t0,&state_global->lambda,&lam0,
 +            nrnb,top_global,&upd,
 +            nfile,fnm,&outf,&mdebin,
 +            force_vir,shake_vir,mu_tot,&bSimAnn,&vcm,state_global,Flags);
 +
 +    clear_mat(total_vir);
 +    clear_mat(pres);
 +    /* Energy terms and groups */
 +    snew(enerd,1);
 +    init_enerdata(top_global->groups.grps[egcENER].nr,ir->n_flambda,enerd);
 +    if (DOMAINDECOMP(cr))
 +    {
 +        f = NULL;
 +    }
 +    else
 +    {
 +        snew(f,top_global->natoms);
 +    }
 +
 +    /* Kinetic energy data */
 +    snew(ekind,1);
 +    init_ekindata(fplog,top_global,&(ir->opts),ekind);
 +    /* needed for iteration of constraints */
 +    snew(ekind_save,1);
 +    init_ekindata(fplog,top_global,&(ir->opts),ekind_save);
 +    /* Copy the cos acceleration to the groups struct */    
 +    ekind->cosacc.cos_accel = ir->cos_accel;
 +
 +    gstat = global_stat_init(ir);
 +    debug_gmx();
 +
 +    /* Check for polarizable models and flexible constraints */
 +    shellfc = init_shell_flexcon(fplog,
 +                                 top_global,n_flexible_constraints(constr),
 +                                 (ir->bContinuation || 
 +                                  (DOMAINDECOMP(cr) && !MASTER(cr))) ?
 +                                 NULL : state_global->x);
 +
 +    if (DEFORM(*ir))
 +    {
 +#ifdef GMX_THREAD_MPI
 +        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
 +#endif
 +        set_deform_reference_box(upd,
 +                                 deform_init_init_step_tpx,
 +                                 deform_init_box_tpx);
 +#ifdef GMX_THREAD_MPI
 +        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
 +#endif
 +    }
 +
 +    {
 +        double io = compute_io(ir,top_global->natoms,groups,mdebin->ebin->nener,1);
 +        if ((io > 2000) && MASTER(cr))
 +            fprintf(stderr,
 +                    "\nWARNING: This run will generate roughly %.0f Mb of data\n\n",
 +                    io);
 +    }
 +
 +    if (DOMAINDECOMP(cr)) {
 +        top = dd_init_local_top(top_global);
 +
 +        snew(state,1);
 +        dd_init_local_state(cr->dd,state_global,state);
 +
 +        if (DDMASTER(cr->dd) && ir->nstfout) {
 +            snew(f_global,state_global->natoms);
 +        }
 +    } else {
 +        if (PAR(cr)) {
 +            /* Initialize the particle decomposition and split the topology */
 +            top = split_system(fplog,top_global,ir,cr);
 +
 +            pd_cg_range(cr,&fr->cg0,&fr->hcg);
 +            pd_at_range(cr,&a0,&a1);
 +        } else {
 +            top = gmx_mtop_generate_local_top(top_global,ir);
 +
 +            a0 = 0;
 +            a1 = top_global->natoms;
 +        }
 +
 +        state = partdec_init_local_state(cr,state_global);
 +        f_global = f;
 +
 +        atoms2md(top_global,ir,0,NULL,a0,a1-a0,mdatoms);
 +
 +        if (vsite) {
 +            set_vsite_top(vsite,top,mdatoms,cr);
 +        }
 +
 +        if (ir->ePBC != epbcNONE && !ir->bPeriodicMols) {
 +            graph = mk_graph(fplog,&(top->idef),0,top_global->natoms,FALSE,FALSE);
 +        }
 +
 +        if (shellfc) {
 +            make_local_shells(cr,mdatoms,shellfc);
 +        }
 +
 +        if (ir->pull && PAR(cr)) {
 +            dd_make_local_pull_groups(NULL,ir->pull,mdatoms);
 +        }
 +    }
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        /* Distribute the charge groups over the nodes from the master node */
 +        dd_partition_system(fplog,ir->init_step,cr,TRUE,1,
 +                            state_global,top_global,ir,
 +                            state,&f,mdatoms,top,fr,
 +                            vsite,shellfc,constr,
 +                            nrnb,wcycle,FALSE);
 +    }
 +
 +    update_mdatoms(mdatoms,state->lambda);
 +
 +    if (MASTER(cr))
 +    {
 +        if (opt2bSet("-cpi",nfile,fnm))
 +        {
 +            /* Update mdebin with energy history if appending to output files */
 +            if ( Flags & MD_APPENDFILES )
 +            {
 +                restore_energyhistory_from_state(mdebin,&state_global->enerhist);
 +            }
 +            else
 +            {
 +                /* We might have read an energy history from checkpoint,
 +                 * free the allocated memory and reset the counts.
 +                 */
 +                done_energyhistory(&state_global->enerhist);
 +                init_energyhistory(&state_global->enerhist);
 +            }
 +        }
 +        /* Set the initial energy history in state by updating once */
 +        update_energyhistory(&state_global->enerhist,mdebin);
 +    } 
 +
 +    if ((state->flags & (1<<estLD_RNG)) && (Flags & MD_READ_RNG)) {
 +        /* Set the random state if we read a checkpoint file */
 +        set_stochd_state(upd,state);
 +    }
 +
 +    /* Initialize constraints */
 +    if (constr) {
 +        if (!DOMAINDECOMP(cr))
 +            set_constraints(constr,top,ir,mdatoms,cr);
 +    }
 +
 +    /* Check whether we have to GCT stuff */
 +    bTCR = ftp2bSet(efGCT,nfile,fnm);
 +    if (bTCR) {
 +        if (MASTER(cr)) {
 +            fprintf(stderr,"Will do General Coupling Theory!\n");
 +        }
 +        gnx = top_global->mols.nr;
 +        snew(grpindex,gnx);
 +        for(i=0; (i<gnx); i++) {
 +            grpindex[i] = i;
 +        }
 +    }
 +
 +    if (repl_ex_nst > 0)
 +    {
 +        /* We need to be sure replica exchange can only occur
 +         * when the energies are current */
 +        check_nst_param(fplog,cr,"nstcalcenergy",ir->nstcalcenergy,
 +                        "repl_ex_nst",&repl_ex_nst);
 +        /* This check needs to happen before inter-simulation
 +         * signals are initialized, too */
 +    }
 +    if (repl_ex_nst > 0 && MASTER(cr))
 +        repl_ex = init_replica_exchange(fplog,cr->ms,state_global,ir,
 +                                        repl_ex_nst,repl_ex_seed);
 +
 +    if (!ir->bContinuation && !bRerunMD)
 +    {
 +        if (mdatoms->cFREEZE && (state->flags & (1<<estV)))
 +        {
 +            /* Set the velocities of frozen particles to zero */
 +            for(i=mdatoms->start; i<mdatoms->start+mdatoms->homenr; i++)
 +            {
 +                for(m=0; m<DIM; m++)
 +                {
 +                    if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
 +                    {
 +                        state->v[i][m] = 0;
 +                    }
 +                }
 +            }
 +        }
 +
 +        if (constr)
 +        {
 +            /* Constrain the initial coordinates and velocities */
 +            do_constrain_first(fplog,constr,ir,mdatoms,state,f,
 +                               graph,cr,nrnb,fr,top,shake_vir);
 +        }
 +        if (vsite)
 +        {
 +            /* Construct the virtual sites for the initial configuration */
 +            construct_vsites(fplog,vsite,state->x,nrnb,ir->delta_t,NULL,
 +                             top->idef.iparams,top->idef.il,
 +                             fr->ePBC,fr->bMolPBC,graph,cr,state->box);
 +        }
 +    }
 +
 +    debug_gmx();
 +  
 +    /* I'm assuming we need global communication the first time! MRS */
 +    cglo_flags = (CGLO_TEMPERATURE | CGLO_GSTAT
++                  | ((ir->comm_mode != ecmNO) ? CGLO_STOPCM:0)
 +                  | (bVV ? CGLO_PRESSURE:0)
 +                  | (bVV ? CGLO_CONSTRAINT:0)
 +                  | (bRerunMD ? CGLO_RERUNMD:0)
 +                  | ((Flags & MD_READ_EKIN) ? CGLO_READEKIN:0));
 +    
 +    bSumEkinhOld = FALSE;
 +    compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
 +                    NULL,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
 +                    constr,NULL,FALSE,state->box,
 +                    top_global,&pcurr,top_global->natoms,&bSumEkinhOld,cglo_flags);
 +    if (ir->eI == eiVVAK) {
 +        /* a second call to get the half step temperature initialized as well */ 
 +        /* we do the same call as above, but turn the pressure off -- internally to 
 +           compute_globals, this is recognized as a velocity verlet half-step 
 +           kinetic energy calculation.  This minimized excess variables, but 
 +           perhaps loses some logic?*/
 +        
 +        compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
 +                        NULL,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
 +                        constr,NULL,FALSE,state->box,
 +                        top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
-     temp0 = enerd->term[F_TEMP];
++                        cglo_flags &~ (CGLO_STOPCM | CGLO_PRESSURE));
 +    }
 +    
 +    /* Calculate the initial half step temperature, and save the ekinh_old */
 +    if (!(Flags & MD_STARTFROMCPT)) 
 +    {
 +        for(i=0; (i<ir->opts.ngtc); i++) 
 +        {
 +            copy_mat(ekind->tcstat[i].ekinh,ekind->tcstat[i].ekinh_old);
 +        } 
 +    }
 +    if (ir->eI != eiVV) 
 +    {
 +        enerd->term[F_TEMP] *= 2; /* result of averages being done over previous and current step,
 +                                     and there is no previous step */
 +    }
-         fprintf(fplog,"Initial temperature: %g K\n",enerd->term[F_TEMP]);
 +    
 +    /* if using an iterative algorithm, we need to create a working directory for the state. */
 +    if (bIterations) 
 +    {
 +            bufstate = init_bufstate(state);
 +    }
 +    if (bFFscan) 
 +    {
 +        snew(xcopy,state->natoms);
 +        snew(vcopy,state->natoms);
 +        copy_rvecn(state->x,xcopy,0,state->natoms);
 +        copy_rvecn(state->v,vcopy,0,state->natoms);
 +        copy_mat(state->box,boxcopy);
 +    } 
 +    
 +    /* need to make an initiation call to get the Trotter variables set, as well as other constants for non-trotter
 +       temperature control */
 +    trotter_seq = init_npt_vars(ir,state,&MassQ,bTrotter);
 +    
 +    if (MASTER(cr))
 +    {
 +        if (constr && !ir->bContinuation && ir->eConstrAlg == econtLINCS)
 +        {
 +            fprintf(fplog,
 +                    "RMS relative constraint deviation after constraining: %.2e\n",
 +                    constr_rmsd(constr,FALSE));
 +        }
-                       (bStopCM ? CGLO_STOPCM : 0) |
++        if (EI_STATE_VELOCITY(ir->eI))
++        {
++            fprintf(fplog,"Initial temperature: %g K\n",enerd->term[F_TEMP]);
++        }
 +        if (bRerunMD)
 +        {
 +            fprintf(stderr,"starting md rerun '%s', reading coordinates from"
 +                    " input trajectory '%s'\n\n",
 +                    *(top_global->name),opt2fn("-rerun",nfile,fnm));
 +            if (bVerbose)
 +            {
 +                fprintf(stderr,"Calculated time to finish depends on nsteps from "
 +                        "run input file,\nwhich may not correspond to the time "
 +                        "needed to process input trajectory.\n\n");
 +            }
 +        }
 +        else
 +        {
 +            char tbuf[20];
 +            fprintf(stderr,"starting mdrun '%s'\n",
 +                    *(top_global->name));
 +            if (ir->nsteps >= 0)
 +            {
 +                sprintf(tbuf,"%8.1f",(ir->init_step+ir->nsteps)*ir->delta_t);
 +            }
 +            else
 +            {
 +                sprintf(tbuf,"%s","infinite");
 +            }
 +            if (ir->init_step > 0)
 +            {
 +                fprintf(stderr,"%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
 +                        gmx_step_str(ir->init_step+ir->nsteps,sbuf),tbuf,
 +                        gmx_step_str(ir->init_step,sbuf2),
 +                        ir->init_step*ir->delta_t);
 +            }
 +            else
 +            {
 +                fprintf(stderr,"%s steps, %s ps.\n",
 +                        gmx_step_str(ir->nsteps,sbuf),tbuf);
 +            }
 +        }
 +        fprintf(fplog,"\n");
 +    }
 +
 +    /* Set and write start time */
 +    runtime_start(runtime);
 +    print_date_and_time(fplog,cr->nodeid,"Started mdrun",runtime);
 +    wallcycle_start(wcycle,ewcRUN);
 +    if (fplog)
 +        fprintf(fplog,"\n");
 +
 +    /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
 +#ifdef GMX_FAHCORE
 +    chkpt_ret=fcCheckPointParallel( cr->nodeid,
 +                                    NULL,0);
 +    if ( chkpt_ret == 0 ) 
 +        gmx_fatal( 3,__FILE__,__LINE__, "Checkpoint error on step %d\n", 0 );
 +#endif
 +
 +    debug_gmx();
 +    /***********************************************************
 +     *
 +     *             Loop over MD steps 
 +     *
 +     ************************************************************/
 +
 +    /* if rerunMD then read coordinates and velocities from input trajectory */
 +    if (bRerunMD)
 +    {
 +        if (getenv("GMX_FORCE_UPDATE"))
 +        {
 +            bForceUpdate = TRUE;
 +        }
 +
 +        rerun_fr.natoms = 0;
 +        if (MASTER(cr))
 +        {
 +            bNotLastFrame = read_first_frame(oenv,&status,
 +                                             opt2fn("-rerun",nfile,fnm),
 +                                             &rerun_fr,TRX_NEED_X | TRX_READ_V);
 +            if (rerun_fr.natoms != top_global->natoms)
 +            {
 +                gmx_fatal(FARGS,
 +                          "Number of atoms in trajectory (%d) does not match the "
 +                          "run input file (%d)\n",
 +                          rerun_fr.natoms,top_global->natoms);
 +            }
 +            if (ir->ePBC != epbcNONE)
 +            {
 +                if (!rerun_fr.bBox)
 +                {
 +                    gmx_fatal(FARGS,"Rerun trajectory frame step %d time %f does not contain a box, while pbc is used",rerun_fr.step,rerun_fr.time);
 +                }
 +                if (max_cutoff2(ir->ePBC,rerun_fr.box) < sqr(fr->rlistlong))
 +                {
 +                    gmx_fatal(FARGS,"Rerun trajectory frame step %d time %f has too small box dimensions",rerun_fr.step,rerun_fr.time);
 +                }
 +            }
 +        }
 +
 +        if (PAR(cr))
 +        {
 +            rerun_parallel_comm(cr,&rerun_fr,&bNotLastFrame);
 +        }
 +
 +        if (ir->ePBC != epbcNONE)
 +        {
 +            /* Set the shift vectors.
 +             * Necessary here when have a static box different from the tpr box.
 +             */
 +            calc_shifts(rerun_fr.box,fr->shift_vec);
 +        }
 +    }
 +
 +    /* loop over MD steps or if rerunMD to end of input trajectory */
 +    bFirstStep = TRUE;
 +    /* Skip the first Nose-Hoover integration when we get the state from tpx */
 +    bStateFromTPX = !opt2bSet("-cpi",nfile,fnm);
 +    bInitStep = bFirstStep && (bStateFromTPX || bVV);
 +    bStartingFromCpt = (Flags & MD_STARTFROMCPT) && bInitStep;
 +    bLastStep    = FALSE;
 +    bSumEkinhOld = FALSE;
 +    bExchanged   = FALSE;
 +
 +    init_global_signals(&gs,cr,ir,repl_ex_nst);
 +
 +    step = ir->init_step;
 +    step_rel = 0;
 +
 +    if (ir->nstlist == -1)
 +    {
 +        init_nlistheuristics(&nlh,bGStatEveryStep,step);
 +    }
 +
 +    if (MULTISIM(cr) && (repl_ex_nst <=0 ))
 +    {
 +        /* check how many steps are left in other sims */
 +        multisim_nsteps=get_multisim_nsteps(cr, ir->nsteps);
 +    }
 +
 +
 +    /* and stop now if we should */
 +    bLastStep = (bRerunMD || (ir->nsteps >= 0 && step_rel > ir->nsteps) ||
 +                 ((multisim_nsteps >= 0) && (step_rel >= multisim_nsteps )));
 +    while (!bLastStep || (bRerunMD && bNotLastFrame)) {
 +
 +        wallcycle_start(wcycle,ewcSTEP);
 +
 +        if (bRerunMD) {
 +            if (rerun_fr.bStep) {
 +                step = rerun_fr.step;
 +                step_rel = step - ir->init_step;
 +            }
 +            if (rerun_fr.bTime) {
 +                t = rerun_fr.time;
 +            }
 +            else
 +            {
 +                t = step;
 +            }
 +        } 
 +        else 
 +        {
 +            bLastStep = (step_rel == ir->nsteps);
 +            t = t0 + step*ir->delta_t;
 +        }
 +
 +        if (ir->efep != efepNO)
 +        {
 +            if (bRerunMD && rerun_fr.bLambda && (ir->delta_lambda!=0))
 +            {
 +                state_global->lambda = rerun_fr.lambda;
 +            }
 +            else
 +            {
 +                state_global->lambda = lam0 + step*ir->delta_lambda;
 +            }
 +            state->lambda = state_global->lambda;
 +            bDoDHDL = do_per_step(step,ir->nstdhdl);
 +        }
 +
 +        if (bSimAnn) 
 +        {
 +            update_annealing_target_temp(&(ir->opts),t);
 +        }
 +
 +        if (bRerunMD)
 +        {
 +            if (!(DOMAINDECOMP(cr) && !MASTER(cr)))
 +            {
 +                for(i=0; i<state_global->natoms; i++)
 +                {
 +                    copy_rvec(rerun_fr.x[i],state_global->x[i]);
 +                }
 +                if (rerun_fr.bV)
 +                {
 +                    for(i=0; i<state_global->natoms; i++)
 +                    {
 +                        copy_rvec(rerun_fr.v[i],state_global->v[i]);
 +                    }
 +                }
 +                else
 +                {
 +                    for(i=0; i<state_global->natoms; i++)
 +                    {
 +                        clear_rvec(state_global->v[i]);
 +                    }
 +                    if (bRerunWarnNoV)
 +                    {
 +                        fprintf(stderr,"\nWARNING: Some frames do not contain velocities.\n"
 +                                "         Ekin, temperature and pressure are incorrect,\n"
 +                                "         the virial will be incorrect when constraints are present.\n"
 +                                "\n");
 +                        bRerunWarnNoV = FALSE;
 +                    }
 +                }
 +            }
 +            copy_mat(rerun_fr.box,state_global->box);
 +            copy_mat(state_global->box,state->box);
 +
 +            if (vsite && (Flags & MD_RERUN_VSITE))
 +            {
 +                if (DOMAINDECOMP(cr))
 +                {
 +                    gmx_fatal(FARGS,"Vsite recalculation with -rerun is not implemented for domain decomposition, use particle decomposition");
 +                }
 +                if (graph)
 +                {
 +                    /* Following is necessary because the graph may get out of sync
 +                     * with the coordinates if we only have every N'th coordinate set
 +                     */
 +                    mk_mshift(fplog,graph,fr->ePBC,state->box,state->x);
 +                    shift_self(graph,state->box,state->x);
 +                }
 +                construct_vsites(fplog,vsite,state->x,nrnb,ir->delta_t,state->v,
 +                                 top->idef.iparams,top->idef.il,
 +                                 fr->ePBC,fr->bMolPBC,graph,cr,state->box);
 +                if (graph)
 +                {
 +                    unshift_self(graph,state->box,state->x);
 +                }
 +            }
 +        }
 +
 +        /* Stop Center of Mass motion */
 +        bStopCM = (ir->comm_mode != ecmNO && do_per_step(step,ir->nstcomm));
 +
 +        /* Copy back starting coordinates in case we're doing a forcefield scan */
 +        if (bFFscan)
 +        {
 +            for(ii=0; (ii<state->natoms); ii++)
 +            {
 +                copy_rvec(xcopy[ii],state->x[ii]);
 +                copy_rvec(vcopy[ii],state->v[ii]);
 +            }
 +            copy_mat(boxcopy,state->box);
 +        }
 +
 +        if (bRerunMD)
 +        {
 +            /* for rerun MD always do Neighbour Searching */
 +            bNS = (bFirstStep || ir->nstlist != 0);
 +            bNStList = bNS;
 +        }
 +        else
 +        {
 +            /* Determine whether or not to do Neighbour Searching and LR */
 +            bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
 +            
 +            bNS = (bFirstStep || bExchanged || bNStList ||
 +                   (ir->nstlist == -1 && nlh.nabnsb > 0));
 +
 +            if (bNS && ir->nstlist == -1)
 +            {
 +                set_nlistheuristics(&nlh,bFirstStep || bExchanged,step);
 +            }
 +        } 
 +
 +        /* check whether we should stop because another simulation has 
 +           stopped. */
 +        if (MULTISIM(cr))
 +        {
 +            if ( (multisim_nsteps >= 0) &&  (step_rel >= multisim_nsteps)  &&  
 +                 (multisim_nsteps != ir->nsteps) )  
 +            {
 +                if (bNS)
 +                {
 +                    if (MASTER(cr))
 +                    {
 +                        fprintf(stderr, 
 +                                "Stopping simulation %d because another one has finished\n",
 +                                cr->ms->sim);
 +                    }
 +                    bLastStep=TRUE;
 +                    gs.sig[eglsCHKPT] = 1;
 +                }
 +            }
 +        }
 +
 +        /* < 0 means stop at next step, > 0 means stop at next NS step */
 +        if ( (gs.set[eglsSTOPCOND] < 0 ) ||
 +             ( (gs.set[eglsSTOPCOND] > 0 ) && ( bNS || ir->nstlist==0)) )
 +        {
 +            bLastStep = TRUE;
 +        }
 +
 +        /* Determine whether or not to update the Born radii if doing GB */
 +        bBornRadii=bFirstStep;
 +        if (ir->implicit_solvent && (step % ir->nstgbradii==0))
 +        {
 +            bBornRadii=TRUE;
 +        }
 +        
 +        do_log = do_per_step(step,ir->nstlog) || bFirstStep || bLastStep;
 +        do_verbose = bVerbose &&
 +                  (step % stepout == 0 || bFirstStep || bLastStep);
 +
 +        if (bNS && !(bFirstStep && ir->bContinuation && !bRerunMD))
 +        {
 +            if (bRerunMD)
 +            {
 +                bMasterState = TRUE;
 +            }
 +            else
 +            {
 +                bMasterState = FALSE;
 +                /* Correct the new box if it is too skewed */
 +                if (DYNAMIC_BOX(*ir))
 +                {
 +                    if (correct_box(fplog,step,state->box,graph))
 +                    {
 +                        bMasterState = TRUE;
 +                    }
 +                }
 +                if (DOMAINDECOMP(cr) && bMasterState)
 +                {
 +                    dd_collect_state(cr->dd,state,state_global);
 +                }
 +            }
 +
 +            if (DOMAINDECOMP(cr))
 +            {
 +                /* Repartition the domain decomposition */
 +                wallcycle_start(wcycle,ewcDOMDEC);
 +                dd_partition_system(fplog,step,cr,
 +                                    bMasterState,nstglobalcomm,
 +                                    state_global,top_global,ir,
 +                                    state,&f,mdatoms,top,fr,
 +                                    vsite,shellfc,constr,
 +                                    nrnb,wcycle,do_verbose);
 +                wallcycle_stop(wcycle,ewcDOMDEC);
 +                /* If using an iterative integrator, reallocate space to match the decomposition */
 +            }
 +        }
 +
 +        if (MASTER(cr) && do_log && !bFFscan)
 +        {
 +            print_ebin_header(fplog,step,t,state->lambda);
 +        }
 +
 +        if (ir->efep != efepNO)
 +        {
 +            update_mdatoms(mdatoms,state->lambda); 
 +        }
 +
 +        if (bRerunMD && rerun_fr.bV)
 +        {
 +            
 +            /* We need the kinetic energy at minus the half step for determining
 +             * the full step kinetic energy and possibly for T-coupling.*/
 +            /* This may not be quite working correctly yet . . . . */
 +            compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
 +                            wcycle,enerd,NULL,NULL,NULL,NULL,mu_tot,
 +                            constr,NULL,FALSE,state->box,
 +                            top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
 +                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
 +        }
 +        clear_mat(force_vir);
 +        
 +        /* Ionize the atoms if necessary */
 +        if (bIonize)
 +        {
 +            ionize(fplog,oenv,mdatoms,top_global,t,ir,state->x,state->v,
 +                   mdatoms->start,mdatoms->start+mdatoms->homenr,state->box,cr);
 +        }
 +        
 +        /* Update force field in ffscan program */
 +        if (bFFscan)
 +        {
 +            if (update_forcefield(fplog,
 +                                  nfile,fnm,fr,
 +                                  mdatoms->nr,state->x,state->box)) {
 +                if (gmx_parallel_env_initialized())
 +                {
 +                    gmx_finalize();
 +                }
 +                exit(0);
 +            }
 +        }
 +
 +        /* We write a checkpoint at this MD step when:
 +         * either at an NS step when we signalled through gs,
 +         * or at the last step (but not when we do not want confout),
 +         * but never at the first step or with rerun.
 +         */
 +        bCPT = (((gs.set[eglsCHKPT] && (bNS || ir->nstlist == 0)) ||
 +                 (bLastStep && (Flags & MD_CONFOUT))) &&
 +                step > ir->init_step && !bRerunMD);
 +        if (bCPT)
 +        {
 +            gs.set[eglsCHKPT] = 0;
 +        }
 +
 +        /* Determine the energy and pressure:
 +         * at nstcalcenergy steps and at energy output steps (set below).
 +         */
 +        bNstEner = do_per_step(step,ir->nstcalcenergy);
 +        bCalcEnerPres =
 +            (bNstEner ||
 +             (ir->epc != epcNO && do_per_step(step,ir->nstpcouple)));
 +
 +        /* Do we need global communication ? */
 +        bGStat = (bCalcEnerPres || bStopCM ||
 +                  do_per_step(step,nstglobalcomm) ||
 +                  (ir->nstlist == -1 && !bRerunMD && step >= nlh.step_nscheck));
 +
 +        do_ene = (do_per_step(step,ir->nstenergy) || bLastStep);
 +
 +        if (do_ene || do_log)
 +        {
 +            bCalcEnerPres = TRUE;
 +            bGStat        = TRUE;
 +        }
 +        
 +        /* these CGLO_ options remain the same throughout the iteration */
 +        cglo_flags = ((bRerunMD ? CGLO_RERUNMD : 0) |
 +                      (bGStat ? CGLO_GSTAT : 0)
 +            );
 +        
 +        force_flags = (GMX_FORCE_STATECHANGED |
 +                       ((DYNAMIC_BOX(*ir) || bRerunMD) ? GMX_FORCE_DYNAMICBOX : 0) |
 +                       GMX_FORCE_ALLFORCES |
 +                       (bNStList ? GMX_FORCE_DOLR : 0) |
 +                       GMX_FORCE_SEPLRF |
 +                       (bCalcEnerPres ? GMX_FORCE_VIRIAL : 0) |
 +                       (bDoDHDL ? GMX_FORCE_DHDL : 0)
 +            );
 +        
 +        if (shellfc)
 +        {
 +            /* Now is the time to relax the shells */
 +            count=relax_shell_flexcon(fplog,cr,bVerbose,bFFscan ? step+1 : step,
 +                                      ir,bNS,force_flags,
 +                                      bStopCM,top,top_global,
 +                                      constr,enerd,fcd,
 +                                      state,f,force_vir,mdatoms,
 +                                      nrnb,wcycle,graph,groups,
 +                                      shellfc,fr,bBornRadii,t,mu_tot,
 +                                      state->natoms,&bConverged,vsite,
 +                                      outf->fp_field);
 +            tcount+=count;
 +
 +            if (bConverged)
 +            {
 +                nconverged++;
 +            }
 +        }
 +        else
 +        {
 +            /* The coordinates (x) are shifted (to get whole molecules)
 +             * in do_force.
 +             * This is parallellized as well, and does communication too. 
 +             * Check comments in sim_util.c
 +             */
 +        
 +            do_force(fplog,cr,ir,step,nrnb,wcycle,top,top_global,groups,
 +                     state->box,state->x,&state->hist,
 +                     f,force_vir,mdatoms,enerd,fcd,
 +                     state->lambda,graph,
 +                     fr,vsite,mu_tot,t,outf->fp_field,ed,bBornRadii,
 +                     (bNS ? GMX_FORCE_NS : 0) | force_flags);
 +        }
 +        
 +        if (bTCR)
 +        {
 +            mu_aver = calc_mu_aver(cr,state->x,mdatoms->chargeA,
 +                                   mu_tot,&top_global->mols,mdatoms,gnx,grpindex);
 +        }
 +        
 +        if (bTCR && bFirstStep)
 +        {
 +            tcr=init_coupling(fplog,nfile,fnm,cr,fr,mdatoms,&(top->idef));
 +            fprintf(fplog,"Done init_coupling\n"); 
 +            fflush(fplog);
 +        }
 +        
 +        if (bVV && !bStartingFromCpt && !bRerunMD)
 +        /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
 +        {
 +            if (ir->eI==eiVV && bInitStep) 
 +            {
 +                /* if using velocity verlet with full time step Ekin,
 +                 * take the first half step only to compute the 
 +                 * virial for the first step. From there,
 +                 * revert back to the initial coordinates
 +                 * so that the input is actually the initial step.
 +                 */
 +                copy_rvecn(state->v,cbuf,0,state->natoms); /* should make this better for parallelizing? */
 +            } else {
 +                /* this is for NHC in the Ekin(t+dt/2) version of vv */
 +                trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ1);            
 +            }
 +
 +            update_coords(fplog,step,ir,mdatoms,state,
 +                          f,fr->bTwinRange && bNStList,fr->f_twin,fcd,
 +                          ekind,M,wcycle,upd,bInitStep,etrtVELOCITY1,
 +                          cr,nrnb,constr,&top->idef);
 +            
 +            if (bIterations)
 +            {
 +                gmx_iterate_init(&iterate,bIterations && !bInitStep);
 +            }
 +            /* for iterations, we save these vectors, as we will be self-consistently iterating
 +               the calculations */
 +
 +            /*#### UPDATE EXTENDED VARIABLES IN TROTTER FORMULATION */
 +            
 +            /* save the state */
 +            if (bIterations && iterate.bIterate) { 
 +                copy_coupling_state(state,bufstate,ekind,ekind_save,&(ir->opts));
 +            }
 +            
 +            bFirstIterate = TRUE;
 +            while (bFirstIterate || (bIterations && iterate.bIterate))
 +            {
 +                if (bIterations && iterate.bIterate) 
 +                {
 +                    copy_coupling_state(bufstate,state,ekind_save,ekind,&(ir->opts));
 +                    if (bFirstIterate && bTrotter) 
 +                    {
 +                        /* The first time through, we need a decent first estimate
 +                           of veta(t+dt) to compute the constraints.  Do
 +                           this by computing the box volume part of the
 +                           trotter integration at this time. Nothing else
 +                           should be changed by this routine here.  If
 +                           !(first time), we start with the previous value
 +                           of veta.  */
 +                        
 +                        veta_save = state->veta;
 +                        trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ0);
 +                        vetanew = state->veta;
 +                        state->veta = veta_save;
 +                    } 
 +                } 
 +                
 +                bOK = TRUE;
 +                if ( !bRerunMD || rerun_fr.bV || bForceUpdate) {  /* Why is rerun_fr.bV here?  Unclear. */
 +                    dvdl = 0;
 +                    
 +                    update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,state,graph,f,
 +                                       &top->idef,shake_vir,NULL,
 +                                       cr,nrnb,wcycle,upd,constr,
 +                                       bInitStep,TRUE,bCalcEnerPres,vetanew);
 +                    
 +                    if (!bOK && !bFFscan)
 +                    {
 +                        gmx_fatal(FARGS,"Constraint error: Shake, Lincs or Settle could not solve the constrains");
 +                    }
 +                    
 +                } 
 +                else if (graph)
 +                { /* Need to unshift here if a do_force has been
 +                     called in the previous step */
 +                    unshift_self(graph,state->box,state->x);
 +                }
 +                
 +                
 +                /* if VV, compute the pressure and constraints */
 +                /* For VV2, we strictly only need this if using pressure
 +                 * control, but we really would like to have accurate pressures
 +                 * printed out.
 +                 * Think about ways around this in the future?
 +                 * For now, keep this choice in comments.
 +                 */
 +                /*bPres = (ir->eI==eiVV || IR_NPT_TROTTER(ir)); */
 +                    /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && IR_NPT_TROTTER(ir)));*/
 +                bPres = TRUE;
 +                bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK));
 +                compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
 +                                wcycle,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
 +                                constr,NULL,FALSE,state->box,
 +                                top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
 +                                cglo_flags 
 +                                | CGLO_ENERGY 
++                                | (bStopCM ? CGLO_STOPCM : 0)
 +                                | (bTemp ? CGLO_TEMPERATURE:0) 
 +                                | (bPres ? CGLO_PRESSURE : 0) 
 +                                | (bPres ? CGLO_CONSTRAINT : 0)
 +                                | ((bIterations && iterate.bIterate) ? CGLO_ITERATE : 0)  
 +                                | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
 +                                | CGLO_SCALEEKIN 
 +                    );
 +                /* explanation of above: 
 +                   a) We compute Ekin at the full time step
 +                   if 1) we are using the AveVel Ekin, and it's not the
 +                   initial step, or 2) if we are using AveEkin, but need the full
 +                   time step kinetic energy for the pressure (always true now, since we want accurate statistics).
 +                   b) If we are using EkinAveEkin for the kinetic energy for the temperture control, we still feed in 
 +                   EkinAveVel because it's needed for the pressure */
 +                
 +                /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
 +                if (!bInitStep) 
 +                {
 +                    if (bTrotter)
 +                    {
 +                        trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ2);
 +                    } 
 +                    else 
 +                    {
 +                        update_tcouple(fplog,step,ir,state,ekind,wcycle,upd,&MassQ,mdatoms);
 +                    }
 +                }
 +                
 +                if (bIterations &&
 +                    done_iterating(cr,fplog,step,&iterate,bFirstIterate,
 +                                   state->veta,&vetanew)) 
 +                {
 +                    break;
 +                }
 +                bFirstIterate = FALSE;
 +            }
 +
 +            if (bTrotter && !bInitStep) {
 +                copy_mat(shake_vir,state->svir_prev);
 +                copy_mat(force_vir,state->fvir_prev);
 +                if (IR_NVT_TROTTER(ir) && ir->eI==eiVV) {
 +                    /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
 +                    enerd->term[F_TEMP] = sum_ekin(&(ir->opts),ekind,NULL,(ir->eI==eiVV),FALSE,FALSE);
 +                    enerd->term[F_EKIN] = trace(ekind->ekin);
 +                }
 +            }
 +            /* if it's the initial step, we performed this first step just to get the constraint virial */
 +            if (bInitStep && ir->eI==eiVV) {
 +                copy_rvecn(cbuf,state->v,0,state->natoms);
 +            }
 +            
 +            if (fr->bSepDVDL && fplog && do_log) 
 +            {
 +                fprintf(fplog,sepdvdlformat,"Constraint",0.0,dvdl);
 +            }
 +            enerd->term[F_DHDL_CON] += dvdl;
 +        }
 +    
 +        /* MRS -- now done iterating -- compute the conserved quantity */
 +        if (bVV) {
 +            saved_conserved_quantity = compute_conserved_from_auxiliary(ir,state,&MassQ);
 +            if (ir->eI==eiVV) 
 +            {
 +                last_ekin = enerd->term[F_EKIN]; /* does this get preserved through checkpointing? */
 +            }
 +            if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres)) 
 +            {
 +                saved_conserved_quantity -= enerd->term[F_DISPCORR];
 +            }
 +        }
 +        
 +        /* ########  END FIRST UPDATE STEP  ############## */
 +        /* ########  If doing VV, we now have v(dt) ###### */
 +        
 +        /* ################## START TRAJECTORY OUTPUT ################# */
 +        
 +        /* Now we have the energies and forces corresponding to the 
 +         * coordinates at time t. We must output all of this before
 +         * the update.
 +         * for RerunMD t is read from input trajectory
 +         */
 +        mdof_flags = 0;
 +        if (do_per_step(step,ir->nstxout)) { mdof_flags |= MDOF_X; }
 +        if (do_per_step(step,ir->nstvout)) { mdof_flags |= MDOF_V; }
 +        if (do_per_step(step,ir->nstfout)) { mdof_flags |= MDOF_F; }
 +        if (do_per_step(step,ir->nstxtcout)) { mdof_flags |= MDOF_XTC; }
 +        if (bCPT) { mdof_flags |= MDOF_CPT; };
 +
 +#if defined(GMX_FAHCORE) || defined(GMX_WRITELASTSTEP)
 +        if (bLastStep)
 +        {
 +            /* Enforce writing positions and velocities at end of run */
 +            mdof_flags |= (MDOF_X | MDOF_V);
 +        }
 +#endif
 +#ifdef GMX_FAHCORE
 +        if (MASTER(cr))
 +            fcReportProgress( ir->nsteps, step );
 +
 +        /* sync bCPT and fc record-keeping */
 +        if (bCPT && MASTER(cr))
 +            fcRequestCheckPoint();
 +#endif
 +        
 +        if (mdof_flags != 0)
 +        {
 +            wallcycle_start(wcycle,ewcTRAJ);
 +            if (bCPT)
 +            {
 +                if (state->flags & (1<<estLD_RNG))
 +                {
 +                    get_stochd_state(upd,state);
 +                }
 +                if (MASTER(cr))
 +                {
 +                    if (bSumEkinhOld)
 +                    {
 +                        state_global->ekinstate.bUpToDate = FALSE;
 +                    }
 +                    else
 +                    {
 +                        update_ekinstate(&state_global->ekinstate,ekind);
 +                        state_global->ekinstate.bUpToDate = TRUE;
 +                    }
 +                    update_energyhistory(&state_global->enerhist,mdebin);
 +                }
 +            }
 +            write_traj(fplog,cr,outf,mdof_flags,top_global,
 +                       step,t,state,state_global,f,f_global,&n_xtc,&x_xtc);
 +            if (bCPT)
 +            {
 +                nchkpt++;
 +                bCPT = FALSE;
 +            }
 +            debug_gmx();
 +            if (bLastStep && step_rel == ir->nsteps &&
 +                (Flags & MD_CONFOUT) && MASTER(cr) &&
 +                !bRerunMD && !bFFscan)
 +            {
 +                /* x and v have been collected in write_traj,
 +                 * because a checkpoint file will always be written
 +                 * at the last step.
 +                 */
 +                fprintf(stderr,"\nWriting final coordinates.\n");
 +                if (ir->ePBC != epbcNONE && !ir->bPeriodicMols &&
 +                    DOMAINDECOMP(cr))
 +                {
 +                    /* Make molecules whole only for confout writing */
 +                    do_pbc_mtop(fplog,ir->ePBC,state->box,top_global,state_global->x);
 +                }
 +                write_sto_conf_mtop(ftp2fn(efSTO,nfile,fnm),
 +                                    *top_global->name,top_global,
 +                                    state_global->x,state_global->v,
 +                                    ir->ePBC,state->box);
 +                debug_gmx();
 +            }
 +            wallcycle_stop(wcycle,ewcTRAJ);
 +        }
 +        
 +        /* kludge -- virial is lost with restart for NPT control. Must restart */
 +        if (bStartingFromCpt && bVV) 
 +        {
 +            copy_mat(state->svir_prev,shake_vir);
 +            copy_mat(state->fvir_prev,force_vir);
 +        }
 +        /*  ################## END TRAJECTORY OUTPUT ################ */
 +        
 +        /* Determine the wallclock run time up till now */
 +        run_time = gmx_gettime() - (double)runtime->real;
 +
 +        /* Check whether everything is still allright */    
 +        if (((int)gmx_get_stop_condition() > handled_stop_condition)
 +#ifdef GMX_THREAD_MPI
 +            && MASTER(cr)
 +#endif
 +            )
 +        {
 +            /* this is just make gs.sig compatible with the hack 
 +               of sending signals around by MPI_Reduce with together with
 +               other floats */
 +            if ( gmx_get_stop_condition() == gmx_stop_cond_next_ns )
 +                gs.sig[eglsSTOPCOND]=1;
 +            if ( gmx_get_stop_condition() == gmx_stop_cond_next )
 +                gs.sig[eglsSTOPCOND]=-1;
 +            /* < 0 means stop at next step, > 0 means stop at next NS step */
 +            if (fplog)
 +            {
 +                fprintf(fplog,
 +                        "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
 +                        gmx_get_signal_name(),
 +                        gs.sig[eglsSTOPCOND]==1 ? "NS " : "");
 +                fflush(fplog);
 +            }
 +            fprintf(stderr,
 +                    "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
 +                    gmx_get_signal_name(),
 +                    gs.sig[eglsSTOPCOND]==1 ? "NS " : "");
 +            fflush(stderr);
 +            handled_stop_condition=(int)gmx_get_stop_condition();
 +        }
 +        else if (MASTER(cr) && (bNS || ir->nstlist <= 0) &&
 +                 (max_hours > 0 && run_time > max_hours*60.0*60.0*0.99) &&
 +                 gs.sig[eglsSTOPCOND] == 0 && gs.set[eglsSTOPCOND] == 0)
 +        {
 +            /* Signal to terminate the run */
 +            gs.sig[eglsSTOPCOND] = 1;
 +            if (fplog)
 +            {
 +                fprintf(fplog,"\nStep %s: Run time exceeded %.3f hours, will terminate the run\n",gmx_step_str(step,sbuf),max_hours*0.99);
 +            }
 +            fprintf(stderr, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n",gmx_step_str(step,sbuf),max_hours*0.99);
 +        }
 +
 +        if (bResetCountersHalfMaxH && MASTER(cr) &&
 +            run_time > max_hours*60.0*60.0*0.495)
 +        {
 +            gs.sig[eglsRESETCOUNTERS] = 1;
 +        }
 +
 +        if (ir->nstlist == -1 && !bRerunMD)
 +        {
 +            /* When bGStatEveryStep=FALSE, global_stat is only called
 +             * when we check the atom displacements, not at NS steps.
 +             * This means that also the bonded interaction count check is not
 +             * performed immediately after NS. Therefore a few MD steps could
 +             * be performed with missing interactions.
 +             * But wrong energies are never written to file,
 +             * since energies are only written after global_stat
 +             * has been called.
 +             */
 +            if (step >= nlh.step_nscheck)
 +            {
 +                nlh.nabnsb = natoms_beyond_ns_buffer(ir,fr,&top->cgs,
 +                                                     nlh.scale_tot,state->x);
 +            }
 +            else
 +            {
 +                /* This is not necessarily true,
 +                 * but step_nscheck is determined quite conservatively.
 +                 */
 +                nlh.nabnsb = 0;
 +            }
 +        }
 +
 +        /* In parallel we only have to check for checkpointing in steps
 +         * where we do global communication,
 +         *  otherwise the other nodes don't know.
 +         */
 +        if (MASTER(cr) && ((bGStat || !PAR(cr)) &&
 +                           cpt_period >= 0 &&
 +                           (cpt_period == 0 || 
 +                            run_time >= nchkpt*cpt_period*60.0)) &&
 +            gs.set[eglsCHKPT] == 0)
 +        {
 +            gs.sig[eglsCHKPT] = 1;
 +        }
 +  
 +        if (bIterations)
 +        {
 +            gmx_iterate_init(&iterate,bIterations);
 +        }
 +    
 +        /* for iterations, we save these vectors, as we will be redoing the calculations */
 +        if (bIterations && iterate.bIterate) 
 +        {
 +            copy_coupling_state(state,bufstate,ekind,ekind_save,&(ir->opts));
 +        }
 +        bFirstIterate = TRUE;
 +        while (bFirstIterate || (bIterations && iterate.bIterate))
 +        {
 +            /* We now restore these vectors to redo the calculation with improved extended variables */    
 +            if (bIterations) 
 +            { 
 +                copy_coupling_state(bufstate,state,ekind_save,ekind,&(ir->opts));
 +            }
 +
 +            /* We make the decision to break or not -after- the calculation of Ekin and Pressure,
 +               so scroll down for that logic */
 +            
 +            /* #########   START SECOND UPDATE STEP ################# */
 +            /* Box is changed in update() when we do pressure coupling,
 +             * but we should still use the old box for energy corrections and when
 +             * writing it to the energy file, so it matches the trajectory files for
 +             * the same timestep above. Make a copy in a separate array.
 +             */
 +            copy_mat(state->box,lastbox);
 +
 +            bOK = TRUE;
 +            if (!(bRerunMD && !rerun_fr.bV && !bForceUpdate))
 +            {
 +                wallcycle_start(wcycle,ewcUPDATE);
 +                dvdl = 0;
 +                /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
 +                if (bTrotter) 
 +                {
 +                    if (bIterations && iterate.bIterate) 
 +                    {
 +                        if (bFirstIterate) 
 +                        {
 +                            scalevir = 1;
 +                        }
 +                        else 
 +                        {
 +                            /* we use a new value of scalevir to converge the iterations faster */
 +                            scalevir = tracevir/trace(shake_vir);
 +                        }
 +                        msmul(shake_vir,scalevir,shake_vir); 
 +                        m_add(force_vir,shake_vir,total_vir);
 +                        clear_mat(shake_vir);
 +                    }
 +                    trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ3);
 +                /* We can only do Berendsen coupling after we have summed
 +                 * the kinetic energy or virial. Since the happens
 +                 * in global_state after update, we should only do it at
 +                 * step % nstlist = 1 with bGStatEveryStep=FALSE.
 +                 */
 +                }
 +                else 
 +                {
 +                    update_tcouple(fplog,step,ir,state,ekind,wcycle,upd,&MassQ,mdatoms);
 +                    update_pcouple(fplog,step,ir,state,pcoupl_mu,M,wcycle,
 +                                   upd,bInitStep);
 +                }
 +
 +                if (bVV)
 +                {
 +                    /* velocity half-step update */
 +                    update_coords(fplog,step,ir,mdatoms,state,f,
 +                                  fr->bTwinRange && bNStList,fr->f_twin,fcd,
 +                                  ekind,M,wcycle,upd,FALSE,etrtVELOCITY2,
 +                                  cr,nrnb,constr,&top->idef);
 +                }
 +
 +                /* Above, initialize just copies ekinh into ekin,
 +                 * it doesn't copy position (for VV),
 +                 * and entire integrator for MD.
 +                 */
 +                
 +                if (ir->eI==eiVVAK) 
 +                {
 +                    copy_rvecn(state->x,cbuf,0,state->natoms);
 +                }
 +                
 +                update_coords(fplog,step,ir,mdatoms,state,f,fr->bTwinRange && bNStList,fr->f_twin,fcd,
 +                              ekind,M,wcycle,upd,bInitStep,etrtPOSITION,cr,nrnb,constr,&top->idef);
 +                wallcycle_stop(wcycle,ewcUPDATE);
 +
 +                update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,state,graph,f,
 +                                   &top->idef,shake_vir,force_vir,
 +                                   cr,nrnb,wcycle,upd,constr,
 +                                   bInitStep,FALSE,bCalcEnerPres,state->veta);  
 +                
 +                if (ir->eI==eiVVAK) 
 +                {
 +                    /* erase F_EKIN and F_TEMP here? */
 +                    /* just compute the kinetic energy at the half step to perform a trotter step */
 +                    compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
 +                                    wcycle,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
 +                                    constr,NULL,FALSE,lastbox,
 +                                    top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
 +                                    cglo_flags | CGLO_TEMPERATURE    
 +                        );
 +                    wallcycle_start(wcycle,ewcUPDATE);
 +                    trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ4);            
 +                    /* now we know the scaling, we can compute the positions again again */
 +                    copy_rvecn(cbuf,state->x,0,state->natoms);
 +
 +                    update_coords(fplog,step,ir,mdatoms,state,f,fr->bTwinRange && bNStList,fr->f_twin,fcd,
 +                                  ekind,M,wcycle,upd,bInitStep,etrtPOSITION,cr,nrnb,constr,&top->idef);
 +                    wallcycle_stop(wcycle,ewcUPDATE);
 +
 +                    /* do we need an extra constraint here? just need to copy out of state->v to upd->xp? */
 +                    /* are the small terms in the shake_vir here due
 +                     * to numerical errors, or are they important
 +                     * physically? I'm thinking they are just errors, but not completely sure. 
 +                     * For now, will call without actually constraining, constr=NULL*/
 +                    update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,state,graph,f,
 +                                       &top->idef,tmp_vir,force_vir,
 +                                       cr,nrnb,wcycle,upd,NULL,
 +                                       bInitStep,FALSE,bCalcEnerPres,
 +                                       state->veta);  
 +                }
 +                if (!bOK && !bFFscan) 
 +                {
 +                    gmx_fatal(FARGS,"Constraint error: Shake, Lincs or Settle could not solve the constrains");
 +                }
 +                
 +                if (fr->bSepDVDL && fplog && do_log) 
 +                {
 +                    fprintf(fplog,sepdvdlformat,"Constraint",0.0,dvdl);
 +                }
 +                enerd->term[F_DHDL_CON] += dvdl;
 +            } 
 +            else if (graph) 
 +            {
 +                /* Need to unshift here */
 +                unshift_self(graph,state->box,state->x);
 +            }
 +
 +            if (vsite != NULL) 
 +            {
 +                wallcycle_start(wcycle,ewcVSITECONSTR);
 +                if (graph != NULL) 
 +                {
 +                    shift_self(graph,state->box,state->x);
 +                }
 +                construct_vsites(fplog,vsite,state->x,nrnb,ir->delta_t,state->v,
 +                                 top->idef.iparams,top->idef.il,
 +                                 fr->ePBC,fr->bMolPBC,graph,cr,state->box);
 +                
 +                if (graph != NULL) 
 +                {
 +                    unshift_self(graph,state->box,state->x);
 +                }
 +                wallcycle_stop(wcycle,ewcVSITECONSTR);
 +            }
 +            
 +            /* ############## IF NOT VV, Calculate globals HERE, also iterate constraints ############ */
 +            if (ir->nstlist == -1 && bFirstIterate)
 +            {
 +                gs.sig[eglsNABNSB] = nlh.nabnsb;
 +            }
 +            compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
 +                            wcycle,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
 +                            constr,
 +                            bFirstIterate ? &gs : NULL, 
 +                            (step_rel % gs.nstms == 0) && 
 +                                (multisim_nsteps<0 || (step_rel<multisim_nsteps)),
 +                            lastbox,
 +                            top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
 +                            cglo_flags 
 +                            | (!EI_VV(ir->eI) ? CGLO_ENERGY : 0) 
++                            | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
 +                            | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0) 
 +                            | (!EI_VV(ir->eI) || bRerunMD ? CGLO_PRESSURE : 0) 
 +                            | (bIterations && iterate.bIterate ? CGLO_ITERATE : 0) 
 +                            | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
 +                            | CGLO_CONSTRAINT 
 +                );
 +            if (ir->nstlist == -1 && bFirstIterate)
 +            {
 +                nlh.nabnsb = gs.set[eglsNABNSB];
 +                gs.set[eglsNABNSB] = 0;
 +            }
 +            /* bIterate is set to keep it from eliminating the old ekin kinetic energy terms */
 +            /* #############  END CALC EKIN AND PRESSURE ################# */
 +        
 +            /* Note: this is OK, but there are some numerical precision issues with using the convergence of
 +               the virial that should probably be addressed eventually. state->veta has better properies,
 +               but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
 +               generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
 +
 +            if (bIterations && 
 +                done_iterating(cr,fplog,step,&iterate,bFirstIterate,
 +                               trace(shake_vir),&tracevir)) 
 +            {
 +                break;
 +            }
 +            bFirstIterate = FALSE;
 +        }
 +
 +        update_box(fplog,step,ir,mdatoms,state,graph,f,
 +                   ir->nstlist==-1 ? &nlh.scale_tot : NULL,pcoupl_mu,nrnb,wcycle,upd,bInitStep,FALSE);
 +        
 +        /* ################# END UPDATE STEP 2 ################# */
 +        /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
 +    
 +        /* The coordinates (x) were unshifted in update */
 +        if (bFFscan && (shellfc==NULL || bConverged))
 +        {
 +            if (print_forcefield(fplog,enerd->term,mdatoms->homenr,
 +                                 f,NULL,xcopy,
 +                                 &(top_global->mols),mdatoms->massT,pres))
 +            {
 +                if (gmx_parallel_env_initialized())
 +                {
 +                    gmx_finalize();
 +                }
 +                fprintf(stderr,"\n");
 +                exit(0);
 +            }
 +        }
 +        if (!bGStat)
 +        {
 +            /* We will not sum ekinh_old,                                                            
 +             * so signal that we still have to do it.                                                
 +             */
 +            bSumEkinhOld = TRUE;
 +        }
 +        
 +        if (bTCR)
 +        {
 +            /* Only do GCT when the relaxation of shells (minimization) has converged,
 +             * otherwise we might be coupling to bogus energies. 
 +             * In parallel we must always do this, because the other sims might
 +             * update the FF.
 +             */
 +
 +            /* Since this is called with the new coordinates state->x, I assume
 +             * we want the new box state->box too. / EL 20040121
 +             */
 +            do_coupling(fplog,oenv,nfile,fnm,tcr,t,step,enerd->term,fr,
 +                        ir,MASTER(cr),
 +                        mdatoms,&(top->idef),mu_aver,
 +                        top_global->mols.nr,cr,
 +                        state->box,total_vir,pres,
 +                        mu_tot,state->x,f,bConverged);
 +            debug_gmx();
 +        }
 +
 +        /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
 +        
 +        /* sum up the foreign energy and dhdl terms */
 +        sum_dhdl(enerd,state->lambda,ir);
 +
 +        /* use the directly determined last velocity, not actually the averaged half steps */
 +        if (bTrotter && ir->eI==eiVV) 
 +        {
 +            enerd->term[F_EKIN] = last_ekin;
 +        }
 +        enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
 +        
 +        if (bVV)
 +        {
 +            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
 +        }
 +        else 
 +        {
 +            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + compute_conserved_from_auxiliary(ir,state,&MassQ);
 +        }
 +        /* Check for excessively large energies */
 +        if (bIonize) 
 +        {
 +#ifdef GMX_DOUBLE
 +            real etot_max = 1e200;
 +#else
 +            real etot_max = 1e30;
 +#endif
 +            if (fabs(enerd->term[F_ETOT]) > etot_max) 
 +            {
 +                fprintf(stderr,"Energy too large (%g), giving up\n",
 +                        enerd->term[F_ETOT]);
 +            }
 +        }
 +        /* #########  END PREPARING EDR OUTPUT  ###########  */
 +        
 +        /* Time for performance */
 +        if (((step % stepout) == 0) || bLastStep) 
 +        {
 +            runtime_upd_proc(runtime);
 +        }
 +        
 +        /* Output stuff */
 +        if (MASTER(cr))
 +        {
 +            gmx_bool do_dr,do_or;
 +            
 +            if (!(bStartingFromCpt && (EI_VV(ir->eI)))) 
 +            {
 +                if (bNstEner)
 +                {
 +                    upd_mdebin(mdebin,bDoDHDL, TRUE,
 +                               t,mdatoms->tmass,enerd,state,lastbox,
 +                               shake_vir,force_vir,total_vir,pres,
 +                               ekind,mu_tot,constr);
 +                }
 +                else
 +                {
 +                    upd_mdebin_step(mdebin);
 +                }
 +                
 +                do_dr  = do_per_step(step,ir->nstdisreout);
 +                do_or  = do_per_step(step,ir->nstorireout);
 +                
 +                print_ebin(outf->fp_ene,do_ene,do_dr,do_or,do_log?fplog:NULL,
 +                           step,t,
 +                           eprNORMAL,bCompact,mdebin,fcd,groups,&(ir->opts));
 +            }
 +            if (ir->ePull != epullNO)
 +            {
 +                pull_print_output(ir->pull,step,t);
 +            }
 +            
 +            if (do_per_step(step,ir->nstlog))
 +            {
 +                if(fflush(fplog) != 0)
 +                {
 +                    gmx_fatal(FARGS,"Cannot flush logfile - maybe you are out of disk space?");
 +                }
 +            }
 +        }
 +
 +
 +        /* Remaining runtime */
 +        if (MULTIMASTER(cr) && (do_verbose || gmx_got_usr_signal() ))
 +        {
 +            if (shellfc) 
 +            {
 +                fprintf(stderr,"\n");
 +            }
 +            print_time(stderr,runtime,step,ir,cr);
 +        }
 +
 +        /* Replica exchange */
 +        bExchanged = FALSE;
 +        if ((repl_ex_nst > 0) && (step > 0) && !bLastStep &&
 +            do_per_step(step,repl_ex_nst)) 
 +        {
 +            bExchanged = replica_exchange(fplog,cr,repl_ex,
 +                                          state_global,enerd->term,
 +                                          state,step,t);
 +
 +            if (bExchanged && DOMAINDECOMP(cr)) 
 +            {
 +                dd_partition_system(fplog,step,cr,TRUE,1,
 +                                    state_global,top_global,ir,
 +                                    state,&f,mdatoms,top,fr,
 +                                    vsite,shellfc,constr,
 +                                    nrnb,wcycle,FALSE);
 +            }
 +        }
 +        
 +        bFirstStep = FALSE;
 +        bInitStep = FALSE;
 +        bStartingFromCpt = FALSE;
 +
 +        /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
 +        /* With all integrators, except VV, we need to retain the pressure
 +         * at the current step for coupling at the next step.
 +         */
 +        if ((state->flags & (1<<estPRES_PREV)) &&
 +            (bGStatEveryStep ||
 +             (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
 +        {
 +            /* Store the pressure in t_state for pressure coupling
 +             * at the next MD step.
 +             */
 +            copy_mat(pres,state->pres_prev);
 +        }
 +        
 +        /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
 +
 +        if ( (membed!=NULL) && (!bLastStep) )
 +            rescale_membed(step_rel,membed,state_global->x);
 +        
 +        if (bRerunMD) 
 +        {
 +            if (MASTER(cr))
 +            {
 +                /* read next frame from input trajectory */
 +                bNotLastFrame = read_next_frame(oenv,status,&rerun_fr);
 +            }
 +
 +            if (PAR(cr))
 +            {
 +                rerun_parallel_comm(cr,&rerun_fr,&bNotLastFrame);
 +            }
 +        }
 +        
 +        if (!bRerunMD || !rerun_fr.bStep)
 +        {
 +            /* increase the MD step number */
 +            step++;
 +            step_rel++;
 +        }
 +        
 +        cycles = wallcycle_stop(wcycle,ewcSTEP);
 +        if (DOMAINDECOMP(cr) && wcycle)
 +        {
 +            dd_cycles_add(cr->dd,cycles,ddCyclStep);
 +        }
 +        
 +        if (step_rel == wcycle_get_reset_counters(wcycle) ||
 +            gs.set[eglsRESETCOUNTERS] != 0)
 +        {
 +            /* Reset all the counters related to performance over the run */
 +            reset_all_counters(fplog,cr,step,&step_rel,ir,wcycle,nrnb,runtime);
 +            wcycle_set_reset_counters(wcycle,-1);
 +            /* Correct max_hours for the elapsed time */
 +            max_hours -= run_time/(60.0*60.0);
 +            bResetCountersHalfMaxH = FALSE;
 +            gs.set[eglsRESETCOUNTERS] = 0;
 +        }
 +
 +    }
 +    /* End of main MD loop */
 +    debug_gmx();
 +    
 +    /* Stop the time */
 +    runtime_end(runtime);
 +    
 +    if (bRerunMD && MASTER(cr))
 +    {
 +        close_trj(status);
 +    }
 +    
 +    if (!(cr->duty & DUTY_PME))
 +    {
 +        /* Tell the PME only node to finish */
 +        gmx_pme_finish(cr);
 +    }
 +    
 +    if (MASTER(cr))
 +    {
 +        if (ir->nstcalcenergy > 0 && !bRerunMD) 
 +        {
 +            print_ebin(outf->fp_ene,FALSE,FALSE,FALSE,fplog,step,t,
 +                       eprAVER,FALSE,mdebin,fcd,groups,&(ir->opts));
 +        }
 +    }
 +
 +    done_mdoutf(outf);
 +
 +    debug_gmx();
 +
 +    if (ir->nstlist == -1 && nlh.nns > 0 && fplog)
 +    {
 +        fprintf(fplog,"Average neighborlist lifetime: %.1f steps, std.dev.: %.1f steps\n",nlh.s1/nlh.nns,sqrt(nlh.s2/nlh.nns - sqr(nlh.s1/nlh.nns)));
 +        fprintf(fplog,"Average number of atoms that crossed the half buffer length: %.1f\n\n",nlh.ab/nlh.nns);
 +    }
 +    
 +    if (shellfc && fplog)
 +    {
 +        fprintf(fplog,"Fraction of iterations that converged:           %.2f %%\n",
 +                (nconverged*100.0)/step_rel);
 +        fprintf(fplog,"Average number of force evaluations per MD step: %.2f\n\n",
 +                tcount/step_rel);
 +    }
 +    
 +    if (repl_ex_nst > 0 && MASTER(cr))
 +    {
 +        print_replica_exchange_statistics(fplog,repl_ex);
 +    }
 +    
 +    runtime->nsteps_done = step_rel;
 +    
 +    return 0;
 +}
index 564ada37b1a59b34b7d80f2f3dcbba4202c097b1,0000000000000000000000000000000000000000..01d2e745249b45731f987766e91f7110f001b0a4
mode 100644,000000..100644
--- /dev/null
@@@ -1,578 -1,0 +1,572 @@@
- #if ((defined WIN32 || defined _WIN32 || defined WIN64 || defined _WIN64) && !defined __CYGWIN__ && !defined __CYGWIN32__)
- /* _isnan() */
- #include <float.h>
- #endif
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2010, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include <signal.h>
 +#include <stdlib.h>
 +
- #include "pppm.h"
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "sysstuff.h"
 +#include "vec.h"
 +#include "statutil.h"
 +#include "vcm.h"
 +#include "mdebin.h"
 +#include "nrnb.h"
 +#include "calcmu.h"
 +#include "index.h"
 +#include "vsite.h"
 +#include "update.h"
 +#include "ns.h"
 +#include "trnio.h"
 +#include "xtcio.h"
 +#include "mdrun.h"
 +#include "confio.h"
 +#include "network.h"
 +#include "pull.h"
 +#include "xvgr.h"
 +#include "physics.h"
 +#include "names.h"
 +#include "xmdrun.h"
 +#include "ionize.h"
 +#include "disre.h"
 +#include "orires.h"
 +#include "dihre.h"
 +#include "pme.h"
 +#include "mdatoms.h"
 +#include "qmmm.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "topsort.h"
 +#include "coulomb.h"
 +#include "constr.h"
 +#include "compute_io.h"
 +#include "mvdata.h"
 +#include "checkpoint.h"
 +#include "mtop_util.h"
 +#include "sighandler.h"
 +#include "genborn.h"
 +#include "string2.h"
 +#include "copyrite.h"
 +#include "membed.h"
 +
 +#ifdef GMX_THREAD_MPI
 +#include "tmpi.h"
 +#endif
 +
 +/* include even when OpenMM not used to force compilation of do_md_openmm */
 +#include "openmm_wrapper.h"
 +
 +double do_md_openmm(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
 +                    const output_env_t oenv, gmx_bool bVerbose,gmx_bool bCompact,
 +                    int nstglobalcomm,
 +                    gmx_vsite_t *vsite,gmx_constr_t constr,
 +                    int stepout,t_inputrec *ir,
 +                    gmx_mtop_t *top_global,
 +                    t_fcdata *fcd,
 +                    t_state *state_global,
 +                    t_mdatoms *mdatoms,
 +                    t_nrnb *nrnb,gmx_wallcycle_t wcycle,
 +                    gmx_edsam_t ed,t_forcerec *fr,
 +                    int repl_ex_nst,int repl_ex_seed,
 +                    gmx_membed_t *membed,
 +                    real cpt_period,real max_hours,
 +                    const char *deviceOptions,
 +                    unsigned long Flags,
 +                    gmx_runtime_t *runtime)
 +{
 +    gmx_mdoutf_t *outf;
 +    gmx_large_int_t step,step_rel;
 +    double     run_time;
 +    double     t,t0,lam0;
 +    gmx_bool       bSimAnn,
 +    bFirstStep,bStateFromTPX,bLastStep,bStartingFromCpt;
 +    gmx_bool       bInitStep=TRUE;
 +    gmx_bool       do_ene,do_log, do_verbose,
 +    bX,bV,bF,bCPT;
 +    tensor     force_vir,shake_vir,total_vir,pres;
 +    int        i,m;
 +    int        mdof_flags;
 +    rvec       mu_tot;
 +    t_vcm      *vcm;
 +    int        nchkpt=1;
 +    gmx_localtop_t *top;
 +    t_mdebin *mdebin;
 +    t_state    *state=NULL;
 +    rvec       *f_global=NULL;
 +    int        n_xtc=-1;
 +    rvec       *x_xtc=NULL;
 +    gmx_enerdata_t *enerd;
 +    rvec       *f=NULL;
 +    gmx_global_stat_t gstat;
 +    gmx_update_t upd=NULL;
 +    t_graph    *graph=NULL;
 +    globsig_t   gs;
 +
 +    gmx_groups_t *groups;
 +    gmx_ekindata_t *ekind, *ekind_save;
 +    gmx_bool        bAppend;
 +    int         a0,a1;
 +    matrix      lastbox;
 +    real        reset_counters=0,reset_counters_now=0;
 +    char        sbuf[STEPSTRSIZE],sbuf2[STEPSTRSIZE];
 +    int         handled_stop_condition=gmx_stop_cond_none; 
 +
 +    const char *ommOptions = NULL;
 +    void   *openmmData;
 +
 +#ifdef GMX_DOUBLE
 +    /* Checks in cmake should prevent the compilation in double precision
 +     * with OpenMM, but just to be sure we check here.
 +     */
 +    gmx_fatal(FARGS,"Compilation was performed in double precision, but OpenMM only supports single precision. If you want to use to OpenMM, compile in single precision.");
 +#endif
 +
 +    bAppend  = (Flags & MD_APPENDFILES);
 +    check_ir_old_tpx_versions(cr,fplog,ir,top_global);
 +
 +    groups = &top_global->groups;
 +
 +    /* Initial values */
 +    init_md(fplog,cr,ir,oenv,&t,&t0,&state_global->lambda,&lam0,
 +            nrnb,top_global,&upd,
 +            nfile,fnm,&outf,&mdebin,
 +            force_vir,shake_vir,mu_tot,&bSimAnn,&vcm,state_global,Flags);
 +
 +    clear_mat(total_vir);
 +    clear_mat(pres);
 +    /* Energy terms and groups */
 +    snew(enerd,1);
 +    init_enerdata(top_global->groups.grps[egcENER].nr,ir->n_flambda,enerd);
 +    snew(f,top_global->natoms);
 +
 +    /* Kinetic energy data */
 +    snew(ekind,1);
 +    init_ekindata(fplog,top_global,&(ir->opts),ekind);
 +    /* needed for iteration of constraints */
 +    snew(ekind_save,1);
 +    init_ekindata(fplog,top_global,&(ir->opts),ekind_save);
 +    /* Copy the cos acceleration to the groups struct */
 +    ekind->cosacc.cos_accel = ir->cos_accel;
 +
 +    gstat = global_stat_init(ir);
 +    debug_gmx();
 +
 +    {
 +        double io = compute_io(ir,top_global->natoms,groups,mdebin->ebin->nener,1);
 +        if ((io > 2000) && MASTER(cr))
 +            fprintf(stderr,
 +                    "\nWARNING: This run will generate roughly %.0f Mb of data\n\n",
 +                    io);
 +    }
 +
 +    top = gmx_mtop_generate_local_top(top_global,ir);
 +
 +    a0 = 0;
 +    a1 = top_global->natoms;
 +
 +    state = partdec_init_local_state(cr,state_global);
 +    f_global = f;
 +
 +    atoms2md(top_global,ir,0,NULL,a0,a1-a0,mdatoms);
 +
 +    if (vsite)
 +    {
 +        set_vsite_top(vsite,top,mdatoms,cr);
 +    }
 +
 +    if (ir->ePBC != epbcNONE && !ir->bPeriodicMols)
 +    {
 +        graph = mk_graph(fplog,&(top->idef),0,top_global->natoms,FALSE,FALSE);
 +    }
 +
 +    update_mdatoms(mdatoms,state->lambda);
 +
 +    if (deviceOptions[0]=='\0')
 +    {
 +        /* empty options, which should default to OpenMM in this build */
 +        ommOptions=deviceOptions;
 +    }
 +    else
 +    {
 +        if (gmx_strncasecmp(deviceOptions,"OpenMM",6)!=0)
 +        {
 +            gmx_fatal(FARGS, "This Gromacs version currently only works with OpenMM. Use -device \"OpenMM:<options>\"");
 +        }
 +        else
 +        {
 +            ommOptions=strchr(deviceOptions,':');
 +            if (NULL!=ommOptions)
 +            {
 +                /* Increase the pointer to skip the colon */
 +                ommOptions++;
 +            }
 +        }
 +    }
 +
 +    openmmData = openmm_init(fplog, ommOptions, ir, top_global, top, mdatoms, fr, state);
 +    please_cite(fplog,"Friedrichs2009");
 +
 +    if (MASTER(cr))
 +    {
 +        /* Update mdebin with energy history if appending to output files */
 +        if ( Flags & MD_APPENDFILES )
 +        {
 +            restore_energyhistory_from_state(mdebin,&state_global->enerhist);
 +        }
 +        /* Set the initial energy history in state to zero by updating once */
 +        update_energyhistory(&state_global->enerhist,mdebin);
 +    }
 +
 +    if (constr)
 +    {
 +        set_constraints(constr,top,ir,mdatoms,cr);
 +    }
 +
 +    if (!ir->bContinuation)
 +    {
 +        if (mdatoms->cFREEZE && (state->flags & (1<<estV)))
 +        {
 +            /* Set the velocities of frozen particles to zero */
 +            for (i=mdatoms->start; i<mdatoms->start+mdatoms->homenr; i++)
 +            {
 +                for (m=0; m<DIM; m++)
 +                {
 +                    if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
 +                    {
 +                        state->v[i][m] = 0;
 +                    }
 +                }
 +            }
 +        }
 +
 +        if (constr)
 +        {
 +            /* Constrain the initial coordinates and velocities */
 +            do_constrain_first(fplog,constr,ir,mdatoms,state,f,
 +                               graph,cr,nrnb,fr,top,shake_vir);
 +        }
 +        if (vsite)
 +        {
 +            /* Construct the virtual sites for the initial configuration */
 +            construct_vsites(fplog,vsite,state->x,nrnb,ir->delta_t,NULL,
 +                             top->idef.iparams,top->idef.il,
 +                             fr->ePBC,fr->bMolPBC,graph,cr,state->box);
 +        }
 +    }
 +
 +    debug_gmx();
 +
 +    if (MASTER(cr))
 +    {
 +        char tbuf[20];
 +        fprintf(stderr,"starting mdrun '%s'\n",
 +                *(top_global->name));
 +        if (ir->nsteps >= 0)
 +        {
 +            sprintf(tbuf,"%8.1f",(ir->init_step+ir->nsteps)*ir->delta_t);
 +        }
 +        else
 +        {
 +            sprintf(tbuf,"%s","infinite");
 +        }
 +        if (ir->init_step > 0)
 +        {
 +            fprintf(stderr,"%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
 +                    gmx_step_str(ir->init_step+ir->nsteps,sbuf),tbuf,
 +                    gmx_step_str(ir->init_step,sbuf2),
 +                    ir->init_step*ir->delta_t);
 +        }
 +        else
 +        {
 +            fprintf(stderr,"%s steps, %s ps.\n",
 +                    gmx_step_str(ir->nsteps,sbuf),tbuf);
 +        }
 +    }
 +
 +    fprintf(fplog,"\n");
 +
 +    /* Set and write start time */
 +    runtime_start(runtime);
 +    print_date_and_time(fplog,cr->nodeid,"Started mdrun",runtime);
 +    wallcycle_start(wcycle,ewcRUN);
 +    if (fplog)
 +        fprintf(fplog,"\n");
 +
 +    /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
 +
 +    debug_gmx();
 +    /***********************************************************
 +     *
 +     *             Loop over MD steps
 +     *
 +     ************************************************************/
 +
 +    /* loop over MD steps or if rerunMD to end of input trajectory */
 +    bFirstStep = TRUE;
 +    /* Skip the first Nose-Hoover integration when we get the state from tpx */
 +    bStateFromTPX = !opt2bSet("-cpi",nfile,fnm);
 +    bInitStep = bFirstStep && bStateFromTPX;
 +    bStartingFromCpt = (Flags & MD_STARTFROMCPT) && bInitStep;
 +    bLastStep = FALSE;
 +
 +    init_global_signals(&gs,cr,ir,repl_ex_nst);
 +
 +    step = ir->init_step;
 +    step_rel = 0;
 +
 +    while (!bLastStep)
 +    {
 +        wallcycle_start(wcycle,ewcSTEP);
 +
 +        bLastStep = (step_rel == ir->nsteps);
 +        t = t0 + step*ir->delta_t;
 +
 +        if (gs.set[eglsSTOPCOND] != 0)
 +        {
 +            bLastStep = TRUE;
 +        }
 +
 +        do_log = do_per_step(step,ir->nstlog) || bFirstStep || bLastStep;
 +        do_verbose = bVerbose &&
 +                     (step % stepout == 0 || bFirstStep || bLastStep);
 +
 +        if (MASTER(cr) && do_log)
 +        {
 +            print_ebin_header(fplog,step,t,state->lambda);
 +        }
 +
 +        clear_mat(force_vir);
 +
 +        /* We write a checkpoint at this MD step when:
 +         * either when we signalled through gs (in OpenMM NS works different),
 +         * or at the last step (but not when we do not want confout),
 +         * but never at the first step.
 +         */
 +        bCPT = ((gs.set[eglsCHKPT] ||
 +                 (bLastStep && (Flags & MD_CONFOUT))) &&
 +                step > ir->init_step );
 +        if (bCPT)
 +        {
 +            gs.set[eglsCHKPT] = 0;
 +        }
 +
 +        /* Now we have the energies and forces corresponding to the
 +         * coordinates at time t. We must output all of this before
 +         * the update.
 +         * for RerunMD t is read from input trajectory
 +         */
 +        mdof_flags = 0;
 +        if (do_per_step(step,ir->nstxout))
 +        {
 +            mdof_flags |= MDOF_X;
 +        }
 +        if (do_per_step(step,ir->nstvout))
 +        {
 +            mdof_flags |= MDOF_V;
 +        }
 +        if (do_per_step(step,ir->nstfout))
 +        {
 +            mdof_flags |= MDOF_F;
 +        }
 +        if (do_per_step(step,ir->nstxtcout))
 +        {
 +            mdof_flags |= MDOF_XTC;
 +        }
 +        if (bCPT)
 +        {
 +            mdof_flags |= MDOF_CPT;
 +        };
 +        do_ene = (do_per_step(step,ir->nstenergy) || bLastStep);
 +
 +        if (mdof_flags != 0 || do_ene || do_log)
 +        {
 +            wallcycle_start(wcycle,ewcTRAJ);
 +            bF = (mdof_flags & MDOF_F);
 +            bX = (mdof_flags & (MDOF_X | MDOF_XTC | MDOF_CPT));
 +            bV = (mdof_flags & (MDOF_V | MDOF_CPT));
 +
 +            openmm_copy_state(openmmData, state, &t, f, enerd, bX, bV, bF, do_ene);
 +
 +            upd_mdebin(mdebin,FALSE,TRUE,
 +                       t,mdatoms->tmass,enerd,state,lastbox,
 +                       shake_vir,force_vir,total_vir,pres,
 +                       ekind,mu_tot,constr);
 +            print_ebin(outf->fp_ene,do_ene,FALSE,FALSE,do_log?fplog:NULL,
 +                       step,t,
 +                       eprNORMAL,bCompact,mdebin,fcd,groups,&(ir->opts));
 +            write_traj(fplog,cr,outf,mdof_flags,top_global,
 +                       step,t,state,state_global,f,f_global,&n_xtc,&x_xtc);
 +            if (bCPT)
 +            {
 +                nchkpt++;
 +                bCPT = FALSE;
 +            }
 +            debug_gmx();
 +            if (bLastStep && step_rel == ir->nsteps &&
 +                    (Flags & MD_CONFOUT) && MASTER(cr))
 +            {
 +                /* x and v have been collected in write_traj,
 +                 * because a checkpoint file will always be written
 +                 * at the last step.
 +                 */
 +                fprintf(stderr,"\nWriting final coordinates.\n");
 +                if (ir->ePBC != epbcNONE && !ir->bPeriodicMols)
 +                {
 +                    /* Make molecules whole only for confout writing */
 +                    do_pbc_mtop(fplog,ir->ePBC,state->box,top_global,state_global->x);
 +                }
 +                write_sto_conf_mtop(ftp2fn(efSTO,nfile,fnm),
 +                                    *top_global->name,top_global,
 +                                    state_global->x,state_global->v,
 +                                    ir->ePBC,state->box);
 +                debug_gmx();
 +            }
 +            wallcycle_stop(wcycle,ewcTRAJ);
 +        }
 +
 +        /* Determine the wallclock run time up till now */
 +        run_time = gmx_gettime() - (double)runtime->real;
 +
 +        /* Check whether everything is still allright */
 +        if (((int)gmx_get_stop_condition() > handled_stop_condition)
 +#ifdef GMX_THREAD_MPI
 +            && MASTER(cr)
 +#endif
 +            )
 +        {
 +           /* this is just make gs.sig compatible with the hack 
 +               of sending signals around by MPI_Reduce with together with
 +               other floats */
 +            /* NOTE: this only works for serial code. For code that allows
 +               MPI nodes to propagate their condition, see kernel/md.c*/
 +            if ( gmx_get_stop_condition() == gmx_stop_cond_next_ns )
 +                gs.set[eglsSTOPCOND]=1;
 +            if ( gmx_get_stop_condition() == gmx_stop_cond_next )
 +                gs.set[eglsSTOPCOND]=1;
 +            /* < 0 means stop at next step, > 0 means stop at next NS step */
 +            if (fplog)
 +            {
 +                fprintf(fplog,
 +                        "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
 +                        gmx_get_signal_name(),
 +                        gs.sig[eglsSTOPCOND]==1 ? "NS " : "");
 +                fflush(fplog);
 +            }
 +            fprintf(stderr,
 +                    "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
 +                    gmx_get_signal_name(),
 +                    gs.sig[eglsSTOPCOND]==1 ? "NS " : "");
 +            fflush(stderr);
 +            handled_stop_condition=(int)gmx_get_stop_condition();
 +        }
 +        else if (MASTER(cr) &&
 +                 (max_hours > 0 && run_time > max_hours*60.0*60.0*0.99) &&
 +                 gs.set[eglsSTOPCOND] == 0)
 +        {
 +            /* Signal to terminate the run */
 +            gs.set[eglsSTOPCOND] = 1;
 +            if (fplog)
 +            {
 +                fprintf(fplog,"\nStep %s: Run time exceeded %.3f hours, will terminate the run\n",gmx_step_str(step,sbuf),max_hours*0.99);
 +            }
 +            fprintf(stderr, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n",gmx_step_str(step,sbuf),max_hours*0.99);
 +        }
 +
 +        /* checkpoints */
 +        if (MASTER(cr) && (cpt_period >= 0 &&
 +                           (cpt_period == 0 ||
 +                            run_time >= nchkpt*cpt_period*60.0)) &&
 +                gs.set[eglsCHKPT] == 0)
 +        {
 +            gs.set[eglsCHKPT] = 1;
 +        }
 +
 +        /* Time for performance */
 +        if (((step % stepout) == 0) || bLastStep)
 +        {
 +            runtime_upd_proc(runtime);
 +        }
 +
 +        if (do_per_step(step,ir->nstlog))
 +        {
 +            if (fflush(fplog) != 0)
 +            {
 +                gmx_fatal(FARGS,"Cannot flush logfile - maybe you are out of disk space?");
 +            }
 +        }
 +
 +        /* Remaining runtime */
 +        if (MULTIMASTER(cr) && (do_verbose || gmx_got_usr_signal() ))
 +        {
 +            print_time(stderr,runtime,step,ir,cr);
 +        }
 +
 +        bFirstStep = FALSE;
 +        bInitStep = FALSE;
 +        bStartingFromCpt = FALSE;
 +        step++;
 +        step_rel++;
 +
 +        openmm_take_one_step(openmmData);
 +    }
 +    /* End of main MD loop */
 +    debug_gmx();
 +
 +    /* Stop the time */
 +    runtime_end(runtime);
 +
 +    if (MASTER(cr))
 +    {
 +        if (ir->nstcalcenergy > 0) 
 +        {
 +            print_ebin(outf->fp_ene,FALSE,FALSE,FALSE,fplog,step,t,
 +                       eprAVER,FALSE,mdebin,fcd,groups,&(ir->opts));
 +        }
 +    }
 +
 +    openmm_cleanup(fplog, openmmData);
 +
 +    done_mdoutf(outf);
 +
 +    debug_gmx();
 +
 +    runtime->nsteps_done = step_rel;
 +
 +    return 0;
 +}
index 552865e7427e3e5c2ae1c0a570b7bb6b832e54db,0000000000000000000000000000000000000000..e10938a998daa8c14bcccef7d888ac8505327c4a
mode 100644,000000..100644
--- /dev/null
@@@ -1,702 -1,0 +1,701 @@@
-   PCA_Flags = (PCA_KEEP_ARGS | PCA_NOEXIT_ON_ARGS | PCA_CAN_SET_DEFFNM
-              | (MASTER(cr) ? 0 : PCA_QUIET));
 +/*  -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include "typedefs.h"
 +#include "macros.h"
 +#include "copyrite.h"
 +#include "main.h"
 +#include "statutil.h"
 +#include "smalloc.h"
 +#include "futil.h"
 +#include "smalloc.h"
 +#include "edsam.h"
 +#include "mdrun.h"
 +#include "xmdrun.h"
 +#include "checkpoint.h"
 +#ifdef GMX_THREAD_MPI
 +#include "thread_mpi.h"
 +#endif
 +
 +/* afm stuf */
 +#include "pull.h"
 +
 +int main(int argc,char *argv[])
 +{
 +  const char *desc[] = {
 + #ifdef GMX_OPENMM
 +    "This is an experimental release of GROMACS for accelerated",
 +      "Molecular Dynamics simulations on GPU processors. Support is provided",
 +      "by the OpenMM library (https://simtk.org/home/openmm).[PAR]",
 +      "*Warning*[BR]",
 +      "This release is targeted at developers and advanced users and",
 +      "care should be taken before production use. The following should be",
 +      "noted before using the program:[PAR]",
 +      " * The current release runs only on modern nVidia GPU hardware with CUDA support.",
 +      "Make sure that the necessary CUDA drivers and libraries for your operating system",
 +      "are already installed. The CUDA SDK also should be installed in order to compile",
 +      "the program from source (http://www.nvidia.com/object/cuda_home.html).[PAR]",
 +      " * Multiple GPU cards are not supported.[PAR]",
 +      " * Only a small subset of the GROMACS features and options are supported on the GPUs.",
 +      "See below for a detailed list.[PAR]",
 +      " * Consumer level GPU cards are known to often have problems with faulty memory.",
 +      "It is recommended that a full memory check of the cards is done at least once",
 +      "(for example, using the memtest=full option).",
 +      "A partial memory check (for example, memtest=15) before and",
 +      "after the simulation run would help spot",
 +      "problems resulting from processor overheating.[PAR]",
 +      " * The maximum size of the simulated systems depends on the available",
 +      "GPU memory,for example, a GTX280 with 1GB memory has been tested with systems",
 +      "of up to about 100,000 atoms.[PAR]",
 +      " * In order to take a full advantage of the GPU platform features, many algorithms",
 +      "have been implemented in a very different way than they are on the CPUs.",
 +      "Therefore numercal correspondence between properties of the state of",
 +      "simulated systems should not be expected. Moreover, the values will likely vary",
 +      "when simulations are done on different GPU hardware.[PAR]",
 +      " * Frequent retrieval of system state information such as",
 +      "trajectory coordinates and energies can greatly influence the performance",
 +      "of the program due to slow CPU<->GPU memory transfer speed.[PAR]",
 +      " * MD algorithms are complex, and although the Gromacs code is highly tuned for them,",
 +      "they often do not translate very well onto the streaming architetures.",
 +      "Realistic expectations about the achievable speed-up from test with GTX280:",
 +      "For small protein systems in implicit solvent using all-vs-all kernels the acceleration",
 +      "can be as high as 20 times, but in most other setups involving cutoffs and PME the",
 +      "acceleration is usually only ~4 times relative to a 3GHz CPU.[PAR]",
 +      "Supported features:[PAR]",
 +      " * Integrators: md/md-vv/md-vv-avek, sd/sd1 and bd.\n",
 +      " * Long-range interactions (option coulombtype): Reaction-Field, Ewald, PME, and cut-off (for Implicit Solvent only)\n",
 +      " * Temperature control: Supported only with the md/md-vv/md-vv-avek, sd/sd1 and bd integrators.\n",
 +      " * Pressure control: Supported.\n",
 +      " * Implicit solvent: Supported.\n",
 +      "A detailed description can be found on the GROMACS website:\n",
 +      "http://www.gromacs.org/gpu[PAR]",
 +/* From the original mdrun documentaion */
 +    "The [TT]mdrun[tt] program reads the run input file ([TT]-s[tt])",
 +    "and distributes the topology over nodes if needed.",
 +    "[TT]mdrun[tt] produces at least four output files.",
 +    "A single log file ([TT]-g[tt]) is written, unless the option",
 +    "[TT]-seppot[tt] is used, in which case each node writes a log file.",
 +    "The trajectory file ([TT]-o[tt]), contains coordinates, velocities and",
 +    "optionally forces.",
 +    "The structure file ([TT]-c[tt]) contains the coordinates and",
 +    "velocities of the last step.",
 +    "The energy file ([TT]-e[tt]) contains energies, the temperature,",
 +    "pressure, etc, a lot of these things are also printed in the log file.",
 +    "Optionally coordinates can be written to a compressed trajectory file",
 +    "([TT]-x[tt]).[PAR]",
 +/* openmm specific information */
 +      "Usage with OpenMM:[BR]",
 +      "[TT]mdrun -device \"OpenMM:platform=Cuda,memtest=15,deviceid=0,force-device=no\"[tt][PAR]",
 +      "Options:[PAR]",
 +      "      [TT]platform[tt] = Cuda\t\t:\tThe only available value. OpenCL support will be available in future.\n",
 +      "      [TT]memtest[tt] = 15\t\t:\tRun a partial, random GPU memory test for the given amount of seconds. A full test",
 +      "(recommended!) can be run with \"memtest=full\". Memory testing can be disabled with \"memtest=off\".\n",
 +      "      [TT]deviceid[tt] = 0\t\t:\tSpecify the target device when multiple cards are present.",
 +      "Only one card can be used at any given time though.\n",
 +      "      [TT]force-device[tt] = no\t\t:\tIf set to \"yes\" [TT]mdrun[tt]  will be forced to execute on",
 +      "hardware that is not officially supported. GPU acceleration can also be achieved on older",
 +      "but Cuda capable cards, although the simulation might be too slow, and the memory limits too strict.",
 +#else
 +    "The [TT]mdrun[tt] program is the main computational chemistry engine",
 +    "within GROMACS. Obviously, it performs Molecular Dynamics simulations,",
 +    "but it can also perform Stochastic Dynamics, Energy Minimization,",
 +    "test particle insertion or (re)calculation of energies.",
 +    "Normal mode analysis is another option. In this case [TT]mdrun[tt]",
 +    "builds a Hessian matrix from single conformation.",
 +    "For usual Normal Modes-like calculations, make sure that",
 +    "the structure provided is properly energy-minimized.",
 +    "The generated matrix can be diagonalized by [TT]g_nmeig[tt].[PAR]",
 +    "The [TT]mdrun[tt] program reads the run input file ([TT]-s[tt])",
 +    "and distributes the topology over nodes if needed.",
 +    "[TT]mdrun[tt] produces at least four output files.",
 +    "A single log file ([TT]-g[tt]) is written, unless the option",
 +    "[TT]-seppot[tt] is used, in which case each node writes a log file.",
 +    "The trajectory file ([TT]-o[tt]), contains coordinates, velocities and",
 +    "optionally forces.",
 +    "The structure file ([TT]-c[tt]) contains the coordinates and",
 +    "velocities of the last step.",
 +    "The energy file ([TT]-e[tt]) contains energies, the temperature,",
 +    "pressure, etc, a lot of these things are also printed in the log file.",
 +    "Optionally coordinates can be written to a compressed trajectory file",
 +    "([TT]-x[tt]).[PAR]",
 +    "The option [TT]-dhdl[tt] is only used when free energy calculation is",
 +    "turned on.[PAR]",
 +    "When [TT]mdrun[tt] is started using MPI with more than 1 node, parallelization",
 +    "is used. By default domain decomposition is used, unless the [TT]-pd[tt]",
 +    "option is set, which selects particle decomposition.[PAR]",
 +    "With domain decomposition, the spatial decomposition can be set",
 +    "with option [TT]-dd[tt]. By default [TT]mdrun[tt] selects a good decomposition.",
 +    "The user only needs to change this when the system is very inhomogeneous.",
 +    "Dynamic load balancing is set with the option [TT]-dlb[tt],",
 +    "which can give a significant performance improvement,",
 +    "especially for inhomogeneous systems. The only disadvantage of",
 +    "dynamic load balancing is that runs are no longer binary reproducible,",
 +    "but in most cases this is not important.",
 +    "By default the dynamic load balancing is automatically turned on",
 +    "when the measured performance loss due to load imbalance is 5% or more.",
 +    "At low parallelization these are the only important options",
 +    "for domain decomposition.",
 +    "At high parallelization the options in the next two sections",
 +    "could be important for increasing the performace.",
 +    "[PAR]",
 +    "When PME is used with domain decomposition, separate nodes can",
 +    "be assigned to do only the PME mesh calculation;",
 +    "this is computationally more efficient starting at about 12 nodes.",
 +    "The number of PME nodes is set with option [TT]-npme[tt],",
 +    "this can not be more than half of the nodes.",
 +    "By default [TT]mdrun[tt] makes a guess for the number of PME",
 +    "nodes when the number of nodes is larger than 11 or performance wise",
 +    "not compatible with the PME grid x dimension.",
 +    "But the user should optimize npme. Performance statistics on this issue",
 +    "are written at the end of the log file.",
 +    "For good load balancing at high parallelization, the PME grid x and y",
 +    "dimensions should be divisible by the number of PME nodes",
 +    "(the simulation will run correctly also when this is not the case).",
 +    "[PAR]",
 +    "This section lists all options that affect the domain decomposition.",
 +    "[PAR]",
 +    "Option [TT]-rdd[tt] can be used to set the required maximum distance",
 +    "for inter charge-group bonded interactions.",
 +    "Communication for two-body bonded interactions below the non-bonded",
 +    "cut-off distance always comes for free with the non-bonded communication.",
 +    "Atoms beyond the non-bonded cut-off are only communicated when they have",
 +    "missing bonded interactions; this means that the extra cost is minor",
 +    "and nearly indepedent of the value of [TT]-rdd[tt].",
 +    "With dynamic load balancing option [TT]-rdd[tt] also sets",
 +    "the lower limit for the domain decomposition cell sizes.",
 +    "By default [TT]-rdd[tt] is determined by [TT]mdrun[tt] based on",
 +    "the initial coordinates. The chosen value will be a balance",
 +    "between interaction range and communication cost.",
 +    "[PAR]",
 +    "When inter charge-group bonded interactions are beyond",
 +    "the bonded cut-off distance, [TT]mdrun[tt] terminates with an error message.",
 +    "For pair interactions and tabulated bonds",
 +    "that do not generate exclusions, this check can be turned off",
 +    "with the option [TT]-noddcheck[tt].",
 +    "[PAR]",
 +    "When constraints are present, option [TT]-rcon[tt] influences",
 +    "the cell size limit as well.",
 +    "Atoms connected by NC constraints, where NC is the LINCS order plus 1,",
 +    "should not be beyond the smallest cell size. A error message is",
 +    "generated when this happens and the user should change the decomposition",
 +    "or decrease the LINCS order and increase the number of LINCS iterations.",
 +    "By default [TT]mdrun[tt] estimates the minimum cell size required for P-LINCS",
 +    "in a conservative fashion. For high parallelization it can be useful",
 +    "to set the distance required for P-LINCS with the option [TT]-rcon[tt].",
 +    "[PAR]",
 +    "The [TT]-dds[tt] option sets the minimum allowed x, y and/or z scaling",
 +    "of the cells with dynamic load balancing. [TT]mdrun[tt] will ensure that",
 +    "the cells can scale down by at least this factor. This option is used",
 +    "for the automated spatial decomposition (when not using [TT]-dd[tt])",
 +    "as well as for determining the number of grid pulses, which in turn",
 +    "sets the minimum allowed cell size. Under certain circumstances",
 +    "the value of [TT]-dds[tt] might need to be adjusted to account for",
 +    "high or low spatial inhomogeneity of the system.",
 +    "[PAR]",
 +    "The option [TT]-gcom[tt] can be used to only do global communication",
 +    "every n steps.",
 +    "This can improve performance for highly parallel simulations",
 +    "where this global communication step becomes the bottleneck.",
 +    "For a global thermostat and/or barostat the temperature",
 +    "and/or pressure will also only be updated every [TT]-gcom[tt] steps.",
 +    "By default it is set to the minimum of nstcalcenergy and nstlist.[PAR]",
 +    "With [TT]-rerun[tt] an input trajectory can be given for which ",
 +    "forces and energies will be (re)calculated. Neighbor searching will be",
 +    "performed for every frame, unless [TT]nstlist[tt] is zero",
 +    "(see the [TT].mdp[tt] file).[PAR]",
 +    "ED (essential dynamics) sampling is switched on by using the [TT]-ei[tt]",
 +    "flag followed by an [TT].edi[tt] file.",
 +    "The [TT].edi[tt] file can be produced using options in the essdyn",
 +    "menu of the WHAT IF program. [TT]mdrun[tt] produces a [TT].edo[tt] file that",
 +    "contains projections of positions, velocities and forces onto selected",
 +    "eigenvectors.[PAR]",
 +    "When user-defined potential functions have been selected in the",
 +    "[TT].mdp[tt] file the [TT]-table[tt] option is used to pass [TT]mdrun[tt]",
 +    "a formatted table with potential functions. The file is read from",
 +    "either the current directory or from the [TT]GMXLIB[tt] directory.",
 +    "A number of pre-formatted tables are presented in the [TT]GMXLIB[tt] dir,",
 +    "for 6-8, 6-9, 6-10, 6-11, 6-12 Lennard-Jones potentials with",
 +    "normal Coulomb.",
 +    "When pair interactions are present, a separate table for pair interaction",
 +    "functions is read using the [TT]-tablep[tt] option.[PAR]",
 +    "When tabulated bonded functions are present in the topology,",
 +    "interaction functions are read using the [TT]-tableb[tt] option.",
 +    "For each different tabulated interaction type the table file name is",
 +    "modified in a different way: before the file extension an underscore is",
 +    "appended, then a 'b' for bonds, an 'a' for angles or a 'd' for dihedrals",
 +    "and finally the table number of the interaction type.[PAR]",
 +    "The options [TT]-px[tt] and [TT]-pf[tt] are used for writing pull COM",
 +    "coordinates and forces when pulling is selected",
 +    "in the [TT].mdp[tt] file.[PAR]",
 +    "With [TT]-multi[tt] or [TT]-multidir[tt], multiple systems can be ",
 +    "simulated in parallel.",
 +    "As many input files/directories are required as the number of systems. ",
 +    "The [TT]-multidir[tt] option takes a list of directories (one for each ",
 +    "system) and runs in each of them, using the input/output file names, ",
 +    "such as specified by e.g. the [TT]-s[tt] option, relative to these ",
 +    "directories.",
 +    "With [TT]-multi[tt], the system number is appended to the run input ",
 +    "and each output filename, for instance [TT]topol.tpr[tt] becomes",
 +    "[TT]topol0.tpr[tt], [TT]topol1.tpr[tt] etc.",
 +    "The number of nodes per system is the total number of nodes",
 +    "divided by the number of systems.",
 +    "One use of this option is for NMR refinement: when distance",
 +    "or orientation restraints are present these can be ensemble averaged",
 +    "over all the systems.[PAR]",
 +    "With [TT]-replex[tt] replica exchange is attempted every given number",
 +    "of steps. The number of replicas is set with the [TT]-multi[tt] or ",
 +    "[TT]-multidir[tt] option, described above.",
 +    "All run input files should use a different coupling temperature,",
 +    "the order of the files is not important. The random seed is set with",
 +    "[TT]-reseed[tt]. The velocities are scaled and neighbor searching",
 +    "is performed after every exchange.[PAR]",
 +    "Finally some experimental algorithms can be tested when the",
 +    "appropriate options have been given. Currently under",
 +    "investigation are: polarizability and X-ray bombardments.",
 +    "[PAR]",
 +    "The option [TT]-membed[dd] does what used to be g_membed, i.e. embed",
 +    "a protein into a membrane. The data file should contain the options",
 +    "that where passed to g_membed before. The [TT]-mn[tt] and [TT]-mp[tt]",
 +    "both apply to this as well.",
 +    "[PAR]",
 +    "The option [TT]-pforce[tt] is useful when you suspect a simulation",
 +    "crashes due to too large forces. With this option coordinates and",
 +    "forces of atoms with a force larger than a certain value will",
 +    "be printed to stderr.",
 +    "[PAR]",
 +    "Checkpoints containing the complete state of the system are written",
 +    "at regular intervals (option [TT]-cpt[tt]) to the file [TT]-cpo[tt],",
 +    "unless option [TT]-cpt[tt] is set to -1.",
 +    "The previous checkpoint is backed up to [TT]state_prev.cpt[tt] to",
 +    "make sure that a recent state of the system is always available,",
 +    "even when the simulation is terminated while writing a checkpoint.",
 +    "With [TT]-cpnum[tt] all checkpoint files are kept and appended",
 +    "with the step number.",
 +    "A simulation can be continued by reading the full state from file",
 +    "with option [TT]-cpi[tt]. This option is intelligent in the way that",
 +    "if no checkpoint file is found, Gromacs just assumes a normal run and",
 +    "starts from the first step of the [TT].tpr[tt] file. By default the output",
 +    "will be appending to the existing output files. The checkpoint file",
 +    "contains checksums of all output files, such that you will never",
 +    "loose data when some output files are modified, corrupt or removed.",
 +    "There are three scenarios with [TT]-cpi[tt]:[PAR]",
 +    "[TT]*[tt] no files with matching names are present: new output files are written[PAR]",
 +    "[TT]*[tt] all files are present with names and checksums matching those stored",
 +    "in the checkpoint file: files are appended[PAR]",
 +    "[TT]*[tt] otherwise no files are modified and a fatal error is generated[PAR]",
 +    "With [TT]-noappend[tt] new output files are opened and the simulation",
 +    "part number is added to all output file names.",
 +    "Note that in all cases the checkpoint file itself is not renamed",
 +    "and will be overwritten, unless its name does not match",
 +    "the [TT]-cpo[tt] option.",
 +    "[PAR]",
 +    "With checkpointing the output is appended to previously written",
 +    "output files, unless [TT]-noappend[tt] is used or none of the previous",
 +    "output files are present (except for the checkpoint file).",
 +    "The integrity of the files to be appended is verified using checksums",
 +    "which are stored in the checkpoint file. This ensures that output can",
 +    "not be mixed up or corrupted due to file appending. When only some",
 +    "of the previous output files are present, a fatal error is generated",
 +    "and no old output files are modified and no new output files are opened.",
 +    "The result with appending will be the same as from a single run.",
 +    "The contents will be binary identical, unless you use a different number",
 +    "of nodes or dynamic load balancing or the FFT library uses optimizations",
 +    "through timing.",
 +    "[PAR]",
 +    "With option [TT]-maxh[tt] a simulation is terminated and a checkpoint",
 +    "file is written at the first neighbor search step where the run time",
 +    "exceeds [TT]-maxh[tt]*0.99 hours.",
 +    "[PAR]",
 +    "When [TT]mdrun[tt] receives a TERM signal, it will set nsteps to the current",
 +    "step plus one. When [TT]mdrun[tt] receives an INT signal (e.g. when ctrl+C is",
 +    "pressed), it will stop after the next neighbor search step ",
 +    "(with nstlist=0 at the next step).",
 +    "In both cases all the usual output will be written to file.",
 +    "When running with MPI, a signal to one of the [TT]mdrun[tt] processes",
 +    "is sufficient, this signal should not be sent to mpirun or",
 +    "the [TT]mdrun[tt] process that is the parent of the others.",
 +    "[PAR]",
 +    "When [TT]mdrun[tt] is started with MPI, it does not run niced by default."
 +#endif
 +  };
 +  t_commrec    *cr;
 +  t_filenm fnm[] = {
 +    { efTPX, NULL,      NULL,       ffREAD },
 +    { efTRN, "-o",      NULL,       ffWRITE },
 +    { efXTC, "-x",      NULL,       ffOPTWR },
 +    { efCPT, "-cpi",    NULL,       ffOPTRD },
 +    { efCPT, "-cpo",    NULL,       ffOPTWR },
 +    { efSTO, "-c",      "confout",  ffWRITE },
 +    { efEDR, "-e",      "ener",     ffWRITE },
 +    { efLOG, "-g",      "md",       ffWRITE },
 +    { efXVG, "-dhdl",   "dhdl",     ffOPTWR },
 +    { efXVG, "-field",  "field",    ffOPTWR },
 +    { efXVG, "-table",  "table",    ffOPTRD },
 +    { efXVG, "-tabletf", "tabletf",    ffOPTRD },
 +    { efXVG, "-tablep", "tablep",   ffOPTRD },
 +    { efXVG, "-tableb", "table",    ffOPTRD },
 +    { efTRX, "-rerun",  "rerun",    ffOPTRD },
 +    { efXVG, "-tpi",    "tpi",      ffOPTWR },
 +    { efXVG, "-tpid",   "tpidist",  ffOPTWR },
 +    { efEDI, "-ei",     "sam",      ffOPTRD },
 +    { efEDO, "-eo",     "sam",      ffOPTWR },
 +    { efGCT, "-j",      "wham",     ffOPTRD },
 +    { efGCT, "-jo",     "bam",      ffOPTWR },
 +    { efXVG, "-ffout",  "gct",      ffOPTWR },
 +    { efXVG, "-devout", "deviatie", ffOPTWR },
 +    { efXVG, "-runav",  "runaver",  ffOPTWR },
 +    { efXVG, "-px",     "pullx",    ffOPTWR },
 +    { efXVG, "-pf",     "pullf",    ffOPTWR },
 +    { efXVG, "-ro",     "rotation", ffOPTWR },
 +    { efLOG, "-ra",     "rotangles",ffOPTWR },
 +    { efLOG, "-rs",     "rotslabs", ffOPTWR },
 +    { efLOG, "-rt",     "rottorque",ffOPTWR },
 +    { efMTX, "-mtx",    "nm",       ffOPTWR },
 +    { efNDX, "-dn",     "dipole",   ffOPTWR },
 +    { efDAT, "-membed", "membed",   ffOPTRD },
 +    { efTOP, "-mp",     "membed",   ffOPTRD },
 +    { efNDX, "-mn",     "membed",   ffOPTRD },
 +    { efRND, "-multidir",NULL,      ffOPTRDMULT}
 +  };
 +#define NFILE asize(fnm)
 +
 +  /* Command line options ! */
 +  gmx_bool bCart        = FALSE;
 +  gmx_bool bPPPME       = FALSE;
 +  gmx_bool bPartDec     = FALSE;
 +  gmx_bool bDDBondCheck = TRUE;
 +  gmx_bool bDDBondComm  = TRUE;
 +  gmx_bool bVerbose     = FALSE;
 +  gmx_bool bCompact     = TRUE;
 +  gmx_bool bSepPot      = FALSE;
 +  gmx_bool bRerunVSite  = FALSE;
 +  gmx_bool bIonize      = FALSE;
 +  gmx_bool bConfout     = TRUE;
 +  gmx_bool bReproducible = FALSE;
 +    
 +  int  npme=-1;
 +  int  nmultisim=0;
 +  int  nstglobalcomm=-1;
 +  int  repl_ex_nst=0;
 +  int  repl_ex_seed=-1;
 +  int  nstepout=100;
 +  int  nthreads=0; /* set to determine # of threads automatically */
 +  int  resetstep=-1;
 +  
 +  rvec realddxyz={0,0,0};
 +  const char *ddno_opt[ddnoNR+1] =
 +    { NULL, "interleave", "pp_pme", "cartesian", NULL };
 +    const char *dddlb_opt[] =
 +    { NULL, "auto", "no", "yes", NULL };
 +  real rdd=0.0,rconstr=0.0,dlb_scale=0.8,pforce=-1;
 +  char *ddcsx=NULL,*ddcsy=NULL,*ddcsz=NULL;
 +  real cpt_period=15.0,max_hours=-1;
 +  gmx_bool bAppendFiles=TRUE;
 +  gmx_bool bKeepAndNumCPT=FALSE;
 +  gmx_bool bResetCountersHalfWay=FALSE;
 +  output_env_t oenv=NULL;
 +  const char *deviceOptions = "";
 +
 +  t_pargs pa[] = {
 +
 +    { "-pd",      FALSE, etBOOL,{&bPartDec},
 +      "Use particle decompostion" },
 +    { "-dd",      FALSE, etRVEC,{&realddxyz},
 +      "Domain decomposition grid, 0 is optimize" },
 +#ifdef GMX_THREAD_MPI
 +    { "-nt",      FALSE, etINT, {&nthreads},
 +      "Number of threads to start (0 is guess)" },
 +#endif
 +    { "-npme",    FALSE, etINT, {&npme},
 +      "Number of separate nodes to be used for PME, -1 is guess" },
 +    { "-ddorder", FALSE, etENUM, {ddno_opt},
 +      "DD node order" },
 +    { "-ddcheck", FALSE, etBOOL, {&bDDBondCheck},
 +      "Check for all bonded interactions with DD" },
 +    { "-ddbondcomm", FALSE, etBOOL, {&bDDBondComm},
 +      "HIDDENUse special bonded atom communication when [TT]-rdd[tt] > cut-off" },
 +    { "-rdd",     FALSE, etREAL, {&rdd},
 +      "The maximum distance for bonded interactions with DD (nm), 0 is determine from initial coordinates" },
 +    { "-rcon",    FALSE, etREAL, {&rconstr},
 +      "Maximum distance for P-LINCS (nm), 0 is estimate" },
 +    { "-dlb",     FALSE, etENUM, {dddlb_opt},
 +      "Dynamic load balancing (with DD)" },
 +    { "-dds",     FALSE, etREAL, {&dlb_scale},
 +      "Minimum allowed dlb scaling of the DD cell size" },
 +    { "-ddcsx",   FALSE, etSTR, {&ddcsx},
 +      "HIDDENThe DD cell sizes in x" },
 +    { "-ddcsy",   FALSE, etSTR, {&ddcsy},
 +      "HIDDENThe DD cell sizes in y" },
 +    { "-ddcsz",   FALSE, etSTR, {&ddcsz},
 +      "HIDDENThe DD cell sizes in z" },
 +    { "-gcom",    FALSE, etINT,{&nstglobalcomm},
 +      "Global communication frequency" },
 +    { "-v",       FALSE, etBOOL,{&bVerbose},  
 +      "Be loud and noisy" },
 +    { "-compact", FALSE, etBOOL,{&bCompact},  
 +      "Write a compact log file" },
 +    { "-seppot",  FALSE, etBOOL, {&bSepPot},
 +      "Write separate V and dVdl terms for each interaction type and node to the log file(s)" },
 +    { "-pforce",  FALSE, etREAL, {&pforce},
 +      "Print all forces larger than this (kJ/mol nm)" },
 +    { "-reprod",  FALSE, etBOOL,{&bReproducible},  
 +      "Try to avoid optimizations that affect binary reproducibility" },
 +    { "-cpt",     FALSE, etREAL, {&cpt_period},
 +      "Checkpoint interval (minutes)" },
 +    { "-cpnum",   FALSE, etBOOL, {&bKeepAndNumCPT},
 +      "Keep and number checkpoint files" },
 +    { "-append",  FALSE, etBOOL, {&bAppendFiles},
 +      "Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names" },
 +    { "-maxh",   FALSE, etREAL, {&max_hours},
 +      "Terminate after 0.99 times this time (hours)" },
 +    { "-multi",   FALSE, etINT,{&nmultisim}, 
 +      "Do multiple simulations in parallel" },
 +    { "-replex",  FALSE, etINT, {&repl_ex_nst}, 
 +      "Attempt replica exchange periodically with this period (steps)" },
 +    { "-reseed",  FALSE, etINT, {&repl_ex_seed}, 
 +      "Seed for replica exchange, -1 is generate a seed" },
 +    { "-rerunvsite", FALSE, etBOOL, {&bRerunVSite},
 +      "HIDDENRecalculate virtual site coordinates with [TT]-rerun[tt]" },
 +    { "-ionize",  FALSE, etBOOL,{&bIonize},
 +      "Do a simulation including the effect of an X-Ray bombardment on your system" },
 +    { "-confout", FALSE, etBOOL, {&bConfout},
 +      "HIDDENWrite the last configuration with [TT]-c[tt] and force checkpointing at the last step" },
 +    { "-stepout", FALSE, etINT, {&nstepout},
 +      "HIDDENFrequency of writing the remaining runtime" },
 +    { "-resetstep", FALSE, etINT, {&resetstep},
 +      "HIDDENReset cycle counters after these many time steps" },
 +    { "-resethway", FALSE, etBOOL, {&bResetCountersHalfWay},
 +      "HIDDENReset the cycle counters after half the number of steps or halfway [TT]-maxh[tt]" }
 +#ifdef GMX_OPENMM
 +    ,
 +    { "-device",  FALSE, etSTR, {&deviceOptions},
 +      "Device option string" }
 +#endif
 +  };
 +  gmx_edsam_t  ed;
 +  unsigned long Flags, PCA_Flags;
 +  ivec     ddxyz;
 +  int      dd_node_order;
 +  gmx_bool     bAddPart;
 +  FILE     *fplog,*fptest;
 +  int      sim_part,sim_part_fn;
 +  const char *part_suffix=".part";
 +  char     suffix[STRLEN];
 +  int      rc;
 +  char **multidir=NULL;
 +
 +
 +  cr = init_par(&argc,&argv);
 +
 +  if (MASTER(cr))
 +    CopyRight(stderr, argv[0]);
 +
++  PCA_Flags = (PCA_CAN_SET_DEFFNM | (MASTER(cr) ? 0 : PCA_QUIET));
 +  
 +  /* Comment this in to do fexist calls only on master
 +   * works not with rerun or tables at the moment
 +   * also comment out the version of init_forcerec in md.c 
 +   * with NULL instead of opt2fn
 +   */
 +  /*
 +     if (!MASTER(cr))
 +     {
 +     PCA_Flags |= PCA_NOT_READ_NODE;
 +     }
 +     */
 +
 +  parse_common_args(&argc,argv,PCA_Flags, NFILE,fnm,asize(pa),pa,
 +                    asize(desc),desc,0,NULL, &oenv);
 +
 +
 +
 +  /* we set these early because they might be used in init_multisystem() 
 +     Note that there is the potential for npme>nnodes until the number of
 +     threads is set later on, if there's thread parallelization. That shouldn't
 +     lead to problems. */ 
 +  dd_node_order = nenum(ddno_opt);
 +  cr->npmenodes = npme;
 +
 +#ifndef GMX_THREAD_MPI
 +  nthreads=1;
 +#endif
 +
 +  /* now check the -multi and -multidir option */
 +  if (opt2bSet("-multidir", NFILE, fnm))
 +  {
 +      int i;
 +      if (nmultisim > 0)
 +      {
 +          gmx_fatal(FARGS, "mdrun -multi and -multidir options are mutually exclusive.");
 +      }
 +      nmultisim = opt2fns(&multidir, "-multidir", NFILE, fnm);
 +  }
 +
 +
 +  if (repl_ex_nst != 0 && nmultisim < 2)
 +      gmx_fatal(FARGS,"Need at least two replicas for replica exchange (option -multi)");
 +
 +  if (nmultisim > 1) {
 +#ifndef GMX_THREAD_MPI
 +    gmx_bool bParFn = (multidir == NULL);
 +    init_multisystem(cr, nmultisim, multidir, NFILE, fnm, bParFn);
 +#else
 +    gmx_fatal(FARGS,"mdrun -multi is not supported with the thread library.Please compile GROMACS with MPI support");
 +#endif
 +  }
 +
 +  bAddPart = !bAppendFiles;
 +
 +  /* Check if there is ANY checkpoint file available */       
 +  sim_part    = 1;
 +  sim_part_fn = sim_part;
 +  if (opt2bSet("-cpi",NFILE,fnm))
 +  {
 +      if (bSepPot && bAppendFiles)
 +      {
 +          gmx_fatal(FARGS,"Output file appending is not supported with -seppot");
 +      }
 +
 +      bAppendFiles =
 +                read_checkpoint_simulation_part(opt2fn_master("-cpi", NFILE,
 +                                                              fnm,cr),
 +                                                &sim_part_fn,NULL,cr,
 +                                                bAppendFiles,NFILE,fnm,
 +                                                part_suffix,&bAddPart);
 +      if (sim_part_fn==0 && MASTER(cr))
 +      {
 +          fprintf(stdout,"No previous checkpoint file present, assuming this is a new run.\n");
 +      }
 +      else
 +      {
 +          sim_part = sim_part_fn + 1;
 +      }
 +
 +      if (MULTISIM(cr) && MASTER(cr))
 +      {
 +          check_multi_int(stdout,cr->ms,sim_part,"simulation part");
 +      }
 +  } 
 +  else
 +  {
 +      bAppendFiles = FALSE;
 +  }
 +
 +  if (!bAppendFiles)
 +  {
 +      sim_part_fn = sim_part;
 +  }
 +
 +  if (bAddPart)
 +  {
 +      /* Rename all output files (except checkpoint files) */
 +      /* create new part name first (zero-filled) */
 +      sprintf(suffix,"%s%04d",part_suffix,sim_part_fn);
 +
 +      add_suffix_to_output_names(fnm,NFILE,suffix);
 +      if (MASTER(cr))
 +      {
 +          fprintf(stdout,"Checkpoint file is from part %d, new output files will be suffixed '%s'.\n",sim_part-1,suffix);
 +      }
 +  }
 +
 +  Flags = opt2bSet("-rerun",NFILE,fnm) ? MD_RERUN : 0;
 +  Flags = Flags | (bSepPot       ? MD_SEPPOT       : 0);
 +  Flags = Flags | (bIonize       ? MD_IONIZE       : 0);
 +  Flags = Flags | (bPartDec      ? MD_PARTDEC      : 0);
 +  Flags = Flags | (bDDBondCheck  ? MD_DDBONDCHECK  : 0);
 +  Flags = Flags | (bDDBondComm   ? MD_DDBONDCOMM   : 0);
 +  Flags = Flags | (bConfout      ? MD_CONFOUT      : 0);
 +  Flags = Flags | (bRerunVSite   ? MD_RERUN_VSITE  : 0);
 +  Flags = Flags | (bReproducible ? MD_REPRODUCIBLE : 0);
 +  Flags = Flags | (bAppendFiles  ? MD_APPENDFILES  : 0); 
++  Flags = Flags | (opt2parg_bSet("-append", asize(pa),pa) ? MD_APPENDFILESSET : 0); 
 +  Flags = Flags | (bKeepAndNumCPT ? MD_KEEPANDNUMCPT : 0); 
 +  Flags = Flags | (sim_part>1    ? MD_STARTFROMCPT : 0); 
 +  Flags = Flags | (bResetCountersHalfWay ? MD_RESETCOUNTERSHALFWAY : 0);
 +
 +
 +  /* We postpone opening the log file if we are appending, so we can 
 +     first truncate the old log file and append to the correct position 
 +     there instead.  */
 +  if ((MASTER(cr) || bSepPot) && !bAppendFiles) 
 +  {
 +      gmx_log_open(ftp2fn(efLOG,NFILE,fnm),cr,!bSepPot,Flags,&fplog);
 +      CopyRight(fplog,argv[0]);
 +      please_cite(fplog,"Hess2008b");
 +      please_cite(fplog,"Spoel2005a");
 +      please_cite(fplog,"Lindahl2001a");
 +      please_cite(fplog,"Berendsen95a");
 +  }
 +  else if (!MASTER(cr) && bSepPot)
 +  {
 +      gmx_log_open(ftp2fn(efLOG,NFILE,fnm),cr,!bSepPot,Flags,&fplog);
 +  }
 +  else
 +  {
 +      fplog = NULL;
 +  }
 +
 +  ddxyz[XX] = (int)(realddxyz[XX] + 0.5);
 +  ddxyz[YY] = (int)(realddxyz[YY] + 0.5);
 +  ddxyz[ZZ] = (int)(realddxyz[ZZ] + 0.5);
 +
 +  rc = mdrunner(nthreads, fplog,cr,NFILE,fnm,oenv,bVerbose,bCompact,
 +                nstglobalcomm, ddxyz,dd_node_order,rdd,rconstr,
 +                dddlb_opt[0],dlb_scale,ddcsx,ddcsy,ddcsz,
 +                nstepout,resetstep,nmultisim,repl_ex_nst,repl_ex_seed,
 +                pforce, cpt_period,max_hours,deviceOptions,Flags);
 +
 +  if (gmx_parallel_env_initialized())
 +      gmx_finalize();
 +
 +  if (MULTIMASTER(cr)) {
 +      thanx(stderr);
 +  }
 +
 +  /* Log file has to be closed in mdrunner if we are appending to it 
 +     (fplog not set here) */
 +  if (MASTER(cr) && !bAppendFiles) 
 +  {
 +      gmx_log_close(fplog);
 +  }
 +
 +  return rc;
 +}
 +
index 8a3c710ee375e3b28816aa2cf4e63ba01031b119,0000000000000000000000000000000000000000..4a4739ea22aa2b105dea7e263265e37e2c00ac1a
mode 100644,000000..100644
--- /dev/null
@@@ -1,998 -1,0 +1,977 @@@
- #if ((defined WIN32 || defined _WIN32 || defined WIN64 || defined _WIN64) && !defined __CYGWIN__ && !defined __CYGWIN32__)
- /* _isnan() */
- #include <float.h>
- #endif
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +#ifdef __linux
 +#define _GNU_SOURCE
 +#include <sched.h>
 +#include <sys/syscall.h>
 +#endif
 +#include <signal.h>
 +#include <stdlib.h>
 +
- #include "pppm.h"
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "sysstuff.h"
 +#include "statutil.h"
 +#include "mdrun.h"
 +#include "network.h"
 +#include "pull.h"
 +#include "pull_rotation.h"
 +#include "names.h"
 +#include "disre.h"
 +#include "orires.h"
 +#include "dihre.h"
-                             (Flags & MD_APPENDFILES));
 +#include "pme.h"
 +#include "mdatoms.h"
 +#include "repl_ex.h"
 +#include "qmmm.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "coulomb.h"
 +#include "constr.h"
 +#include "mvdata.h"
 +#include "checkpoint.h"
 +#include "mtop_util.h"
 +#include "sighandler.h"
 +#include "tpxio.h"
 +#include "txtdump.h"
 +#include "pull_rotation.h"
 +#include "membed.h"
 +#include "macros.h"
 +
 +#ifdef GMX_LIB_MPI
 +#include <mpi.h>
 +#endif
 +#ifdef GMX_THREAD_MPI
 +#include "tmpi.h"
 +#endif
 +
 +#ifdef GMX_FAHCORE
 +#include "corewrap.h"
 +#endif
 +
 +#ifdef GMX_OPENMM
 +#include "md_openmm.h"
 +#endif
 +
 +#ifdef GMX_OPENMP
 +#include <omp.h>
 +#endif
 +
 +
 +typedef struct { 
 +    gmx_integrator_t *func;
 +} gmx_intp_t;
 +
 +/* The array should match the eI array in include/types/enums.h */
 +#ifdef GMX_OPENMM  /* FIXME do_md_openmm needs fixing */
 +const gmx_intp_t integrator[eiNR] = { {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm},{do_md_openmm}};
 +#else
 +const gmx_intp_t integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md},{do_md}};
 +#endif
 +
 +gmx_large_int_t     deform_init_init_step_tpx;
 +matrix              deform_init_box_tpx;
 +#ifdef GMX_THREAD_MPI
 +tMPI_Thread_mutex_t deform_init_box_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
 +#endif
 +
 +
 +#ifdef GMX_THREAD_MPI
 +struct mdrunner_arglist
 +{
 +    FILE *fplog;
 +    t_commrec *cr;
 +    int nfile;
 +    const t_filenm *fnm;
 +    output_env_t oenv;
 +    gmx_bool bVerbose;
 +    gmx_bool bCompact;
 +    int nstglobalcomm;
 +    ivec ddxyz;
 +    int dd_node_order;
 +    real rdd;
 +    real rconstr;
 +    const char *dddlb_opt;
 +    real dlb_scale;
 +    const char *ddcsx;
 +    const char *ddcsy;
 +    const char *ddcsz;
 +    int nstepout;
 +    int resetstep;
 +    int nmultisim;
 +    int repl_ex_nst;
 +    int repl_ex_seed;
 +    real pforce;
 +    real cpt_period;
 +    real max_hours;
 +    const char *deviceOptions;
 +    unsigned long Flags;
 +    int ret; /* return value */
 +};
 +
 +
 +/* The function used for spawning threads. Extracts the mdrunner() 
 +   arguments from its one argument and calls mdrunner(), after making
 +   a commrec. */
 +static void mdrunner_start_fn(void *arg)
 +{
 +    struct mdrunner_arglist *mda=(struct mdrunner_arglist*)arg;
 +    struct mdrunner_arglist mc=*mda; /* copy the arg list to make sure 
 +                                        that it's thread-local. This doesn't
 +                                        copy pointed-to items, of course,
 +                                        but those are all const. */
 +    t_commrec *cr;  /* we need a local version of this */
 +    FILE *fplog=NULL;
 +    t_filenm *fnm;
 +
 +    fnm = dup_tfn(mc.nfile, mc.fnm);
 +
 +    cr = init_par_threads(mc.cr);
 +
 +    if (MASTER(cr))
 +    {
 +        fplog=mc.fplog;
 +    }
 +
 +    mda->ret=mdrunner(cr->nnodes, fplog, cr, mc.nfile, fnm, mc.oenv, 
 +                      mc.bVerbose, mc.bCompact, mc.nstglobalcomm, 
 +                      mc.ddxyz, mc.dd_node_order, mc.rdd,
 +                      mc.rconstr, mc.dddlb_opt, mc.dlb_scale, 
 +                      mc.ddcsx, mc.ddcsy, mc.ddcsz, mc.nstepout, mc.resetstep, 
 +                      mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_seed, mc.pforce, 
 +                      mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.Flags);
 +}
 +
 +/* called by mdrunner() to start a specific number of threads (including 
 +   the main thread) for thread-parallel runs. This in turn calls mdrunner()
 +   for each thread. 
 +   All options besides nthreads are the same as for mdrunner(). */
 +static t_commrec *mdrunner_start_threads(int nthreads, 
 +              FILE *fplog,t_commrec *cr,int nfile, 
 +              const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
 +              gmx_bool bCompact, int nstglobalcomm,
 +              ivec ddxyz,int dd_node_order,real rdd,real rconstr,
 +              const char *dddlb_opt,real dlb_scale,
 +              const char *ddcsx,const char *ddcsy,const char *ddcsz,
 +              int nstepout,int resetstep,int nmultisim,int repl_ex_nst,
 +              int repl_ex_seed, real pforce,real cpt_period, real max_hours, 
 +              const char *deviceOptions, unsigned long Flags)
 +{
 +    int ret;
 +    struct mdrunner_arglist *mda;
 +    t_commrec *crn; /* the new commrec */
 +    t_filenm *fnmn;
 +
 +    /* first check whether we even need to start tMPI */
 +    if (nthreads<2)
 +        return cr;
 +
 +    /* a few small, one-time, almost unavoidable memory leaks: */
 +    snew(mda,1);
 +    fnmn=dup_tfn(nfile, fnm);
 +
 +    /* fill the data structure to pass as void pointer to thread start fn */
 +    mda->fplog=fplog;
 +    mda->cr=cr;
 +    mda->nfile=nfile;
 +    mda->fnm=fnmn;
 +    mda->oenv=oenv;
 +    mda->bVerbose=bVerbose;
 +    mda->bCompact=bCompact;
 +    mda->nstglobalcomm=nstglobalcomm;
 +    mda->ddxyz[XX]=ddxyz[XX];
 +    mda->ddxyz[YY]=ddxyz[YY];
 +    mda->ddxyz[ZZ]=ddxyz[ZZ];
 +    mda->dd_node_order=dd_node_order;
 +    mda->rdd=rdd;
 +    mda->rconstr=rconstr;
 +    mda->dddlb_opt=dddlb_opt;
 +    mda->dlb_scale=dlb_scale;
 +    mda->ddcsx=ddcsx;
 +    mda->ddcsy=ddcsy;
 +    mda->ddcsz=ddcsz;
 +    mda->nstepout=nstepout;
 +    mda->resetstep=resetstep;
 +    mda->nmultisim=nmultisim;
 +    mda->repl_ex_nst=repl_ex_nst;
 +    mda->repl_ex_seed=repl_ex_seed;
 +    mda->pforce=pforce;
 +    mda->cpt_period=cpt_period;
 +    mda->max_hours=max_hours;
 +    mda->deviceOptions=deviceOptions;
 +    mda->Flags=Flags;
 +
 +    fprintf(stderr, "Starting %d threads\n",nthreads);
 +    fflush(stderr);
 +    /* now spawn new threads that start mdrunner_start_fn(), while 
 +       the main thread returns */
 +    ret=tMPI_Init_fn(TRUE, nthreads, mdrunner_start_fn, (void*)(mda) );
 +    if (ret!=TMPI_SUCCESS)
 +        return NULL;
 +
 +    /* make a new comm_rec to reflect the new situation */
 +    crn=init_par_threads(cr);
 +    return crn;
 +}
 +
 +
 +/* Get the number of threads to use for thread-MPI based on how many
 + * were requested, which algorithms we're using,
 + * and how many particles there are.
 + */
 +static int get_nthreads_mpi(int nthreads_requested, t_inputrec *inputrec,
 +                            gmx_mtop_t *mtop)
 +{
 +    int nthreads,nthreads_new;
 +    int min_atoms_per_thread;
 +    char *env;
 +
 +    nthreads = nthreads_requested;
 +
 +    /* determine # of hardware threads. */
 +    if (nthreads_requested < 1)
 +    {
 +        if ((env = getenv("GMX_MAX_THREADS")) != NULL)
 +        {
 +            nthreads = 0;
 +            sscanf(env,"%d",&nthreads);
 +            if (nthreads < 1)
 +            {
 +                gmx_fatal(FARGS,"GMX_MAX_THREADS (%d) should be larger than 0",
 +                          nthreads);
 +            }
 +        }
 +        else
 +        {
 +            nthreads = tMPI_Thread_get_hw_number();
 +        }
 +    }
 +
 +    if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
 +    {
 +        /* Steps are divided over the nodes iso splitting the atoms */
 +        min_atoms_per_thread = 0;
 +    }
 +    else
 +    {
 +        min_atoms_per_thread = MIN_ATOMS_PER_THREAD;
 +    }
 +
 +    /* Check if an algorithm does not support parallel simulation.  */
 +    if (nthreads != 1 && 
 +        ( inputrec->eI == eiLBFGS ||
 +          inputrec->coulombtype == eelEWALD ) )
 +    {
 +        fprintf(stderr,"\nThe integration or electrostatics algorithm doesn't support parallel runs. Not starting any threads.\n");
 +        nthreads = 1;
 +    }
 +    else if (nthreads_requested < 1 &&
 +             mtop->natoms/nthreads < min_atoms_per_thread)
 +    {
 +        /* the thread number was chosen automatically, but there are too many
 +           threads (too few atoms per thread) */
 +        nthreads_new = max(1,mtop->natoms/min_atoms_per_thread);
 +
 +        if (nthreads_new > 8 || (nthreads == 8 && nthreads_new > 4))
 +        {
 +            /* Use only multiples of 4 above 8 threads
 +             * or with an 8-core processor
 +             * (to avoid 6 threads on 8 core processors with 4 real cores).
 +             */
 +            nthreads_new = (nthreads_new/4)*4;
 +        }
 +        else if (nthreads_new > 4)
 +        {
 +            /* Avoid 5 or 7 threads */
 +            nthreads_new = (nthreads_new/2)*2;
 +        }
 +
 +        nthreads = nthreads_new;
 +
 +        fprintf(stderr,"\n");
 +        fprintf(stderr,"NOTE: Parallelization is limited by the small number of atoms,\n");
 +        fprintf(stderr,"      only starting %d threads.\n",nthreads);
 +        fprintf(stderr,"      You can use the -nt option to optimize the number of threads.\n\n");
 +    }
 +    return nthreads;
 +}
 +#endif
 +
 +
 +int mdrunner(int nthreads_requested, FILE *fplog,t_commrec *cr,int nfile,
 +             const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
 +             gmx_bool bCompact, int nstglobalcomm,
 +             ivec ddxyz,int dd_node_order,real rdd,real rconstr,
 +             const char *dddlb_opt,real dlb_scale,
 +             const char *ddcsx,const char *ddcsy,const char *ddcsz,
 +             int nstepout,int resetstep,int nmultisim,int repl_ex_nst,
 +             int repl_ex_seed, real pforce,real cpt_period,real max_hours,
 +             const char *deviceOptions, unsigned long Flags)
 +{
 +    double     nodetime=0,realtime;
 +    t_inputrec *inputrec;
 +    t_state    *state=NULL;
 +    matrix     box;
 +    gmx_ddbox_t ddbox={0};
 +    int        npme_major,npme_minor;
 +    real       tmpr1,tmpr2;
 +    t_nrnb     *nrnb;
 +    gmx_mtop_t *mtop=NULL;
 +    t_mdatoms  *mdatoms=NULL;
 +    t_forcerec *fr=NULL;
 +    t_fcdata   *fcd=NULL;
 +    real       ewaldcoeff=0;
 +    gmx_pme_t  *pmedata=NULL;
 +    gmx_vsite_t *vsite=NULL;
 +    gmx_constr_t constr;
 +    int        i,m,nChargePerturbed=-1,status,nalloc;
 +    char       *gro;
 +    gmx_wallcycle_t wcycle;
 +    gmx_bool       bReadRNG,bReadEkin;
 +    int        list;
 +    gmx_runtime_t runtime;
 +    int        rc;
 +    gmx_large_int_t reset_counters;
 +    gmx_edsam_t ed=NULL;
 +    t_commrec   *cr_old=cr; 
 +    int         nthreads_mpi=1;
 +    int         nthreads_pme=1;
 +    gmx_membed_t *membed=NULL;
 +
 +    /* CAUTION: threads may be started later on in this function, so
 +       cr doesn't reflect the final parallel state right now */
 +    snew(inputrec,1);
 +    snew(mtop,1);
 +
 +    if (bVerbose && SIMMASTER(cr))
 +    {
 +        fprintf(stderr,"Getting Loaded...\n");
 +    }
 +    
 +    if (Flags & MD_APPENDFILES) 
 +    {
 +        fplog = NULL;
 +    }
 +
 +    snew(state,1);
 +    if (MASTER(cr)) 
 +    {
 +        /* Read (nearly) all data required for the simulation */
 +        read_tpx_state(ftp2fn(efTPX,nfile,fnm),inputrec,state,NULL,mtop);
 +
 +        /* NOW the threads will be started: */
 +#ifdef GMX_THREAD_MPI
 +        nthreads_mpi = get_nthreads_mpi(nthreads_requested, inputrec, mtop);
 +
 +        if (nthreads_mpi > 1)
 +        {
 +            /* now start the threads. */
 +            cr=mdrunner_start_threads(nthreads_mpi, fplog, cr_old, nfile, fnm,
 +                                      oenv, bVerbose, bCompact, nstglobalcomm, 
 +                                      ddxyz, dd_node_order, rdd, rconstr, 
 +                                      dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
 +                                      nstepout, resetstep, nmultisim, 
 +                                      repl_ex_nst, repl_ex_seed, pforce, 
 +                                      cpt_period, max_hours, deviceOptions, 
 +                                      Flags);
 +            /* the main thread continues here with a new cr. We don't deallocate
 +               the old cr because other threads may still be reading it. */
 +            if (cr == NULL)
 +            {
 +                gmx_comm("Failed to spawn threads");
 +            }
 +        }
 +#endif
 +    }
 +    /* END OF CAUTION: cr is now reliable */
 +
 +    /* g_membed initialisation *
 +     * Because we change the mtop, init_membed is called before the init_parallel *
 +     * (in case we ever want to make it run in parallel) */
 +    if (opt2bSet("-membed",nfile,fnm))
 +    {
 +      fprintf(stderr,"Entering membed code");
 +        snew(membed,1);
 +        init_membed(fplog,membed,nfile,fnm,mtop,inputrec,state,cr,&cpt_period);
 +    }
 +
 +    if (PAR(cr))
 +    {
 +        /* now broadcast everything to the non-master nodes/threads: */
 +        init_parallel(fplog, cr, inputrec, mtop);
 +    }
 +    if (fplog != NULL)
 +    {
 +        pr_inputrec(fplog,0,"Input Parameters",inputrec,FALSE);
 +    }
 +
 +    /* now make sure the state is initialized and propagated */
 +    set_state_entries(state,inputrec,cr->nnodes);
 +
 +    /* A parallel command line option consistency check that we can
 +       only do after any threads have started. */
 +    if (!PAR(cr) &&
 +        (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
 +    {
 +        gmx_fatal(FARGS,
 +                  "The -dd or -npme option request a parallel simulation, "
 +#ifndef GMX_MPI
 +                  "but mdrun was compiled without threads or MPI enabled"
 +#else
 +#ifdef GMX_THREAD_MPI
 +                  "but the number of threads (option -nt) is 1"
 +#else
 +                  "but mdrun was not started through mpirun/mpiexec or only one process was requested through mpirun/mpiexec" 
 +#endif
 +#endif
 +            );
 +    }
 +
 +    if ((Flags & MD_RERUN) &&
 +        (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
 +    {
 +        gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
 +    }
 +
 +    if (can_use_allvsall(inputrec,mtop,TRUE,cr,fplog))
 +    {
 +        /* All-vs-all loops do not work with domain decomposition */
 +        Flags |= MD_PARTDEC;
 +    }
 +
 +    if (!EEL_PME(inputrec->coulombtype) || (Flags & MD_PARTDEC))
 +    {
 +        if (cr->npmenodes > 0)
 +        {
 +            if (!EEL_PME(inputrec->coulombtype))
 +            {
 +                gmx_fatal_collective(FARGS,cr,NULL,
 +                                     "PME nodes are requested, but the system does not use PME electrostatics");
 +            }
 +            if (Flags & MD_PARTDEC)
 +            {
 +                gmx_fatal_collective(FARGS,cr,NULL,
 +                                     "PME nodes are requested, but particle decomposition does not support separate PME nodes");
 +            }
 +        }
 +
 +        cr->npmenodes = 0;
 +    }
 +
 +#ifdef GMX_FAHCORE
 +    fcRegisterSteps(inputrec->nsteps,inputrec->init_step);
 +#endif
 +
 +    /* NMR restraints must be initialized before load_checkpoint,
 +     * since with time averaging the history is added to t_state.
 +     * For proper consistency check we therefore need to extend
 +     * t_state here.
 +     * So the PME-only nodes (if present) will also initialize
 +     * the distance restraints.
 +     */
 +    snew(fcd,1);
 +
 +    /* This needs to be called before read_checkpoint to extend the state */
 +    init_disres(fplog,mtop,inputrec,cr,Flags & MD_PARTDEC,fcd,state);
 +
 +    if (gmx_mtop_ftype_count(mtop,F_ORIRES) > 0)
 +    {
 +        if (PAR(cr) && !(Flags & MD_PARTDEC))
 +        {
 +            gmx_fatal(FARGS,"Orientation restraints do not work (yet) with domain decomposition, use particle decomposition (mdrun option -pd)");
 +        }
 +        /* Orientation restraints */
 +        if (MASTER(cr))
 +        {
 +            init_orires(fplog,mtop,state->x,inputrec,cr->ms,&(fcd->orires),
 +                        state);
 +        }
 +    }
 +
 +    if (DEFORM(*inputrec))
 +    {
 +        /* Store the deform reference box before reading the checkpoint */
 +        if (SIMMASTER(cr))
 +        {
 +            copy_mat(state->box,box);
 +        }
 +        if (PAR(cr))
 +        {
 +            gmx_bcast(sizeof(box),box,cr);
 +        }
 +        /* Because we do not have the update struct available yet
 +         * in which the reference values should be stored,
 +         * we store them temporarily in static variables.
 +         * This should be thread safe, since they are only written once
 +         * and with identical values.
 +         */
 +#ifdef GMX_THREAD_MPI
 +        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
 +#endif
 +        deform_init_init_step_tpx = inputrec->init_step;
 +        copy_mat(box,deform_init_box_tpx);
 +#ifdef GMX_THREAD_MPI
 +        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
 +#endif
 +    }
 +
 +    if (opt2bSet("-cpi",nfile,fnm)) 
 +    {
 +        /* Check if checkpoint file exists before doing continuation.
 +         * This way we can use identical input options for the first and subsequent runs...
 +         */
 +        if( gmx_fexist_master(opt2fn_master("-cpi",nfile,fnm,cr),cr) )
 +        {
 +            load_checkpoint(opt2fn_master("-cpi",nfile,fnm,cr),&fplog,
 +                            cr,Flags & MD_PARTDEC,ddxyz,
 +                            inputrec,state,&bReadRNG,&bReadEkin,
-         /* Initiate PPPM if necessary */
-         if (fr->eeltype == eelPPPM)
-         {
-             if (mdatoms->nChargePerturbed)
-             {
-                 gmx_fatal(FARGS,"Free energy with %s is not implemented",
-                           eel_names[fr->eeltype]);
-             }
-             status = gmx_pppm_init(fplog,cr,oenv,FALSE,TRUE,box,
-                                    getenv("GMXGHAT"),inputrec, (Flags & MD_REPRODUCIBLE));
-             if (status != 0)
-             {
-                 gmx_fatal(FARGS,"Error %d initializing PPPM",status);
-             }
-         }
++                            (Flags & MD_APPENDFILES),
++                            (Flags & MD_APPENDFILESSET));
 +            
 +            if (bReadRNG)
 +            {
 +                Flags |= MD_READ_RNG;
 +            }
 +            if (bReadEkin)
 +            {
 +                Flags |= MD_READ_EKIN;
 +            }
 +        }
 +    }
 +
 +    if (((MASTER(cr) || (Flags & MD_SEPPOT)) && (Flags & MD_APPENDFILES))
 +#ifdef GMX_THREAD_MPI
 +        /* With thread MPI only the master node/thread exists in mdrun.c,
 +         * therefore non-master nodes need to open the "seppot" log file here.
 +         */
 +        || (!MASTER(cr) && (Flags & MD_SEPPOT))
 +#endif
 +        )
 +    {
 +        gmx_log_open(ftp2fn(efLOG,nfile,fnm),cr,!(Flags & MD_SEPPOT),
 +                             Flags,&fplog);
 +    }
 +
 +    if (SIMMASTER(cr)) 
 +    {
 +        copy_mat(state->box,box);
 +    }
 +
 +    if (PAR(cr)) 
 +    {
 +        gmx_bcast(sizeof(box),box,cr);
 +    }
 +
 +    /* Essential dynamics */
 +    if (opt2bSet("-ei",nfile,fnm))
 +    {
 +        /* Open input and output files, allocate space for ED data structure */
 +        ed = ed_open(nfile,fnm,Flags,cr);
 +    }
 +
 +    if (bVerbose && SIMMASTER(cr))
 +    {
 +        fprintf(stderr,"Loaded with Money\n\n");
 +    }
 +
 +    if (PAR(cr) && !((Flags & MD_PARTDEC) ||
 +                     EI_TPI(inputrec->eI) ||
 +                     inputrec->eI == eiNM))
 +    {
 +        cr->dd = init_domain_decomposition(fplog,cr,Flags,ddxyz,rdd,rconstr,
 +                                           dddlb_opt,dlb_scale,
 +                                           ddcsx,ddcsy,ddcsz,
 +                                           mtop,inputrec,
 +                                           box,state->x,
 +                                           &ddbox,&npme_major,&npme_minor);
 +
 +        make_dd_communicators(fplog,cr,dd_node_order);
 +
 +        /* Set overallocation to avoid frequent reallocation of arrays */
 +        set_over_alloc_dd(TRUE);
 +    }
 +    else
 +    {
 +        /* PME, if used, is done on all nodes with 1D decomposition */
 +        cr->npmenodes = 0;
 +        cr->duty = (DUTY_PP | DUTY_PME);
 +        npme_major = 1;
 +        npme_minor = 1;
 +        if (!EI_TPI(inputrec->eI))
 +        {
 +            npme_major = cr->nnodes;
 +        }
 +        
 +        if (inputrec->ePBC == epbcSCREW)
 +        {
 +            gmx_fatal(FARGS,
 +                      "pbc=%s is only implemented with domain decomposition",
 +                      epbc_names[inputrec->ePBC]);
 +        }
 +    }
 +
 +    if (PAR(cr))
 +    {
 +        /* After possible communicator splitting in make_dd_communicators.
 +         * we can set up the intra/inter node communication.
 +         */
 +        gmx_setup_nodecomm(fplog,cr);
 +    }
 +
 +    /* get number of OpenMP/PME threads
 +     * env variable should be read only on one node to make sure it is identical everywhere */
 +#ifdef GMX_OPENMP
 +    if (EEL_PME(inputrec->coulombtype))
 +    {
 +        if (MASTER(cr))
 +        {
 +            char *ptr;
 +            if ((ptr=getenv("GMX_PME_NTHREADS")) != NULL)
 +            {
 +                sscanf(ptr,"%d",&nthreads_pme);
 +            }
 +            if (fplog != NULL && nthreads_pme > 1)
 +            {
 +                fprintf(fplog,"Using %d threads for PME\n",nthreads_pme);
 +            }
 +        }
 +        if (PAR(cr))
 +        {
 +            gmx_bcast_sim(sizeof(nthreads_pme),&nthreads_pme,cr);
 +        }
 +    }
 +#endif
 +
 +    wcycle = wallcycle_init(fplog,resetstep,cr,nthreads_pme);
 +    if (PAR(cr))
 +    {
 +        /* Master synchronizes its value of reset_counters with all nodes 
 +         * including PME only nodes */
 +        reset_counters = wcycle_get_reset_counters(wcycle);
 +        gmx_bcast_sim(sizeof(reset_counters),&reset_counters,cr);
 +        wcycle_set_reset_counters(wcycle, reset_counters);
 +    }
 +
 +
 +    snew(nrnb,1);
 +    if (cr->duty & DUTY_PP)
 +    {
 +        /* For domain decomposition we allocate dynamically
 +         * in dd_partition_system.
 +         */
 +        if (DOMAINDECOMP(cr))
 +        {
 +            bcast_state_setup(cr,state);
 +        }
 +        else
 +        {
 +            if (PAR(cr))
 +            {
 +                bcast_state(cr,state,TRUE);
 +            }
 +        }
 +
 +        /* Dihedral Restraints */
 +        if (gmx_mtop_ftype_count(mtop,F_DIHRES) > 0)
 +        {
 +            init_dihres(fplog,mtop,inputrec,fcd);
 +        }
 +
 +        /* Initiate forcerecord */
 +        fr = mk_forcerec();
 +        init_forcerec(fplog,oenv,fr,fcd,inputrec,mtop,cr,box,FALSE,
 +                      opt2fn("-table",nfile,fnm),
 +                      opt2fn("-tabletf",nfile,fnm),
 +                      opt2fn("-tablep",nfile,fnm),
 +                      opt2fn("-tableb",nfile,fnm),FALSE,pforce);
 +
 +        /* version for PCA_NOT_READ_NODE (see md.c) */
 +        /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
 +          "nofile","nofile","nofile","nofile",FALSE,pforce);
 +          */        
 +        fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT);
 +
 +        /* Initialize QM-MM */
 +        if(fr->bQMMM)
 +        {
 +            init_QMMMrec(cr,box,mtop,inputrec,fr);
 +        }
 +
 +        /* Initialize the mdatoms structure.
 +         * mdatoms is not filled with atom data,
 +         * as this can not be done now with domain decomposition.
 +         */
 +        mdatoms = init_mdatoms(fplog,mtop,inputrec->efep!=efepNO);
 +
 +        /* Initialize the virtual site communication */
 +        vsite = init_vsite(mtop,cr);
 +
 +        calc_shifts(box,fr->shift_vec);
 +
 +        /* With periodic molecules the charge groups should be whole at start up
 +         * and the virtual sites should not be far from their proper positions.
 +         */
 +        if (!inputrec->bContinuation && MASTER(cr) &&
 +            !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
 +        {
 +            /* Make molecules whole at start of run */
 +            if (fr->ePBC != epbcNONE)
 +            {
 +                do_pbc_first_mtop(fplog,inputrec->ePBC,box,mtop,state->x);
 +            }
 +            if (vsite)
 +            {
 +                /* Correct initial vsite positions are required
 +                 * for the initial distribution in the domain decomposition
 +                 * and for the initial shell prediction.
 +                 */
 +                construct_vsites_mtop(fplog,vsite,mtop,state->x);
 +            }
 +        }
 +
 +        if (EEL_PME(fr->eeltype))
 +        {
 +            ewaldcoeff = fr->ewaldcoeff;
 +            pmedata = &fr->pmedata;
 +        }
 +        else
 +        {
 +            pmedata = NULL;
 +        }
 +    }
 +    else
 +    {
 +        /* This is a PME only node */
 +
 +        /* We don't need the state */
 +        done_state(state);
 +
 +        ewaldcoeff = calc_ewaldcoeff(inputrec->rcoulomb, inputrec->ewald_rtol);
 +        snew(pmedata,1);
 +    }
 +
 +    /* Initiate PME if necessary,
 +     * either on all nodes or on dedicated PME nodes only. */
 +    if (EEL_PME(inputrec->coulombtype))
 +    {
 +        if (mdatoms)
 +        {
 +            nChargePerturbed = mdatoms->nChargePerturbed;
 +        }
 +        if (cr->npmenodes > 0)
 +        {
 +            /* The PME only nodes need to know nChargePerturbed */
 +            gmx_bcast_sim(sizeof(nChargePerturbed),&nChargePerturbed,cr);
 +        }
 +
 +
 +        /* Set CPU affinity. Can be important for performance.
 +           On some systems (e.g. Cray) CPU Affinity is set by default.
 +           But default assigning doesn't work (well) with only some ranks
 +           having threads. This causes very low performance.
 +           External tools have cumbersome syntax for setting affinity
 +           in the case that only some ranks have threads.
 +           Thus it is important that GROMACS sets the affinity internally at
 +           if only PME is using threads.
 +        */
 +
 +#ifdef GMX_OPENMP
 +#ifdef __linux
 +#ifdef GMX_LIB_MPI
 +        {
 +            int core;
 +            MPI_Comm comm_intra; /* intra communicator (but different to nc.comm_intra includes PME nodes) */
 +            MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),gmx_node_rank(),&comm_intra);
 +            int local_omp_nthreads = (cr->duty & DUTY_PME) ? nthreads_pme : 1; /* threads on this node */
 +            MPI_Scan(&local_omp_nthreads,&core, 1, MPI_INT, MPI_SUM, comm_intra);
 +            core-=local_omp_nthreads; /* make exclusive scan */
 +#pragma omp parallel firstprivate(core) num_threads(local_omp_nthreads)
 +            {
 +                cpu_set_t mask;
 +                CPU_ZERO(&mask);
 +                core+=omp_get_thread_num();
 +                CPU_SET(core,&mask);
 +                sched_setaffinity((pid_t) syscall (SYS_gettid),sizeof(cpu_set_t),&mask);
 +            }
 +        }
 +#endif /*GMX_MPI*/
 +#endif /*__linux*/
 +#endif /*GMX_OPENMP*/
 +
 +        if (cr->duty & DUTY_PME)
 +        {
 +            status = gmx_pme_init(pmedata,cr,npme_major,npme_minor,inputrec,
 +                                  mtop ? mtop->natoms : 0,nChargePerturbed,
 +                                  (Flags & MD_REPRODUCIBLE),nthreads_pme);
 +            if (status != 0) 
 +            {
 +                gmx_fatal(FARGS,"Error %d initializing PME",status);
 +            }
 +        }
 +    }
 +
 +
 +    if (integrator[inputrec->eI].func == do_md
 +#ifdef GMX_OPENMM
 +        ||
 +        integrator[inputrec->eI].func == do_md_openmm
 +#endif
 +        )
 +    {
 +        /* Turn on signal handling on all nodes */
 +        /*
 +         * (A user signal from the PME nodes (if any)
 +         * is communicated to the PP nodes.
 +         */
 +        signal_handler_install();
 +    }
 +
 +    if (cr->duty & DUTY_PP)
 +    {
 +        if (inputrec->ePull != epullNO)
 +        {
 +            /* Initialize pull code */
 +            init_pull(fplog,inputrec,nfile,fnm,mtop,cr,oenv,
 +                      EI_DYNAMICS(inputrec->eI) && MASTER(cr),Flags);
 +        }
 +        
 +        if (inputrec->bRot)
 +        {
 +           /* Initialize enforced rotation code */
 +           init_rot(fplog,inputrec,nfile,fnm,cr,state->x,state->box,mtop,oenv,
 +                    bVerbose,Flags);
 +        }
 +
 +        constr = init_constraints(fplog,mtop,inputrec,ed,state,cr);
 +
 +        if (DOMAINDECOMP(cr))
 +        {
 +            dd_init_bondeds(fplog,cr->dd,mtop,vsite,constr,inputrec,
 +                            Flags & MD_DDBONDCHECK,fr->cginfo_mb);
 +
 +            set_dd_parameters(fplog,cr->dd,dlb_scale,inputrec,fr,&ddbox);
 +
 +            setup_dd_grid(fplog,cr->dd);
 +        }
 +
 +        /* Now do whatever the user wants us to do (how flexible...) */
 +        integrator[inputrec->eI].func(fplog,cr,nfile,fnm,
 +                                      oenv,bVerbose,bCompact,
 +                                      nstglobalcomm,
 +                                      vsite,constr,
 +                                      nstepout,inputrec,mtop,
 +                                      fcd,state,
 +                                      mdatoms,nrnb,wcycle,ed,fr,
 +                                      repl_ex_nst,repl_ex_seed,
 +                                      membed,
 +                                      cpt_period,max_hours,
 +                                      deviceOptions,
 +                                      Flags,
 +                                      &runtime);
 +
 +        if (inputrec->ePull != epullNO)
 +        {
 +            finish_pull(fplog,inputrec->pull);
 +        }
 +        
 +        if (inputrec->bRot)
 +        {
 +            finish_rot(fplog,inputrec->rot);
 +        }
 +
 +    } 
 +    else 
 +    {
 +        /* do PME only */
 +        gmx_pmeonly(*pmedata,cr,nrnb,wcycle,ewaldcoeff,FALSE,inputrec);
 +    }
 +
 +    if (EI_DYNAMICS(inputrec->eI) || EI_TPI(inputrec->eI))
 +    {
 +        /* Some timing stats */  
 +        if (SIMMASTER(cr))
 +        {
 +            if (runtime.proc == 0)
 +            {
 +                runtime.proc = runtime.real;
 +            }
 +        }
 +        else
 +        {
 +            runtime.real = 0;
 +        }
 +    }
 +
 +    wallcycle_stop(wcycle,ewcRUN);
 +
 +    /* Finish up, write some stuff
 +     * if rerunMD, don't write last frame again 
 +     */
 +    finish_run(fplog,cr,ftp2fn(efSTO,nfile,fnm),
 +               inputrec,nrnb,wcycle,&runtime,
 +               EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
 +    
 +    if (opt2bSet("-membed",nfile,fnm))
 +    {
 +        sfree(membed);
 +    }
 +
 +    /* Does what it says */  
 +    print_date_and_time(fplog,cr->nodeid,"Finished mdrun",&runtime);
 +
 +    /* Close logfile already here if we were appending to it */
 +    if (MASTER(cr) && (Flags & MD_APPENDFILES))
 +    {
 +        gmx_log_close(fplog);
 +    } 
 +
 +    rc=(int)gmx_get_stop_condition();
 +
 +#ifdef GMX_THREAD_MPI
 +    /* we need to join all threads. The sub-threads join when they
 +       exit this function, but the master thread needs to be told to 
 +       wait for that. */
 +    if (PAR(cr) && MASTER(cr))
 +    {
 +        tMPI_Finalize();
 +    }
 +#endif
 +
 +    return rc;
 +}
index 2a15467d2bd44610587b74006a2303f403a2101a,0000000000000000000000000000000000000000..04900f7d4fc9d080fbce7654c876aa6b4ce8a49e
mode 100644,000000..100644
--- /dev/null
@@@ -1,468 -1,0 +1,477 @@@
 +/*
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
++#include <assert.h>
++
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "strdb.h"
 +#include "string2.h"
 +#include "xmdrun.h"
 +#include "vec.h"
 +#include "genalg.h"
 +#include "random.h"
 +#include "macros.h"
 +
 +real mol_dipole(int k0,int k1,rvec x[],real q[])
 +{
 +  int  k,m;
 +  rvec mu;
 +  
 +  clear_rvec(mu);
 +  for(k=k0; (k<k1); k++) {
 +    for(m=0; (m<DIM); m++) 
 +      mu[m] += q[k]*x[k][m];
 +  }
 +  return norm(mu);  /* Dipole moment of this molecule in e nm */
 +}
 +
 +real calc_mu_aver(t_commrec *cr,rvec x[],real q[],rvec mu,
 +                t_block *mols,t_mdatoms *md,int gnx,atom_id grpindex[])
 +{
 +  int     i,start,end;
 +  real    mu_ave;
 +  
 +  start = md->start;
 +  end   = md->homenr + start;  
 +
 +  /*
 +  clear_rvec(mu);
 +  for(i=start; (i<end); i++)
 +    for(m=0; (m<DIM); m++)
 +      mu[m] += q[i]*x[i][m];
 +  if (PAR(cr)) {
 +    gmx_sum(DIM,mu,cr);
 +  }
 +  */
 +  /* I guess we have to parallelise this one! */
 +
 +  if (gnx > 0) {
 +    mu_ave = 0.0;
 +    for(i=0; (i<gnx); i++) {
 +      int gi = grpindex[i];
 +      mu_ave += mol_dipole(mols->index[gi],mols->index[gi+1],x,q);
 +    }
 +    
 +    return(mu_ave/gnx);
 +  }
 +  else
 +    return 0;
 +}
 +
 +/* Lots of global variables! Yummy... */
 +static t_ffscan ff;
 +
 +void set_ffvars(t_ffscan *fff)
 +{
 +  ff = *fff;
 +}
 +
 +real cost(tensor P,real MSF,real E)
 +{
 +  return (ff.fac_msf*MSF+ff.fac_epot*sqr(E-ff.epot)+ff.fac_pres*
 +        (sqr(P[XX][XX]-ff.pres)+sqr(P[YY][YY]-ff.pres)+sqr(P[ZZ][ZZ]-ff.pres)));
 +  
 +}
 +
 +static const char     *esenm[eseNR] = { "SIG", "EPS", "BHAMA", "BHAMB", "BHAMC", "CELlX", "CELLY", "CELLZ" };
 +static int      nparm=0,*param_val=NULL;
 +static t_range  *range=NULL;
 +static t_genalg *ga=NULL;
 +static rvec     scale = { 1,1,1 };
 +
 +static void init_range(t_range *r,int np,int atype,int ptype,
 +                     real rmin,real rmax)
 +{
 +  if (rmin > rmax)
 +    gmx_fatal(FARGS,"rmin (%f) > rmax (%f)",rmin,rmax);
 +  if (np <= 0)
 +    gmx_fatal(FARGS,"np (%d) should be > 0",np);
 +  if ((rmax > rmin) && (np <= 1))
 +    gmx_fatal(FARGS,"If rmax > rmin, np should be > 1");
 +  if ((ptype < 0) || (ptype >= eseNR))
 +    gmx_fatal(FARGS,"ptype (%d) should be < %d",ptype,eseNR);
 +  r->np    = np;
 +  r->atype = atype;
 +  r->ptype = ptype;
 +  r->rmin  = rmin;
 +  r->rmax  = rmax;
 +  r->rval  = rmin;
 +  r->dr    = r->rmax - r->rmin;
 +}
 +
 +static t_range *read_range(const char *db,int *nrange)
 +{
 +  int     nlines,nr,np,i;
 +  char    **lines;
 +  t_range *range;
 +  int     atype,ptype;
 +  double  rmin,rmax;
 +  
 +  nlines = get_file(db,&lines);
 +  snew(range,nlines);
 +  
 +  nr=0;
 +  for(i=0; (i < nlines); i++) {
 +    strip_comment(lines[i]);
 +    if (sscanf(lines[i],"%d%d%d%lf%lf",&np,&atype,&ptype,&rmin,&rmax) == 5) {
 +      if (ff.bLogEps && (ptype == eseEPSILON) && (rmin <= 0))
 +      gmx_fatal(FARGS,"When using logarithmic epsilon increments the minimum"
 +                  "value must be > 0");
 +      init_range(&range[nr],np,atype,ptype,rmin,rmax);
 +      nr++;
 +    }
 +  }
 +  fprintf(stderr,"found %d variables to iterate over\n",nr);
 +  
 +  *nrange = nr;
 +
 +  for(nr=0; (nr < nlines); nr++)
 +    sfree(lines[nr]);
 +  sfree(lines);
 +    
 +  return range;
 +}
 +
 +static real value_range(t_range *r,int n)
 +{
 +  real logrmin,logrmax;
 +  
 +  if ((n < 0) || (n > r->np))
 +    gmx_fatal(FARGS,"Value (%d) out of range for value_range (max %d)",n,r->np);
 +
 +  if (r->np == 1)
 +    r->rval = r->rmin;
 +  else {
 +    if ((r->ptype == eseEPSILON) && ff.bLogEps) {
 +      logrmin = log(r->rmin);
 +      logrmax = log(r->rmax);
 +      r->rval = exp(logrmin + (n*(logrmax-logrmin))/(r->np-1));
 +    }
 +    else
 +      r->rval = r->rmin+(n*(r->dr))/(r->np-1);
 +  }
 +  return r->rval;
 +}
 +
 +real value_rand(t_range *r,int *seed)
 +{
 +  real logrmin,logrmax;
 +  real mr;
 +  
 +  if (r->np == 1)
 +    r->rval = r->rmin;
 +  else {
 +    mr = rando(seed);
 +    if ((r->ptype == eseEPSILON) && ff.bLogEps) {
 +      logrmin = log(r->rmin);
 +      logrmax = log(r->rmax);
 +      r->rval = exp(logrmin + mr*(logrmax-logrmin));
 +    }
 +    else
 +      r->rval = r->rmin + mr*(r->rmax-r->rmin);
 +  }
 +  if (debug)
 +    fprintf(debug,"type: %s, value: %g\n",esenm[r->ptype],r->rval);
 +  return r->rval;
 +}
 +
 +static void update_ff(t_forcerec *fr,int nparm,t_range range[],int param_val[])
 +{
 +  static double *sigma=NULL,*eps=NULL,*c6=NULL,*cn=NULL,*bhama=NULL,*bhamb=NULL,*bhamc=NULL;
 +  real   val,*nbfp;
 +  int    i,j,atnr;
 +  
 +  atnr = fr->ntype;
 +  nbfp = fr->nbfp;
 +  
 +  if (fr->bBHAM) {
 +    if (bhama == NULL) {
++      assert(bhamb==NULL && bhamc==NULL);
 +      snew(bhama,atnr);
 +      snew(bhamb,atnr);
 +      snew(bhamc,atnr);
 +    }
 +  }
 +  else {
 +    if (sigma == NULL) {
++      assert(eps==NULL && c6==NULL && cn==NULL);
 +      snew(sigma,atnr);
 +      snew(eps,atnr);
 +      snew(c6,atnr);
 +      snew(cn,atnr);
 +    }
 +  }
 +  /* Get current values for everything */
 +  for(i=0; (i<nparm); i++) {
 +    if (ga)
 +      val = range[i].rval;
 +    else
 +      val = value_range(&range[i],param_val[i]);
 +    if(debug)
 +      fprintf(debug,"val = %g\n",val);
 +    switch (range[i].ptype) {
 +    case eseSIGMA:
++      assert(!fr->bBHAM);
 +      sigma[range[i].atype] = val;
 +      break;
 +    case eseEPSILON:
++      assert(!fr->bBHAM);
 +      eps[range[i].atype] = val;
 +      break;
 +    case eseBHAMA:
++      assert(fr->bBHAM);
 +      bhama[range[i].atype] = val;
 +      break;
 +    case eseBHAMB:
++      assert(fr->bBHAM);
 +      bhamb[range[i].atype] = val;
 +      break;
 +    case eseBHAMC:
++      assert(fr->bBHAM);
 +      bhamc[range[i].atype] = val;
 +      break;
 +    case eseCELLX:
 +      scale[XX] = val;
 +      break;
 +    case eseCELLY:
 +      scale[YY] = val;
 +      break;
 +    case eseCELLZ:
 +      scale[ZZ] = val;
 +      break;
 +    default:
 +      gmx_fatal(FARGS,"Unknown ptype");
 +    }
 +  }
 +  if (fr->bBHAM) {
 +    for(i=0; (i<atnr); i++) {
 +      for(j=0; (j<=i); j++) {
 +      BHAMA(nbfp,atnr,i,j) = BHAMA(nbfp,atnr,j,i) = sqrt(bhama[i]*bhama[j]);
 +      BHAMB(nbfp,atnr,i,j) = BHAMB(nbfp,atnr,j,i) = sqrt(bhamb[i]*bhamb[j]);
 +      BHAMC(nbfp,atnr,i,j) = BHAMC(nbfp,atnr,j,i) = sqrt(bhamc[i]*bhamc[j]);
 +      }
 +    }
 +  }
 +  else {  
 +    /* Now build a new matrix */
 +    for(i=0; (i<atnr); i++) {
 +      c6[i] = 4*eps[i]*pow(sigma[i],6.0);
 +      cn[i] = 4*eps[i]*pow(sigma[i],ff.npow);
 +    }
 +    for(i=0; (i<atnr); i++) {
 +      for(j=0; (j<=i); j++) {
 +      C6(nbfp,atnr,i,j)  = C6(nbfp,atnr,j,i)  = sqrt(c6[i]*c6[j]);
 +      C12(nbfp,atnr,i,j) = C12(nbfp,atnr,j,i) = sqrt(cn[i]*cn[j]);
 +      }
 +    }
 +  }
 +  
 +  if (debug) {
 +    if (!fr->bBHAM) 
 +      for(i=0; (i<atnr); i++)
 +      fprintf(debug,"atnr = %2d  sigma = %8.4f  eps = %8.4f\n",i,sigma[i],eps[i]);
 +    for(i=0; (i<atnr); i++) {
 +      for(j=0; (j<atnr); j++) {
 +      if (fr->bBHAM)
 +        fprintf(debug,"i: %2d  j: %2d  A:  %10.5e  B:  %10.5e  C:  %10.5e\n",i,j,
 +                BHAMA(nbfp,atnr,i,j),BHAMB(nbfp,atnr,i,j),BHAMC(nbfp,atnr,i,j));
 +      else
 +        fprintf(debug,"i: %2d  j: %2d  c6:  %10.5e  cn:  %10.5e\n",i,j,
 +                C6(nbfp,atnr,i,j),C12(nbfp,atnr,i,j));
 +      }
 +    }
 +  }
 +}
 +
 +static void scale_box(int natoms,rvec x[],matrix box)
 +{
 +  int i,m;
 +  
 +  if ((scale[XX] != 1.0) ||   (scale[YY] != 1.0) ||   (scale[ZZ] != 1.0)) {
 +    if (debug)
 +      fprintf(debug,"scale = %8.4f  %8.4f  %8.4f\n",
 +            scale[XX],scale[YY],scale[ZZ]);
 +    for(m=0; (m<DIM); m++)
 +      box[m][m] *= scale[m];
 +    for(i=0; (i<natoms); i++) 
 +      for(m=0; (m<DIM); m++)
 +      x[i][m] *= scale[m];
 +  }
 +}
 +
 +gmx_bool update_forcefield(FILE *fplog,
 +                     int nfile,const t_filenm fnm[],t_forcerec *fr,
 +                     int natoms,rvec x[],matrix box)
 +{
 +  static int ntry,ntried;
 +  int    i,j;
 +  gmx_bool   bDone;
 +
 +  /* First time around we have to read the parameters */  
 +  if (nparm == 0) {    
 +    range = read_range(ftp2fn(efDAT,nfile,fnm),&nparm);
 +    if (nparm == 0) 
 +      gmx_fatal(FARGS,"No correct parameter info in %s",ftp2fn(efDAT,nfile,fnm));
 +    snew(param_val,nparm);
 +
 +    if (opt2bSet("-ga",nfile,fnm)) {
 +      /* Genetic algorithm time */
 +      ga = init_ga(fplog,opt2fn("-ga",nfile,fnm),nparm,range);
 +    }
 +    else {  
 +      /* Determine the grid size */
 +      ntry = 1;
 +      for(i=0; (i<nparm); i++)
 +      ntry *= range[i].np;
 +      ntried = 0;
 +      
 +      fprintf(fplog,"Going to try %d different combinations of %d parameters\n",
 +            ntry,nparm);
 +    }
 +  }
 +  if (ga) {
 +    update_ga(fplog,range,ga);
 +  }
 +  else {
 +    /* Increment the counter
 +     * Non-trivial, since this is nparm nested loops in principle 
 +     */
 +    for(i=0; (i<nparm); i++) {
 +      if (param_val[i] < (range[i].np-1)) {
 +      param_val[i]++;
 +      for(j=0; (j<i); j++)
 +        param_val[j] = 0;
 +      ntried++;
 +      break;
 +      }
 +    }
 +    if (i == nparm) {
 +      fprintf(fplog,"Finished with %d out of %d iterations\n",ntried+1,ntry);
 +      return TRUE;
 +    }
 +  }
 +
 +  /* Now do the real updating */
 +  update_ff(fr,nparm,range,param_val);
 +  
 +  /* Update box and coordinates if necessary */
 +  scale_box(natoms,x,box);
 +  
 +  return FALSE;
 +}
 +
 +static void print_range(FILE *fp,tensor P,real MSF,real energy)
 +{
 +  int  i;
 +  
 +  fprintf(fp,"%8.3f  %8.3f  %8.3f  %8.3f",
 +        cost(P,MSF,energy),trace(P)/3,MSF,energy);
 +  for(i=0; (i<nparm); i++)
 +    fprintf(fp," %s %10g",esenm[range[i].ptype],range[i].rval);
 +  fprintf(fp," FF\n");
 +  fflush(fp);
 +}
 +
 +static real msf(int n,rvec f1[],rvec f2[])
 +{
 +  int  i,j;
 +  rvec ff2;
 +  real msf1=0;
 +  
 +  for(i=0; (i<n); ) {
 +    clear_rvec(ff2);
 +    for(j=0; ((j<ff.molsize) && (i<n)); j++,i++) {
 +      rvec_inc(ff2,f1[i]);
 +      if (f2)
 +      rvec_inc(ff2,f2[i]);
 +    }
 +    msf1 += iprod(ff2,ff2);
 +  }
 +  
 +  return msf1/n;
 +}
 +
 +static void print_grid(FILE *fp,real ener[],int natoms,rvec f[],rvec fshake[],
 +                     rvec x[],t_block *mols,real mass[],tensor pres)
 +{
 +  static gmx_bool bFirst = TRUE;
 +  static const char *desc[] = {
 +    "------------------------------------------------------------------------",
 +    "In the output from the forcefield scan we have the potential energy,", 
 +    "then the root mean square force on the atoms, and finally the parameters",
 +    "in the order they appear in the input file.",
 +    "------------------------------------------------------------------------" 
 +  };
 +  real msf1;
 +  int  i;
 +  
 +  if (bFirst) {
 +    for(i=0; (i<asize(desc)); i++)
 +      fprintf(fp,"%s\n",desc[i]);
 +    fflush(fp);
 +    bFirst = FALSE;
 +  }
 +  if ((ff.tol == 0) || (fabs(ener[F_EPOT]/ff.nmol-ff.epot) < ff.tol)) {
 +    msf1 = msf(natoms,f,fshake);
 +    if ((ff.f_max == 0) || (msf1 < sqr(ff.f_max))) 
 +      print_range(fp,pres,msf1,ener[F_EPOT]/ff.nmol);
 +  }
 +}
 +
 +gmx_bool print_forcefield(FILE *fp,real ener[],int natoms,rvec f[],rvec fshake[],
 +                    rvec x[],t_block *mols,real mass[],tensor pres)
 +{
 +  real msf1;
 +  
 +  if (ga) {
 +    msf1 = msf(natoms,f,fshake);
 +    if (debug)
 +      fprintf(fp,"Pressure: %12g, RMSF: %12g, Energy-Epot: %12g, cost: %12g\n",
 +            ener[F_PRES],sqrt(msf1),ener[F_EPOT]/ff.nmol-ff.epot,
 +            cost(pres,msf1,ener[F_EPOT]/ff.nmol));
 +    if (print_ga(fp,ga,msf1,pres,scale,(ener[F_EPOT]/ff.nmol),range,ff.tol)) {
 +      return TRUE;
 +    }
 +    fflush(fp);
 +  }
 +  else
 +    print_grid(fp,ener,natoms,f,fshake,x,mols,mass,pres);
 +  return FALSE;
 +}
 + 
Simple merge
index baff1487d2fe0c690a7b0e390eee2c429bf5b6a3,cb73909dc4df5d4c1d4a309a138d340887ae12b9..14c47cfc442fba6eccee6a1a8abbde8b1d71190f
@@@ -3626,47 -3741,84 +3611,48 @@@ int gmx_membed(int argc,char *argv[]
                bAppendFiles = FALSE;
        }
  
 -      if (!bAppendFiles)
 +        data_out = ffopen(opt2fn("-dat",NFILE,fnm),"w");
 +        fprintf(data_out,"nxy = %d\nnz = %d\nxyinit = %f\nxyend = %f\nzinit = %f\nzend = %f\n"
 +                      "rad = %f\npieces = %d\nasymmetry = %s\nndiff = %d\nmaxwarn = %d\n",
 +                      it_xy,it_z,xy_fac,xy_max,z_fac,z_max,probe_rad,pieces,
 +                      bALLOW_ASYMMETRY ? "yes" : "no",low_up_rm,maxwarn);
 +        fclose(data_out);
 +
 +        sprintf(buf,"%s -s %s -membed %s -o %s -c %s -e %s -nt 1 -cpt -1",
 +                  (mdrun_path==NULL) ? "mdrun" : mdrun_path,
 +                  opt2fn("-f",NFILE,fnm),opt2fn("-dat",NFILE,fnm),opt2fn("-o",NFILE,fnm),
 +                  opt2fn("-c",NFILE,fnm),opt2fn("-e",NFILE,fnm));
 +        if (opt2bSet("-n",NFILE,fnm))
        {
 -              sim_part_fn = sim_part;
 -      }
 -
 -      if (bAddPart && sim_part_fn > 1)
 +              sprintf(buf2," -mn %s",opt2fn("-n",NFILE,fnm));
 +              strcat(buf,buf2);
 +        }
 +      if (opt2bSet("-x",NFILE,fnm))
        {
 -              /* This is a continuation run, rename trajectory output files
 -       (except checkpoint files) */
 -              /* create new part name first (zero-filled) */
 -              sprintf(suffix,"%s%04d",part_suffix,sim_part_fn);
 -
 -              add_suffix_to_output_names(fnm,NFILE,suffix);
 -              fprintf(stdout,"Checkpoint file is from part %d, new output files will be suffixed '%s'.\n",sim_part-1,suffix);
 +              sprintf(buf2," -x %s",opt2fn("-x",NFILE,fnm));
 +                strcat(buf,buf2);
        }
 -
 -      Flags = opt2bSet("-rerun",NFILE,fnm) ? MD_RERUN : 0;
 -      Flags = Flags | (bSepPot       ? MD_SEPPOT       : 0);
 -      Flags = Flags | (bIonize       ? MD_IONIZE       : 0);
 -      Flags = Flags | (bPartDec      ? MD_PARTDEC      : 0);
 -      Flags = Flags | (bDDBondCheck  ? MD_DDBONDCHECK  : 0);
 -      Flags = Flags | (bDDBondComm   ? MD_DDBONDCOMM   : 0);
 -      Flags = Flags | (bConfout      ? MD_CONFOUT      : 0);
 -      Flags = Flags | (bRerunVSite   ? MD_RERUN_VSITE  : 0);
 -      Flags = Flags | (bReproducible ? MD_REPRODUCIBLE : 0);
 -      Flags = Flags | (bAppendFiles  ? MD_APPENDFILES  : 0);
 +        if (opt2bSet("-p",NFILE,fnm))
+       Flags = Flags | (opt2parg_bSet("-append", asize(pa),pa) ? MD_APPENDFILESSET : 0); 
 -      Flags = Flags | (sim_part>1    ? MD_STARTFROMCPT : 0);
 -      Flags = Flags | (bResetCountersHalfWay ? MD_RESETCOUNTERSHALFWAY : 0);
 -
 -
 -      /* We postpone opening the log file if we are appending, so we can
 -   first truncate the old log file and append to the correct position
 -   there instead.  */
 -      if ((MASTER(cr) || bSepPot) && !bAppendFiles)
 -      {
 -              gmx_log_open(ftp2fn(efLOG,NFILE,fnm),cr,!bSepPot,Flags,&fplog);
 -              CopyRight(fplog,argv[0]);
 -              please_cite(fplog,"Hess2008b");
 -              please_cite(fplog,"Spoel2005a");
 -              please_cite(fplog,"Lindahl2001a");
 -              please_cite(fplog,"Berendsen95a");
 -      }
 -      else
 +        {
 +                sprintf(buf2," -mp %s",opt2fn("-p",NFILE,fnm));
 +                strcat(buf,buf2);
 +        }
 +      if (bVerbose)
        {
 -              fplog = NULL;
 +              sprintf(buf2," -v -stepout %d",nstepout);
 +              strcat(buf,buf2);
        }
  
 -      ddxyz[XX] = (int)(realddxyz[XX] + 0.5);
 -      ddxyz[YY] = (int)(realddxyz[YY] + 0.5);
 -      ddxyz[ZZ] = (int)(realddxyz[ZZ] + 0.5);
 -
 -      /* even if nthreads = 1, we still call this one */
 -
 -      rc = mdrunner_membed(fplog, cr, NFILE, fnm, oenv, bVerbose, bCompact,
 -                      nstglobalcomm,
 -                      ddxyz, dd_node_order, rdd, rconstr, dddlb_opt[0], dlb_scale,
 -                      ddcsx, ddcsy, ddcsz, nstepout, resetstep, nmultisim, repl_ex_nst,
 -                      repl_ex_seed, pforce, cpt_period, max_hours, deviceOptions, Flags,
 -                      xy_fac,xy_max,z_fac,z_max,
 -                      it_xy,it_z,probe_rad,low_up_rm,
 -                      pieces,bALLOW_ASYMMETRY,maxwarn);
 -
 -      if (gmx_parallel_env_initialized())
 -              gmx_finalize();
 -
 -      if (MULTIMASTER(cr)) {
 -              thanx(stderr);
 -      }
 -
 -      /* Log file has to be closed in mdrunner if we are appending to it
 -   (fplog not set here) */
 -      fprintf(stderr,"Please cite:\nWolf et al, J Comp Chem 31 (2010) 2169-2174.\n");
 +        printf("%s\n",buf);
 +        if (bStart)
 +        {
 +                system(buf);
 +        } else {
 +                printf("You can membed your protein now by:\n%s\n",buf);
 +        }
  
 -      if (MASTER(cr) && !bAppendFiles)
 -      {
 -              gmx_log_close(fplog);
 -      }
 +        fprintf(stderr,"Please cite:\nWolf et al, J Comp Chem 31 (2010) 2169-2174.\n");
  
 -      return rc;
 +      return 0;
  }
Simple merge
Simple merge