Merge branch 'release-4-5-patches' into release-4-6
authorSzilard Pall <pszilard@cbr.su.se>
Mon, 3 Sep 2012 10:40:56 +0000 (12:40 +0200)
committerSzilard Pall <pszilard@cbr.su.se>
Mon, 3 Sep 2012 10:40:56 +0000 (12:40 +0200)
Conflicts:
src/mdlib/tpi.c

Change-Id: I1b5d92291ece7cf3194b4e1d88cd686a5aa18696

1  2 
share/top/charmm27.ff/aminoacids.rtp
src/kernel/gmxcheck.c
src/mdlib/domdec.c
src/mdlib/edsam.c
src/mdlib/pme.c
src/mdlib/sim_util.c
src/mdlib/tpi.c
src/tools/gmx_energy.c
src/tools/gmx_trjconv.c
src/tools/make_edi.c

index 7eef837af42ea2867b6f1623e35cf152378fb15b,f3e84ae860bc4e5286a762ad06c5dd5d23de5b73..d24ef05e5173956f03aa56c6cba390f58c9a9345
@@@ -1,26 -1,14 +1,26 @@@
  [ bondedtypes ] 
 -; Col 1: Type of bond 
 -; Col 2: Type of angles 
 -; Col 3: Type of proper dihedrals 
 -; Col 4: Type of improper dihedrals 
 -; Col 5: Generate all dihedrals if 1, only heavy atoms of 0. 
 -; Col 6: Number of excluded neighbors for nonbonded interactions 
 -; Col 7: Generate 1,4 interactions between pairs of hydrogens if 1 
 -; Col 8: Remove propers over the same bond as an improper if it is 1 
 -; bonds  angles  dihedrals  impropers all_dihedrals nrexcl HH14 RemoveDih 
 -     1       5          9        2        1           3      1     0 
 +; Column 1 : default bondtype
 +; Column 2 : default angletype
 +; Column 3 : default proper dihedraltype
 +; Column 4 : default improper dihedraltype
 +; Column 5 : This controls the generation of dihedrals from the bonding.
 +;            All possible dihedrals are generated automatically. A value of
 +;            1 here means that all these are retained. A value of
 +;            0 here requires generated dihedrals be removed if
 +;              * there are any dihedrals on the same central atoms
 +;                specified in the residue topology, or
 +;              * there are other identical generated dihedrals
 +;                sharing the same central atoms, or
 +;              * there are other generated dihedrals sharing the
 +;                same central bond that have fewer hydrogen atoms
 +; Column 6 : number of neighbors to exclude from non-bonded interactions
 +; Column 7 : 1 = generate 1,4 interactions between pairs of hydrogen atoms
 +;            0 = do not generate such
 +; Column 8 : 1 = remove proper dihedrals if found centered on the same
 +;                bond as an improper dihedral
 +;            0 = do not generate such
 +; bondtype angletype dihedraltype impropertype all_dih nrexcl HH14 bRemoveDih
 +     1       5            9            2         1       3     1       0
  
  [ ALA ]
   [ atoms ]
        OH2     H1
        OH2     H2
  
+ [ HO4 ]
+ ; TIP4P
+  [ atoms ]
+        OW      OWT4    0.00    0
+        HW1     HWT4    0.52    0
+        HW2     HWT4    0.52    0
+        MW      MWT4    -1.04   0
+  [ bonds ]
+        OW      HW1
+        OW      HW2
  [ SOD ]
   [ atoms ]
        SOD     SOD     1.00    0
diff --combined src/kernel/gmxcheck.c
index 84e3880b0613775c6504cef18aa9c8ddbbd0e31d,ede1bbf354de45bd044704d374bccbc0c261a7e7..0c4d26054d0587b10fc19608d4fd0d885cb47d9a
@@@ -41,7 -41,7 +41,7 @@@
  #include <ctype.h>
  #include "main.h"
  #include "macros.h"
 -#include "math.h"
 +#include <math.h>
  #include "futil.h"
  #include "statutil.h"
  #include "copyrite.h"
@@@ -157,10 -157,12 +157,12 @@@ static void chk_coords(int frame,int na
        printf("Warning at frame %d: coordinates for atom %d are large (%g)\n",
               frame,i,x[i][j]);
      }
-     if ((fabs(x[j][XX]) < tol) && 
-       (fabs(x[j][YY]) < tol) && 
-       (fabs(x[j][ZZ]) < tol))
-       nNul++;
+     if ((fabs(x[i][XX]) < tol) && 
+         (fabs(x[i][YY]) < tol) && 
+         (fabs(x[i][ZZ]) < tol))
+     {
+         nNul++;
+     }
    }
    if (nNul > 0)
      printf("Warning at frame %d: there are %d particles with all coordinates zero\n",
@@@ -215,7 -217,7 +217,7 @@@ static void chk_bonds(t_idef *idef,int 
          b0 = idef->iparams[type].harmonic.rA;
          break;
        case F_MORSE:
 -        b0 = idef->iparams[type].morse.b0;
 +        b0 = idef->iparams[type].morse.b0A;
          break;
        case F_CUBICBONDS:
          b0 = idef->iparams[type].cubic.b0;
@@@ -519,7 -521,7 +521,7 @@@ void chk_tps(const char *fn, real vdw_f
  void chk_ndx(const char *fn)
  {
    t_blocka *grps;
 -  char **grpname=NULL;
 +  char **grpname;
    int  i,j;
    
    grps = init_index(fn,&grpname);
@@@ -663,7 -665,7 +665,7 @@@ int main(int argc,char *argv[]
        "Last energy term to compare (if not given all are tested). It makes sense to go up until the Pressure." }
    };
  
 -  CopyRight(stdout,argv[0]);
 +  CopyRight(stderr,argv[0]);
    parse_common_args(&argc,argv,0,NFILE,fnm,asize(pa),pa,
                    asize(desc),desc,0,NULL,&oenv);
  
diff --combined src/mdlib/domdec.c
index 5bfe1f2fa772e8b64dc960e1c184bf1e6b91bab1,285e66b42e46787a44f95a7e0a19e34194e9c33f..472778af04349f2ac47f6e16b70e96bbbc5df8b5
@@@ -41,7 -41,6 +41,7 @@@
  #include "force.h"
  #include "pme.h"
  #include "pull.h"
 +#include "pull_rotation.h"
  #include "gmx_wallcycle.h"
  #include "mdrun.h"
  #include "nsgrid.h"
@@@ -54,7 -53,7 +54,7 @@@
  #ifdef GMX_LIB_MPI
  #include <mpi.h>
  #endif
 -#ifdef GMX_THREADS
 +#ifdef GMX_THREAD_MPI
  #include "tmpi.h"
  #endif
  
@@@ -1454,10 -1453,7 +1454,10 @@@ void dd_collect_state(gmx_domdec_t *dd
  
      if (DDMASTER(dd))
      {
 -        state->lambda = state_local->lambda;
 +        for (i=0;i<efptNR;i++) {
 +            state->lambda[i] = state_local->lambda[i];
 +        }
 +        state->fep_state = state_local->fep_state;
          state->veta = state_local->veta;
          state->vol0 = state_local->vol0;
          copy_mat(state_local->box,state->box);
      }
      for(est=0; est<estNR; est++)
      {
 -        if (EST_DISTR(est) && state_local->flags & (1<<est))
 +        if (EST_DISTR(est) && (state_local->flags & (1<<est)))
          {
              switch (est) {
              case estX:
@@@ -1567,7 -1563,7 +1567,7 @@@ static void dd_realloc_state(t_state *s
      
      for(est=0; est<estNR; est++)
      {
 -        if (EST_DISTR(est) && state->flags & (1<<est))
 +        if (EST_DISTR(est) && (state->flags & (1<<est)))
          {
              switch(est) {
              case estX:
@@@ -1710,17 -1706,13 +1710,17 @@@ static void dd_distribute_state(gmx_dom
                                  t_state *state,t_state *state_local,
                                  rvec **f)
  {
 -    int  i,j,ngtch,ngtcp,nh;
 +    int  i,j,nh;
  
      nh = state->nhchainlength;
  
      if (DDMASTER(dd))
      {
 -        state_local->lambda = state->lambda;
 +        for(i=0;i<efptNR;i++)
 +        {
 +            state_local->lambda[i] = state->lambda[i];
 +        }
 +        state_local->fep_state = state->fep_state;
          state_local->veta   = state->veta;
          state_local->vol0   = state->vol0;
          copy_mat(state->box,state_local->box);
              }
          }
      }
 -    dd_bcast(dd,sizeof(real),&state_local->lambda);
 +    dd_bcast(dd,((efptNR)*sizeof(real)),state_local->lambda);
 +    dd_bcast(dd,sizeof(int),&state_local->fep_state);
      dd_bcast(dd,sizeof(real),&state_local->veta);
      dd_bcast(dd,sizeof(real),&state_local->vol0);
      dd_bcast(dd,sizeof(state_local->box),state_local->box);
      }
      for(i=0; i<estNR; i++)
      {
 -        if (EST_DISTR(i) && state_local->flags & (1<<i))
 +        if (EST_DISTR(i) && (state_local->flags & (1<<i)))
          {
              switch (i) {
              case estX:
@@@ -1866,7 -1857,7 +1866,7 @@@ static void write_dd_grid_pdb(const cha
                  }
                  else
                  {
 -                    if (dd->nc[d] > 1 && d < ddbox->npbcdim)
 +                    if (d < ddbox->npbcdim && dd->nc[d] > 1)
                      {
                          tric[d][i] = box[i][d]/box[i][i];
                      }
@@@ -3822,8 -3813,8 +3822,8 @@@ static void get_cg_distribution(FILE *f
      ivec npulse;
      int  i,cg_gl;
      int  *ibuf,buf2[2] = { 0, 0 };
 -    
 -    if (DDMASTER(dd))
 +    gmx_bool bMaster = DDMASTER(dd);
 +    if (bMaster)
      {
          ma = dd->ma;
          
          srenew(dd->index_gl,dd->cg_nalloc);
          srenew(dd->cgindex,dd->cg_nalloc+1);
      }
 -    if (DDMASTER(dd))
 +    if (bMaster)
      {
          for(i=0; i<dd->nnodes; i++)
          {
@@@ -4092,7 -4083,7 +4092,7 @@@ static void print_cg_move(FILE *fplog
      fprintf(fplog,"\nStep %s:\n",gmx_step_str(step,buf));
      if (bHaveLimitdAndCMOld)
      {
-         fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition (%f) in direction %c\n",
+         fprintf(fplog,"The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
                  ddglatnr(dd,dd->cgindex[cg]),limitd,dim2char(dim));
      }
      else
@@@ -4141,7 -4132,7 +4141,7 @@@ static void rotate_state_atom(t_state *
  
      for(est=0; est<estNR; est++)
      {
 -        if (EST_DISTR(est) && state->flags & (1<<est)) {
 +        if (EST_DISTR(est) && (state->flags & (1<<est))) {
              switch (est) {
              case estX:
                  /* Rotate the complete state; for a rectangular box only */
@@@ -4578,19 -4569,19 +4578,19 @@@ static int dd_redistribute_cg(FILE *fpl
              if (dim >= npbcdim && dd->nc[dim] > 2)
              {
                  /* No pbc in this dim and more than one domain boundary.
-                  * We to a separate check if a charge did not move too far.
+                  * We do a separate check if a charge group didn't move too far.
                   */
                  if (((flag & DD_FLAG_FW(d)) &&
-                      comm->vbuf.v[buf_pos][d] > cell_x1[dim]) ||
+                      comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
                      ((flag & DD_FLAG_BW(d)) &&
-                      comm->vbuf.v[buf_pos][d] < cell_x0[dim]))
+                      comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
                  {
-                     cg_move_error(fplog,dd,step,cg,d,
+                     cg_move_error(fplog,dd,step,cg,dim,
                                    (flag & DD_FLAG_FW(d)) ? 1 : 0,
                                     FALSE,0,
                                     comm->vbuf.v[buf_pos],
                                     comm->vbuf.v[buf_pos],
-                                    comm->vbuf.v[buf_pos][d]);
+                                    comm->vbuf.v[buf_pos][dim]);
                  }
              }
  
@@@ -5207,30 -5198,34 +5207,30 @@@ static void dd_print_load_verbose(gmx_d
  }
  
  #ifdef GMX_MPI
 -static void make_load_communicator(gmx_domdec_t *dd,MPI_Group g_all,
 -                                   int dim_ind,ivec loc)
 +static void make_load_communicator(gmx_domdec_t *dd, int dim_ind,ivec loc)
  {
 -    MPI_Group g_row;
      MPI_Comm  c_row;
 -    int  dim,i,*rank;
 +    int  dim, i, rank;
      ivec loc_c;
      gmx_domdec_root_t *root;
 +    gmx_bool bPartOfGroup = FALSE;
      
      dim = dd->dim[dim_ind];
      copy_ivec(loc,loc_c);
 -    snew(rank,dd->nc[dim]);
      for(i=0; i<dd->nc[dim]; i++)
      {
          loc_c[dim] = i;
 -        rank[i] = dd_index(dd->nc,loc_c);
 +        rank = dd_index(dd->nc,loc_c);
 +        if (rank == dd->rank)
 +        {
 +            /* This process is part of the group */
 +            bPartOfGroup = TRUE;
 +        }
      }
 -    /* Here we create a new group, that does not necessarily
 -     * include our process. But MPI_Comm_create needs to be
 -     * called by all the processes in the original communicator.
 -     * Calling MPI_Group_free afterwards gives errors, so I assume
 -     * also the group is needed by all processes. (B. Hess)
 -     */
 -    MPI_Group_incl(g_all,dd->nc[dim],rank,&g_row);
 -    MPI_Comm_create(dd->mpi_comm_all,g_row,&c_row);
 -    if (c_row != MPI_COMM_NULL)
 +    MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup?0:MPI_UNDEFINED, dd->rank,
 +                   &c_row);
 +    if (bPartOfGroup)
      {
 -        /* This process is part of the group */
          dd->comm->mpi_comm_load[dim_ind] = c_row;
          if (dd->comm->eDLB != edlbNO)
          {
              snew(dd->comm->load[dim_ind].load,dd->nc[dim]*DD_NLOAD_MAX);
          }
      }
 -    sfree(rank);
  }
  #endif
  
  static void make_load_communicators(gmx_domdec_t *dd)
  {
  #ifdef GMX_MPI
 -  MPI_Group g_all;
    int  dim0,dim1,i,j;
    ivec loc;
  
    if (debug)
      fprintf(debug,"Making load communicators\n");
  
 -  MPI_Comm_group(dd->mpi_comm_all,&g_all);
 -  
    snew(dd->comm->load,dd->ndim);
    snew(dd->comm->mpi_comm_load,dd->ndim);
    
    clear_ivec(loc);
 -  make_load_communicator(dd,g_all,0,loc);
 +  make_load_communicator(dd,0,loc);
    if (dd->ndim > 1) {
      dim0 = dd->dim[0];
      for(i=0; i<dd->nc[dim0]; i++) {
        loc[dim0] = i;
 -      make_load_communicator(dd,g_all,1,loc);
 +      make_load_communicator(dd,1,loc);
      }
    }
    if (dd->ndim > 2) {
        dim1 = dd->dim[1];
        for(j=0; j<dd->nc[dim1]; j++) {
          loc[dim1] = j;
 -        make_load_communicator(dd,g_all,2,loc);
 +        make_load_communicator(dd,2,loc);
        }
      }
    }
  
 -  MPI_Group_free(&g_all);
 -
    if (debug)
      fprintf(debug,"Finished making load communicators\n");
  #endif
@@@ -7990,7 -7991,7 +7990,7 @@@ static void dd_sort_state(gmx_domdec_t 
      /* Reorder the state */
      for(i=0; i<estNR; i++)
      {
 -        if (EST_DISTR(i) && state->flags & (1<<i))
 +        if (EST_DISTR(i) && (state->flags & (1<<i)))
          {
              switch (i)
              {
@@@ -8599,13 -8600,6 +8599,13 @@@ void dd_partition_system(FIL
          /* Update the local pull groups */
          dd_make_local_pull_groups(dd,ir->pull,mdatoms);
      }
 +    
 +    if (ir->bRot)
 +    {
 +        /* Update the local rotation groups */
 +        dd_make_local_rotation_groups(dd,ir->rot);
 +    }
 +
  
      add_dd_statistics(dd);
      
diff --combined src/mdlib/edsam.c
index 3a0b21dc91703b83aeb11cf58af22e4d6c7f2f09,c629ec60d39a173c251522cc2628f14d2eaaa548..15fdd6873b416f23058f858bd719c56cc4e94f4d
@@@ -46,7 -46,7 +46,7 @@@
  #include "mvdata.h"
  #include "txtdump.h"
  #include "vec.h"
 -#include "time.h"
 +#include <time.h>
  #include "nrnb.h"
  #include "mshift.h"
  #include "mdrun.h"
@@@ -857,7 -857,8 +857,8 @@@ static void do_single_flood
          t_edpar *edi,
          gmx_large_int_t step,
          matrix box,
-         t_commrec *cr)
+         t_commrec *cr,
+         gmx_bool bNS)       /* Are we in a neighbor searching step? */
  {
      int i;
      matrix  rotmat;         /* rotation matrix */
  
      buf=edi->buf->do_edsam;
  
      /* Broadcast the positions of the AVERAGE structure such that they are known on
       * every processor. Each node contributes its local positions x and stores them in
       * the collective ED array buf->xcoll */
-     communicate_group_positions(cr, buf->xcoll, buf->shifts_xcoll, buf->extra_shifts_xcoll, buf->bUpdateShifts, x,
+     communicate_group_positions(cr, buf->xcoll, buf->shifts_xcoll, buf->extra_shifts_xcoll, bNS, x,
                      edi->sav.nr, edi->sav.nr_loc, edi->sav.anrs_loc, edi->sav.c_ind, edi->sav.x_old, box);
  
      /* Only assembly REFERENCE positions if their indices differ from the average ones */
      if (!edi->bRefEqAv)
-         communicate_group_positions(cr, buf->xc_ref, buf->shifts_xc_ref, buf->extra_shifts_xc_ref, buf->bUpdateShifts, x,
+         communicate_group_positions(cr, buf->xc_ref, buf->shifts_xc_ref, buf->extra_shifts_xc_ref, bNS, x,
                  edi->sref.nr, edi->sref.nr_loc, edi->sref.anrs_loc, edi->sref.c_ind, edi->sref.x_old, box);
  
      /* If bUpdateShifts was TRUE, the shifts have just been updated in get_positions.
@@@ -935,7 -937,8 +937,8 @@@ extern void do_flood
          rvec            force[], /* forcefield forces, to these the flooding forces are added */
          gmx_edsam_t     ed,      /* ed data structure contains all ED and flooding datasets */
          matrix          box,     /* the box */
-         gmx_large_int_t step)    /* The relative time step since ir->init_step is already subtracted */
+         gmx_large_int_t step,    /* The relative time step since ir->init_step is already subtracted */
+         gmx_bool        bNS)     /* Are we in a neighbor searching step? */
  {
      t_edpar *edi;
  
      {
          /* Call flooding for one matrix */
          if (edi->flood.vecs.neig)
-             do_single_flood(ed->edo,x,force,edi,step,box,cr);
+             do_single_flood(ed->edo,x,force,edi,step,box,cr,bNS);
          edi = edi->next_edi;
      }
  }
@@@ -1497,7 -1500,9 +1500,7 @@@ static int read_edi(FILE* in, gmx_edsam
      {
          if (readmagic==666 || readmagic==667 || readmagic==668)
              gmx_fatal(FARGS,"Wrong magic number: Use newest version of make_edi to produce edi file");
 -        else if (readmagic == 669)
 -            ;
 -        else
 +        else if (readmagic != 669)
              gmx_fatal(FARGS,"Wrong magic number %d in %s",readmagic,ed->edinam);
      }
  
      /* allocate space for reference positions and read them */
      snew(edi->sref.anrs,edi->sref.nr);
      snew(edi->sref.x   ,edi->sref.nr);
-     if (PAR(cr))
-         snew(edi->sref.x_old,edi->sref.nr);
+     snew(edi->sref.x_old,edi->sref.nr);
      edi->sref.sqrtm    =NULL;
      read_edx(in,edi->sref.nr,edi->sref.anrs,edi->sref.x);
  
      edi->sav.nr=read_checked_edint(in,"NAV");
      snew(edi->sav.anrs,edi->sav.nr);
      snew(edi->sav.x   ,edi->sav.nr);
-     if (PAR(cr))
-         snew(edi->sav.x_old,edi->sav.nr);
+     snew(edi->sav.x_old,edi->sav.nr);
      read_edx(in,edi->sav.nr,edi->sav.anrs,edi->sav.x);
  
      /* Check if the same atom indices are used for reference and average positions */
@@@ -2118,13 -2121,16 +2119,16 @@@ static int ed_constraints(gmx_bool edty
   * umbrella sampling simulations. */
  static void copyEvecReference(t_eigvec* floodvecs)
  {
-       int i;
+     int i;
  
+     if (NULL==floodvecs->refproj0)
+         snew(floodvecs->refproj0, floodvecs->neig);
  
-       for (i=0; i<floodvecs->neig; i++)
-       {
-               floodvecs->refproj0[i] = floodvecs->refproj[i];
-       }
+     for (i=0; i<floodvecs->neig; i++)
+     {
+         floodvecs->refproj0[i] = floodvecs->refproj[i];
+     }
  }
  
  
@@@ -2137,7 -2143,7 +2141,7 @@@ void init_edsam(gmx_mtop_t  *mtop,   /
  {
      t_edpar *edi = NULL;    /* points to a single edi data set */
      int     numedis=0;      /* keep track of the number of ED data sets in edi file */
-     int     i,nr_edi;
+     int     i,nr_edi,avindex;
      rvec    *x_pbc  = NULL; /* positions of the whole MD system with pbc removed  */
      rvec    *xfit   = NULL; /* the positions which will be fitted to the reference structure  */
      rvec    *xstart = NULL; /* the positions which are subject to ED sampling */
              {
                  copy_rvec(x_pbc[edi->sref.anrs[i]], xfit[i]);
  
-                 /* Save the sref positions such that in the next time step the molecule can
-                  * be made whole again (in the parallel case) */
-                 if (PAR(cr))
-                     copy_rvec(xfit[i], edi->sref.x_old[i]);
+                 /* Save the sref positions such that in the next time step we can make the ED group whole
+                  * in case any of the atoms do not have the correct PBC representation */
+                 copy_rvec(xfit[i], edi->sref.x_old[i]);
              }
  
              /* Extract the positions of the atoms subject to ED sampling */
              {
                  copy_rvec(x_pbc[edi->sav.anrs[i]], xstart[i]);
  
-                 /* Save the sav positions such that in the next time step the molecule can
-                  * be made whole again (in the parallel case) */
-                 if (PAR(cr))
-                     copy_rvec(xstart[i], edi->sav.x_old[i]);
+                 /* Save the sav positions such that in the next time step we can make the ED group whole
+                  * in case any of the atoms do not have the correct PBC representation */
+                 copy_rvec(xstart[i], edi->sav.x_old[i]);
              }
  
              /* Make the fit to the REFERENCE structure, get translation and rotation */
              /* calculate initial projections */
              project(xstart, edi);
  
+             /* For the target and origin structure both a reference (fit) and an
+              * average structure can be provided in make_edi. If both structures
+              * are the same, make_edi only stores one of them in the .edi file.
+              * If they differ, first the fit and then the average structure is stored
+              * in star (or sor), thus the number of entries in star/sor is
+              * (n_fit + n_av) with n_fit the size of the fitting group and n_av
+              * the size of the average group. */
              /* process target structure, if required */
              if (edi->star.nr > 0)
              {
                  fprintf(stderr, "ED: Fitting target structure to reference structure\n");
                  /* get translation & rotation for fit of target structure to reference structure */
                  fit_to_reference(edi->star.x, fit_transvec, fit_rotmat, edi);
                  /* do the fit */
-                 translate_and_rotate(edi->star.x, edi->sav.nr, fit_transvec, fit_rotmat);
-                 rad_project(edi, edi->star.x, &edi->vecs.radcon, cr);
+                 translate_and_rotate(edi->star.x, edi->star.nr, fit_transvec, fit_rotmat);
+                 if (edi->star.nr == edi->sav.nr)
+                 {
+                     avindex = 0;
+                 }
+                 else /* edi->star.nr = edi->sref.nr + edi->sav.nr */
+                 {
+                     /* The last sav.nr indices of the target structure correspond to
+                      * the average structure, which must be projected */
+                     avindex = edi->star.nr - edi->sav.nr;
+                 }
+                 rad_project(edi, &edi->star.x[avindex], &edi->vecs.radcon, cr);
              } else
                  rad_project(edi, xstart, &edi->vecs.radcon, cr);
  
              /* process structure that will serve as origin of expansion circle */
              if ( (eEDflood == ed->eEDtype) && (FALSE == edi->flood.bConstForce) )
                  fprintf(stderr, "ED: Setting center of flooding potential (0 = average structure)\n");
              if (edi->sori.nr > 0)
              {
                  fprintf(stderr, "ED: Fitting origin structure to reference structure\n");
                  /* fit this structure to reference structure */
                  fit_to_reference(edi->sori.x, fit_transvec, fit_rotmat, edi);
                  /* do the fit */
-                 translate_and_rotate(edi->sori.x, edi->sav.nr, fit_transvec, fit_rotmat);
-                 rad_project(edi, edi->sori.x, &edi->vecs.radacc, cr);
-                 rad_project(edi, edi->sori.x, &edi->vecs.radfix, cr);
+                 translate_and_rotate(edi->sori.x, edi->sori.nr, fit_transvec, fit_rotmat);
+                 if (edi->sori.nr == edi->sav.nr)
+                 {
+                     avindex = 0;
+                 }
+                 else /* edi->sori.nr = edi->sref.nr + edi->sav.nr */
+                 {
+                     /* For the projection, we need the last sav.nr indices of sori */
+                     avindex = edi->sori.nr - edi->sav.nr;
+                 }
+                 rad_project(edi, &edi->sori.x[avindex], &edi->vecs.radacc, cr);
+                 rad_project(edi, &edi->sori.x[avindex], &edi->vecs.radfix, cr);
                  if ( (eEDflood == ed->eEDtype) && (FALSE == edi->flood.bConstForce) )
                  {
                      fprintf(stderr, "ED: The ORIGIN structure will define the flooding potential center.\n");
                      /* Set center of flooding potential to the ORIGIN structure */
-                     rad_project(edi, edi->sori.x, &edi->flood.vecs, cr);
+                     rad_project(edi, &edi->sori.x[avindex], &edi->flood.vecs, cr);
                      /* We already know that no (moving) reference position was provided,
                       * therefore we can overwrite refproj[0]*/
                      copyEvecReference(&edi->flood.vecs);
@@@ -2470,7 -2505,7 +2503,7 @@@ void do_edsam(t_inputrec  *ir
               * the collective buf->xcoll array. Note that for edinr > 1
               * xs could already have been modified by an earlier ED */
  
-             communicate_group_positions(cr, buf->xcoll, buf->shifts_xcoll, buf->extra_shifts_xcoll, buf->bUpdateShifts, xs,
+             communicate_group_positions(cr, buf->xcoll, buf->shifts_xcoll, buf->extra_shifts_xcoll, PAR(cr) ? buf->bUpdateShifts : TRUE, xs,
                      edi->sav.nr, edi->sav.nr_loc, edi->sav.anrs_loc, edi->sav.c_ind, edi->sav.x_old,  box);
  
  #ifdef DEBUG_ED
  #endif
              /* Only assembly reference positions if their indices differ from the average ones */
              if (!edi->bRefEqAv)
-                 communicate_group_positions(cr, buf->xc_ref, buf->shifts_xc_ref, buf->extra_shifts_xc_ref, buf->bUpdateShifts, xs,
+                 communicate_group_positions(cr, buf->xc_ref, buf->shifts_xc_ref, buf->extra_shifts_xc_ref, PAR(cr) ? buf->bUpdateShifts : TRUE, xs,
                          edi->sref.nr, edi->sref.nr_loc, edi->sref.anrs_loc, edi->sref.c_ind, edi->sref.x_old, box);
  
-             /* If bUpdateShifts was TRUE then the shifts have just been updated in get_positions.
-              * We do not need to uptdate the shifts until the next NS step */
+             /* If bUpdateShifts was TRUE then the shifts have just been updated in communicate_group_positions.
+              * We do not need to update the shifts until the next NS step. Note that dd_make_local_ed_indices
+              * set bUpdateShifts=TRUE in the parallel case. */
              buf->bUpdateShifts = FALSE;
  
              /* Now all nodes have all of the ED positions in edi->sav->xcoll,
diff --combined src/mdlib/pme.c
index c39c978f605de808143b77c114da2bfe481a333c,2f5ce0a84bdb02f730a76a55205029b73fd9a8a3..8342621cad818ecbe0e5fd396580cdf47e2656fa
@@@ -1,12 -1,12 +1,12 @@@
  /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   *
 - * 
 + *
   *                This source code is part of
 - * 
 + *
   *                 G   R   O   M   A   C   S
 - * 
 + *
   *          GROningen MAchine for Chemical Simulations
 - * 
 + *
   *                        VERSION 3.2.0
   * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * modify it under the terms of the GNU General Public License
   * as published by the Free Software Foundation; either version 2
   * of the License, or (at your option) any later version.
 - * 
 + *
   * If you want to redistribute modifications, please consider that
   * scientific software is very special. Version control is crucial -
   * bugs must be traceable. We will be happy to consider code for
   * inclusion in the official distribution, but derived work must not
   * be called official GROMACS. Details are found in the README & COPYING
   * files - if they are missing, get the official version at www.gromacs.org.
 - * 
 + *
   * To help us fund GROMACS development, we humbly ask that you cite
   * the papers on the package - you can find them in the top README file.
 - * 
 + *
   * For more info, check our website at http://www.gromacs.org
 - * 
 + *
   * And Hey:
   * GROwing Monsters And Cloning Shrimps
   */
@@@ -54,7 -54,7 +54,7 @@@
   *
   * It might seem an overkill, but better safe than sorry.
   * /Erik 001109
 - */ 
 + */
  
  #ifdef HAVE_CONFIG_H
  #include <config.h>
  #ifdef GMX_LIB_MPI
  #include <mpi.h>
  #endif
 -#ifdef GMX_THREADS
 +#ifdef GMX_THREAD_MPI
  #include "tmpi.h"
  #endif
  
 -
  #include <stdio.h>
  #include <string.h>
  #include <math.h>
 +#include <assert.h>
  #include "typedefs.h"
  #include "txtdump.h"
  #include "vec.h"
  #include "gmx_wallcycle.h"
  #include "gmx_parallel_3dfft.h"
  #include "pdbio.h"
 +#include "gmx_cyclecounter.h"
 +
 +/* Single precision, with SSE2 or higher available */
 +#if defined(GMX_X86_SSE2) && !defined(GMX_DOUBLE)
 +
 +#include "gmx_x86_sse2.h"
 +#include "gmx_math_x86_sse2_single.h"
  
 -#if ( !defined(GMX_DOUBLE) && ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_X86_64_SSE2) ) )
 -#include "gmx_sse2_single.h"
 +#define PME_SSE
 +/* Some old AMD processors could have problems with unaligned loads+stores */
 +#ifndef GMX_FAHCORE
 +#define PME_SSE_UNALIGNED
 +#endif
  #endif
  
  #include "mpelogging.h"
  /* #define TAKETIME (step > 1 && timesteps < 10) */
  #define TAKETIME FALSE
  
 +/* #define PME_TIME_THREADS */
 +
  #ifdef GMX_DOUBLE
  #define mpi_type MPI_DOUBLE
  #else
  #define mpi_type MPI_FLOAT
  #endif
  
 +/* GMX_CACHE_SEP should be a multiple of 16 to preserve alignment */
 +#define GMX_CACHE_SEP 64
 +
 +/* We only define a maximum to be able to use local arrays without allocation.
 + * An order larger than 12 should never be needed, even for test cases.
 + * If needed it can be changed here.
 + */
 +#define PME_ORDER_MAX 12
 +
  /* Internal datastructures */
  typedef struct {
      int send_index0;
@@@ -145,23 -124,8 +145,23 @@@ typedef struct 
      int  noverlap_nodes;
      int  *send_id,*recv_id;
      pme_grid_comm_t *comm_data;
 +    real *sendbuf;
 +    real *recvbuf;
  } pme_overlap_t;
  
 +typedef struct {
 +    int *n;     /* Cumulative counts of the number of particles per thread */
 +    int nalloc; /* Allocation size of i */
 +    int *i;     /* Particle indices ordered on thread index (n) */
 +} thread_plist_t;
 +
 +typedef struct {
 +    int  n;
 +    int  *ind;
 +    splinevec theta;
 +    splinevec dtheta;
 +} splinedata_t;
 +
  typedef struct {
      int  dimind;            /* The index of the dimension, 0=x, 1=y */
      int  nslab;
      int  pd_nalloc;
      int  *pd;
      int  *count;            /* The number of atoms to send to each node */
 +    int  **count_thread;
      int  *rcount;           /* The number of atoms to receive */
  
      int  n;
      rvec *x;
      real *q;
      rvec *f;
 -    gmx_bool bSpread;           /* These coordinates are used for spreading */
 +    gmx_bool bSpread;       /* These coordinates are used for spreading */
      int  pme_order;
 -    splinevec theta,dtheta;
      ivec *idx;
      rvec *fractx;            /* Fractional coordinate relative to the
 -                              * lower cell boundary 
 +                              * lower cell boundary
                                */
 +    int  nthread;
 +    int  *thread_idx;        /* Which thread should spread which charge */
 +    thread_plist_t *thread_plist;
 +    splinedata_t *spline;
  } pme_atomcomm_t;
  
 +#define FLBS  3
 +#define FLBSZ 4
 +
 +typedef struct {
 +    ivec ci;     /* The spatial location of this grid       */
 +    ivec n;      /* The size of *grid, including order-1    */
 +    ivec offset; /* The grid offset from the full node grid */
 +    int  order;  /* PME spreading order                     */
 +    real *grid;  /* The grid local thread, size n           */
 +} pmegrid_t;
 +
 +typedef struct {
 +    pmegrid_t grid;     /* The full node grid (non thread-local)            */
 +    int  nthread;       /* The number of threads operating on this grid     */
 +    ivec nc;            /* The local spatial decomposition over the threads */
 +    pmegrid_t *grid_th; /* Array of grids for each thread                   */
 +    int  **g2t;         /* The grid to thread index                         */
 +    ivec nthread_comm;  /* The number of threads to communicate with        */
 +} pmegrids_t;
 +
 +
 +typedef struct {
 +#ifdef PME_SSE
 +    /* Masks for SSE aligned spreading and gathering */
 +    __m128 mask_SSE0[6],mask_SSE1[6];
 +#else
 +    int dummy; /* C89 requires that struct has at least one member */
 +#endif
 +} pme_spline_work_t;
 +
 +typedef struct {
 +    /* work data for solve_pme */
 +    int      nalloc;
 +    real *   mhx;
 +    real *   mhy;
 +    real *   mhz;
 +    real *   m2;
 +    real *   denom;
 +    real *   tmp1_alloc;
 +    real *   tmp1;
 +    real *   eterm;
 +    real *   m2inv;
 +
 +    real     energy;
 +    matrix   vir;
 +} pme_work_t;
 +
  typedef struct gmx_pme {
      int  ndecompdim;         /* The number of decomposition dimensions */
      int  nodeid;             /* Our nodeid in mpi->mpi_comm */
      MPI_Datatype  rvec_mpi;  /* the pme vector's MPI type */
  #endif
  
 -    gmx_bool bPPnode;            /* Node also does particle-particle forces */
 -    gmx_bool bFEP;               /* Compute Free energy contribution */
 +    int  nthread;            /* The number of threads doing PME */
 +
 +    gmx_bool bPPnode;        /* Node also does particle-particle forces */
 +    gmx_bool bFEP;           /* Compute Free energy contribution */
      int nkx,nky,nkz;         /* Grid dimensions */
 +    gmx_bool bP3M;           /* Do P3M: optimize the influence function */
      int pme_order;
 -    real epsilon_r;           
 -    
 -    real *  pmegridA;  /* Grids on which we do spreading/interpolation, includes overlap */
 -    real *  pmegridB;
 +    real epsilon_r;
 +
 +    pmegrids_t pmegridA;  /* Grids on which we do spreading/interpolation, includes overlap */
 +    pmegrids_t pmegridB;
 +    /* The PME charge spreading grid sizes/strides, includes pme_order-1 */
      int     pmegrid_nx,pmegrid_ny,pmegrid_nz;
 -    int     pmegrid_start_ix,pmegrid_start_iy,pmegrid_start_iz;    
 -    
 -    real *  pmegrid_sendbuf;
 -    real *  pmegrid_recvbuf;
 -    
 +    /* pmegrid_nz might be larger than strictly necessary to ensure
 +     * memory alignment, pmegrid_nz_base gives the real base size.
 +     */
 +    int     pmegrid_nz_base;
 +    /* The local PME grid starting indices */
 +    int     pmegrid_start_ix,pmegrid_start_iy,pmegrid_start_iz;
 +
 +    /* Work data for spreading and gathering */
 +    pme_spline_work_t *spline_work;
 +
      real *fftgridA;             /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
      real *fftgridB;             /* inside the interpolation grid, but separate for 2D PME decomp. */
      int   fftgrid_nx,fftgrid_ny,fftgrid_nz;
 -    
 +
      t_complex *cfftgridA;             /* Grids for complex FFT data */
 -    t_complex *cfftgridB;            
 +    t_complex *cfftgridB;
      int   cfftgrid_nx,cfftgrid_ny,cfftgrid_nz;
 -    
 +
      gmx_parallel_3dfft_t  pfft_setupA;
      gmx_parallel_3dfft_t  pfft_setupB;
 -    
 +
      int  *nnx,*nny,*nnz;
      real *fshx,*fshy,*fshz;
 -    
 +
      pme_atomcomm_t atc[2];  /* Indexed on decomposition index */
      matrix    recipbox;
      splinevec bsp_mod;
 -    
 -    pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */
  
 +    pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */
  
      pme_atomcomm_t atc_energy; /* Only for gmx_pme_calc_energy */
 -    
 +
      rvec *bufv;             /* Communication buffer */
      real *bufr;             /* Communication buffer */
      int  buf_nalloc;        /* The communication buffer size */
  
 -    /* work data for solve_pme */
 -    int      work_nalloc;
 -    real *   work_mhx;
 -    real *   work_mhy;
 -    real *   work_mhz;
 -    real *   work_m2;
 -    real *   work_denom;
 -    real *   work_tmp1_alloc;
 -    real *   work_tmp1;
 -    real *   work_m2inv;
 +    /* thread local work data for solve_pme */
 +    pme_work_t *work;
  
      /* Work data for PME_redist */
 -    gmx_bool     redist_init;
 -    int *    scounts; 
 +    gmx_bool redist_init;
 +    int *    scounts;
      int *    rcounts;
      int *    sdispls;
      int *    rdispls;
      int *    sidx;
 -    int *    idxa;    
 +    int *    idxa;
      real *   redist_buf;
      int      redist_buf_nalloc;
 -    
 +
      /* Work data for sum_qgrid */
      real *   sum_qgrid_tmp;
      real *   sum_qgrid_dd_tmp;
  } t_gmx_pme;
  
  
 -static void calc_interpolation_idx(gmx_pme_t pme,pme_atomcomm_t *atc)
 +static void calc_interpolation_idx(gmx_pme_t pme,pme_atomcomm_t *atc,
 +                                   int start,int end,int thread)
  {
      int  i;
      int  *idxptr,tix,tiy,tiz;
      real rxx,ryx,ryy,rzx,rzy,rzz;
      int  nx,ny,nz;
      int  start_ix,start_iy,start_iz;
 -    
 +    int  *g2tx,*g2ty,*g2tz;
 +    gmx_bool bThreads;
 +    int  *thread_idx=NULL;
 +    thread_plist_t *tpl=NULL;
 +    int  *tpl_n=NULL;
 +    int  thread_i;
 +
      nx  = pme->nkx;
      ny  = pme->nky;
      nz  = pme->nkz;
 -    
 +
      start_ix = pme->pmegrid_start_ix;
      start_iy = pme->pmegrid_start_iy;
      start_iz = pme->pmegrid_start_iz;
 -    
 +
      rxx = pme->recipbox[XX][XX];
      ryx = pme->recipbox[YY][XX];
      ryy = pme->recipbox[YY][YY];
      rzx = pme->recipbox[ZZ][XX];
      rzy = pme->recipbox[ZZ][YY];
      rzz = pme->recipbox[ZZ][ZZ];
 -    
 -    for(i=0; (i<atc->n); i++) {
 +
 +    g2tx = pme->pmegridA.g2t[XX];
 +    g2ty = pme->pmegridA.g2t[YY];
 +    g2tz = pme->pmegridA.g2t[ZZ];
 +
 +    bThreads = (atc->nthread > 1);
 +    if (bThreads)
 +    {
 +        thread_idx = atc->thread_idx;
 +
 +        tpl   = &atc->thread_plist[thread];
 +        tpl_n = tpl->n;
 +        for(i=0; i<atc->nthread; i++)
 +        {
 +            tpl_n[i] = 0;
 +        }
 +    }
 +
 +    for(i=start; i<end; i++) {
          xptr   = atc->x[i];
          idxptr = atc->idx[i];
          fptr   = atc->fractx[i];
 -        
 +
          /* Fractional coordinates along box vectors, add 2.0 to make 100% sure we are positive for triclinic boxes */
          tx = nx * ( xptr[XX] * rxx + xptr[YY] * ryx + xptr[ZZ] * rzx + 2.0 );
          ty = ny * (                  xptr[YY] * ryy + xptr[ZZ] * rzy + 2.0 );
          tz = nz * (                                   xptr[ZZ] * rzz + 2.0 );
 -        
 +
          tix = (int)(tx);
          tiy = (int)(ty);
          tiz = (int)(tz);
 -        
 +
          /* Because decomposition only occurs in x and y,
           * we never have a fraction correction in z.
           */
          fptr[XX] = tx - tix + pme->fshx[tix];
          fptr[YY] = ty - tiy + pme->fshy[tiy];
 -        fptr[ZZ] = tz - tiz;   
 +        fptr[ZZ] = tz - tiz;
  
          idxptr[XX] = pme->nnx[tix];
          idxptr[YY] = pme->nny[tiy];
          range_check(idxptr[YY],0,pme->pmegrid_ny);
          range_check(idxptr[ZZ],0,pme->pmegrid_nz);
  #endif
 -  }  
 +
 +        if (bThreads)
 +        {
 +            thread_i = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
 +            thread_idx[i] = thread_i;
 +            tpl_n[thread_i]++;
 +        }
 +    }
 +
 +    if (bThreads)
 +    {
 +        /* Make a list of particle indices sorted on thread */
 +
 +        /* Get the cumulative count */
 +        for(i=1; i<atc->nthread; i++)
 +        {
 +            tpl_n[i] += tpl_n[i-1];
 +        }
 +        /* The current implementation distributes particles equally
 +         * over the threads, so we could actually allocate for that
 +         * in pme_realloc_atomcomm_things.
 +         */
 +        if (tpl_n[atc->nthread-1] > tpl->nalloc)
 +        {
 +            tpl->nalloc = over_alloc_large(tpl_n[atc->nthread-1]);
 +            srenew(tpl->i,tpl->nalloc);
 +        }
 +        /* Set tpl_n to the cumulative start */
 +        for(i=atc->nthread-1; i>=1; i--)
 +        {
 +            tpl_n[i] = tpl_n[i-1];
 +        }
 +        tpl_n[0] = 0;
 +
 +        /* Fill our thread local array with indices sorted on thread */
 +        for(i=start; i<end; i++)
 +        {
 +            tpl->i[tpl_n[atc->thread_idx[i]]++] = i;
 +        }
 +        /* Now tpl_n contains the cummulative count again */
 +    }
 +}
 +
 +static void make_thread_local_ind(pme_atomcomm_t *atc,
 +                                  int thread,splinedata_t *spline)
 +{
 +    int  n,t,i,start,end;
 +    thread_plist_t *tpl;
 +
 +    /* Combine the indices made by each thread into one index */
 +
 +    n = 0;
 +    start = 0;
 +    for(t=0; t<atc->nthread; t++)
 +    {
 +        tpl = &atc->thread_plist[t];
 +        /* Copy our part (start - end) from the list of thread t */
 +        if (thread > 0)
 +        {
 +            start = tpl->n[thread-1];
 +        }
 +        end = tpl->n[thread];
 +        for(i=start; i<end; i++)
 +        {
 +            spline->ind[n++] = tpl->i[i];
 +        }
 +    }
 +
 +    spline->n = n;
  }
  
 -static void pme_calc_pidx(int natoms, matrix recipbox, rvec x[],
 -                          pme_atomcomm_t *atc)
 +
 +static void pme_calc_pidx(int start, int end,
 +                          matrix recipbox, rvec x[],
 +                          pme_atomcomm_t *atc, int *count)
  {
      int  nslab,i;
      int  si;
      real *xptr,s;
      real rxx,ryx,rzx,ryy,rzy;
 -    int *pd,*count;
 +    int *pd;
  
      /* Calculate PME task index (pidx) for each grid index.
       * Here we always assign equally sized slabs to each node
       * for load balancing reasons (the PME grid spacing is not used).
       */
 -    
 +
      nslab = atc->nslab;
      pd    = atc->pd;
 -    count = atc->count;
  
      /* Reset the count */
      for(i=0; i<nslab; i++)
      {
          count[i] = 0;
      }
 -    
 +
      if (atc->dimind == 0)
      {
          rxx = recipbox[XX][XX];
          ryx = recipbox[YY][XX];
          rzx = recipbox[ZZ][XX];
          /* Calculate the node index in x-dimension */
 -        for(i=0; (i<natoms); i++)
 +        for(i=start; i<end; i++)
          {
              xptr   = x[i];
              /* Fractional coordinates along box vectors */
          ryy = recipbox[YY][YY];
          rzy = recipbox[ZZ][YY];
          /* Calculate the node index in y-dimension */
 -        for(i=0; (i<natoms); i++)
 +        for(i=start; i<end; i++)
          {
              xptr   = x[i];
              /* Fractional coordinates along box vectors */
      }
  }
  
 +static void pme_calc_pidx_wrapper(int natoms, matrix recipbox, rvec x[],
 +                                  pme_atomcomm_t *atc)
 +{
 +    int nthread,thread,slab;
 +
 +    nthread = atc->nthread;
 +
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +    for(thread=0; thread<nthread; thread++)
 +    {
 +        pme_calc_pidx(natoms* thread   /nthread,
 +                      natoms*(thread+1)/nthread,
 +                      recipbox,x,atc,atc->count_thread[thread]);
 +    }
 +    /* Non-parallel reduction, since nslab is small */
 +
 +    for(thread=1; thread<nthread; thread++)
 +    {
 +        for(slab=0; slab<atc->nslab; slab++)
 +        {
 +            atc->count_thread[0][slab] += atc->count_thread[thread][slab];
 +        }
 +    }
 +}
 +
 +static void pme_realloc_splinedata(splinedata_t *spline, pme_atomcomm_t *atc)
 +{
 +    int i,d;
 +
 +    srenew(spline->ind,atc->nalloc);
 +    /* Initialize the index to identity so it works without threads */
 +    for(i=0; i<atc->nalloc; i++)
 +    {
 +        spline->ind[i] = i;
 +    }
 +
 +    for(d=0;d<DIM;d++)
 +    {
 +        srenew(spline->theta[d] ,atc->pme_order*atc->nalloc);
 +        srenew(spline->dtheta[d],atc->pme_order*atc->nalloc);
 +    }
 +}
 +
  static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
  {
 -    int nalloc_old,i;
 -    
 +    int nalloc_old,i,j,nalloc_tpl;
 +
      /* We have to avoid a NULL pointer for atc->x to avoid
       * possible fatal errors in MPI routines.
       */
      {
          nalloc_old = atc->nalloc;
          atc->nalloc = over_alloc_dd(max(atc->n,1));
 -        
 +
          if (atc->nslab > 1) {
              srenew(atc->x,atc->nalloc);
              srenew(atc->q,atc->nalloc);
              }
          }
          if (atc->bSpread) {
 -            for(i=0;i<DIM;i++) {
 -                srenew(atc->theta[i] ,atc->pme_order*atc->nalloc); 
 -                srenew(atc->dtheta[i],atc->pme_order*atc->nalloc);
 -            }
 -            srenew(atc->fractx,atc->nalloc); 
 +            srenew(atc->fractx,atc->nalloc);
              srenew(atc->idx   ,atc->nalloc);
 +
 +            if (atc->nthread > 1)
 +            {
 +                srenew(atc->thread_idx,atc->nalloc);
 +            }
 +
 +            for(i=0; i<atc->nthread; i++)
 +            {
 +                pme_realloc_splinedata(&atc->spline[i],atc);
 +            }
          }
      }
  }
@@@ -627,7 -398,7 +627,7 @@@ static void pmeredist_pd(gmx_pme_t pme
  {
      int *idxa;
      int i, ii;
 -    
 +
      if(FALSE == pme->redist_init) {
          snew(pme->scounts,atc->nslab);
          snew(pme->rcounts,atc->nslab);
          pme->redist_buf_nalloc = over_alloc_dd(n);
          srenew(pme->redist_buf,pme->redist_buf_nalloc*DIM);
      }
 -    
 +
      pme->idxa = atc->pd;
  
  #ifdef GMX_MPI
      if (forw && bXF) {
 -        /* forward, redistribution from pp to pme */ 
 -        
 +        /* forward, redistribution from pp to pme */
 +
          /* Calculate send counts and exchange them with other nodes */
          for(i=0; (i<atc->nslab); i++) pme->scounts[i]=0;
          for(i=0; (i<n); i++) pme->scounts[pme->idxa[i]]++;
          MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm);
 -        
 -        /* Calculate send and receive displacements and index into send 
 +
 +        /* Calculate send and receive displacements and index into send
             buffer */
          pme->sdispls[0]=0;
          pme->rdispls[0]=0;
          }
          /* Total # of particles to be received */
          atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1];
 -        
 +
          pme_realloc_atomcomm_things(atc);
 -        
 +
          /* Copy particle coordinates into send buffer and exchange*/
          for(i=0; (i<n); i++) {
              ii=DIM*pme->sidx[pme->idxa[i]];
              pme->redist_buf[ii+YY]=x_f[i][YY];
              pme->redist_buf[ii+ZZ]=x_f[i][ZZ];
          }
 -        MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, 
 -                      pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls, 
 +        MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
 +                      pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
                        pme->rvec_mpi, atc->mpi_comm);
      }
      if (forw) {
                        atc->q, pme->rcounts, pme->rdispls, mpi_type,
                        atc->mpi_comm);
      }
 -    else { /* backward, redistribution from pme to pp */ 
 +    else { /* backward, redistribution from pme to pp */
          MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi,
 -                      pme->redist_buf, pme->scounts, pme->sdispls, 
 +                      pme->redist_buf, pme->scounts, pme->sdispls,
                        pme->rvec_mpi, atc->mpi_comm);
 -        
 +
          /* Copy data from receive buffer */
          for(i=0; i<atc->nslab; i++)
              pme->sidx[i] = pme->sdispls[i];
              pme->sidx[pme->idxa[i]]++;
          }
      }
 -#endif 
 +#endif
  }
  
  static void pme_dd_sendrecv(pme_atomcomm_t *atc,
  #ifdef GMX_MPI
      int dest,src;
      MPI_Status stat;
 -    
 +
      if (bBackward == FALSE) {
          dest = atc->node_dest[shift];
          src  = atc->node_src[shift];
          dest = atc->node_src[shift];
          src  = atc->node_dest[shift];
      }
 -    
 +
      if (nbyte_s > 0 && nbyte_r > 0) {
          MPI_Sendrecv(buf_s,nbyte_s,MPI_BYTE,
                       dest,shift,
  #endif
  }
  
 -static void dd_pmeredist_x_q(gmx_pme_t pme, 
 +static void dd_pmeredist_x_q(gmx_pme_t pme,
                               int n, gmx_bool bX, rvec *x, real *charge,
                               pme_atomcomm_t *atc)
  {
      int *commnode,*buf_index;
      int nnodes_comm,i,nsend,local_pos,buf_pos,node,scount,rcount;
 -    
 +
      commnode  = atc->node_dest;
      buf_index = atc->buf_index;
 -    
 +
      nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
 -    
 +
      nsend = 0;
      for(i=0; i<nnodes_comm; i++) {
          buf_index[commnode[i]] = nsend;
                        "This usually means that your system is not well equilibrated.",
                        n - (atc->count[atc->nodeid] + nsend),
                        pme->nodeid,'x'+atc->dimind);
 -        
 +
          if (nsend > pme->buf_nalloc) {
              pme->buf_nalloc = over_alloc_dd(nsend);
              srenew(pme->bufv,pme->buf_nalloc);
              srenew(pme->bufr,pme->buf_nalloc);
          }
 -        
 +
          atc->n = atc->count[atc->nodeid];
          for(i=0; i<nnodes_comm; i++) {
              scount = atc->count[commnode[i]];
                              &atc->rcount[i],sizeof(int));
              atc->n += atc->rcount[i];
          }
 -        
 +
          pme_realloc_atomcomm_things(atc);
      }
 -    
 +
      local_pos = 0;
      for(i=0; i<n; i++) {
          node = atc->pd[i];
              buf_index[node]++;
          }
      }
 -    
 +
      buf_pos = 0;
      for(i=0; i<nnodes_comm; i++) {
          scount = atc->count[commnode[i]];
@@@ -902,7 -673,7 +902,7 @@@ static void dd_pmeredist_f(gmx_pme_t pm
  }
  
  #ifdef GMX_MPI
 -static void 
 +static void
  gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
  {
      pme_overlap_t *overlap;
      int ipulse,send_id,recv_id,datasize;
      real *p;
      real *sendptr,*recvptr;
 -    
 +
      /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
      overlap = &pme->overlap[1];
 -    
 +
      for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
      {
          /* Since we have already (un)wrapped the overlap in the z-dimension,
              send_id = overlap->recv_id[ipulse];
              recv_id = overlap->send_id[ipulse];
              send_index0   = overlap->comm_data[ipulse].recv_index0;
 -            send_nindex   = overlap->comm_data[ipulse].recv_nindex;            
 +            send_nindex   = overlap->comm_data[ipulse].recv_nindex;
              recv_index0   = overlap->comm_data[ipulse].send_index0;
              recv_nindex   = overlap->comm_data[ipulse].send_nindex;
          }
                  for(k=0;k<pme->nkz;k++)
                  {
                      iz = k;
 -                    pme->pmegrid_sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
 +                    overlap->sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
                  }
              }
          }
 -            
 +
          datasize      = pme->pmegrid_nx * pme->nkz;
 -        
 -        MPI_Sendrecv(pme->pmegrid_sendbuf,send_nindex*datasize,GMX_MPI_REAL,
 +
 +        MPI_Sendrecv(overlap->sendbuf,send_nindex*datasize,GMX_MPI_REAL,
                       send_id,ipulse,
 -                     pme->pmegrid_recvbuf,recv_nindex*datasize,GMX_MPI_REAL,
 +                     overlap->recvbuf,recv_nindex*datasize,GMX_MPI_REAL,
                       recv_id,ipulse,
                       overlap->mpi_comm,&stat);
 -        
 +
          /* Get data from contiguous recv buffer */
          if (debug)
          {
                      iz = k;
                      if(direction==GMX_SUM_QGRID_FORWARD)
                      {
 -                        grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += pme->pmegrid_recvbuf[icnt++];
 +                        grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
                      }
                      else
                      {
 -                        grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz]  = pme->pmegrid_recvbuf[icnt++];
 +                        grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz]  = overlap->recvbuf[icnt++];
                      }
                  }
              }
          }
      }
 -    
 +
      /* Major dimension is easier, no copying required,
       * but we might have to sum to separate array.
       * Since we don't copy, we have to communicate up to pmegrid_nz,
       * not nkz as for the minor direction.
       */
      overlap = &pme->overlap[0];
 -    
 +
      for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
      {
          if(direction==GMX_SUM_QGRID_FORWARD)
              send_nindex   = overlap->comm_data[ipulse].send_nindex;
              recv_index0   = overlap->comm_data[ipulse].recv_index0;
              recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 -            recvptr   = pme->pmegrid_recvbuf;
 +            recvptr   = overlap->recvbuf;
          }
          else
          {
              send_id = overlap->recv_id[ipulse];
              recv_id = overlap->send_id[ipulse];
              send_index0   = overlap->comm_data[ipulse].recv_index0;
 -            send_nindex   = overlap->comm_data[ipulse].recv_nindex;            
 +            send_nindex   = overlap->comm_data[ipulse].recv_nindex;
              recv_index0   = overlap->comm_data[ipulse].send_index0;
              recv_nindex   = overlap->comm_data[ipulse].send_nindex;
              recvptr   = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
          }
 -                
 +
          sendptr       = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
          datasize      = pme->pmegrid_ny * pme->pmegrid_nz;
  
                       recvptr,recv_nindex*datasize,GMX_MPI_REAL,
                       recv_id,ipulse,
                       overlap->mpi_comm,&stat);
 -        
 +
          /* ADD data from contiguous recv buffer */
          if(direction==GMX_SUM_QGRID_FORWARD)
 -        {        
 +        {
              p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
              for(i=0;i<recv_nindex*datasize;i++)
              {
 -                p[i] += pme->pmegrid_recvbuf[i];
 +                p[i] += overlap->recvbuf[i];
              }
          }
      }
@@@ -1085,12 -856,12 +1085,12 @@@ copy_pmegrid_to_fftgrid(gmx_pme_t pme, 
                                     local_fft_ndata,
                                     local_fft_offset,
                                     local_fft_size);
 -    
 +
      local_pme_size[0] = pme->pmegrid_nx;
      local_pme_size[1] = pme->pmegrid_ny;
      local_pme_size[2] = pme->pmegrid_nz;
 -    
 -    /* The fftgrid is always 'justified' to the lower-left corner of the PME grid, 
 +
 +    /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
       the offset is identical, and the PME grid always has more data (due to overlap)
       */
      {
          fp2 = ffopen(fn,"w");
       sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
  #endif
 +
      for(ix=0;ix<local_fft_ndata[XX];ix++)
      {
          for(iy=0;iy<local_fft_ndata[YY];iy++)
          }
      }
  #ifdef DEBUG_PME
 -    fclose(fp);
 -    fclose(fp2);
 +    ffclose(fp);
 +    ffclose(fp2);
  #endif
      }
      return 0;
  }
  
  
 +static gmx_cycles_t omp_cyc_start()
 +{
 +    return gmx_cycles_read();
 +}
 +
 +static gmx_cycles_t omp_cyc_end(gmx_cycles_t c)
 +{
 +    return gmx_cycles_read() - c;
 +}
 +
 +
  static int
 -copy_fftgrid_to_pmegrid(gmx_pme_t pme, real *fftgrid, real *pmegrid)
 +copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid,
 +                        int nthread,int thread)
  {
      ivec    local_fft_ndata,local_fft_offset,local_fft_size;
      ivec    local_pme_size;
 -    int     i,ix,iy,iz;
 +    int     ixy0,ixy1,ixy,ix,iy,iz;
      int     pmeidx,fftidx;
 -    
 +#ifdef PME_TIME_THREADS
 +    gmx_cycles_t c1;
 +    static double cs1=0;
 +    static int cnt=0;
 +#endif
 +
 +#ifdef PME_TIME_THREADS
 +    c1 = omp_cyc_start();
 +#endif
      /* Dimensions should be identical for A/B grid, so we just use A here */
      gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
                                     local_fft_ndata,
      local_pme_size[0] = pme->pmegrid_nx;
      local_pme_size[1] = pme->pmegrid_ny;
      local_pme_size[2] = pme->pmegrid_nz;
 -    
 -    /* The fftgrid is always 'justified' to the lower-left corner of the PME grid, 
 +
 +    /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
       the offset is identical, and the PME grid always has more data (due to overlap)
       */
 -    for(ix=0;ix<local_fft_ndata[XX];ix++)
 +    ixy0 = ((thread  )*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
 +    ixy1 = ((thread+1)*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
 +
 +    for(ixy=ixy0;ixy<ixy1;ixy++)
      {
 -        for(iy=0;iy<local_fft_ndata[YY];iy++)
 +        ix = ixy/local_fft_ndata[YY];
 +        iy = ixy - ix*local_fft_ndata[YY];
 +
 +        pmeidx = (ix*local_pme_size[YY] + iy)*local_pme_size[ZZ];
 +        fftidx = (ix*local_fft_size[YY] + iy)*local_fft_size[ZZ];
 +        for(iz=0;iz<local_fft_ndata[ZZ];iz++)
          {
 -            for(iz=0;iz<local_fft_ndata[ZZ];iz++)
 -            {
 -                pmeidx = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
 -                fftidx = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
 -                pmegrid[pmeidx] = fftgrid[fftidx];
 -            }
 +            pmegrid[pmeidx+iz] = fftgrid[fftidx+iz];
          }
 -    }   
 +    }
 +
 +#ifdef PME_TIME_THREADS
 +    c1 = omp_cyc_end(c1);
 +    cs1 += (double)c1;
 +    cnt++;
 +    if (cnt % 20 == 0)
 +    {
 +        printf("copy %.2f\n",cs1*1e-9);
 +    }
 +#endif
 +
      return 0;
  }
  
@@@ -1226,9 -962,9 +1226,9 @@@ wrap_periodic_pmegrid(gmx_pme_t pme, re
      overlap = pme->pme_order - 1;
  
      /* Add periodic overlap in z */
 -    for(ix=0; ix<pnx; ix++)
 +    for(ix=0; ix<pme->pmegrid_nx; ix++)
      {
 -        for(iy=0; iy<pny; iy++)
 +        for(iy=0; iy<pme->pmegrid_ny; iy++)
          {
              for(iz=0; iz<overlap; iz++)
              {
  
      if (pme->nnodes_minor == 1)
      {
 -       for(ix=0; ix<pnx; ix++)
 +       for(ix=0; ix<pme->pmegrid_nx; ix++)
         {
             for(iy=0; iy<overlap; iy++)
             {
             }
         }
      }
 -     
 +
      if (pme->nnodes_major == 1)
      {
 -        ny_x = (pme->nnodes_minor == 1 ? ny : pny);
 +        ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
  
          for(ix=0; ix<overlap; ix++)
          {
  static void
  unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
  {
 -    int     nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix,iy,iz;
 +    int     nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix;
  
      nx = pme->nkx;
      ny = pme->nky;
  
      if (pme->nnodes_major == 1)
      {
 -        ny_x = (pme->nnodes_minor == 1 ? ny : pny);
 +        ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
  
          for(ix=0; ix<overlap; ix++)
          {
 +            int iy,iz;
 +
              for(iy=0; iy<ny_x; iy++)
              {
                  for(iz=0; iz<nz; iz++)
  
      if (pme->nnodes_minor == 1)
      {
 -       for(ix=0; ix<pnx; ix++)
 +#pragma omp parallel for num_threads(pme->nthread) schedule(static)
 +       for(ix=0; ix<pme->pmegrid_nx; ix++)
         {
 +           int iy,iz;
 +
             for(iy=0; iy<overlap; iy++)
             {
                 for(iz=0; iz<nz; iz++)
      }
  
      /* Copy periodic overlap in z */
 -    for(ix=0; ix<pnx; ix++)
 +#pragma omp parallel for num_threads(pme->nthread) schedule(static)
 +    for(ix=0; ix<pme->pmegrid_nx; ix++)
      {
 -        for(iy=0; iy<pny; iy++)
 +        int iy,iz;
 +
 +        for(iy=0; iy<pme->pmegrid_ny; iy++)
          {
              for(iz=0; iz<overlap; iz++)
              {
      }
  }
  
 +static void clear_grid(int nx,int ny,int nz,real *grid,
 +                       ivec fs,int *flag,
 +                       int fx,int fy,int fz,
 +                       int order)
 +{
 +    int nc,ncz;
 +    int fsx,fsy,fsz,gx,gy,gz,g0x,g0y,x,y,z;
 +    int flind;
 +
 +    nc  = 2 + (order - 2)/FLBS;
 +    ncz = 2 + (order - 2)/FLBSZ;
 +
 +    for(fsx=fx; fsx<fx+nc; fsx++)
 +    {
 +        for(fsy=fy; fsy<fy+nc; fsy++)
 +        {
 +            for(fsz=fz; fsz<fz+ncz; fsz++)
 +            {
 +                flind = (fsx*fs[YY] + fsy)*fs[ZZ] + fsz;
 +                if (flag[flind] == 0)
 +                {
 +                    gx = fsx*FLBS;
 +                    gy = fsy*FLBS;
 +                    gz = fsz*FLBSZ;
 +                    g0x = (gx*ny + gy)*nz + gz;
 +                    for(x=0; x<FLBS; x++)
 +                    {
 +                        g0y = g0x;
 +                        for(y=0; y<FLBS; y++)
 +                        {
 +                            for(z=0; z<FLBSZ; z++)
 +                            {
 +                                grid[g0y+z] = 0;
 +                            }
 +                            g0y += nz;
 +                        }
 +                        g0x += ny*nz;
 +                    }
 +
 +                    flag[flind] = 1;
 +                }
 +            }
 +        }
 +    }
 +}
  
  /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
  #define DO_BSPLINE(order)                            \
@@@ -1408,13 -1091,11 +1408,13 @@@ for(ithx=0; (ithx<order); ithx++
  }
  
  
 -static void spread_q_bsplines(gmx_pme_t pme, pme_atomcomm_t *atc, 
 -                              real *grid)
 +static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
 +                                     pme_atomcomm_t *atc, splinedata_t *spline,
 +                                     pme_spline_work_t *work)
  {
  
      /* spread charges from home atoms to local grid */
 +    real     *grid;
      pme_overlap_t *ol;
      int      b,i,nn,n,ithx,ithy,ithz,i0,j0,k0;
      int *    idxptr;
      real     valx,valxy,qn;
      real     *thx,*thy,*thz;
      int      localsize, bndsize;
 -  
      int      pnx,pny,pnz,ndatatot;
 -  
 -    pnx = pme->pmegrid_nx;
 -    pny = pme->pmegrid_ny;
 -    pnz = pme->pmegrid_nz;
 +    int      offx,offy,offz;
 +
 +    pnx = pmegrid->n[XX];
 +    pny = pmegrid->n[YY];
 +    pnz = pmegrid->n[ZZ];
 +
 +    offx = pmegrid->offset[XX];
 +    offy = pmegrid->offset[YY];
 +    offz = pmegrid->offset[ZZ];
 +
      ndatatot = pnx*pny*pnz;
 -    
 +    grid = pmegrid->grid;
      for(i=0;i<ndatatot;i++)
      {
          grid[i] = 0;
      }
  
 -    order = pme->pme_order;
 +    order = pmegrid->order;
  
 -    for(nn=0; (nn<atc->n);nn++) 
 +    for(nn=0; nn<spline->n; nn++)
      {
 -        n      = nn;
 -        qn     = atc->q[n];
 +        n  = spline->ind[nn];
 +        qn = atc->q[n];
  
 -        if (qn != 0) 
 +        if (qn != 0)
          {
              idxptr = atc->idx[n];
 -            norder = n*order;
 -            
 -            i0   = idxptr[XX]; 
 -            j0   = idxptr[YY];
 -            k0   = idxptr[ZZ];
 -            thx = atc->theta[XX] + norder;
 -            thy = atc->theta[YY] + norder;
 -            thz = atc->theta[ZZ] + norder;
 -            
 +            norder = nn*order;
 +
 +            i0   = idxptr[XX] - offx;
 +            j0   = idxptr[YY] - offy;
 +            k0   = idxptr[ZZ] - offz;
 +
 +            thx = spline->theta[XX] + norder;
 +            thy = spline->theta[YY] + norder;
 +            thz = spline->theta[ZZ] + norder;
 +
              switch (order) {
 -            case 4:  DO_BSPLINE(4);     break;
 -            case 5:  DO_BSPLINE(5);     break;
 -            default: DO_BSPLINE(order); break;
 +            case 4:
 +#ifdef PME_SSE
 +#ifdef PME_SSE_UNALIGNED
 +#define PME_SPREAD_SSE_ORDER4
 +#else
 +#define PME_SPREAD_SSE_ALIGNED
 +#define PME_ORDER 4
 +#endif
 +#include "pme_sse_single.h"
 +#else
 +                DO_BSPLINE(4);
 +#endif
 +                break;
 +            case 5:
 +#ifdef PME_SSE
 +#define PME_SPREAD_SSE_ALIGNED
 +#define PME_ORDER 5
 +#include "pme_sse_single.h"
 +#else
 +                DO_BSPLINE(5);
 +#endif
 +                break;
 +            default:
 +                DO_BSPLINE(order);
 +                break;
              }
          }
 -    } 
 +    }
  }
  
 -
 -#if ( !defined(GMX_DOUBLE) && ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_X86_64_SSE2) ) )
 -    /* Calculate exponentials through SSE in float precision */
 -#define CALC_EXPONENTIALS(start,end,r_aligned)      \
 -    {                                               \
 -        __m128 tmp_sse;                             \
 -        for(kx=0; kx<end; kx+=4)                    \
 -        {                                           \
 -            tmp_sse = _mm_load_ps(r_aligned+kx);    \
 -            tmp_sse = gmx_mm_exp_ps(tmp_sse);       \
 -            _mm_store_ps(r_aligned+kx,tmp_sse);     \
 -        }                                           \
 +static void set_grid_alignment(int *pmegrid_nz,int pme_order)
 +{
 +#ifdef PME_SSE
 +    if (pme_order == 5
 +#ifndef PME_SSE_UNALIGNED
 +        || pme_order == 4
 +#endif
 +        )
 +    {
 +        /* Round nz up to a multiple of 4 to ensure alignment */
 +        *pmegrid_nz = ((*pmegrid_nz + 3) & ~3);
      }
 -#else
 -#define CALC_EXPONENTIALS(start,end,r)          \
 -    for(kx=start; kx<end; kx++)                 \
 -    {                                           \
 -        r[kx] = exp(r[kx]);                     \
 +#endif
 +}
 +
 +static void set_gridsize_alignment(int *gridsize,int pme_order)
 +{
 +#ifdef PME_SSE
 +#ifndef PME_SSE_UNALIGNED
 +    if (pme_order == 4)
 +    {
 +        /* Add extra elements to ensured aligned operations do not go
 +         * beyond the allocated grid size.
 +         * Note that for pme_order=5, the pme grid z-size alignment
 +         * ensures that we will not go beyond the grid size.
 +         */
 +         *gridsize += 4;
      }
  #endif
 +#endif
 +}
 +
 +static void pmegrid_init(pmegrid_t *grid,
 +                         int cx, int cy, int cz,
 +                         int x0, int y0, int z0,
 +                         int x1, int y1, int z1,
 +                         gmx_bool set_alignment,
 +                         int pme_order,
 +                         real *ptr)
 +{
 +    int nz,gridsize;
 +
 +    grid->ci[XX] = cx;
 +    grid->ci[YY] = cy;
 +    grid->ci[ZZ] = cz;
 +    grid->offset[XX] = x0;
 +    grid->offset[YY] = y0;
 +    grid->offset[ZZ] = z0;
 +    grid->n[XX]      = x1 - x0 + pme_order - 1;
 +    grid->n[YY]      = y1 - y0 + pme_order - 1;
 +    grid->n[ZZ]      = z1 - z0 + pme_order - 1;
 +
 +    nz = grid->n[ZZ];
 +    set_grid_alignment(&nz,pme_order);
 +    if (set_alignment)
 +    {
 +        grid->n[ZZ] = nz;
 +    }
 +    else if (nz != grid->n[ZZ])
 +    {
 +        gmx_incons("pmegrid_init call with an unaligned z size");
 +    }
  
 +    grid->order = pme_order;
 +    if (ptr == NULL)
 +    {
 +        gridsize = grid->n[XX]*grid->n[YY]*grid->n[ZZ];
 +        set_gridsize_alignment(&gridsize,pme_order);
 +        snew_aligned(grid->grid,gridsize,16);
 +    }
 +    else
 +    {
 +        grid->grid = ptr;
 +    }
 +}
  
 -static int solve_pme_yzx(gmx_pme_t pme,t_complex *grid,
 -                         real ewaldcoeff,real vol,
 -                         gmx_bool bEnerVir,real *mesh_energy,matrix vir)
 +static int div_round_up(int enumerator,int denominator)
  {
 -    /* do recip sum over local cells in grid */
 -    /* y major, z middle, x minor or continuous */
 -    t_complex *p0;
 +    return (enumerator + denominator - 1)/denominator;
 +}
 +
 +static void make_subgrid_division(const ivec n,int ovl,int nthread,
 +                                  ivec nsub)
 +{
 +    int gsize_opt,gsize;
 +    int nsx,nsy,nsz;
 +    char *env;
 +
 +    gsize_opt = -1;
 +    for(nsx=1; nsx<=nthread; nsx++)
 +    {
 +        if (nthread % nsx == 0)
 +        {
 +            for(nsy=1; nsy<=nthread; nsy++)
 +            {
 +                if (nsx*nsy <= nthread && nthread % (nsx*nsy) == 0)
 +                {
 +                    nsz = nthread/(nsx*nsy);
 +
 +                    /* Determine the number of grid points per thread */
 +                    gsize =
 +                        (div_round_up(n[XX],nsx) + ovl)*
 +                        (div_round_up(n[YY],nsy) + ovl)*
 +                        (div_round_up(n[ZZ],nsz) + ovl);
 +
 +                    /* Minimize the number of grids points per thread
 +                     * and, secondarily, the number of cuts in minor dimensions.
 +                     */
 +                    if (gsize_opt == -1 ||
 +                        gsize < gsize_opt ||
 +                        (gsize == gsize_opt &&
 +                         (nsz < nsub[ZZ] || (nsz == nsub[ZZ] && nsy < nsub[YY]))))
 +                    {
 +                        nsub[XX] = nsx;
 +                        nsub[YY] = nsy;
 +                        nsub[ZZ] = nsz;
 +                        gsize_opt = gsize;
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    env = getenv("GMX_PME_THREAD_DIVISION");
 +    if (env != NULL)
 +    {
 +        sscanf(env,"%d %d %d",&nsub[XX],&nsub[YY],&nsub[ZZ]);
 +    }
 +
 +    if (nsub[XX]*nsub[YY]*nsub[ZZ] != nthread)
 +    {
 +        gmx_fatal(FARGS,"PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)",nsub[XX],nsub[YY],nsub[ZZ],nthread);
 +    }
 +}
 +
 +static void pmegrids_init(pmegrids_t *grids,
 +                          int nx,int ny,int nz,int nz_base,
 +                          int pme_order,
 +                          int nthread,
 +                          int overlap_x,
 +                          int overlap_y)
 +{
 +    ivec n,n_base,g0,g1;
 +    int t,x,y,z,d,i,tfac;
 +    int max_comm_lines;
 +
 +    n[XX] = nx - (pme_order - 1);
 +    n[YY] = ny - (pme_order - 1);
 +    n[ZZ] = nz - (pme_order - 1);
 +
 +    copy_ivec(n,n_base);
 +    n_base[ZZ] = nz_base;
 +
 +    pmegrid_init(&grids->grid,0,0,0,0,0,0,n[XX],n[YY],n[ZZ],FALSE,pme_order,
 +                 NULL);
 +
 +    grids->nthread = nthread;
 +
 +    make_subgrid_division(n_base,pme_order-1,grids->nthread,grids->nc);
 +
 +    if (grids->nthread > 1)
 +    {
 +        ivec nst;
 +        int gridsize;
 +        real *grid_all;
 +
 +        for(d=0; d<DIM; d++)
 +        {
 +            nst[d] = div_round_up(n[d],grids->nc[d]) + pme_order - 1;
 +        }
 +        set_grid_alignment(&nst[ZZ],pme_order);
 +
 +        if (debug)
 +        {
 +            fprintf(debug,"pmegrid thread local division: %d x %d x %d\n",
 +                    grids->nc[XX],grids->nc[YY],grids->nc[ZZ]);
 +            fprintf(debug,"pmegrid %d %d %d max thread pmegrid %d %d %d\n",
 +                    nx,ny,nz,
 +                    nst[XX],nst[YY],nst[ZZ]);
 +        }
 +
 +        snew(grids->grid_th,grids->nthread);
 +        t = 0;
 +        gridsize = nst[XX]*nst[YY]*nst[ZZ];
 +        set_gridsize_alignment(&gridsize,pme_order);
 +        snew_aligned(grid_all,
 +                     grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
 +                     16);
 +
 +        for(x=0; x<grids->nc[XX]; x++)
 +        {
 +            for(y=0; y<grids->nc[YY]; y++)
 +            {
 +                for(z=0; z<grids->nc[ZZ]; z++)
 +                {
 +                    pmegrid_init(&grids->grid_th[t],
 +                                 x,y,z,
 +                                 (n[XX]*(x  ))/grids->nc[XX],
 +                                 (n[YY]*(y  ))/grids->nc[YY],
 +                                 (n[ZZ]*(z  ))/grids->nc[ZZ],
 +                                 (n[XX]*(x+1))/grids->nc[XX],
 +                                 (n[YY]*(y+1))/grids->nc[YY],
 +                                 (n[ZZ]*(z+1))/grids->nc[ZZ],
 +                                 TRUE,
 +                                 pme_order,
 +                                 grid_all+GMX_CACHE_SEP+t*(gridsize+GMX_CACHE_SEP));
 +                    t++;
 +                }
 +            }
 +        }
 +    }
 +
 +    snew(grids->g2t,DIM);
 +    tfac = 1;
 +    for(d=DIM-1; d>=0; d--)
 +    {
 +        snew(grids->g2t[d],n[d]);
 +        t = 0;
 +        for(i=0; i<n[d]; i++)
 +        {
 +            /* The second check should match the parameters
 +             * of the pmegrid_init call above.
 +             */
 +            while (t + 1 < grids->nc[d] && i >= (n[d]*(t+1))/grids->nc[d])
 +            {
 +                t++;
 +            }
 +            grids->g2t[d][i] = t*tfac;
 +        }
 +
 +        tfac *= grids->nc[d];
 +
 +        switch (d)
 +        {
 +        case XX: max_comm_lines = overlap_x;     break;
 +        case YY: max_comm_lines = overlap_y;     break;
 +        case ZZ: max_comm_lines = pme_order - 1; break;
 +        }
 +        grids->nthread_comm[d] = 0;
 +        while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines)
 +        {
 +            grids->nthread_comm[d]++;
 +        }
 +        if (debug != NULL)
 +        {
 +            fprintf(debug,"pmegrid thread grid communication range in %c: %d\n",
 +                    'x'+d,grids->nthread_comm[d]);
 +        }
 +        /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
 +         * work, but this is not a problematic restriction.
 +         */
 +        if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d])
 +        {
 +            gmx_fatal(FARGS,"Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME",grids->nthread);
 +        }
 +    }
 +}
 +
 +
 +static void pmegrids_destroy(pmegrids_t *grids)
 +{
 +    int t;
 +
 +    if (grids->grid.grid != NULL)
 +    {
 +        sfree(grids->grid.grid);
 +
 +        if (grids->nthread > 0)
 +        {
 +            for(t=0; t<grids->nthread; t++)
 +            {
 +                sfree(grids->grid_th[t].grid);
 +            }
 +            sfree(grids->grid_th);
 +        }
 +    }
 +}
 +
 +
 +static void realloc_work(pme_work_t *work,int nkx)
 +{
 +    if (nkx > work->nalloc)
 +    {
 +        work->nalloc = nkx;
 +        srenew(work->mhx  ,work->nalloc);
 +        srenew(work->mhy  ,work->nalloc);
 +        srenew(work->mhz  ,work->nalloc);
 +        srenew(work->m2   ,work->nalloc);
 +        /* Allocate an aligned pointer for SSE operations, including 3 extra
 +         * elements at the end since SSE operates on 4 elements at a time.
 +         */
 +        sfree_aligned(work->denom);
 +        sfree_aligned(work->tmp1);
 +        sfree_aligned(work->eterm);
 +        snew_aligned(work->denom,work->nalloc+3,16);
 +        snew_aligned(work->tmp1 ,work->nalloc+3,16);
 +        snew_aligned(work->eterm,work->nalloc+3,16);
 +        srenew(work->m2inv,work->nalloc);
 +    }
 +}
 +
 +
 +static void free_work(pme_work_t *work)
 +{
 +    sfree(work->mhx);
 +    sfree(work->mhy);
 +    sfree(work->mhz);
 +    sfree(work->m2);
 +    sfree_aligned(work->denom);
 +    sfree_aligned(work->tmp1);
 +    sfree_aligned(work->eterm);
 +    sfree(work->m2inv);
 +}
 +
 +
 +#ifdef PME_SSE
 +    /* Calculate exponentials through SSE in float precision */
 +inline static void calc_exponentials(int start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
 +{
 +    {
 +        const __m128 two = _mm_set_ps(2.0f,2.0f,2.0f,2.0f);
 +        __m128 f_sse;
 +        __m128 lu;
 +        __m128 tmp_d1,d_inv,tmp_r,tmp_e;
 +        int kx;
 +        f_sse = _mm_load1_ps(&f);
 +        for(kx=0; kx<end; kx+=4)
 +        {
 +            tmp_d1   = _mm_load_ps(d_aligned+kx);
 +            lu       = _mm_rcp_ps(tmp_d1);
 +            d_inv    = _mm_mul_ps(lu,_mm_sub_ps(two,_mm_mul_ps(lu,tmp_d1)));
 +            tmp_r    = _mm_load_ps(r_aligned+kx);
 +            tmp_r    = gmx_mm_exp_ps(tmp_r);
 +            tmp_e    = _mm_mul_ps(f_sse,d_inv);
 +            tmp_e    = _mm_mul_ps(tmp_e,tmp_r);
 +            _mm_store_ps(e_aligned+kx,tmp_e);
 +        }
 +    }
 +}
 +#else
 +inline static void calc_exponentials(int start, int end, real f, real *d, real *r, real *e)
 +{
 +    int kx;
 +    for(kx=start; kx<end; kx++)
 +    {
 +        d[kx] = 1.0/d[kx];
 +    }
 +    for(kx=start; kx<end; kx++)
 +    {
 +        r[kx] = exp(r[kx]);
 +    }
 +    for(kx=start; kx<end; kx++)
 +    {
 +        e[kx] = f*r[kx]*d[kx];
 +    }
 +}
 +#endif
 +
 +
 +static int solve_pme_yzx(gmx_pme_t pme,t_complex *grid,
 +                         real ewaldcoeff,real vol,
 +                         gmx_bool bEnerVir,
 +                         int nthread,int thread)
 +{
 +    /* do recip sum over local cells in grid */
 +    /* y major, z middle, x minor or continuous */
 +    t_complex *p0;
      int     kx,ky,kz,maxkx,maxky,maxkz;
 -    int     nx,ny,nz,iy,iz,kxstart,kxend;
 +    int     nx,ny,nz,iyz0,iyz1,iyz,iy,iz,kxstart,kxend;
      real    mx,my,mz;
      real    factor=M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
      real    ets2,struct2,vfactor,ets2vf;
 -    real    eterm,d1,d2,energy=0;
 +    real    d1,d2,energy=0;
      real    by,bz;
      real    virxx=0,virxy=0,virxz=0,viryy=0,viryz=0,virzz=0;
      real    rxx,ryx,ryy,rzx,rzy,rzz;
 -      real    *mhx,*mhy,*mhz,*m2,*denom,*tmp1,*m2inv;
 +    pme_work_t *work;
 +    real    *mhx,*mhy,*mhz,*m2,*denom,*tmp1,*eterm,*m2inv;
      real    mhxk,mhyk,mhzk,m2k;
      real    corner_fac;
      ivec    complex_order;
      ivec    local_ndata,local_offset,local_size;
 -    
 +    real    elfac;
 +
 +    elfac = ONE_4PI_EPS0/pme->epsilon_r;
 +
      nx = pme->nkx;
      ny = pme->nky;
      nz = pme->nkz;
 -    
 +
      /* Dimensions should be identical for A/B grid, so we just use A here */
      gmx_parallel_3dfft_complex_limits(pme->pfft_setupA,
                                        complex_order,
                                        local_ndata,
                                        local_offset,
                                        local_size);
 -    
 +
      rxx = pme->recipbox[XX][XX];
      ryx = pme->recipbox[YY][XX];
      ryy = pme->recipbox[YY][YY];
      rzx = pme->recipbox[ZZ][XX];
      rzy = pme->recipbox[ZZ][YY];
      rzz = pme->recipbox[ZZ][ZZ];
 -    
 +
      maxkx = (nx+1)/2;
      maxky = (ny+1)/2;
      maxkz = nz/2+1;
 -      
 -      mhx   = pme->work_mhx;
 -      mhy   = pme->work_mhy;
 -      mhz   = pme->work_mhz;
 -      m2    = pme->work_m2;
 -      denom = pme->work_denom;
 -      tmp1  = pme->work_tmp1;
 -      m2inv = pme->work_m2inv;        
  
 -    for(iy=0;iy<local_ndata[YY];iy++)
 +    work = &pme->work[thread];
 +    mhx   = work->mhx;
 +    mhy   = work->mhy;
 +    mhz   = work->mhz;
 +    m2    = work->m2;
 +    denom = work->denom;
 +    tmp1  = work->tmp1;
 +    eterm = work->eterm;
 +    m2inv = work->m2inv;
 +
 +    iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread   /nthread;
 +    iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
 +
 +    for(iyz=iyz0; iyz<iyz1; iyz++)
      {
 +        iy = iyz/local_ndata[ZZ];
 +        iz = iyz - iy*local_ndata[ZZ];
 +
          ky = iy + local_offset[YY];
 -        
 -        if (ky < maxky) 
 +
 +        if (ky < maxky)
          {
              my = ky;
          }
 -        else 
 +        else
          {
              my = (ky - ny);
          }
 -        
 +
          by = M_PI*vol*pme->bsp_mod[YY][ky];
  
 -        for(iz=0;iz<local_ndata[ZZ];iz++)
 -        {
 -            kz = iz + local_offset[ZZ];
 -            
 -            mz = kz;
 -
 -            bz = pme->bsp_mod[ZZ][kz];
 -            
 -            /* 0.5 correction for corner points */
 -                      corner_fac = 1;
 -            if (kz == 0)
 -                corner_fac = 0.5;
 -            if (kz == (nz+1)/2)
 -                corner_fac = 0.5;
 -                      
 -            p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
 -            
 -            /* We should skip the k-space point (0,0,0) */
 -            if (local_offset[XX] > 0 ||
 -                local_offset[YY] > 0 || ky > 0 ||
 -                kz > 0)
 +        kz = iz + local_offset[ZZ];
 +
 +        mz = kz;
 +
 +        bz = pme->bsp_mod[ZZ][kz];
 +
 +        /* 0.5 correction for corner points */
 +        corner_fac = 1;
 +        if (kz == 0 || kz == (nz+1)/2)
 +        {
 +            corner_fac = 0.5;
 +        }
 +
 +        p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
 +
 +        /* We should skip the k-space point (0,0,0) */
 +        if (local_offset[XX] > 0 || ky > 0 || kz > 0)
 +        {
 +            kxstart = local_offset[XX];
 +        }
 +        else
 +        {
 +            kxstart = local_offset[XX] + 1;
 +            p0++;
 +        }
 +        kxend = local_offset[XX] + local_ndata[XX];
 +
 +        if (bEnerVir)
 +        {
 +            /* More expensive inner loop, especially because of the storage
 +             * of the mh elements in array's.
 +             * Because x is the minor grid index, all mh elements
 +             * depend on kx for triclinic unit cells.
 +             */
 +
 +                /* Two explicit loops to avoid a conditional inside the loop */
 +            for(kx=kxstart; kx<maxkx; kx++)
              {
 -                kxstart = local_offset[XX];
 +                mx = kx;
 +
 +                mhxk      = mx * rxx;
 +                mhyk      = mx * ryx + my * ryy;
 +                mhzk      = mx * rzx + my * rzy + mz * rzz;
 +                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 +                mhx[kx]   = mhxk;
 +                mhy[kx]   = mhyk;
 +                mhz[kx]   = mhzk;
 +                m2[kx]    = m2k;
 +                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
 +                tmp1[kx]  = -factor*m2k;
              }
 -            else
 +
 +            for(kx=maxkx; kx<kxend; kx++)
              {
 -                kxstart = local_offset[XX] + 1;
 -                p0++;
 +                mx = (kx - nx);
 +
 +                mhxk      = mx * rxx;
 +                mhyk      = mx * ryx + my * ryy;
 +                mhzk      = mx * rzx + my * rzy + mz * rzz;
 +                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 +                mhx[kx]   = mhxk;
 +                mhy[kx]   = mhyk;
 +                mhz[kx]   = mhzk;
 +                m2[kx]    = m2k;
 +                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
 +                tmp1[kx]  = -factor*m2k;
              }
 -            kxend = local_offset[XX] + local_ndata[XX];
 -                      
 -            if (bEnerVir)
 +
 +            for(kx=kxstart; kx<kxend; kx++)
              {
 -                /* More expensive inner loop, especially because of the storage
 -                 * of the mh elements in array's.
 -                 * Because x is the minor grid index, all mh elements
 -                 * depend on kx for triclinic unit cells.
 -                 */
 +                m2inv[kx] = 1.0/m2[kx];
 +            }
  
 -                /* Two explicit loops to avoid a conditional inside the loop */
 -                for(kx=kxstart; kx<maxkx; kx++)
 -                {
 -                    mx = kx;
 -                    
 -                    mhxk      = mx * rxx;
 -                    mhyk      = mx * ryx + my * ryy;
 -                    mhzk      = mx * rzx + my * rzy + mz * rzz;
 -                    m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 -                    mhx[kx]   = mhxk;
 -                    mhy[kx]   = mhyk;
 -                    mhz[kx]   = mhzk;
 -                    m2[kx]    = m2k;
 -                    denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
 -                    tmp1[kx]  = -factor*m2k;
 -                }
 -                
 -                for(kx=maxkx; kx<kxend; kx++)
 -                {
 -                    mx = (kx - nx);
 -
 -                    mhxk      = mx * rxx;
 -                    mhyk      = mx * ryx + my * ryy;
 -                    mhzk      = mx * rzx + my * rzy + mz * rzz;
 -                    m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 -                    mhx[kx]   = mhxk;
 -                    mhy[kx]   = mhyk;
 -                    mhz[kx]   = mhzk;
 -                    m2[kx]    = m2k;
 -                    denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
 -                    tmp1[kx]  = -factor*m2k;
 -                }
 -                
 -                for(kx=kxstart; kx<kxend; kx++)
 -                {
 -                    m2inv[kx] = 1.0/m2[kx];
 -                }
 -                for(kx=kxstart; kx<kxend; kx++)
 -                {
 -                    denom[kx] = 1.0/denom[kx];
 -                }
 +            calc_exponentials(kxstart,kxend,elfac,denom,tmp1,eterm);
  
 -                CALC_EXPONENTIALS(kxstart,kxend,tmp1);
 +            for(kx=kxstart; kx<kxend; kx++,p0++)
 +            {
 +                d1      = p0->re;
 +                d2      = p0->im;
  
 -                for(kx=kxstart; kx<kxend; kx++,p0++)
 -                {
 -                    d1      = p0->re;
 -                    d2      = p0->im;
 -                    
 -                    eterm    = ONE_4PI_EPS0/pme->epsilon_r*tmp1[kx]*denom[kx];
 -                    
 -                    p0->re  = d1*eterm;
 -                    p0->im  = d2*eterm;
 -                    
 -                    struct2 = 2.0*(d1*d1+d2*d2);
 -                    
 -                    tmp1[kx] = eterm*struct2;
 -                }
 -                
 -                for(kx=kxstart; kx<kxend; kx++)
 -                {
 -                    ets2     = corner_fac*tmp1[kx];
 -                    vfactor  = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
 -                    energy  += ets2;
 -                    
 -                    ets2vf   = ets2*vfactor;
 -                    virxx   += ets2vf*mhx[kx]*mhx[kx] - ets2;
 -                    virxy   += ets2vf*mhx[kx]*mhy[kx];
 -                    virxz   += ets2vf*mhx[kx]*mhz[kx];
 -                    viryy   += ets2vf*mhy[kx]*mhy[kx] - ets2;
 -                    viryz   += ets2vf*mhy[kx]*mhz[kx];
 -                    virzz   += ets2vf*mhz[kx]*mhz[kx] - ets2;
 -                }
 +                p0->re  = d1*eterm[kx];
 +                p0->im  = d2*eterm[kx];
 +
 +                struct2 = 2.0*(d1*d1+d2*d2);
 +
 +                tmp1[kx] = eterm[kx]*struct2;
              }
 -            else
 +
 +            for(kx=kxstart; kx<kxend; kx++)
              {
 -                /* We don't need to calculate the energy and the virial.
 -                 * In this case the triclinic overhead is small.
 -                 */
 +                ets2     = corner_fac*tmp1[kx];
 +                vfactor  = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
 +                energy  += ets2;
 +
 +                ets2vf   = ets2*vfactor;
 +                virxx   += ets2vf*mhx[kx]*mhx[kx] - ets2;
 +                virxy   += ets2vf*mhx[kx]*mhy[kx];
 +                virxz   += ets2vf*mhx[kx]*mhz[kx];
 +                viryy   += ets2vf*mhy[kx]*mhy[kx] - ets2;
 +                viryz   += ets2vf*mhy[kx]*mhz[kx];
 +                virzz   += ets2vf*mhz[kx]*mhz[kx] - ets2;
 +            }
 +        }
 +        else
 +        {
 +            /* We don't need to calculate the energy and the virial.
 +             * In this case the triclinic overhead is small.
 +             */
  
 -                /* Two explicit loops to avoid a conditional inside the loop */
 +            /* Two explicit loops to avoid a conditional inside the loop */
  
 -                for(kx=kxstart; kx<maxkx; kx++)
 -                {
 -                    mx = kx;
 -                    
 -                    mhxk      = mx * rxx;
 -                    mhyk      = mx * ryx + my * ryy;
 -                    mhzk      = mx * rzx + my * rzy + mz * rzz;
 -                    m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 -                    denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
 -                    tmp1[kx]  = -factor*m2k;
 -                }
 -                
 -                for(kx=maxkx; kx<kxend; kx++)
 -                {
 -                    mx = (kx - nx);
 -                    
 -                    mhxk      = mx * rxx;
 -                    mhyk      = mx * ryx + my * ryy;
 -                    mhzk      = mx * rzx + my * rzy + mz * rzz;
 -                    m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 -                    denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
 -                    tmp1[kx]  = -factor*m2k;
 -                }
 -                
 -                for(kx=kxstart; kx<kxend; kx++)
 -                {
 -                    denom[kx] = 1.0/denom[kx];
 -                }
 +            for(kx=kxstart; kx<maxkx; kx++)
 +            {
 +                mx = kx;
 +
 +                mhxk      = mx * rxx;
 +                mhyk      = mx * ryx + my * ryy;
 +                mhzk      = mx * rzx + my * rzy + mz * rzz;
 +                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 +                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
 +                tmp1[kx]  = -factor*m2k;
 +            }
  
 -                CALC_EXPONENTIALS(kxstart,kxend,tmp1);
 -               
 -                for(kx=kxstart; kx<kxend; kx++,p0++)
 -                {
 -                    d1      = p0->re;
 -                    d2      = p0->im;
 -                    
 -                    eterm    = ONE_4PI_EPS0/pme->epsilon_r*tmp1[kx]*denom[kx];
 -                    
 -                    p0->re  = d1*eterm;
 -                    p0->im  = d2*eterm;
 -                }
 +            for(kx=maxkx; kx<kxend; kx++)
 +            {
 +                mx = (kx - nx);
 +
 +                mhxk      = mx * rxx;
 +                mhyk      = mx * ryx + my * ryy;
 +                mhzk      = mx * rzx + my * rzy + mz * rzz;
 +                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
 +                denom[kx] = m2k*bz*by*pme->bsp_mod[XX][kx];
 +                tmp1[kx]  = -factor*m2k;
 +            }
 +
 +            calc_exponentials(kxstart,kxend,elfac,denom,tmp1,eterm);
 +
 +            for(kx=kxstart; kx<kxend; kx++,p0++)
 +            {
 +                d1      = p0->re;
 +                d2      = p0->im;
 +
 +                p0->re  = d1*eterm[kx];
 +                p0->im  = d2*eterm[kx];
              }
          }
      }
 -    
 +
      if (bEnerVir)
      {
          /* Update virial with local values.
           * experiencing problems on semiisotropic membranes.
           * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
           */
 -        vir[XX][XX] = 0.25*virxx;
 -        vir[YY][YY] = 0.25*viryy;
 -        vir[ZZ][ZZ] = 0.25*virzz;
 -        vir[XX][YY] = vir[YY][XX] = 0.25*virxy;
 -        vir[XX][ZZ] = vir[ZZ][XX] = 0.25*virxz;
 -        vir[YY][ZZ] = vir[ZZ][YY] = 0.25*viryz;
 -        
 +        work->vir[XX][XX] = 0.25*virxx;
 +        work->vir[YY][YY] = 0.25*viryy;
 +        work->vir[ZZ][ZZ] = 0.25*virzz;
 +        work->vir[XX][YY] = work->vir[YY][XX] = 0.25*virxy;
 +        work->vir[XX][ZZ] = work->vir[ZZ][XX] = 0.25*virxz;
 +        work->vir[YY][ZZ] = work->vir[ZZ][YY] = 0.25*viryz;
 +
          /* This energy should be corrected for a charged system */
 -        *mesh_energy = 0.5*energy;
 +        work->energy = 0.5*energy;
      }
  
      /* Return the loop count */
 -    return local_ndata[YY]*local_ndata[ZZ]*local_ndata[XX];
 +    return local_ndata[YY]*local_ndata[XX];
  }
  
 +static void get_pme_ener_vir(const gmx_pme_t pme,int nthread,
 +                             real *mesh_energy,matrix vir)
 +{
 +    /* This function sums output over threads
 +     * and should therefore only be called after thread synchronization.
 +     */
 +    int thread;
 +
 +    *mesh_energy = pme->work[0].energy;
 +    copy_mat(pme->work[0].vir,vir);
 +
 +    for(thread=1; thread<nthread; thread++)
 +    {
 +        *mesh_energy += pme->work[thread].energy;
 +        m_add(vir,pme->work[thread].vir,vir);
 +    }
 +}
  
  #define DO_FSPLINE(order)                      \
  for(ithx=0; (ithx<order); ithx++)              \
 -{                                                                        \
 +{                                              \
      index_x = (i0+ithx)*pny*pnz;               \
      tx      = thx[ithx];                       \
      dx      = dthx[ithx];                      \
                                                 \
      for(ithy=0; (ithy<order); ithy++)          \
 -    {                                                                            \
 +    {                                          \
          index_xy = index_x+(j0+ithy)*pnz;      \
          ty       = thy[ithy];                  \
          dy       = dthy[ithy];                 \
          fxy1     = fz1 = 0;                    \
                                                 \
          for(ithz=0; (ithz<order); ithz++)      \
 -        {                                                                        \
 +        {                                      \
              gval  = grid[index_xy+(k0+ithz)];  \
              fxy1 += thz[ithz]*gval;            \
              fz1  += dthz[ithz]*gval;           \
  }
  
  
 -void gather_f_bsplines(gmx_pme_t pme,real *grid,
 -                       gmx_bool bClearF,pme_atomcomm_t *atc,real scale)
 +static void gather_f_bsplines(gmx_pme_t pme,real *grid,
 +                              gmx_bool bClearF,pme_atomcomm_t *atc,
 +                              splinedata_t *spline,
 +                              real scale)
  {
 -    /* sum forces for local particles */  
 +    /* sum forces for local particles */
      int     nn,n,ithx,ithy,ithz,i0,j0,k0;
      int     index_x,index_xy;
      int     nx,ny,nz,pnx,pny,pnz;
      int     norder;
      real    rxx,ryx,ryy,rzx,rzy,rzz;
      int     order;
 -    
 +
 +    pme_spline_work_t *work;
 +
 +    work = pme->spline_work;
 +
      order = pme->pme_order;
 -    thx   = atc->theta[XX];
 -    thy   = atc->theta[YY];
 -    thz   = atc->theta[ZZ];
 -    dthx  = atc->dtheta[XX];
 -    dthy  = atc->dtheta[YY];
 -    dthz  = atc->dtheta[ZZ];
 +    thx   = spline->theta[XX];
 +    thy   = spline->theta[YY];
 +    thz   = spline->theta[ZZ];
 +    dthx  = spline->dtheta[XX];
 +    dthy  = spline->dtheta[YY];
 +    dthz  = spline->dtheta[ZZ];
      nx    = pme->nkx;
      ny    = pme->nky;
      nz    = pme->nkz;
      pnx   = pme->pmegrid_nx;
      pny   = pme->pmegrid_ny;
      pnz   = pme->pmegrid_nz;
 -    
 +
      rxx   = pme->recipbox[XX][XX];
      ryx   = pme->recipbox[YY][XX];
      ryy   = pme->recipbox[YY][YY];
      rzy   = pme->recipbox[ZZ][YY];
      rzz   = pme->recipbox[ZZ][ZZ];
  
 -    for(nn=0; (nn<atc->n); nn++) 
 +    for(nn=0; nn<spline->n; nn++)
      {
 -        n = nn;
 -        qn      = scale*atc->q[n];
 -        
 -        if (bClearF) 
 +        n  = spline->ind[nn];
 +        qn = scale*atc->q[n];
 +
 +        if (bClearF)
          {
              atc->f[n][XX] = 0;
              atc->f[n][YY] = 0;
              atc->f[n][ZZ] = 0;
          }
 -        if (qn != 0) 
 +        if (qn != 0)
          {
              fx     = 0;
              fy     = 0;
              fz     = 0;
              idxptr = atc->idx[n];
 -            norder = n*order;
 -            
 -            i0   = idxptr[XX]; 
 +            norder = nn*order;
 +
 +            i0   = idxptr[XX];
              j0   = idxptr[YY];
              k0   = idxptr[ZZ];
 -            
 +
              /* Pointer arithmetic alert, next six statements */
 -            thx  = atc->theta[XX] + norder;
 -            thy  = atc->theta[YY] + norder;
 -            thz  = atc->theta[ZZ] + norder;
 -            dthx = atc->dtheta[XX] + norder;
 -            dthy = atc->dtheta[YY] + norder;
 -            dthz = atc->dtheta[ZZ] + norder;
 -            
 +            thx  = spline->theta[XX] + norder;
 +            thy  = spline->theta[YY] + norder;
 +            thz  = spline->theta[ZZ] + norder;
 +            dthx = spline->dtheta[XX] + norder;
 +            dthy = spline->dtheta[YY] + norder;
 +            dthz = spline->dtheta[ZZ] + norder;
 +
              switch (order) {
 -            case 4:  DO_FSPLINE(4);     break;
 -            case 5:  DO_FSPLINE(5);     break;
 -            default: DO_FSPLINE(order); break;
 +            case 4:
 +#ifdef PME_SSE
 +#ifdef PME_SSE_UNALIGNED
 +#define PME_GATHER_F_SSE_ORDER4
 +#else
 +#define PME_GATHER_F_SSE_ALIGNED
 +#define PME_ORDER 4
 +#endif
 +#include "pme_sse_single.h"
 +#else
 +                DO_FSPLINE(4);
 +#endif
 +                break;
 +            case 5:
 +#ifdef PME_SSE
 +#define PME_GATHER_F_SSE_ALIGNED
 +#define PME_ORDER 5
 +#include "pme_sse_single.h"
 +#else
 +                DO_FSPLINE(5);
 +#endif
 +                break;
 +            default:
 +                DO_FSPLINE(order);
 +                break;
              }
  
              atc->f[n][XX] += -qn*( fx*nx*rxx );
       */
  }
  
 +
  static real gather_energy_bsplines(gmx_pme_t pme,real *grid,
                                     pme_atomcomm_t *atc)
  {
 +    splinedata_t *spline;
      int     n,ithx,ithy,ithz,i0,j0,k0;
      int     index_x,index_xy;
      int *   idxptr;
      real    *thx,*thy,*thz;
      int     norder;
      int     order;
 -    
 -    
 +
 +    spline = &atc->spline[0];
 +
      order = pme->pme_order;
 -    
 +
      energy = 0;
      for(n=0; (n<atc->n); n++) {
          qn      = atc->q[n];
 -        
 +
          if (qn != 0) {
              idxptr = atc->idx[n];
              norder = n*order;
 -            
 -            i0   = idxptr[XX]; 
 +
 +            i0   = idxptr[XX];
              j0   = idxptr[YY];
              k0   = idxptr[ZZ];
 -            
 +
              /* Pointer arithmetic alert, next three statements */
 -            thx  = atc->theta[XX] + norder;
 -            thy  = atc->theta[YY] + norder;
 -            thz  = atc->theta[ZZ] + norder;
 +            thx  = spline->theta[XX] + norder;
 +            thy  = spline->theta[YY] + norder;
 +            thz  = spline->theta[ZZ] + norder;
  
              pot = 0;
              for(ithx=0; (ithx<order); ithx++)
      return energy;
  }
  
 +/* Macro to force loop unrolling by fixing order.
 + * This gives a significant performance gain.
 + */
 +#define CALC_SPLINE(order)                     \
 +{                                              \
 +    int j,k,l;                                 \
 +    real dr,div;                               \
 +    real data[PME_ORDER_MAX];                  \
 +    real ddata[PME_ORDER_MAX];                 \
 +                                               \
 +    for(j=0; (j<DIM); j++)                     \
 +    {                                          \
 +        dr  = xptr[j];                         \
 +                                               \
 +        /* dr is relative offset from lower cell limit */ \
 +        data[order-1] = 0;                     \
 +        data[1] = dr;                          \
 +        data[0] = 1 - dr;                      \
 +                                               \
 +        for(k=3; (k<order); k++)               \
 +        {                                      \
 +            div = 1.0/(k - 1.0);               \
 +            data[k-1] = div*dr*data[k-2];      \
 +            for(l=1; (l<(k-1)); l++)           \
 +            {                                  \
 +                data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
 +                                   data[k-l-1]);                \
 +            }                                  \
 +            data[0] = div*(1-dr)*data[0];      \
 +        }                                      \
 +        /* differentiate */                    \
 +        ddata[0] = -data[0];                   \
 +        for(k=1; (k<order); k++)               \
 +        {                                      \
 +            ddata[k] = data[k-1] - data[k];    \
 +        }                                      \
 +                                               \
 +        div = 1.0/(order - 1);                 \
 +        data[order-1] = div*dr*data[order-2];  \
 +        for(l=1; (l<(order-1)); l++)           \
 +        {                                      \
 +            data[order-l-1] = div*((dr+l)*data[order-l-2]+    \
 +                               (order-l-dr)*data[order-l-1]); \
 +        }                                      \
 +        data[0] = div*(1 - dr)*data[0];        \
 +                                               \
 +        for(k=0; k<order; k++)                 \
 +        {                                      \
 +            theta[j][i*order+k]  = data[k];    \
 +            dtheta[j][i*order+k] = ddata[k];   \
 +        }                                      \
 +    }                                          \
 +}
 +
  void make_bsplines(splinevec theta,splinevec dtheta,int order,
 -                   rvec fractx[],int nr,real charge[],
 +                   rvec fractx[],int nr,int ind[],real charge[],
                     gmx_bool bFreeEnergy)
  {
      /* construct splines for local atoms */
 -    int  i,j,k,l;
 -    real dr,div;
 -    real *data,*ddata,*xptr;
 -    
 -    for(i=0; (i<nr); i++) {
 +    int  i,ii;
 +    real *xptr;
 +
 +    for(i=0; i<nr; i++)
 +    {
          /* With free energy we do not use the charge check.
           * In most cases this will be more efficient than calling make_bsplines
           * twice, since usually more than half the particles have charges.
           */
 -        if (bFreeEnergy || charge[i] != 0.0) {
 -            xptr = fractx[i];
 -            for(j=0; (j<DIM); j++) {
 -                dr  = xptr[j];
 -                
 -                /* dr is relative offset from lower cell limit */
 -                data=&(theta[j][i*order]);
 -                data[order-1]=0;
 -                data[1]=dr;
 -                data[0]=1-dr;
 -                
 -                for(k=3; (k<order); k++) {
 -                    div=1.0/(k-1.0);    
 -                    data[k-1]=div*dr*data[k-2];
 -                    for(l=1; (l<(k-1)); l++) {
 -                        data[k-l-1]=div*((dr+l)*data[k-l-2]+(k-l-dr)*
 -                                         data[k-l-1]);
 -                    }
 -                    data[0]=div*(1-dr)*data[0];
 -                }
 -                /* differentiate */
 -                ddata    = &(dtheta[j][i*order]);
 -                ddata[0] = -data[0];
 -                for(k=1; (k<order); k++) {
 -                    ddata[k]=data[k-1]-data[k];
 -                }
 -                
 -                div=1.0/(order-1);
 -                data[order-1]=div*dr*data[order-2];
 -                for(l=1; (l<(order-1)); l++) {
 -                    data[order-l-1]=div*((dr+l)*data[order-l-2]+
 -                                         (order-l-dr)*data[order-l-1]);
 -                }
 -                data[0]=div*(1-dr)*data[0]; 
 +        ii = ind[i];
 +        if (bFreeEnergy || charge[ii] != 0.0) {
 +            xptr = fractx[ii];
 +            switch(order) {
 +            case 4:  CALC_SPLINE(4);     break;
 +            case 5:  CALC_SPLINE(5);     break;
 +            default: CALC_SPLINE(order); break;
              }
          }
      }
@@@ -2410,7 -1656,7 +2410,7 @@@ void make_dft_mod(real *mod,real *data,
  {
    int i,j;
    real sc,ss,arg;
 -    
 +
    for(i=0;i<ndata;i++) {
      sc=ss=0;
      for(j=0;j<ndata;j++) {
  }
  
  
 -
 -void make_bspline_moduli(splinevec bsp_mod,int nx,int ny,int nz,int order)
 +static void make_bspline_moduli(splinevec bsp_mod,
 +                                int nx,int ny,int nz,int order)
  {
    int nmax=max(nx,max(ny,nz));
    real *data,*ddata,*bsp_data;
    int i,k,l;
    real div;
 -    
 +
    snew(data,order);
    snew(ddata,order);
    snew(bsp_data,nmax);
    data[order-1]=0;
    data[1]=0;
    data[0]=1;
 -          
 +
    for(k=3;k<order;k++) {
      div=1.0/(k-1.0);
      data[k-1]=0;
    data[order-1]=0;
    for(l=1;l<(order-1);l++)
      data[order-l-1]=div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
 -  data[0]=div*data[0]; 
 +  data[0]=div*data[0];
  
    for(i=0;i<nmax;i++)
      bsp_data[i]=0;
    for(i=1;i<=order;i++)
      bsp_data[i]=data[i-1];
 -    
 +
    make_dft_mod(bsp_mod[XX],bsp_data,nx);
    make_dft_mod(bsp_mod[YY],bsp_data,ny);
    make_dft_mod(bsp_mod[ZZ],bsp_data,nz);
    sfree(bsp_data);
  }
  
 -static void setup_coordinate_communication(pme_atomcomm_t *atc)
 +
 +/* Return the P3M optimal influence function */
 +static double do_p3m_influence(double z, int order)
  {
 -  int nslab,n,i;
 -  int fw,bw;
 +    double z2,z4;
  
 -  nslab = atc->nslab;
 +    z2 = z*z;
 +    z4 = z2*z2;
  
 -  n = 0;
 -  for(i=1; i<=nslab/2; i++) {
 -    fw = (atc->nodeid + i) % nslab;
 -    bw = (atc->nodeid - i + nslab) % nslab;
 -    if (n < nslab - 1) {
 -      atc->node_dest[n] = fw;
 -      atc->node_src[n]  = bw;
 -      n++;
 -    } 
 +    /* The formula and most constants can be found in:
 +     * Ballenegger et al., JCTC 8, 936 (2012)
 +     */
 +    switch(order)
 +    {
 +    case 2:
 +        return 1.0 - 2.0*z2/3.0;
 +        break;
 +    case 3:
 +        return 1.0 - z2 + 2.0*z4/15.0;
 +        break;
 +    case 4:
 +        return 1.0 - 4.0*z2/3.0 + 2.0*z4/5.0 + 4.0*z2*z4/315.0;
 +        break;
 +    case 5:
 +        return 1.0 - 5.0*z2/3.0 + 7.0*z4/9.0 - 17.0*z2*z4/189.0 + 2.0*z4*z4/2835.0;
 +        break;
 +    case 6:
 +        return 1.0 - 2.0*z2 + 19.0*z4/15.0 - 256.0*z2*z4/945.0 + 62.0*z4*z4/4725.0 + 4.0*z2*z4*z4/155925.0;
 +        break;
 +    case 7:
 +        return 1.0 - 7.0*z2/3.0 + 28.0*z4/15.0 - 16.0*z2*z4/27.0 + 26.0*z4*z4/405.0 - 2.0*z2*z4*z4/1485.0 + 4.0*z4*z4*z4/6081075.0;
 +    case 8:
 +        return 1.0 - 8.0*z2/3.0 + 116.0*z4/45.0 - 344.0*z2*z4/315.0 + 914.0*z4*z4/4725.0 - 248.0*z4*z4*z2/22275.0 + 21844.0*z4*z4*z4/212837625.0 - 8.0*z4*z4*z4*z2/638512875.0;
 +        break;
 +    }
 +
 +    return 0.0;
 +}
 +
 +/* Calculate the P3M B-spline moduli for one dimension */
 +static void make_p3m_bspline_moduli_dim(real *bsp_mod,int n,int order)
 +{
 +    double zarg,zai,sinzai,infl;
 +    int    maxk,i;
 +
 +    if (order > 8)
 +    {
 +        gmx_fatal(FARGS,"The current P3M code only supports orders up to 8");
 +    }
 +
 +    zarg = M_PI/n;
 +
 +    maxk = (n + 1)/2;
 +
 +    for(i=-maxk; i<0; i++)
 +    {
 +        zai    = zarg*i;
 +        sinzai = sin(zai);
 +        infl   = do_p3m_influence(sinzai,order);
 +        bsp_mod[n+i] = infl*infl*pow(sinzai/zai,-2.0*order);
 +    }
 +    bsp_mod[0] = 1.0;
 +    for(i=1; i<maxk; i++)
 +    {
 +        zai    = zarg*i;
 +        sinzai = sin(zai);
 +        infl   = do_p3m_influence(sinzai,order);
 +        bsp_mod[i] = infl*infl*pow(sinzai/zai,-2.0*order);
 +    }
 +}
 +
 +/* Calculate the P3M B-spline moduli */
 +static void make_p3m_bspline_moduli(splinevec bsp_mod,
 +                                    int nx,int ny,int nz,int order)
 +{
 +    make_p3m_bspline_moduli_dim(bsp_mod[XX],nx,order);
 +    make_p3m_bspline_moduli_dim(bsp_mod[YY],ny,order);
 +    make_p3m_bspline_moduli_dim(bsp_mod[ZZ],nz,order);
 +}
 +
 +
 +static void setup_coordinate_communication(pme_atomcomm_t *atc)
 +{
 +  int nslab,n,i;
 +  int fw,bw;
 +
 +  nslab = atc->nslab;
 +
 +  n = 0;
 +  for(i=1; i<=nslab/2; i++) {
 +    fw = (atc->nodeid + i) % nslab;
 +    bw = (atc->nodeid - i + nslab) % nslab;
 +    if (n < nslab - 1) {
 +      atc->node_dest[n] = fw;
 +      atc->node_src[n]  = bw;
 +      n++;
 +    }
      if (n < nslab - 1) {
        atc->node_dest[n] = bw;
        atc->node_src[n]  = fw;
  
  int gmx_pme_destroy(FILE *log,gmx_pme_t *pmedata)
  {
 +    int thread;
 +
      if(NULL != log)
      {
          fprintf(log,"Destroying PME data structures.\n");
      sfree((*pmedata)->nnx);
      sfree((*pmedata)->nny);
      sfree((*pmedata)->nnz);
 -      
 -    sfree((*pmedata)->pmegridA);
 +
 +    pmegrids_destroy(&(*pmedata)->pmegridA);
 +
      sfree((*pmedata)->fftgridA);
      sfree((*pmedata)->cfftgridA);
      gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupA);
 -    
 -    if((*pmedata)->pmegridB)
 +
 +    if ((*pmedata)->pmegridB.grid.grid != NULL)
      {
 -        sfree((*pmedata)->pmegridB);
 +        pmegrids_destroy(&(*pmedata)->pmegridB);
          sfree((*pmedata)->fftgridB);
          sfree((*pmedata)->cfftgridB);
          gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupB);
      }
 -    sfree((*pmedata)->work_mhz);
 -    sfree((*pmedata)->work_m2);
 -    sfree((*pmedata)->work_denom);
 -    sfree((*pmedata)->work_tmp1_alloc);
 -    sfree((*pmedata)->work_m2inv);
 -      
 +    for(thread=0; thread<(*pmedata)->nthread; thread++)
 +    {
 +        free_work(&(*pmedata)->work[thread]);
 +    }
 +    sfree((*pmedata)->work);
 +
      sfree(*pmedata);
      *pmedata = NULL;
 -  
 +
    return 0;
  }
  
@@@ -2642,7 -1804,7 +2642,7 @@@ static double pme_load_imbalance(gmx_pm
  static void init_atomcomm(gmx_pme_t pme,pme_atomcomm_t *atc, t_commrec *cr,
                            int dimind,gmx_bool bSpread)
  {
 -    int nk,k,s;
 +    int nk,k,s,thread;
  
      atc->dimind = dimind;
      atc->nslab  = 1;
          snew(atc->node_dest,atc->nslab);
          snew(atc->node_src,atc->nslab);
          setup_coordinate_communication(atc);
 -        
 -        snew(atc->count,atc->nslab);
 +
 +        snew(atc->count_thread,pme->nthread);
 +        for(thread=0; thread<pme->nthread; thread++)
 +        {
 +            snew(atc->count_thread[thread],atc->nslab);
 +        }
 +        atc->count = atc->count_thread[0];
          snew(atc->rcount,atc->nslab);
          snew(atc->buf_index,atc->nslab);
      }
 +
 +    atc->nthread = pme->nthread;
 +    if (atc->nthread > 1)
 +    {
 +        snew(atc->thread_plist,atc->nthread);
 +    }
 +    snew(atc->spline,atc->nthread);
 +    for(thread=0; thread<atc->nthread; thread++)
 +    {
 +        if (atc->nthread > 1)
 +        {
 +            snew(atc->thread_plist[thread].n,atc->nthread+2*GMX_CACHE_SEP);
 +            atc->thread_plist[thread].n += GMX_CACHE_SEP;
 +        }
 +    }
  }
  
 -static void 
 +static void
  init_overlap_comm(pme_overlap_t *  ol,
                    int              norder,
  #ifdef GMX_MPI
 -                  MPI_Comm         comm,  
 +                  MPI_Comm         comm,
  #endif
 -                  int              nnodes, 
 +                  int              nnodes,
                    int              nodeid,
 -                  int              ndata)
 +                  int              ndata,
 +                  int              commplainsize)
  {
      int lbnd,rbnd,maxlr,b,i;
      int exten;
      pme_grid_comm_t *pgc;
      gmx_bool bCont;
      int fft_start,fft_end,send_index1,recv_index1;
 -    
 +
  #ifdef GMX_MPI
      ol->mpi_comm = comm;
  #endif
 -    
 +
      ol->nnodes = nnodes;
      ol->nodeid = nodeid;
  
      snew(ol->s2g0,ol->nnodes+1);
      snew(ol->s2g1,ol->nnodes);
      if (debug) { fprintf(debug,"PME slab boundaries:"); }
 -    for(i=0; i<nnodes; i++) 
 +    for(i=0; i<nnodes; i++)
      {
          /* s2g0 the local interpolation grid start.
           * s2g1 the local interpolation grid end.
          pgc->recv_index0 = fft_start;
          pgc->recv_nindex = max(0,recv_index1 - pgc->recv_index0);
      }
 +
 +    /* For non-divisible grid we need pme_order iso pme_order-1 */
 +    snew(ol->sendbuf,norder*commplainsize);
 +    snew(ol->recvbuf,norder*commplainsize);
  }
  
  static void
@@@ -2848,7 -1985,7 +2848,7 @@@ make_gridindex5_to_localindex(int n,in
                  if (gtl[i] == n-1)
                  {
                      gtl[i] = 0;
 -                    fsh[i] = -1; 
 +                    fsh[i] = -1;
                  }
                  else if (gtl[i] == local_range)
                  {
      *fraction_shift  = fsh;
  }
  
 +static pme_spline_work_t *make_pme_spline_work(int order)
 +{
 +    pme_spline_work_t *work;
 +
 +#ifdef PME_SSE
 +    float  tmp[8];
 +    __m128 zero_SSE;
 +    int    of,i;
 +
 +    snew_aligned(work,1,16);
 +
 +    zero_SSE = _mm_setzero_ps();
 +
 +    /* Generate bit masks to mask out the unused grid entries,
 +     * as we only operate on order of the 8 grid entries that are
 +     * load into 2 SSE float registers.
 +     */
 +    for(of=0; of<8-(order-1); of++)
 +    {
 +        for(i=0; i<8; i++)
 +        {
 +            tmp[i] = (i >= of && i < of+order ? 1 : 0);
 +        }
 +        work->mask_SSE0[of] = _mm_loadu_ps(tmp);
 +        work->mask_SSE1[of] = _mm_loadu_ps(tmp+4);
 +        work->mask_SSE0[of] = _mm_cmpgt_ps(work->mask_SSE0[of],zero_SSE);
 +        work->mask_SSE1[of] = _mm_cmpgt_ps(work->mask_SSE1[of],zero_SSE);
 +    }
 +#else
 +    work = NULL;
 +#endif
 +
 +    return work;
 +}
 +
  static void
  gmx_pme_check_grid_restrictions(FILE *fplog,char dim,int nnodes,int *nk)
  {
              gmx_fatal(FARGS,"The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). The grid size would have to be increased by more than 50%% to make the grid divisible. Change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).",
                        dim,*nk,dim,nnodes,dim);
          }
 -        
 +
          if (fplog != NULL)
          {
              fprintf(fplog,"\nNOTE: The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). Increasing the PME grid size in dim %c to %d. This will increase the accuracy and will not decrease the performance significantly on this number of nodes. For optimal performance change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).\n\n",
                      dim,*nk,dim,nnodes,dim,nk_new,dim);
          }
 -            
 +
          *nk = nk_new;
      }
  }
@@@ -2929,36 -2031,36 +2929,36 @@@ int gmx_pme_init(gmx_pme_t *         pm
                   int                 nnodes_minor,
                   t_inputrec *        ir,
                   int                 homenr,
 -                 gmx_bool                bFreeEnergy,
 -                 gmx_bool                bReproducible)
 +                 gmx_bool            bFreeEnergy,
 +                 gmx_bool            bReproducible,
 +                 int                 nthread)
  {
      gmx_pme_t pme=NULL;
 -    
 +
      pme_atomcomm_t *atc;
 -    int bufsizex,bufsizey,bufsize;
      ivec ndata;
 -    
 +
      if (debug)
          fprintf(debug,"Creating PME data structures.\n");
      snew(pme,1);
 -        
 +
      pme->redist_init         = FALSE;
      pme->sum_qgrid_tmp       = NULL;
      pme->sum_qgrid_dd_tmp    = NULL;
      pme->buf_nalloc          = 0;
      pme->redist_buf_nalloc   = 0;
 -    
 +
      pme->nnodes              = 1;
      pme->bPPnode             = TRUE;
 -    
 +
      pme->nnodes_major        = nnodes_major;
      pme->nnodes_minor        = nnodes_minor;
  
  #ifdef GMX_MPI
 -    if (nnodes_major*nnodes_minor > 1 && PAR(cr)) 
 +    if (nnodes_major*nnodes_minor > 1)
      {
          pme->mpi_comm = cr->mpi_comm_mygroup;
 -        
 +
          MPI_Comm_rank(pme->mpi_comm,&pme->nodeid);
          MPI_Comm_size(pme->mpi_comm,&pme->nnodes);
          if (pme->nnodes != nnodes_major*nnodes_minor)
  
      if (pme->nnodes == 1)
      {
+ #ifdef GMX_MPI
+         pme->mpi_comm_d[0] = MPI_COMM_NULL;
+         pme->mpi_comm_d[1] = MPI_COMM_NULL;
+ #endif
          pme->ndecompdim = 0;
          pme->nodeid_major = 0;
          pme->nodeid_minor = 0;
 +#ifdef GMX_MPI
 +        pme->mpi_comm_d[0] = pme->mpi_comm_d[1] = MPI_COMM_NULL;
 +#endif
      }
      else
      {
              pme->ndecompdim = 1;
              pme->nodeid_major = pme->nodeid;
              pme->nodeid_minor = 0;
 -            
 +
          }
          else if (nnodes_major == 1)
          {
              pme->nodeid_major = 0;
              pme->nodeid_minor = pme->nodeid;
          }
 -        else 
 +        else
          {
              if (pme->nnodes % nnodes_major != 0)
              {
                  gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
              }
              pme->ndecompdim = 2;
 -            
 +
  #ifdef GMX_MPI
              MPI_Comm_split(pme->mpi_comm,pme->nodeid % nnodes_minor,
                             pme->nodeid,&pme->mpi_comm_d[0]);  /* My communicator along major dimension */
              MPI_Comm_split(pme->mpi_comm,pme->nodeid/nnodes_minor,
                             pme->nodeid,&pme->mpi_comm_d[1]);  /* My communicator along minor dimension */
 -            
 +
              MPI_Comm_rank(pme->mpi_comm_d[0],&pme->nodeid_major);
              MPI_Comm_size(pme->mpi_comm_d[0],&pme->nnodes_major);
              MPI_Comm_rank(pme->mpi_comm_d[1],&pme->nodeid_minor);
          }
          pme->bPPnode = (cr->duty & DUTY_PP);
      }
 -    
 +
 +    pme->nthread = nthread;
 +
      if (ir->ePBC == epbcSCREW)
      {
          gmx_fatal(FARGS,"pme does not (yet) work with pbc = screw");
      }
 -    
 +
      pme->bFEP        = ((ir->efep != efepNO) && bFreeEnergy);
      pme->nkx         = ir->nkx;
      pme->nky         = ir->nky;
      pme->nkz         = ir->nkz;
 +    pme->bP3M        = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL);
      pme->pme_order   = ir->pme_order;
      pme->epsilon_r   = ir->epsilon_r;
 -    
 +
 +    if (pme->pme_order > PME_ORDER_MAX)
 +    {
 +        gmx_fatal(FARGS,"pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
 +                  pme->pme_order,PME_ORDER_MAX);
 +    }
 +
      /* Currently pme.c supports only the fft5d FFT code.
       * Therefore the grid always needs to be divisible by nnodes.
       * When the old 1D code is also supported again, change this check.
          pme->nky <= pme->pme_order*(pme->nnodes_minor > 1 ? 2 : 1) ||
          pme->nkz <= pme->pme_order)
      {
 -        gmx_fatal(FARGS,"The pme grid dimensions need to be larger than pme_order (%d) and in parallel larger than 2*pme_order for x and/or y",pme->pme_order);
 +        gmx_fatal(FARGS,"The pme grid dimensions need to be larger than pme_order (%d) and in parallel larger than 2*pme_ordern for x and/or y",pme->pme_order);
      }
  
      if (pme->nnodes > 1) {
          double imbal;
  
  #ifdef GMX_MPI
 -        MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
 -        MPI_Type_commit(&(pme->rvec_mpi));
 +        MPI_Type_contiguous(DIM, mpi_type, &(pme->rvec_mpi));
 +        MPI_Type_commit(&(pme->rvec_mpi));
 +#endif
 +
 +        /* Note that the charge spreading and force gathering, which usually
 +         * takes about the same amount of time as FFT+solve_pme,
 +         * is always fully load balanced
 +         * (unless the charge distribution is inhomogeneous).
 +         */
 +
 +        imbal = pme_load_imbalance(pme);
 +        if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
 +        {
 +            fprintf(stderr,
 +                    "\n"
 +                    "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
 +                    "      For optimal PME load balancing\n"
 +                    "      PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
 +                    "      and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
 +                    "\n",
 +                    (int)((imbal-1)*100 + 0.5),
 +                    pme->nkx,pme->nky,pme->nnodes_major,
 +                    pme->nky,pme->nkz,pme->nnodes_minor);
 +        }
 +    }
 +
 +    /* For non-divisible grid we need pme_order iso pme_order-1 */
 +    /* In sum_qgrid_dd x overlap is copied in place: take padding into account.
 +     * y is always copied through a buffer: we don't need padding in z,
 +     * but we do need the overlap in x because of the communication order.
 +     */
 +    init_overlap_comm(&pme->overlap[0],pme->pme_order,
 +#ifdef GMX_MPI
 +                      pme->mpi_comm_d[0],
 +#endif
 +                      pme->nnodes_major,pme->nodeid_major,
 +                      pme->nkx,
 +                      (div_round_up(pme->nky,pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
 +
 +    init_overlap_comm(&pme->overlap[1],pme->pme_order,
 +#ifdef GMX_MPI
 +                      pme->mpi_comm_d[1],
 +#endif
 +                      pme->nnodes_minor,pme->nodeid_minor,
 +                      pme->nky,
 +                      (div_round_up(pme->nkx,pme->nnodes_major)+pme->pme_order)*pme->nkz);
 +
 +    /* Check for a limitation of the (current) sum_fftgrid_dd code */
 +    if (pme->nthread > 1 &&
 +        (pme->overlap[0].noverlap_nodes > 1 ||
 +         pme->overlap[1].noverlap_nodes > 1))
 +    {
 +        gmx_fatal(FARGS,"With threads the number of grid lines per node along x and or y should be pme_order (%d) or more or exactly pme_order-1",pme->pme_order);
 +    }
 +
 +    snew(pme->bsp_mod[XX],pme->nkx);
 +    snew(pme->bsp_mod[YY],pme->nky);
 +    snew(pme->bsp_mod[ZZ],pme->nkz);
 +
 +    /* The required size of the interpolation grid, including overlap.
 +     * The allocated size (pmegrid_n?) might be slightly larger.
 +     */
 +    pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
 +                      pme->overlap[0].s2g0[pme->nodeid_major];
 +    pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
 +                      pme->overlap[1].s2g0[pme->nodeid_minor];
 +    pme->pmegrid_nz_base = pme->nkz;
 +    pme->pmegrid_nz = pme->pmegrid_nz_base + pme->pme_order - 1;
 +    set_grid_alignment(&pme->pmegrid_nz,pme->pme_order);
 +
 +    pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
 +    pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
 +    pme->pmegrid_start_iz = 0;
 +
 +    make_gridindex5_to_localindex(pme->nkx,
 +                                  pme->pmegrid_start_ix,
 +                                  pme->pmegrid_nx - (pme->pme_order-1),
 +                                  &pme->nnx,&pme->fshx);
 +    make_gridindex5_to_localindex(pme->nky,
 +                                  pme->pmegrid_start_iy,
 +                                  pme->pmegrid_ny - (pme->pme_order-1),
 +                                  &pme->nny,&pme->fshy);
 +    make_gridindex5_to_localindex(pme->nkz,
 +                                  pme->pmegrid_start_iz,
 +                                  pme->pmegrid_nz_base,
 +                                  &pme->nnz,&pme->fshz);
 +
 +    pmegrids_init(&pme->pmegridA,
 +                  pme->pmegrid_nx,pme->pmegrid_ny,pme->pmegrid_nz,
 +                  pme->pmegrid_nz_base,
 +                  pme->pme_order,
 +                  pme->nthread,
 +                  pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
 +                  pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
 +
 +    pme->spline_work = make_pme_spline_work(pme->pme_order);
 +
 +    ndata[0] = pme->nkx;
 +    ndata[1] = pme->nky;
 +    ndata[2] = pme->nkz;
 +
 +    /* This routine will allocate the grid data to fit the FFTs */
 +    gmx_parallel_3dfft_init(&pme->pfft_setupA,ndata,
 +                            &pme->fftgridA,&pme->cfftgridA,
 +                            pme->mpi_comm_d,
 +                            pme->overlap[0].s2g0,pme->overlap[1].s2g0,
 +                            bReproducible,pme->nthread);
 +
 +    if (bFreeEnergy)
 +    {
 +        pmegrids_init(&pme->pmegridB,
 +                      pme->pmegrid_nx,pme->pmegrid_ny,pme->pmegrid_nz,
 +                      pme->pmegrid_nz_base,
 +                      pme->pme_order,
 +                      pme->nthread,
 +                      pme->nkx % pme->nnodes_major != 0,
 +                      pme->nky % pme->nnodes_minor != 0);
 +
 +        gmx_parallel_3dfft_init(&pme->pfft_setupB,ndata,
 +                                &pme->fftgridB,&pme->cfftgridB,
 +                                pme->mpi_comm_d,
 +                                pme->overlap[0].s2g0,pme->overlap[1].s2g0,
 +                                bReproducible,pme->nthread);
 +    }
 +    else
 +    {
 +        pme->pmegridB.grid.grid = NULL;
 +        pme->fftgridB           = NULL;
 +        pme->cfftgridB          = NULL;
 +    }
 +
 +    if (!pme->bP3M)
 +    {
 +        /* Use plain SPME B-spline interpolation */
 +        make_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order);
 +    }
 +    else
 +    {
 +        /* Use the P3M grid-optimized influence function */
 +        make_p3m_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order);
 +    }
 +
 +    /* Use atc[0] for spreading */
 +    init_atomcomm(pme,&pme->atc[0],cr,nnodes_major > 1 ? 0 : 1,TRUE);
 +    if (pme->ndecompdim >= 2)
 +    {
 +        init_atomcomm(pme,&pme->atc[1],cr,1,FALSE);
 +    }
 +
 +    if (pme->nnodes == 1) {
 +        pme->atc[0].n = homenr;
 +        pme_realloc_atomcomm_things(&pme->atc[0]);
 +    }
 +
 +    {
 +        int thread;
 +
 +        /* Use fft5d, order after FFT is y major, z, x minor */
 +
 +        snew(pme->work,pme->nthread);
 +        for(thread=0; thread<pme->nthread; thread++)
 +        {
 +            realloc_work(&pme->work[thread],pme->nkx);
 +        }
 +    }
 +
 +    *pmedata = pme;
 +
 +    return 0;
 +}
 +
 +
 +static void copy_local_grid(gmx_pme_t pme,
 +                            pmegrids_t *pmegrids,int thread,real *fftgrid)
 +{
 +    ivec local_fft_ndata,local_fft_offset,local_fft_size;
 +    int  fft_my,fft_mz;
 +    int  nsx,nsy,nsz;
 +    ivec nf;
 +    int  offx,offy,offz,x,y,z,i0,i0t;
 +    int  d;
 +    pmegrid_t *pmegrid;
 +    real *grid_th;
 +
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +    fft_my = local_fft_size[YY];
 +    fft_mz = local_fft_size[ZZ];
 +
 +    pmegrid = &pmegrids->grid_th[thread];
 +
 +    nsx = pmegrid->n[XX];
 +    nsy = pmegrid->n[YY];
 +    nsz = pmegrid->n[ZZ];
 +
 +    for(d=0; d<DIM; d++)
 +    {
 +        nf[d] = min(pmegrid->n[d] - (pmegrid->order - 1),
 +                    local_fft_ndata[d] - pmegrid->offset[d]);
 +    }
 +
 +    offx = pmegrid->offset[XX];
 +    offy = pmegrid->offset[YY];
 +    offz = pmegrid->offset[ZZ];
 +
 +    /* Directly copy the non-overlapping parts of the local grids.
 +     * This also initializes the full grid.
 +     */
 +    grid_th = pmegrid->grid;
 +    for(x=0; x<nf[XX]; x++)
 +    {
 +        for(y=0; y<nf[YY]; y++)
 +        {
 +            i0  = ((offx + x)*fft_my + (offy + y))*fft_mz + offz;
 +            i0t = (x*nsy + y)*nsz;
 +            for(z=0; z<nf[ZZ]; z++)
 +            {
 +                fftgrid[i0+z] = grid_th[i0t+z];
 +            }
 +        }
 +    }
 +}
 +
 +static void print_sendbuf(gmx_pme_t pme,real *sendbuf)
 +{
 +    ivec local_fft_ndata,local_fft_offset,local_fft_size;
 +    pme_overlap_t *overlap;
 +    int datasize,nind;
 +    int i,x,y,z,n;
 +
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +    /* Major dimension */
 +    overlap = &pme->overlap[0];
 +
 +    nind   = overlap->comm_data[0].send_nindex;
 +
 +    for(y=0; y<local_fft_ndata[YY]; y++) {
 +         printf(" %2d",y);
 +    }
 +    printf("\n");
 +
 +    i = 0;
 +    for(x=0; x<nind; x++) {
 +        for(y=0; y<local_fft_ndata[YY]; y++) {
 +            n = 0;
 +            for(z=0; z<local_fft_ndata[ZZ]; z++) {
 +                if (sendbuf[i] != 0) n++;
 +                i++;
 +            }
 +            printf(" %2d",n);
 +        }
 +        printf("\n");
 +    }
 +}
 +
 +static void
 +reduce_threadgrid_overlap(gmx_pme_t pme,
 +                          const pmegrids_t *pmegrids,int thread,
 +                          real *fftgrid,real *commbuf_x,real *commbuf_y)
 +{
 +    ivec local_fft_ndata,local_fft_offset,local_fft_size;
 +    int  fft_nx,fft_ny,fft_nz;
 +    int  fft_my,fft_mz;
 +    int  buf_my=-1;
 +    int  nsx,nsy,nsz;
 +    ivec ne;
 +    int  offx,offy,offz,x,y,z,i0,i0t;
 +    int  sx,sy,sz,fx,fy,fz,tx1,ty1,tz1,ox,oy,oz;
 +    gmx_bool bClearBufX,bClearBufY,bClearBufXY,bClearBuf;
 +    gmx_bool bCommX,bCommY;
 +    int  d;
 +    int  thread_f;
 +    const pmegrid_t *pmegrid,*pmegrid_g,*pmegrid_f;
 +    const real *grid_th;
 +    real *commbuf=NULL;
 +
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +    fft_nx = local_fft_ndata[XX];
 +    fft_ny = local_fft_ndata[YY];
 +    fft_nz = local_fft_ndata[ZZ];
 +
 +    fft_my = local_fft_size[YY];
 +    fft_mz = local_fft_size[ZZ];
 +
 +    /* This routine is called when all thread have finished spreading.
 +     * Here each thread sums grid contributions calculated by other threads
 +     * to the thread local grid volume.
 +     * To minimize the number of grid copying operations,
 +     * this routines sums immediately from the pmegrid to the fftgrid.
 +     */
 +
 +    /* Determine which part of the full node grid we should operate on,
 +     * this is our thread local part of the full grid.
 +     */
 +    pmegrid = &pmegrids->grid_th[thread];
 +
 +    for(d=0; d<DIM; d++)
 +    {
 +        ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
 +                    local_fft_ndata[d]);
 +    }
 +
 +    offx = pmegrid->offset[XX];
 +    offy = pmegrid->offset[YY];
 +    offz = pmegrid->offset[ZZ];
 +
 +
 +    bClearBufX  = TRUE;
 +    bClearBufY  = TRUE;
 +    bClearBufXY = TRUE;
 +
 +    /* Now loop over all the thread data blocks that contribute
 +     * to the grid region we (our thread) are operating on.
 +     */
 +    /* Note that ffy_nx/y is equal to the number of grid points
 +     * between the first point of our node grid and the one of the next node.
 +     */
 +    for(sx=0; sx>=-pmegrids->nthread_comm[XX]; sx--)
 +    {
 +        fx = pmegrid->ci[XX] + sx;
 +        ox = 0;
 +        bCommX = FALSE;
 +        if (fx < 0) {
 +            fx += pmegrids->nc[XX];
 +            ox -= fft_nx;
 +            bCommX = (pme->nnodes_major > 1);
 +        }
 +        pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
 +        ox += pmegrid_g->offset[XX];
 +        if (!bCommX)
 +        {
 +            tx1 = min(ox + pmegrid_g->n[XX],ne[XX]);
 +        }
 +        else
 +        {
 +            tx1 = min(ox + pmegrid_g->n[XX],pme->pme_order);
 +        }
 +
 +        for(sy=0; sy>=-pmegrids->nthread_comm[YY]; sy--)
 +        {
 +            fy = pmegrid->ci[YY] + sy;
 +            oy = 0;
 +            bCommY = FALSE;
 +            if (fy < 0) {
 +                fy += pmegrids->nc[YY];
 +                oy -= fft_ny;
 +                bCommY = (pme->nnodes_minor > 1);
 +            }
 +            pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
 +            oy += pmegrid_g->offset[YY];
 +            if (!bCommY)
 +            {
 +                ty1 = min(oy + pmegrid_g->n[YY],ne[YY]);
 +            }
 +            else
 +            {
 +                ty1 = min(oy + pmegrid_g->n[YY],pme->pme_order);
 +            }
 +
 +            for(sz=0; sz>=-pmegrids->nthread_comm[ZZ]; sz--)
 +            {
 +                fz = pmegrid->ci[ZZ] + sz;
 +                oz = 0;
 +                if (fz < 0)
 +                {
 +                    fz += pmegrids->nc[ZZ];
 +                    oz -= fft_nz;
 +                }
 +                pmegrid_g = &pmegrids->grid_th[fz];
 +                oz += pmegrid_g->offset[ZZ];
 +                tz1 = min(oz + pmegrid_g->n[ZZ],ne[ZZ]);
 +
 +                if (sx == 0 && sy == 0 && sz == 0)
 +                {
 +                    /* We have already added our local contribution
 +                     * before calling this routine, so skip it here.
 +                     */
 +                    continue;
 +                }
 +
 +                thread_f = (fx*pmegrids->nc[YY] + fy)*pmegrids->nc[ZZ] + fz;
 +
 +                pmegrid_f = &pmegrids->grid_th[thread_f];
 +
 +                grid_th = pmegrid_f->grid;
 +
 +                nsx = pmegrid_f->n[XX];
 +                nsy = pmegrid_f->n[YY];
 +                nsz = pmegrid_f->n[ZZ];
 +
 +#ifdef DEBUG_PME_REDUCE
 +                printf("n%d t%d add %d  %2d %2d %2d  %2d %2d %2d  %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
 +                       pme->nodeid,thread,thread_f,
 +                       pme->pmegrid_start_ix,
 +                       pme->pmegrid_start_iy,
 +                       pme->pmegrid_start_iz,
 +                       sx,sy,sz,
 +                       offx-ox,tx1-ox,offx,tx1,
 +                       offy-oy,ty1-oy,offy,ty1,
 +                       offz-oz,tz1-oz,offz,tz1);
 +#endif
 +
 +                if (!(bCommX || bCommY))
 +                {
 +                    /* Copy from the thread local grid to the node grid */
 +                    for(x=offx; x<tx1; x++)
 +                    {
 +                        for(y=offy; y<ty1; y++)
 +                        {
 +                            i0  = (x*fft_my + y)*fft_mz;
 +                            i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
 +                            for(z=offz; z<tz1; z++)
 +                            {
 +                                fftgrid[i0+z] += grid_th[i0t+z];
 +                            }
 +                        }
 +                    }
 +                }
 +                else
 +                {
 +                    /* The order of this conditional decides
 +                     * where the corner volume gets stored with x+y decomp.
 +                     */
 +                    if (bCommY)
 +                    {
 +                        commbuf = commbuf_y;
 +                        buf_my  = ty1 - offy;
 +                        if (bCommX)
 +                        {
 +                            /* We index commbuf modulo the local grid size */
 +                            commbuf += buf_my*fft_nx*fft_nz;
 +
 +                            bClearBuf  = bClearBufXY;
 +                            bClearBufXY = FALSE;
 +                        }
 +                        else
 +                        {
 +                            bClearBuf  = bClearBufY;
 +                            bClearBufY = FALSE;
 +                        }
 +                    }
 +                    else
 +                    {
 +                        commbuf = commbuf_x;
 +                        buf_my  = fft_ny;
 +                        bClearBuf  = bClearBufX;
 +                        bClearBufX = FALSE;
 +                    }
 +
 +                    /* Copy to the communication buffer */
 +                    for(x=offx; x<tx1; x++)
 +                    {
 +                        for(y=offy; y<ty1; y++)
 +                        {
 +                            i0  = (x*buf_my + y)*fft_nz;
 +                            i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
 +
 +                            if (bClearBuf)
 +                            {
 +                                /* First access of commbuf, initialize it */
 +                                for(z=offz; z<tz1; z++)
 +                                {
 +                                    commbuf[i0+z]  = grid_th[i0t+z];
 +                                }
 +                            }
 +                            else
 +                            {
 +                                for(z=offz; z<tz1; z++)
 +                                {
 +                                    commbuf[i0+z] += grid_th[i0t+z];
 +                                }
 +                            }
 +                        }
 +                    }
 +                }
 +            }
 +        }
 +    }
 +}
 +
 +
 +static void sum_fftgrid_dd(gmx_pme_t pme,real *fftgrid)
 +{
 +    ivec local_fft_ndata,local_fft_offset,local_fft_size;
 +    pme_overlap_t *overlap;
 +    int  send_nindex;
 +    int  recv_index0,recv_nindex;
 +#ifdef GMX_MPI
 +    MPI_Status stat;
 +#endif
 +    int  ipulse,send_id,recv_id,datasize,gridsize,size_yx;
 +    real *sendptr,*recvptr;
 +    int  x,y,z,indg,indb;
 +
 +    /* Note that this routine is only used for forward communication.
 +     * Since the force gathering, unlike the charge spreading,
 +     * can be trivially parallelized over the particles,
 +     * the backwards process is much simpler and can use the "old"
 +     * communication setup.
 +     */
 +
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +
 +    /* Currently supports only a single communication pulse */
 +
 +/* for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++) */
 +    if (pme->nnodes_minor > 1)
 +    {
 +        /* Major dimension */
 +        overlap = &pme->overlap[1];
 +
 +        if (pme->nnodes_major > 1)
 +        {
 +             size_yx = pme->overlap[0].comm_data[0].send_nindex;
 +        }
 +        else
 +        {
 +            size_yx = 0;
 +        }
 +        datasize = (local_fft_ndata[XX]+size_yx)*local_fft_ndata[ZZ];
 +
 +        ipulse = 0;
 +
 +        send_id = overlap->send_id[ipulse];
 +        recv_id = overlap->recv_id[ipulse];
 +        send_nindex   = overlap->comm_data[ipulse].send_nindex;
 +        /* recv_index0   = overlap->comm_data[ipulse].recv_index0; */
 +        recv_index0 = 0;
 +        recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +
 +        sendptr = overlap->sendbuf;
 +        recvptr = overlap->recvbuf;
 +
 +        /*
 +        printf("node %d comm %2d x %2d x %2d\n",pme->nodeid,
 +               local_fft_ndata[XX]+size_yx,send_nindex,local_fft_ndata[ZZ]);
 +        printf("node %d send %f, %f\n",pme->nodeid,
 +               sendptr[0],sendptr[send_nindex*datasize-1]);
 +        */
 +
 +#ifdef GMX_MPI
 +        MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
 +                     send_id,ipulse,
 +                     recvptr,recv_nindex*datasize,GMX_MPI_REAL,
 +                     recv_id,ipulse,
 +                     overlap->mpi_comm,&stat);
 +#endif
 +
 +        for(x=0; x<local_fft_ndata[XX]; x++)
 +        {
 +            for(y=0; y<recv_nindex; y++)
 +            {
 +                indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
 +                indb = (x*recv_nindex        + y)*local_fft_ndata[ZZ];
 +                for(z=0; z<local_fft_ndata[ZZ]; z++)
 +                {
 +                    fftgrid[indg+z] += recvptr[indb+z];
 +                }
 +            }
 +        }
 +        if (pme->nnodes_major > 1)
 +        {
 +            sendptr = pme->overlap[0].sendbuf;
 +            for(x=0; x<size_yx; x++)
 +            {
 +                for(y=0; y<recv_nindex; y++)
 +                {
 +                    indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
 +                    indb = ((local_fft_ndata[XX] + x)*recv_nindex +y)*local_fft_ndata[ZZ];
 +                    for(z=0; z<local_fft_ndata[ZZ]; z++)
 +                    {
 +                        sendptr[indg+z] += recvptr[indb+z];
 +                    }
 +                }
 +            }
 +        }
 +    }
 +
 +    /* for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++) */
 +    if (pme->nnodes_major > 1)
 +    {
 +        /* Major dimension */
 +        overlap = &pme->overlap[0];
 +
 +        datasize = local_fft_ndata[YY]*local_fft_ndata[ZZ];
 +        gridsize = local_fft_size[YY] *local_fft_size[ZZ];
 +
 +        ipulse = 0;
 +
 +        send_id = overlap->send_id[ipulse];
 +        recv_id = overlap->recv_id[ipulse];
 +        send_nindex   = overlap->comm_data[ipulse].send_nindex;
 +        /* recv_index0   = overlap->comm_data[ipulse].recv_index0; */
 +        recv_index0 = 0;
 +        recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
 +
 +        sendptr = overlap->sendbuf;
 +        recvptr = overlap->recvbuf;
 +
 +        if (debug != NULL)
 +        {
 +            fprintf(debug,"PME fftgrid comm %2d x %2d x %2d\n",
 +                   send_nindex,local_fft_ndata[YY],local_fft_ndata[ZZ]);
 +        }
 +
 +#ifdef GMX_MPI
 +        MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
 +                     send_id,ipulse,
 +                     recvptr,recv_nindex*datasize,GMX_MPI_REAL,
 +                     recv_id,ipulse,
 +                     overlap->mpi_comm,&stat);
  #endif
 -        
 -        /* Note that the charge spreading and force gathering, which usually
 -         * takes about the same amount of time as FFT+solve_pme,
 -         * is always fully load balanced
 -         * (unless the charge distribution is inhomogeneous).
 -         */
 -        
 -        imbal = pme_load_imbalance(pme);
 -        if (imbal >= 1.2 && pme->nodeid_major == 0 && pme->nodeid_minor == 0)
 +
 +        for(x=0; x<recv_nindex; x++)
          {
 -            fprintf(stderr,
 -                    "\n"
 -                    "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
 -                    "      For optimal PME load balancing\n"
 -                    "      PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
 -                    "      and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
 -                    "\n",
 -                    (int)((imbal-1)*100 + 0.5),
 -                    pme->nkx,pme->nky,pme->nnodes_major,
 -                    pme->nky,pme->nkz,pme->nnodes_minor);
 +            for(y=0; y<local_fft_ndata[YY]; y++)
 +            {
 +                indg = (x*local_fft_size[YY]  + y)*local_fft_size[ZZ];
 +                indb = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
 +                for(z=0; z<local_fft_ndata[ZZ]; z++)
 +                {
 +                    fftgrid[indg+z] += recvptr[indb+z];
 +                }
 +            }
          }
      }
 +}
  
 -    init_overlap_comm(&pme->overlap[0],pme->pme_order,
 -#ifdef GMX_MPI
 -                      pme->mpi_comm_d[0],
 +
 +static void spread_on_grid(gmx_pme_t pme,
 +                           pme_atomcomm_t *atc,pmegrids_t *grids,
 +                           gmx_bool bCalcSplines,gmx_bool bSpread,
 +                           real *fftgrid)
 +{
 +    int nthread,thread;
 +#ifdef PME_TIME_THREADS
 +    gmx_cycles_t c1,c2,c3,ct1a,ct1b,ct1c;
 +    static double cs1=0,cs2=0,cs3=0;
 +    static double cs1a[6]={0,0,0,0,0,0};
 +    static int cnt=0;
  #endif
 -                      pme->nnodes_major,pme->nodeid_major,pme->nkx);
 -    
 -    init_overlap_comm(&pme->overlap[1],pme->pme_order,
 -#ifdef GMX_MPI
 -                      pme->mpi_comm_d[1],
 +
 +    nthread = pme->nthread;
 +    assert(nthread>0);
 +
 +#ifdef PME_TIME_THREADS
 +    c1 = omp_cyc_start();
  #endif
 -                      pme->nnodes_minor,pme->nodeid_minor,pme->nky);
 -    
 -    snew(pme->bsp_mod[XX],pme->nkx);
 -    snew(pme->bsp_mod[YY],pme->nky);
 -    snew(pme->bsp_mod[ZZ],pme->nkz);
 -    
 -    /* Allocate data for the interpolation grid, including overlap */
 -    pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
 -                      pme->overlap[0].s2g0[pme->nodeid_major];
 -    pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] - 
 -                      pme->overlap[1].s2g0[pme->nodeid_minor];
 -    pme->pmegrid_nz = pme->nkz + pme->pme_order - 1;
 -    
 -    pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
 -    pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
 -    pme->pmegrid_start_iz = 0;
 -    
 -    make_gridindex5_to_localindex(pme->nkx,
 -                                  pme->pmegrid_start_ix,
 -                                  pme->pmegrid_nx - (pme->pme_order-1),
 -                                  &pme->nnx,&pme->fshx);
 -    make_gridindex5_to_localindex(pme->nky,
 -                                  pme->pmegrid_start_iy,
 -                                  pme->pmegrid_ny - (pme->pme_order-1),
 -                                  &pme->nny,&pme->fshy);
 -    make_gridindex5_to_localindex(pme->nkz,
 -                                  pme->pmegrid_start_iz,
 -                                  pme->pmegrid_nz - (pme->pme_order-1),
 -                                  &pme->nnz,&pme->fshz);
 -    
 -    snew(pme->pmegridA,pme->pmegrid_nx*pme->pmegrid_ny*pme->pmegrid_nz);
 -    
 -    /* For non-divisible grid we need pme_order iso pme_order-1 */
 -    /* x overlap is copied in place: take padding into account.
 -     * y is always copied through a buffer: we don't need padding in z,
 -     * but we do need the overlap in x because of the communication order.
 -     */
 -    bufsizex = pme->pme_order*pme->pmegrid_ny*pme->pmegrid_nz;
 -    bufsizey = pme->pme_order*pme->pmegrid_nx*pme->nkz;
 -    bufsize  = (bufsizex>bufsizey) ? bufsizex : bufsizey;
 -    
 -    snew(pme->pmegrid_sendbuf,bufsize);
 -    snew(pme->pmegrid_recvbuf,bufsize);
 -    
 -    ndata[0] = pme->nkx;
 -    ndata[1] = pme->nky;
 -    ndata[2] = pme->nkz;
 -    
 -    /* This routine will allocate the grid data to fit the FFTs */
 -    gmx_parallel_3dfft_init(&pme->pfft_setupA,ndata,
 -                            &pme->fftgridA,&pme->cfftgridA,
 -                            pme->mpi_comm_d,
 -                            pme->overlap[0].s2g0,pme->overlap[1].s2g0,
 -                            bReproducible);
 -    
 -    if (bFreeEnergy)
 -    {
 -        snew(pme->pmegridB,pme->pmegrid_nx*pme->pmegrid_ny*pme->pmegrid_nz);    
 -        gmx_parallel_3dfft_init(&pme->pfft_setupB,ndata,
 -                                &pme->fftgridB,&pme->cfftgridB,
 -                                pme->mpi_comm_d,
 -                                pme->overlap[0].s2g0,pme->overlap[1].s2g0,
 -                                bReproducible);
 -    } else 
 +    if (bCalcSplines)
      {
 -        pme->pmegridB    = NULL;
 -        pme->fftgridB    = NULL;
 -        pme->cfftgridB   = NULL;
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +        for(thread=0; thread<nthread; thread++)
 +        {
 +            int start,end;
 +
 +            start = atc->n* thread   /nthread;
 +            end   = atc->n*(thread+1)/nthread;
 +
 +            /* Compute fftgrid index for all atoms,
 +             * with help of some extra variables.
 +             */
 +            calc_interpolation_idx(pme,atc,start,end,thread);
 +        }
      }
 -    
 -    make_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order);
 -    
 -    /* Use atc[0] for spreading */
 -    init_atomcomm(pme,&pme->atc[0],cr,nnodes_major > 1 ? 0 : 1,TRUE);
 -    if (pme->ndecompdim >= 2)
 +#ifdef PME_TIME_THREADS
 +    c1 = omp_cyc_end(c1);
 +    cs1 += (double)c1;
 +#endif
 +
 +#ifdef PME_TIME_THREADS
 +    c2 = omp_cyc_start();
 +#endif
 +#pragma omp parallel for num_threads(nthread) schedule(static)
 +    for(thread=0; thread<nthread; thread++)
      {
 -        init_atomcomm(pme,&pme->atc[1],cr,1,FALSE);
 +        splinedata_t *spline;
 +        pmegrid_t *grid;
 +
 +        /* make local bsplines  */
 +        if (grids == NULL || grids->nthread == 1)
 +        {
 +            spline = &atc->spline[0];
 +
 +            spline->n = atc->n;
 +
 +            grid = &grids->grid;
 +        }
 +        else
 +        {
 +            spline = &atc->spline[thread];
 +
 +            make_thread_local_ind(atc,thread,spline);
 +
 +            grid = &grids->grid_th[thread];
 +        }
 +
 +        if (bCalcSplines)
 +        {
 +            make_bsplines(spline->theta,spline->dtheta,pme->pme_order,
 +                          atc->fractx,spline->n,spline->ind,atc->q,pme->bFEP);
 +        }
 +
 +        if (bSpread)
 +        {
 +            /* put local atoms on grid. */
 +#ifdef PME_TIME_SPREAD
 +            ct1a = omp_cyc_start();
 +#endif
 +            spread_q_bsplines_thread(grid,atc,spline,pme->spline_work);
 +
 +            if (grids->nthread > 1)
 +            {
 +                copy_local_grid(pme,grids,thread,fftgrid);
 +            }
 +#ifdef PME_TIME_SPREAD
 +            ct1a = omp_cyc_end(ct1a);
 +            cs1a[thread] += (double)ct1a;
 +#endif
 +        }
      }
 -    
 -    if (pme->nnodes == 1) {
 -        pme->atc[0].n = homenr;
 -        pme_realloc_atomcomm_things(&pme->atc[0]);
 +#ifdef PME_TIME_THREADS
 +    c2 = omp_cyc_end(c2);
 +    cs2 += (double)c2;
 +#endif
 +
 +    if (bSpread && grids->nthread > 1)
 +    {
 +#ifdef PME_TIME_THREADS
 +        c3 = omp_cyc_start();
 +#endif
 +#pragma omp parallel for num_threads(grids->nthread) schedule(static)
 +        for(thread=0; thread<grids->nthread; thread++)
 +        {
 +            reduce_threadgrid_overlap(pme,grids,thread,
 +                                      fftgrid,
 +                                      pme->overlap[0].sendbuf,
 +                                      pme->overlap[1].sendbuf);
 +#ifdef PRINT_PME_SENDBUF
 +            print_sendbuf(pme,pme->overlap[0].sendbuf);
 +#endif
 +        }
 +#ifdef PME_TIME_THREADS
 +        c3 = omp_cyc_end(c3);
 +        cs3 += (double)c3;
 +#endif
 +
 +        if (pme->nnodes > 1)
 +        {
 +            /* Communicate the overlapping part of the fftgrid */
 +            sum_fftgrid_dd(pme,fftgrid);
 +        }
      }
 -    
 -    /* Use fft5d, order after FFT is y major, z, x minor */
 -    pme->work_nalloc = pme->nkx;
 -    snew(pme->work_mhx,pme->work_nalloc);
 -    snew(pme->work_mhy,pme->work_nalloc);
 -    snew(pme->work_mhz,pme->work_nalloc);
 -    snew(pme->work_m2,pme->work_nalloc);
 -    snew(pme->work_denom,pme->work_nalloc);
 -    /* Allocate an aligned pointer for SSE operations, including 3 extra
 -     * elements at the end since SSE operates on 4 elements at a time.
 -     */
 -    snew(pme->work_tmp1_alloc,pme->work_nalloc+8);
 -    pme->work_tmp1 = (real *) (((size_t) pme->work_tmp1_alloc + 16) & (~((size_t) 15)));
 -    snew(pme->work_m2inv,pme->work_nalloc);
  
 -    *pmedata = pme;
 -    
 -    return 0;
 +#ifdef PME_TIME_THREADS
 +    cnt++;
 +    if (cnt % 20 == 0)
 +    {
 +        printf("idx %.2f spread %.2f red %.2f",
 +               cs1*1e-9,cs2*1e-9,cs3*1e-9);
 +#ifdef PME_TIME_SPREAD
 +        for(thread=0; thread<nthread; thread++)
 +            printf(" %.2f",cs1a[thread]*1e-9);
 +#endif
 +        printf("\n");
 +    }
 +#endif
  }
  
 -static void spread_on_grid(gmx_pme_t pme,
 -                           pme_atomcomm_t *atc,real *grid,
 -                           gmx_bool bCalcSplines,gmx_bool bSpread)
 -{    
 -    if (bCalcSplines)
 -    {
 -    
 -        /* Compute fftgrid index for all atoms,
 -         * with help of some extra variables.
 -         */
 -        calc_interpolation_idx(pme,atc);
 -        
 -        /* make local bsplines  */
 -        make_bsplines(atc->theta,atc->dtheta,pme->pme_order,
 -                      atc->fractx,atc->n,atc->q,pme->bFEP);
 -    }    
 -    
 -    if (bSpread)
 +
 +static void dump_grid(FILE *fp,
 +                      int sx,int sy,int sz,int nx,int ny,int nz,
 +                      int my,int mz,const real *g)
 +{
 +    int x,y,z;
 +
 +    for(x=0; x<nx; x++)
      {
 -        /* put local atoms on grid. */
 -        spread_q_bsplines(pme,atc,grid);
 +        for(y=0; y<ny; y++)
 +        {
 +            for(z=0; z<nz; z++)
 +            {
 +                fprintf(fp,"%2d %2d %2d %6.3f\n",
 +                        sx+x,sy+y,sz+z,g[(x*my + y)*mz + z]);
 +            }
 +        }
      }
  }
  
 +static void dump_local_fftgrid(gmx_pme_t pme,const real *fftgrid)
 +{
 +    ivec local_fft_ndata,local_fft_offset,local_fft_size;
 +
 +    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
 +                                   local_fft_ndata,
 +                                   local_fft_offset,
 +                                   local_fft_size);
 +
 +    dump_grid(stderr,
 +              pme->pmegrid_start_ix,
 +              pme->pmegrid_start_iy,
 +              pme->pmegrid_start_iz,
 +              pme->pmegrid_nx-pme->pme_order+1,
 +              pme->pmegrid_ny-pme->pme_order+1,
 +              pme->pmegrid_nz-pme->pme_order+1,
 +              local_fft_size[YY],
 +              local_fft_size[ZZ],
 +              fftgrid);
 +}
 +
 +
  void gmx_pme_calc_energy(gmx_pme_t pme,int n,rvec *x,real *q,real *V)
  {
      pme_atomcomm_t *atc;
 -    real *grid;
 +    pmegrids_t *grid;
  
      if (pme->nnodes > 1)
      {
      }
  
      atc = &pme->atc_energy;
 +    atc->nthread   = 1;
 +    if (atc->spline == NULL)
 +    {
 +        snew(atc->spline,atc->nthread);
 +    }
      atc->nslab     = 1;
      atc->bSpread   = TRUE;
      atc->pme_order = pme->pme_order;
      pme_realloc_atomcomm_things(atc);
      atc->x         = x;
      atc->q         = q;
 -    
 +
      /* We only use the A-charges grid */
 -    grid = pme->pmegridA;
 +    grid = &pme->pmegridA;
  
 -    spread_on_grid(pme,atc,NULL,TRUE,FALSE);
 +    spread_on_grid(pme,atc,NULL,TRUE,FALSE,pme->fftgridA);
  
 -    *V = gather_energy_bsplines(pme,grid,atc);
 +    *V = gather_energy_bsplines(pme,grid->grid.grid,atc);
  }
  
  
@@@ -3968,12 -2399,12 +3972,12 @@@ int gmx_pmeonly(gmx_pme_t pme
      int  count;
      gmx_bool bEnerVir;
      gmx_large_int_t step,step_rel;
 -    
 -    
 +
 +
      pme_pp = gmx_pme_pp_init(cr);
 -    
 +
      init_nrnb(nrnb);
 -    
 +
      count = 0;
      do /****** this is a quasi-loop over time steps! */
      {
                                    &pme->bFEP,&lambda,
                                    &bEnerVir,
                                    &step);
 -        
 +
          if (natoms == -1) {
              /* We should stop: break out of the loop */
              break;
          }
 -        
 +
          step_rel = step - ir->init_step;
 -        
 +
          if (count == 0)
              wallcycle_start(wcycle,ewcRUN);
 -        
 +
          wallcycle_start(wcycle,ewcPMEMESH);
 -        
 +
          dvdlambda = 0;
          clear_mat(vir);
          gmx_pme_do(pme,0,natoms,x_pp,f_pp,chargeA,chargeB,box,
                     cr,maxshift_x,maxshift_y,nrnb,wcycle,vir,ewaldcoeff,
                     &energy,lambda,&dvdlambda,
                     GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
 -        
 +
          cycles = wallcycle_stop(wcycle,ewcPMEMESH);
 -        
 +
          gmx_pme_send_force_vir_ener(pme_pp,
                                      f_pp,vir,energy,dvdlambda,
                                      cycles);
 -        
 +
          count++;
  
          if (step_rel == wcycle_get_reset_counters(wcycle))
              reset_pmeonly_counters(cr,wcycle,nrnb,ir,step_rel);
              wcycle_set_reset_counters(wcycle, 0);
          }
 -        
 +
      } /***** end of quasi-loop, we stop with the break above */
      while (TRUE);
 -    
 +
      return 0;
  }
  
@@@ -4029,34 -2460,28 +4033,34 @@@ int gmx_pme_do(gmx_pme_t pme
                 int start,       int homenr,
                 rvec x[],        rvec f[],
                 real *chargeA,   real *chargeB,
 -               matrix box,    t_commrec *cr,
 +               matrix box, t_commrec *cr,
                 int  maxshift_x, int maxshift_y,
                 t_nrnb *nrnb,    gmx_wallcycle_t wcycle,
                 matrix vir,      real ewaldcoeff,
 -               real *energy,    real lambda, 
 +               real *energy,    real lambda,
                 real *dvdlambda, int flags)
  {
      int     q,d,i,j,ntot,npme;
      int     nx,ny,nz;
      int     n_d,local_ny;
 -    int     loop_count;
      pme_atomcomm_t *atc=NULL;
 -    real *  grid=NULL;
 +    pmegrids_t *pmegrid=NULL;
 +    real    *grid=NULL;
      real    *ptr;
      rvec    *x_d,*f_d;
 -    real    *charge=NULL,*q_d,vol;
 +    real    *charge=NULL,*q_d;
      real    energy_AB[2];
      matrix  vir_AB[2];
 -    gmx_bool    bClearF;
 +    gmx_bool bClearF;
      gmx_parallel_3dfft_t pfft_setup;
      real *  fftgrid;
      t_complex * cfftgrid;
 +    int     thread;
 +    const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR;
 +    const gmx_bool bCalcF = flags & GMX_PME_CALC_F;
 +
 +    assert(pme->nnodes > 0);
 +    assert(pme->nnodes == 1 || pme->ndecompdim > 0);
  
      if (pme->nnodes > 1) {
          atc = &pme->atc[0];
          /* This could be necessary for TPI */
          pme->atc[0].n = homenr;
      }
 -    
 +
      for(q=0; q<(pme->bFEP ? 2 : 1); q++) {
          if (q == 0) {
 -            grid = pme->pmegridA;
 +            pmegrid = &pme->pmegridA;
              fftgrid = pme->fftgridA;
              cfftgrid = pme->cfftgridA;
              pfft_setup = pme->pfft_setupA;
              charge = chargeA+start;
          } else {
 -            grid = pme->pmegridB;
 +            pmegrid = &pme->pmegridB;
              fftgrid = pme->fftgridB;
              cfftgrid = pme->cfftgridB;
              pfft_setup = pme->pfft_setupB;
              charge = chargeB+start;
          }
 +        grid = pmegrid->grid.grid;
          /* Unpack structure */
          if (debug) {
              fprintf(debug,"PME: nnodes = %d, nodeid = %d\n",
                  gmx_fatal(FARGS,"No grid!");
          }
          where();
 -        
 -        m_inv_ur0(box,pme->recipbox); 
 +
 +        m_inv_ur0(box,pme->recipbox);
  
          if (pme->nnodes == 1) {
              atc = &pme->atc[0];
                      srenew(atc->pd,atc->pd_nalloc);
                  }
                  atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y);
 -                pme_calc_pidx(n_d,pme->recipbox,x_d,atc);
 +                pme_calc_pidx_wrapper(n_d,pme->recipbox,x_d,atc);
                  where();
 -                
 +
                  GMX_BARRIER(cr->mpi_comm_mygroup);
                  /* Redistribute x (only once) and qA or qB */
                  if (DOMAINDECOMP(cr)) {
  
              wallcycle_stop(wcycle,ewcPME_REDISTXF);
          }
 -        
 +
          if (debug)
              fprintf(debug,"Node= %6d, pme local particles=%6d\n",
                      cr->nodeid,atc->n);
  
              /* Spread the charges on a grid */
              GMX_MPE_LOG(ev_spread_on_grid_start);
 -            
 +
              /* Spread the charges on a grid */
 -            spread_on_grid(pme,&pme->atc[0],grid,q==0,TRUE);
 +            spread_on_grid(pme,&pme->atc[0],pmegrid,q==0,TRUE,fftgrid);
              GMX_MPE_LOG(ev_spread_on_grid_finish);
  
              if (q == 0)
              inc_nrnb(nrnb,eNR_SPREADQBSP,
                       pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
  
 -            wrap_periodic_pmegrid(pme,grid);
 +            if (pme->nthread == 1)
 +            {
 +                wrap_periodic_pmegrid(pme,grid);
  
 -            /* sum contributions to local grid from other nodes */
 +                /* sum contributions to local grid from other nodes */
  #ifdef GMX_MPI
 -            if (pme->nnodes > 1) {
 -                GMX_BARRIER(cr->mpi_comm_mygroup);
 -                gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_FORWARD);
 -                where();
 -            }
 +                if (pme->nnodes > 1)
 +                {
 +                    GMX_BARRIER(cr->mpi_comm_mygroup);
 +                    gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_FORWARD);
 +                    where();
 +                }
  #endif
 -            where();
  
 -            copy_pmegrid_to_fftgrid(pme,grid,fftgrid);
 +                copy_pmegrid_to_fftgrid(pme,grid,fftgrid);
 +            }
  
              wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
 -        }
 -         
 -        if (flags & GMX_PME_SOLVE)
 -        {
 -            /* do 3d-fft */ 
 -            GMX_BARRIER(cr->mpi_comm_mygroup);
 -            GMX_MPE_LOG(ev_gmxfft3d_start);
 -            wallcycle_start(wcycle,ewcPME_FFT);
 -            gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_REAL_TO_COMPLEX,fftgrid,cfftgrid);
 -            wallcycle_stop(wcycle,ewcPME_FFT);
 -            GMX_MPE_LOG(ev_gmxfft3d_finish);
 -            where();
 -            
 -            /* solve in k-space for our local cells */
 -            vol = det(box);
 -            GMX_BARRIER(cr->mpi_comm_mygroup);
 -            GMX_MPE_LOG(ev_solve_pme_start);
 -            wallcycle_start(wcycle,ewcPME_SOLVE);
 -            loop_count =
 -                solve_pme_yzx(pme,cfftgrid,ewaldcoeff,vol,
 -                              flags & GMX_PME_CALC_ENER_VIR,
 -                              &energy_AB[q],vir_AB[q]);
 -            wallcycle_stop(wcycle,ewcPME_SOLVE);
 -            where();
 -            GMX_MPE_LOG(ev_solve_pme_finish);
 -            inc_nrnb(nrnb,eNR_SOLVEPME,loop_count);
 +
 +            /*
 +            dump_local_fftgrid(pme,fftgrid);
 +            exit(0);
 +            */
          }
  
 -        if ((flags & GMX_PME_CALC_F) ||
 -            (flags & GMX_PME_CALC_POT))
 +        /* Here we start a large thread parallel region */
 +#pragma omp parallel for num_threads(pme->nthread) schedule(static)
 +        for(thread=0; thread<pme->nthread; thread++)
          {
 -            
 -            /* do 3d-invfft */
 -            GMX_BARRIER(cr->mpi_comm_mygroup);
 -            GMX_MPE_LOG(ev_gmxfft3d_start);
 -            where();
 -            wallcycle_start(wcycle,ewcPME_FFT);
 -            gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_COMPLEX_TO_REAL,cfftgrid,fftgrid);
 -            wallcycle_stop(wcycle,ewcPME_FFT);
 +            if (flags & GMX_PME_SOLVE)
 +            {
 +                int loop_count;
  
 -            where();
 -            GMX_MPE_LOG(ev_gmxfft3d_finish);
 +                /* do 3d-fft */
 +                if (thread == 0)
 +                {
 +                    GMX_BARRIER(cr->mpi_comm_mygroup);
 +                    GMX_MPE_LOG(ev_gmxfft3d_start);
 +                    wallcycle_start(wcycle,ewcPME_FFT);
 +                }
 +                gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_REAL_TO_COMPLEX,
 +                                           fftgrid,cfftgrid,thread,wcycle);
 +                if (thread == 0)
 +                {
 +                    wallcycle_stop(wcycle,ewcPME_FFT);
 +                    GMX_MPE_LOG(ev_gmxfft3d_finish);
 +                }
 +                where();
  
 -            if (pme->nodeid == 0)
 -            {
 -                ntot = pme->nkx*pme->nky*pme->nkz;
 -                npme  = ntot*log((real)ntot)/log(2.0);
 -                inc_nrnb(nrnb,eNR_FFT,2*npme);
 +                /* solve in k-space for our local cells */
 +                if (thread == 0)
 +                {
 +                    GMX_BARRIER(cr->mpi_comm_mygroup);
 +                    GMX_MPE_LOG(ev_solve_pme_start);
 +                    wallcycle_start(wcycle,ewcPME_SOLVE);
 +                }
 +                loop_count =
 +                    solve_pme_yzx(pme,cfftgrid,ewaldcoeff,
 +                                  box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
 +                                  bCalcEnerVir,
 +                                  pme->nthread,thread);
 +                if (thread == 0)
 +                {
 +                    wallcycle_stop(wcycle,ewcPME_SOLVE);
 +                    where();
 +                    GMX_MPE_LOG(ev_solve_pme_finish);
 +                    inc_nrnb(nrnb,eNR_SOLVEPME,loop_count);
 +                }
              }
  
 -            wallcycle_start(wcycle,ewcPME_SPREADGATHER);
 +            if (bCalcF)
 +            {
 +                /* do 3d-invfft */
 +                if (thread == 0)
 +                {
 +                    GMX_BARRIER(cr->mpi_comm_mygroup);
 +                    GMX_MPE_LOG(ev_gmxfft3d_start);
 +                    where();
 +                    wallcycle_start(wcycle,ewcPME_FFT);
 +                }
 +                gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_COMPLEX_TO_REAL,
 +                                           cfftgrid,fftgrid,thread,wcycle);
 +                if (thread == 0)
 +                {
 +                    wallcycle_stop(wcycle,ewcPME_FFT);
  
 -            copy_fftgrid_to_pmegrid(pme,fftgrid,grid);
 +                    where();
 +                    GMX_MPE_LOG(ev_gmxfft3d_finish);
 +
 +                    if (pme->nodeid == 0)
 +                    {
 +                        ntot = pme->nkx*pme->nky*pme->nkz;
 +                        npme  = ntot*log((real)ntot)/log(2.0);
 +                        inc_nrnb(nrnb,eNR_FFT,2*npme);
 +                    }
 +
 +                    wallcycle_start(wcycle,ewcPME_SPREADGATHER);
 +                }
 +
 +                copy_fftgrid_to_pmegrid(pme,fftgrid,grid,pme->nthread,thread);
 +            }
 +        }
 +        /* End of thread parallel section.
 +         * With MPI we have to synchronize here before gmx_sum_qgrid_dd.
 +         */
  
 +        if (bCalcF)
 +        {
              /* distribute local grid to all nodes */
  #ifdef GMX_MPI
              if (pme->nnodes > 1) {
              where();
  
              unwrap_periodic_pmegrid(pme,grid);
 -        }
  
 -        if (flags & GMX_PME_CALC_F)
 -        {
              /* interpolate forces for our local atoms */
              GMX_BARRIER(cr->mpi_comm_mygroup);
              GMX_MPE_LOG(ev_gather_f_bsplines_start);
  
              where();
 -            
 +
              /* If we are running without parallelization,
               * atc->f is the actual force array, not a buffer,
               * therefore we should not clear it.
               */
              bClearF = (q == 0 && PAR(cr));
 -            gather_f_bsplines(pme,grid,bClearF,&pme->atc[0],
 -                              pme->bFEP ? (q==0 ? 1.0-lambda : lambda) : 1.0);
 +#pragma omp parallel for num_threads(pme->nthread) schedule(static)
 +            for(thread=0; thread<pme->nthread; thread++)
 +            {
 +                gather_f_bsplines(pme,grid,bClearF,atc,
 +                                  &atc->spline[thread],
 +                                  pme->bFEP ? (q==0 ? 1.0-lambda : lambda) : 1.0);
 +            }
 +
              where();
 -            
 +
              GMX_MPE_LOG(ev_gather_f_bsplines_finish);
 -            
 +
              inc_nrnb(nrnb,eNR_GATHERFBSP,
                       pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
              wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
 -       }
 +        }
 +
 +        if (bCalcEnerVir)
 +        {
 +            /* This should only be called on the master thread
 +             * and after the threads have synchronized.
 +             */
 +            get_pme_ener_vir(pme,pme->nthread,&energy_AB[q],vir_AB[q]);
 +        }
      } /* of q-loop */
 -    
 -    if ((flags & GMX_PME_CALC_F) && pme->nnodes > 1) {
 +
 +    if (bCalcF && pme->nnodes > 1) {
          wallcycle_start(wcycle,ewcPME_REDISTXF);
          for(d=0; d<pme->ndecompdim; d++)
          {
          wallcycle_stop(wcycle,ewcPME_REDISTXF);
      }
      where();
 -    
 -    if (!pme->bFEP) {
 -        *energy = energy_AB[0];
 -        m_add(vir,vir_AB[0],vir);
 -    } else {
 -        *energy = (1.0-lambda)*energy_AB[0] + lambda*energy_AB[1];
 -        *dvdlambda += energy_AB[1] - energy_AB[0];
 -        for(i=0; i<DIM; i++)
 -            for(j=0; j<DIM; j++)
 -                vir[i][j] += (1.0-lambda)*vir_AB[0][i][j] + lambda*vir_AB[1][i][j];
 +
 +    if (bCalcEnerVir)
 +    {
 +        if (!pme->bFEP) {
 +            *energy = energy_AB[0];
 +            m_add(vir,vir_AB[0],vir);
 +        } else {
 +            *energy = (1.0-lambda)*energy_AB[0] + lambda*energy_AB[1];
 +            *dvdlambda += energy_AB[1] - energy_AB[0];
 +            for(i=0; i<DIM; i++)
 +            {
 +                for(j=0; j<DIM; j++)
 +                {
 +                    vir[i][j] += (1.0-lambda)*vir_AB[0][i][j] + 
 +                        lambda*vir_AB[1][i][j];
 +                }
 +            }
 +        }
 +    }
 +    else
 +    {
 +        *energy = 0;
      }
  
      if (debug)
 +    {
          fprintf(debug,"PME mesh energy: %g\n",*energy);
 -    
 +    }
 +
      return 0;
  }
diff --combined src/mdlib/sim_util.c
index bf6743544041b800f1be9c05cca044848f8bbfca,047dc9d4c2b1cef66333fd09f3c2909991ca803c..e06f3d88ce411a6dcdc26d2978d30fe31491b061
@@@ -1,12 -1,12 +1,12 @@@
  /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   *
 - * 
 + *
   *                This source code is part of
 - * 
 + *
   *                 G   R   O   M   A   C   S
 - * 
 + *
   *          GROningen MAchine for Chemical Simulations
 - * 
 + *
   *                        VERSION 3.2.0
   * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * modify it under the terms of the GNU General Public License
   * as published by the Free Software Foundation; either version 2
   * of the License, or (at your option) any later version.
 - * 
 + *
   * If you want to redistribute modifications, please consider that
   * scientific software is very special. Version control is crucial -
   * bugs must be traceable. We will be happy to consider code for
   * inclusion in the official distribution, but derived work must not
   * be called official GROMACS. Details are found in the README & COPYING
   * files - if they are missing, get the official version at www.gromacs.org.
 - * 
 + *
   * To help us fund GROMACS development, we humbly ask that you cite
   * the papers on the package - you can find them in the top README file.
 - * 
 + *
   * For more info, check our website at http://www.gromacs.org
 - * 
 + *
   * And Hey:
   * GROwing Monsters And Cloning Shrimps
   */
@@@ -59,7 -59,7 +59,7 @@@
  #include "pbc.h"
  #include "chargegroup.h"
  #include "vec.h"
 -#include "time.h"
 +#include <time.h>
  #include "nrnb.h"
  #include "mshift.h"
  #include "mdrun.h"
@@@ -70,6 -70,7 +70,6 @@@
  #include "force.h"
  #include "bondf.h"
  #include "pme.h"
 -#include "pppm.h"
  #include "disre.h"
  #include "orires.h"
  #include "network.h"
@@@ -79,8 -80,7 +79,8 @@@
  #include "trnio.h"
  #include "xtcio.h"
  #include "copyrite.h"
 -
 +#include "pull_rotation.h"
 +#include "gmx_random.h"
  #include "mpelogging.h"
  #include "domdec.h"
  #include "partdec.h"
  #ifdef GMX_LIB_MPI
  #include <mpi.h>
  #endif
 -#ifdef GMX_THREADS
 +#ifdef GMX_THREAD_MPI
  #include "tmpi.h"
  #endif
  
 +#include "adress.h"
  #include "qmmm.h"
  
  #if 0
  typedef struct gmx_timeprint {
 -    
 +
  } t_gmx_timeprint;
  #endif
  
@@@ -114,17 -113,17 +114,17 @@@ gmx_gettime(
  #ifdef HAVE_GETTIMEOFDAY
        struct timeval t;
        double seconds;
 -      
 +
        gettimeofday(&t,NULL);
 -      
 +
        seconds = (double) t.tv_sec + 1e-6*(double)t.tv_usec;
 -      
 +
        return seconds;
  #else
        double  seconds;
 -      
 +
        seconds = time(NULL);
 -      
 +
        return seconds;
  #endif
  }
  
  #define difftime(end,start) ((double)(end)-(double)(start))
  
 -void print_time(FILE *out,gmx_runtime_t *runtime,gmx_large_int_t step,   
 +void print_time(FILE *out,gmx_runtime_t *runtime,gmx_large_int_t step,
                  t_inputrec *ir, t_commrec *cr)
  {
      time_t finish;
      char   timebuf[STRLEN];
      double dt;
      char buf[48];
 -    
 -#ifndef GMX_THREADS
 +
 +#ifndef GMX_THREAD_MPI
      if (!PAR(cr))
  #endif
      {
              runtime->time_per_step = dt/(step - ir->init_step + 1);
          }
          dt = (ir->nsteps + ir->init_step - step)*runtime->time_per_step;
 -        
 +
          if (ir->nsteps >= 0)
          {
              if (dt >= 300)
 -            {    
 +            {
                  finish = (time_t) (runtime->last + dt);
                  gmx_ctime_r(&finish,timebuf,STRLEN);
                  sprintf(buf,"%s",timebuf);
                      ir->delta_t/1000*24*60*60/runtime->time_per_step);
          }
      }
 -#ifndef GMX_THREADS
 +#ifndef GMX_THREAD_MPI
      if (PAR(cr))
      {
          fprintf(out,"\n");
      fflush(out);
  }
  
 -#ifdef NO_CLOCK 
 +#ifdef NO_CLOCK
  #define clock() -1
  #endif
  
@@@ -199,7 -198,7 +199,7 @@@ static double set_proctime(gmx_runtime_
  
      prev = runtime->proc;
      runtime->proc = dclock();
 -    
 +
      diff = runtime->proc - prev;
  #else
      clock_t prev;
@@@ -232,9 -231,9 +232,9 @@@ void runtime_start(gmx_runtime_t *runti
  void runtime_end(gmx_runtime_t *runtime)
  {
      double now;
 -    
 +
      now = gmx_gettime();
 -    
 +
      runtime->proctime += set_proctime(runtime);
      runtime->realtime  = now - runtime->real;
      runtime->real      = now;
@@@ -278,7 -277,7 +278,7 @@@ void print_date_and_time(FILE *fplog,in
  static void sum_forces(int start,int end,rvec f[],rvec flr[])
  {
    int i;
 -  
 +
    if (gmx_debug_at) {
      pr_rvecs(debug,0,"fsr",f+start,end-start);
      pr_rvecs(debug,0,"flr",flr+start,end-start);
      rvec_inc(f[i],flr[i]);
  }
  
 -/* 
 +/*
   * calc_f_el calculates forces due to an electric field.
   *
 - * force is kJ mol^-1 nm^-1 = e * kJ mol^-1 nm^-1 / e 
 + * force is kJ mol^-1 nm^-1 = e * kJ mol^-1 nm^-1 / e
   *
 - * Et[] contains the parameters for the time dependent 
 - * part of the field (not yet used). 
 + * Et[] contains the parameters for the time dependent
 + * part of the field (not yet used).
   * Ex[] contains the parameters for
   * the spatial dependent part of the field. You can have cool periodic
   * fields in principle, but only a constant field is supported
 - * now. 
 + * now.
   * The function should return the energy due to the electric field
   * (if any) but for now returns 0.
   *
@@@ -317,7 -316,7 +317,7 @@@ static void calc_f_el(FILE *fp,int  sta
      rvec Ext;
      real t0;
      int  i,m;
 -    
 +
      for(m=0; (m<DIM); m++)
      {
          if (Et[m].n > 0)
@@@ -366,9 -365,9 +366,9 @@@ static void calc_virial(FILE *fplog,in
    clear_mat(vir_part);
    calc_vir(fplog,SHIFTS,fr->shift_vec,fr->fshift,vir_part,ePBC==epbcSCREW,box);
    inc_nrnb(nrnb,eNR_VIRIAL,SHIFTS);
 -  
 -  /* Calculate partial virial, for local atoms only, based on short range. 
 -   * Total virial is computed in global_stat, called from do_md 
 +
 +  /* Calculate partial virial, for local atoms only, based on short range.
 +   * Total virial is computed in global_stat, called from do_md
     */
    f_calc_vir(fplog,start,start+homenr,x,f,vir_part,graph,box);
    inc_nrnb(nrnb,eNR_VIRIAL,homenr);
@@@ -417,7 -416,7 +417,7 @@@ void do_force(FILE *fplog,t_commrec *cr
                tensor vir_force,
                t_mdatoms *mdatoms,
                gmx_enerdata_t *enerd,t_fcdata *fcd,
 -              real lambda,t_graph *graph,
 +              real *lambda,t_graph *graph,
                t_forcerec *fr,gmx_vsite_t *vsite,rvec mu_tot,
                double t,FILE *field,gmx_edsam_t ed,
                gmx_bool bBornRadii,
  {
      int    cg0,cg1,i,j;
      int    start,homenr;
 -    double mu[2*DIM]; 
 +    double mu[2*DIM];
      gmx_bool   bSepDVDL,bStateChanged,bNS,bFillGrid,bCalcCGCM,bBS;
      gmx_bool   bDoLongRange,bDoForces,bSepLRF;
 +    gmx_bool   bDoAdressWF;
      matrix boxs;
 -    real   e,v,dvdl;
 +    real   e,v,dvdlambda[efptNR];
 +    real   dvdl_dum,lambda_dum;
      t_pbc  pbc;
      float  cycles_ppdpme,cycles_pme,cycles_seppme,cycles_force;
 -  
 +
      start  = mdatoms->start;
      homenr = mdatoms->homenr;
  
      }
  
      bStateChanged = (flags & GMX_FORCE_STATECHANGED);
 -    bNS           = (flags & GMX_FORCE_NS) && (fr->bAllvsAll==FALSE); 
 +    bNS           = (flags & GMX_FORCE_NS) && (fr->bAllvsAll==FALSE);
      bFillGrid     = (bNS && bStateChanged);
      bCalcCGCM     = (bFillGrid && !DOMAINDECOMP(cr));
      bDoLongRange  = (fr->bTwinRange && bNS && (flags & GMX_FORCE_DOLR));
      bDoForces     = (flags & GMX_FORCE_FORCES);
      bSepLRF       = (bDoLongRange && bDoForces && (flags & GMX_FORCE_SEPLRF));
 +    /* should probably move this to the forcerec since it doesn't change */
 +    bDoAdressWF   = ((fr->adress_type!=eAdressOff));
  
      if (bStateChanged)
      {
          update_forcerec(fplog,fr,box);
 -        
 -        /* Calculate total (local) dipole moment in a temporary common array. 
 +
 +        /* Calculate total (local) dipole moment in a temporary common array.
           * This makes it possible to sum them over nodes faster.
           */
          calc_mu(start,homenr,
                  x,mdatoms->chargeA,mdatoms->chargeB,mdatoms->nChargePerturbed,
                  mu,mu+DIM);
      }
 -  
 -  if (fr->ePBC != epbcNONE) { 
 +
 +  if (fr->ePBC != epbcNONE) {
      /* Compute shift vectors every step,
       * because of pressure coupling or box deformation!
       */
      if ((flags & GMX_FORCE_DYNAMICBOX) && bStateChanged)
        calc_shifts(box,fr->shift_vec);
 -    
 -    if (bCalcCGCM) { 
 +
 +    if (bCalcCGCM) {
        put_charge_groups_in_box(fplog,cg0,cg1,fr->ePBC,box,
                               &(top->cgs),x,fr->cg_cm);
        inc_nrnb(nrnb,eNR_CGCM,homenr);
        inc_nrnb(nrnb,eNR_RESETX,cg1-cg0);
 -    } 
 +    }
      else if (EI_ENERGY_MINIMIZATION(inputrec->eI) && graph) {
        unshift_self(graph,box,x);
      }
 -  } 
 +  }
    else if (bCalcCGCM) {
      calc_cgcm(fplog,cg0,cg1,&(top->cgs),x,fr->cg_cm);
      inc_nrnb(nrnb,eNR_CGCM,homenr);
    }
 -  
 +
    if (bCalcCGCM) {
      if (PAR(cr)) {
        move_cgcm(fplog,cr,fr->cg_cm);
       * Since this is only implemented for domain decomposition
       * and domain decomposition does not use the graph,
       * we do not need to worry about shifting.
 -     */    
 +     */
  
      wallcycle_start(wcycle,ewcPP_PMESENDX);
      GMX_MPE_LOG(ev_send_coordinates_start);
      }
  
      gmx_pme_send_x(cr,bBS ? boxs : box,x,
 -                   mdatoms->nChargePerturbed,lambda,
 +                   mdatoms->nChargePerturbed,lambda[efptCOUL],
                     ( flags & GMX_FORCE_VIRIAL),step);
  
      GMX_MPE_LOG(ev_send_coordinates_finish);
      }
      if (bStateChanged)
      {
 +
 +        /* update adress weight beforehand */
 +        if(bDoAdressWF)
 +        {
 +            /* need pbc for adress weight calculation with pbc_dx */
 +            set_pbc(&pbc,inputrec->ePBC,box);
 +            if(fr->adress_site == eAdressSITEcog)
 +            {
 +                update_adress_weights_cog(top->idef.iparams,top->idef.il,x,fr,mdatoms,
 +                                          inputrec->ePBC==epbcNONE ? NULL : &pbc);
 +            }
 +            else if (fr->adress_site == eAdressSITEcom)
 +            {
 +                update_adress_weights_com(fplog,cg0,cg1,&(top->cgs),x,fr,mdatoms,
 +                                          inputrec->ePBC==epbcNONE ? NULL : &pbc);
 +            }
 +            else if (fr->adress_site == eAdressSITEatomatom){
 +                update_adress_weights_atom_per_atom(cg0,cg1,&(top->cgs),x,fr,mdatoms,
 +                                          inputrec->ePBC==epbcNONE ? NULL : &pbc);
 +            }
 +            else
 +            {
 +                update_adress_weights_atom(cg0,cg1,&(top->cgs),x,fr,mdatoms,
 +                                           inputrec->ePBC==epbcNONE ? NULL : &pbc);
 +            }
 +        }
 +
          for(i=0; i<2; i++)
          {
              for(j=0;j<DIM;j++)
          for(j=0; j<DIM; j++)
          {
              mu_tot[j] =
 -                (1.0 - lambda)*fr->mu_tot[0][j] + lambda*fr->mu_tot[1][j];
 +                (1.0 - lambda[efptCOUL])*fr->mu_tot[0][j] + lambda[efptCOUL]*fr->mu_tot[1][j];
          }
      }
  
      if (bNS)
      {
          wallcycle_start(wcycle,ewcNS);
 -        
 +
          if (graph && bStateChanged)
          {
              /* Calculate intramolecular shift vectors to make molecules whole */
          /* Do the actual neighbour searching and if twin range electrostatics
           * also do the calculation of long range forces and energies.
           */
 -        dvdl = 0; 
 +        for (i=0;i<efptNR;i++) {dvdlambda[i] = 0;}
          ns(fplog,fr,x,box,
             groups,&(inputrec->opts),top,mdatoms,
 -           cr,nrnb,lambda,&dvdl,&enerd->grpp,bFillGrid,
 +           cr,nrnb,lambda,dvdlambda,&enerd->grpp,bFillGrid,
             bDoLongRange,bDoForces,bSepLRF ? fr->f_twin : f);
          if (bSepDVDL)
          {
 -            fprintf(fplog,sepdvdlformat,"LR non-bonded",0.0,dvdl);
 +            fprintf(fplog,sepdvdlformat,"LR non-bonded",0.0,dvdlambda);
          }
 -        enerd->dvdl_lin += dvdl;
 -        
 +        enerd->dvdl_lin[efptVDW] += dvdlambda[efptVDW];
 +        enerd->dvdl_lin[efptCOUL] += dvdlambda[efptCOUL];
 +
          wallcycle_stop(wcycle,ewcNS);
      }
 -      
 -    if (inputrec->implicit_solvent && bNS) 
 +
 +    if (inputrec->implicit_solvent && bNS)
      {
          make_gb_nblist(cr,inputrec->gb_algorithm,inputrec->rlist,
                         x,box,fr,&top->idef,graph,fr->born);
      }
 -      
 +
      if (DOMAINDECOMP(cr))
      {
          if (!(cr->duty & DUTY_PME))
              dd_force_flop_start(cr->dd,nrnb);
          }
      }
 -      
 +
 +    if (inputrec->bRot)
 +    {
 +        /* Enforced rotation has its own cycle counter that starts after the collective
 +         * coordinates have been communicated. It is added to ddCyclF to allow
 +         * for proper load-balancing */
 +        wallcycle_start(wcycle,ewcROT);
 +        do_rotation(cr,inputrec,box,x,t,step,wcycle,bNS);
 +        wallcycle_stop(wcycle,ewcROT);
 +    }
 +
      /* Start the force cycle counter.
       * This counter is stopped in do_forcelow_level.
       * No parallel communication should occur while this counter is running,
       * since that will interfere with the dynamic load balancing.
       */
      wallcycle_start(wcycle,ewcFORCE);
 -    
 +
      if (bDoForces)
      {
          /* Reset forces for which the virial is calculated separately:
           * PME/Ewald forces if necessary */
 -        if (fr->bF_NoVirSum) 
 +        if (fr->bF_NoVirSum)
          {
              if (flags & GMX_FORCE_VIRIAL)
              {
  
      if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
      {
 -        /* Position restraints always require full pbc */
 -        set_pbc(&pbc,inputrec->ePBC,box);
 +        /* Position restraints always require full pbc. Check if we already did it for Adress */
 +        if(!(bStateChanged && bDoAdressWF))
 +        {
 +            set_pbc(&pbc,inputrec->ePBC,box);
 +        }
          v = posres(top->idef.il[F_POSRES].nr,top->idef.il[F_POSRES].iatoms,
                     top->idef.iparams_posres,
                     (const rvec*)x,fr->f_novirsum,fr->vir_diag_posres,
 -                   inputrec->ePBC==epbcNONE ? NULL : &pbc,lambda,&dvdl,
 +                   inputrec->ePBC==epbcNONE ? NULL : &pbc,lambda[efptRESTRAINT],&(dvdlambda[efptRESTRAINT]),
                     fr->rc_scaling,fr->ePBC,fr->posres_com,fr->posres_comB);
          if (bSepDVDL)
          {
              fprintf(fplog,sepdvdlformat,
 -                    interaction_function[F_POSRES].longname,v,dvdl);
 +                    interaction_function[F_POSRES].longname,v,dvdlambda);
          }
          enerd->term[F_POSRES] += v;
          /* This linear lambda dependence assumption is only correct
           * when only k depends on lambda,
           * not when the reference position depends on lambda.
 -         * grompp checks for this.
 +         * grompp checks for this.  (verify this is still the case?)
           */
 -        enerd->dvdl_lin += dvdl;
 +        enerd->dvdl_nonlin[efptRESTRAINT] += dvdlambda[efptRESTRAINT]; /* if just the force constant changes, this is linear,
 +                                                                          but we can't be sure w/o additional checking that is
 +                                                                          hard to do at this level of code. Otherwise,
 +                                                                          the dvdl is not differentiable */
          inc_nrnb(nrnb,eNR_POSRES,top->idef.il[F_POSRES].nr/2);
 -    }
 +        if ((inputrec->fepvals->n_lambda > 0) && (flags & GMX_FORCE_DHDL))
 +        {
 +            for(i=0; i<enerd->n_lambda; i++)
 +            {
 +                lambda_dum = (i==0 ? lambda[efptRESTRAINT] : inputrec->fepvals->all_lambda[efptRESTRAINT][i-1]);
 +                v = posres(top->idef.il[F_POSRES].nr,top->idef.il[F_POSRES].iatoms,
 +                           top->idef.iparams_posres,
 +                           (const rvec*)x,NULL,NULL,
 +                           inputrec->ePBC==epbcNONE ? NULL : &pbc,lambda_dum,&dvdl_dum,
 +                           fr->rc_scaling,fr->ePBC,fr->posres_com,fr->posres_comB);
 +                enerd->enerpart_lambda[i] += v;
 +            }
 +        }
 +   }
  
 -    /* Compute the bonded and non-bonded energies and optionally forces */    
 +    /* Compute the bonded and non-bonded energies and optionally forces */
      do_force_lowlevel(fplog,step,fr,inputrec,&(top->idef),
                        cr,nrnb,wcycle,mdatoms,&(inputrec->opts),
                        x,hist,f,enerd,fcd,mtop,top,fr->born,
                        &(top->atomtypes),bBornRadii,box,
 -                      lambda,graph,&(top->excls),fr->mu_tot,
 +                      inputrec->fepvals,lambda,graph,&(top->excls),fr->mu_tot,
                        flags,&cycles_pme);
 -    
 +
      cycles_force = wallcycle_stop(wcycle,ewcFORCE);
      GMX_BARRIER(cr->mpi_comm_mygroup);
 -    
 +
      if (ed)
      {
-         do_flood(fplog,cr,x,f,ed,box,step);
+         do_flood(fplog,cr,x,f,ed,box,step,bNS);
      }
 -      
 +
      if (DOMAINDECOMP(cr))
      {
          dd_force_flop_stop(cr->dd,nrnb);
              dd_cycles_add(cr->dd,cycles_force-cycles_pme,ddCyclF);
          }
      }
 -    
 +
      if (bDoForces)
      {
          if (IR_ELEC_FIELD(*inputrec))
                        start,homenr,mdatoms->chargeA,x,fr->f_novirsum,
                        inputrec->ex,inputrec->et,t);
          }
 -        
 +
 +        if (bDoAdressWF && fr->adress_icor == eAdressICThermoForce)
 +        {
 +            /* Compute thermodynamic force in hybrid AdResS region */
 +            adress_thermo_force(start,homenr,&(top->cgs),x,fr->f_novirsum,fr,mdatoms,
 +                                inputrec->ePBC==epbcNONE ? NULL : &pbc);
 +        }
 +
          /* Communicate the forces */
          if (PAR(cr))
          {
                  wallcycle_stop(wcycle,ewcVSITESPREAD);
              }
          }
 -        
 +
          if (flags & GMX_FORCE_VIRIAL)
          {
              /* Calculation of the virial must be done after vsites! */
          }
      }
  
 +    enerd->term[F_COM_PULL] = 0;
      if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
      {
          /* Calculate the center of mass forces, this requires communication,
           * which is why we call pull_potential after calc_virial.
           */
          set_pbc(&pbc,inputrec->ePBC,box);
 -        dvdl = 0; 
 -        enerd->term[F_COM_PULL] =
 +        dvdlambda[efptRESTRAINT] = 0;
 +        enerd->term[F_COM_PULL] +=
              pull_potential(inputrec->ePull,inputrec->pull,mdatoms,&pbc,
 -                           cr,t,lambda,x,f,vir_force,&dvdl);
 +                           cr,t,lambda[efptRESTRAINT],x,f,vir_force,&(dvdlambda[efptRESTRAINT]));
          if (bSepDVDL)
          {
 -            fprintf(fplog,sepdvdlformat,"Com pull",enerd->term[F_COM_PULL],dvdl);
 +            fprintf(fplog,sepdvdlformat,"Com pull",enerd->term[F_COM_PULL],dvdlambda[efptRESTRAINT]);
          }
 -        enerd->dvdl_lin += dvdl;
 +        enerd->dvdl_lin[efptRESTRAINT] += dvdlambda[efptRESTRAINT];
 +    }
 +
 +    /* Add the forces from enforced rotation potentials (if any) */
 +    if (inputrec->bRot)
 +    {
 +        wallcycle_start(wcycle,ewcROTadd);
 +        enerd->term[F_COM_PULL] += add_rot_forces(inputrec->rot, f, cr,step,t);
 +        wallcycle_stop(wcycle,ewcROTadd);
      }
  
      if (PAR(cr) && !(cr->duty & DUTY_PME))
          cycles_ppdpme = wallcycle_stop(wcycle,ewcPPDURINGPME);
          dd_cycles_add(cr->dd,cycles_ppdpme,ddCyclPPduringPME);
  
 -        /* In case of node-splitting, the PP nodes receive the long-range 
 +        /* In case of node-splitting, the PP nodes receive the long-range
           * forces, virial and energy from the PME nodes here.
 -         */    
 +         */
          wallcycle_start(wcycle,ewcPP_PMEWAITRECVF);
 -        dvdl = 0;
 -        gmx_pme_receive_f(cr,fr->f_novirsum,fr->vir_el_recip,&e,&dvdl,
 +        dvdlambda[efptCOUL] = 0;
 +        gmx_pme_receive_f(cr,fr->f_novirsum,fr->vir_el_recip,&e,&dvdlambda[efptCOUL],
                            &cycles_seppme);
          if (bSepDVDL)
          {
 -            fprintf(fplog,sepdvdlformat,"PME mesh",e,dvdl);
 +            fprintf(fplog,sepdvdlformat,"PME mesh",e,dvdlambda[efptCOUL]);
          }
          enerd->term[F_COUL_RECIP] += e;
 -        enerd->dvdl_lin += dvdl;
 +        enerd->dvdl_lin[efptCOUL] += dvdlambda[efptCOUL];
          if (wcycle)
          {
              dd_cycles_add(cr->dd,cycles_seppme,ddCyclPME);
      {
          if (vsite)
          {
 -            /* Spread the mesh force on virtual sites to the other particles... 
 +            /* Spread the mesh force on virtual sites to the other particles...
               * This is parallellized. MPI communication is performed
               * if the constructing atoms aren't local.
               */
              }
          }
      }
 -    
 +
      /* Sum the potential energy terms from group contributions */
      sum_epot(&(inputrec->opts),enerd);
 -    
 +
      if (fr->print_force >= 0 && bDoForces)
      {
          print_large_forces(stderr,mdatoms,cr,step,fr->print_force,x,f);
@@@ -1000,14 -922,14 +1000,14 @@@ void do_constrain_first(FILE *fplog,gmx
      int    i,m,start,end;
      gmx_large_int_t step;
      real   dt=ir->delta_t;
 -    real   dvdlambda;
 +    real   dvdl_dum;
      rvec   *savex;
 -    
 +
      snew(savex,state->natoms);
  
      start = md->start;
      end   = md->homenr + start;
 -    
 +
      if (debug)
          fprintf(debug,"vcm: start=%d, homenr=%d, end=%d\n",
                  start,md->homenr,end);
          fprintf(fplog,"\nConstraining the starting coordinates (step %s)\n",
                  gmx_step_str(step,buf));
      }
 -    dvdlambda = 0;
 -    
 +    dvdl_dum = 0;
 +
      /* constrain the current position */
      constrain(NULL,TRUE,FALSE,constr,&(top->idef),
                ir,NULL,cr,step,0,md,
                state->x,state->x,NULL,
 -              state->box,state->lambda,&dvdlambda,
 +              state->box,state->lambda[efptBONDED],&dvdl_dum,
                NULL,NULL,nrnb,econqCoord,ir->epc==epcMTTK,state->veta,state->veta);
 -    if (EI_VV(ir->eI)) 
 +    if (EI_VV(ir->eI))
      {
          /* constrain the inital velocity, and save it */
          /* also may be useful if we need the ekin from the halfstep for velocity verlet */
          constrain(NULL,TRUE,FALSE,constr,&(top->idef),
                    ir,NULL,cr,step,0,md,
                    state->x,state->v,state->v,
 -                  state->box,state->lambda,&dvdlambda,
 +                  state->box,state->lambda[efptBONDED],&dvdl_dum,
                    NULL,NULL,nrnb,econqVeloc,ir->epc==epcMTTK,state->veta,state->veta);
      }
      /* constrain the inital velocities at t-dt/2 */
      if (EI_STATE_VELOCITY(ir->eI) && ir->eI!=eiVV)
      {
 -        for(i=start; (i<end); i++) 
 +        for(i=start; (i<end); i++)
          {
 -            for(m=0; (m<DIM); m++) 
 +            for(m=0; (m<DIM); m++)
              {
                  /* Reverse the velocity */
                  state->v[i][m] = -state->v[i][m];
                  savex[i][m] = state->x[i][m] + dt*state->v[i][m];
              }
          }
 -    /* Shake the positions at t=-dt with the positions at t=0                        
 -     * as reference coordinates.                                                     
 +    /* Shake the positions at t=-dt with the positions at t=0
 +     * as reference coordinates.
           */
          if (fplog)
          {
              fprintf(fplog,"\nConstraining the coordinates at t0-dt (step %s)\n",
                      gmx_step_str(step,buf));
          }
 -        dvdlambda = 0;
 +        dvdl_dum = 0;
          constrain(NULL,TRUE,FALSE,constr,&(top->idef),
                    ir,NULL,cr,step,-1,md,
                    state->x,savex,NULL,
 -                  state->box,state->lambda,&dvdlambda,
 +                  state->box,state->lambda[efptBONDED],&dvdl_dum,
                    state->v,NULL,nrnb,econqCoord,ir->epc==epcMTTK,state->veta,state->veta);
 -        
 +
          for(i=start; i<end; i++) {
              for(m=0; m<DIM; m++) {
                  /* Re-reverse the velocities */
              }
          }
      }
 -    
      sfree(savex);
  }
  
@@@ -1083,7 -1006,7 +1083,7 @@@ void calc_enervirdiff(FILE *fplog,int e
    double r0,r1,r,rc3,rc9,ea,eb,ec,pa,pb,pc,pd;
    double invscale,invscale2,invscale3;
    int    ri0,ri1,ri,i,offstart,offset;
 -  real   scale,*vdwtab; 
 +  real   scale,*vdwtab;
  
    fr->enershiftsix = 0;
    fr->enershifttwelve = 0;
         */
        eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
        eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
 -      
 -      invscale = 1.0/(scale);  
 +
 +      invscale = 1.0/(scale);
        invscale2 = invscale*invscale;
        invscale3 = invscale*invscale2;
  
        switched function.  We perform both the pressure and energy
        loops at the same time for simplicity, as the computational
        cost is low. */
 -      
 +
        for (i=0;i<2;i++) {
          enersum = 0.0; virsum = 0.0;
          if (i==0)
            ea = invscale3;
            eb = 2.0*invscale2*r;
            ec = invscale*r*r;
 -          
 +
            pa = invscale3;
            pb = 3.0*invscale2*r;
            pc = 3.0*invscale*r*r;
            pd = r*r*r;
 -          
 +
            /* this "8" is from the packing in the vdwtab array - perhaps
            should be #define'ed? */
            offset = 8*ri + offstart;
            f = vdwtab[offset+1];
            g = vdwtab[offset+2];
            h = vdwtab[offset+3];
 -        
 +
            enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2)+
 -            g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);  
 -          virsum  +=  f*(pa/4 + pb/3 + pc/2 + pd) + 
 +            g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
 +          virsum  +=  f*(pa/4 + pb/3 + pc/2 + pd) +
              2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
 -        
 +
          }
          enersum *= 4.0*M_PI;
 -        virsum  *= 4.0*M_PI; 
 +        virsum  *= 4.0*M_PI;
          eners[i] -= enersum;
          virs[i]  -= virsum;
        }
        eners[1] +=  4.0*M_PI/(9.0*rc9);
        virs[0]  +=  8.0*M_PI/rc3;
        virs[1]  += -16.0*M_PI/(3.0*rc9);
 -    } 
 +    }
      else if ((fr->vdwtype == evdwCUT) || (fr->vdwtype == evdwUSER)) {
        if (fr->vdwtype == evdwUSER && fplog)
        fprintf(fplog,
@@@ -1215,62 -1138,62 +1215,62 @@@ void calc_dispcorr(FILE *fplog,t_inputr
      gmx_bool bCorrAll,bCorrPres;
      real dvdlambda,invvol,dens,ninter,avcsix,avctwelve,enerdiff,svir=0,spres=0;
      int  m;
 -    
 +
      *prescorr = 0;
      *enercorr = 0;
      *dvdlcorr = 0;
 -    
 +
      clear_mat(virial);
      clear_mat(pres);
 -    
 +
      if (ir->eDispCorr != edispcNO) {
          bCorrAll  = (ir->eDispCorr == edispcAllEner ||
                       ir->eDispCorr == edispcAllEnerPres);
          bCorrPres = (ir->eDispCorr == edispcEnerPres ||
                       ir->eDispCorr == edispcAllEnerPres);
 -        
 +
          invvol = 1/det(box);
 -        if (fr->n_tpi) 
 +        if (fr->n_tpi)
          {
              /* Only correct for the interactions with the inserted molecule */
              dens = (natoms - fr->n_tpi)*invvol;
              ninter = fr->n_tpi;
 -        } 
 -        else 
 +        }
 +        else
          {
              dens = natoms*invvol;
              ninter = 0.5*natoms;
          }
 -        
 -        if (ir->efep == efepNO) 
 +
 +        if (ir->efep == efepNO)
          {
              avcsix    = fr->avcsix[0];
              avctwelve = fr->avctwelve[0];
 -        } 
 -        else 
 +        }
 +        else
          {
              avcsix    = (1 - lambda)*fr->avcsix[0]    + lambda*fr->avcsix[1];
              avctwelve = (1 - lambda)*fr->avctwelve[0] + lambda*fr->avctwelve[1];
          }
 -        
 +
          enerdiff = ninter*(dens*fr->enerdiffsix - fr->enershiftsix);
          *enercorr += avcsix*enerdiff;
          dvdlambda = 0.0;
 -        if (ir->efep != efepNO) 
 +        if (ir->efep != efepNO)
          {
              dvdlambda += (fr->avcsix[1] - fr->avcsix[0])*enerdiff;
          }
 -        if (bCorrAll) 
 +        if (bCorrAll)
          {
              enerdiff = ninter*(dens*fr->enerdifftwelve - fr->enershifttwelve);
              *enercorr += avctwelve*enerdiff;
 -            if (fr->efep != efepNO) 
 +            if (fr->efep != efepNO)
              {
                  dvdlambda += (fr->avctwelve[1] - fr->avctwelve[0])*enerdiff;
              }
          }
 -        
 -        if (bCorrPres) 
 +
 +        if (bCorrPres)
          {
              svir = ninter*dens*avcsix*fr->virdiffsix/3.0;
              if (ir->eDispCorr == edispcAllEnerPres)
              }
              /* The factor 2 is because of the Gromacs virial definition */
              spres = -2.0*invvol*svir*PRESFAC;
 -            
 +
              for(m=0; m<DIM; m++) {
                  virial[m][m] += svir;
                  pres[m][m] += spres;
              }
              *prescorr += spres;
          }
 -        
 +
          /* Can't currently control when it prints, for now, just print when degugging */
          if (debug)
          {
                  fprintf(debug,"Long Range LJ corr.: <C6> %10.4e, <C12> %10.4e\n",
                          avcsix,avctwelve);
              }
 -            if (bCorrPres) 
 +            if (bCorrPres)
              {
                  fprintf(debug,
                          "Long Range LJ corr.: Epot %10g, Pres: %10g, Vir: %10g\n",
                  fprintf(debug,"Long Range LJ corr.: Epot %10g\n",*enercorr);
              }
          }
 -        
 +
          if (fr->bSepDVDL && do_per_step(step,ir->nstlog))
          {
              fprintf(fplog,sepdvdlformat,"Dispersion correction",
                      *enercorr,dvdlambda);
          }
 -        if (fr->efep != efepNO) 
 +        if (fr->efep != efepNO)
          {
              *dvdlcorr += dvdlambda;
          }
@@@ -1357,7 -1280,7 +1357,7 @@@ static void low_do_pbc_mtop(FILE *fplog
    as = 0;
    for(mb=0; mb<mtop->nmolblock; mb++) {
      molb = &mtop->molblock[mb];
 -    if (molb->natoms_mol == 1 || 
 +    if (molb->natoms_mol == 1 ||
        (!bFirst && mtop->moltype[molb->type].cgs.nr == 1)) {
        /* Just one atom or charge group in the molecule, no PBC required */
        as += molb->nmol*molb->natoms_mol;
        /* Pass NULL iso fplog to avoid graph prints for each molecule type */
        mk_graph_ilist(NULL,mtop->moltype[molb->type].ilist,
                     0,molb->natoms_mol,FALSE,FALSE,graph);
 -      
 +
        for(mol=0; mol<molb->nmol; mol++) {
        mk_mshift(fplog,graph,ePBC,box,x+as);
 -      
 +
        shift_self(graph,box,x+as);
        /* The molecule is whole now.
         * We don't need the second mk_mshift call as in do_pbc_first,
         * since we no longer need this graph.
         */
 -      
 +
        as += molb->natoms_mol;
        }
        done_graph(graph);
@@@ -1415,11 -1338,11 +1415,11 @@@ void finish_run(FILE *fplog,t_commrec *
  #ifdef GMX_MPI
      MPI_Reduce(nrnb->n,nrnb_tot->n,eNRNB,MPI_DOUBLE,MPI_SUM,
                 MASTERRANK(cr),cr->mpi_comm_mysim);
 -#endif  
 +#endif
    } else {
      nrnb_tot = nrnb;
    }
 -    
 +
    if (SIMMASTER(cr)) {
      print_flop(fplog,nrnb_tot,&nbfs,&mflop);
      if (cr->nnodes > 1) {
                       cr->mpi_comm_mysim);
          }
      }
 -#endif  
 +#endif
  
    if (SIMMASTER(cr)) {
      wallcycle_print(fplog,cr->nnodes,cr->npmenodes,runtime->realtime,
      } else {
        delta_t = 0;
      }
 -    
 +
      if (fplog) {
          print_perf(fplog,runtime->proctime,runtime->realtime,
                     cr->nnodes-cr->npmenodes,
    }
  }
  
 +extern void initialize_lambdas(FILE *fplog,t_inputrec *ir,int *fep_state,real *lambda,double *lam0)
 +{
 +    /* this function works, but could probably use a logic rewrite to keep all the different
 +       types of efep straight. */
 +
 +    int i;
 +    t_lambda *fep = ir->fepvals;
 +
 +    if ((ir->efep==efepNO) && (ir->bSimTemp == FALSE)) {
 +        for (i=0;i<efptNR;i++)  {
 +            lambda[i] = 0.0;
 +            if (lam0)
 +            {
 +                lam0[i] = 0.0;
 +            }
 +        }
 +        return;
 +    } else {
 +        *fep_state = fep->init_fep_state; /* this might overwrite the checkpoint
 +                                             if checkpoint is set -- a kludge is in for now
 +                                             to prevent this.*/
 +        for (i=0;i<efptNR;i++)
 +        {
 +            /* overwrite lambda state with init_lambda for now for backwards compatibility */
 +            if (fep->init_lambda>=0) /* if it's -1, it was never initializd */
 +            {
 +                lambda[i] = fep->init_lambda;
 +                if (lam0) {
 +                    lam0[i] = lambda[i];
 +                }
 +            }
 +            else
 +            {
 +                lambda[i] = fep->all_lambda[i][*fep_state];
 +                if (lam0) {
 +                    lam0[i] = lambda[i];
 +                }
 +            }
 +        }
 +        if (ir->bSimTemp) {
 +            /* need to rescale control temperatures to match current state */
 +            for (i=0;i<ir->opts.ngtc;i++) {
 +                if (ir->opts.ref_t[i] > 0) {
 +                    ir->opts.ref_t[i] = ir->simtempvals->temperatures[*fep_state];
 +                }
 +            }
 +        }
 +    }
 +
 +    /* Send to the log the information on the current lambdas */
 +    if (fplog != NULL)
 +    {
 +        fprintf(fplog,"Initial vector of lambda components:[ ");
 +        for (i=0;i<efptNR;i++)
 +        {
 +            fprintf(fplog,"%10.4f ",lambda[i]);
 +        }
 +        fprintf(fplog,"]\n");
 +    }
 +    return;
 +}
 +
 +
  void init_md(FILE *fplog,
               t_commrec *cr,t_inputrec *ir,const output_env_t oenv,
               double *t,double *t0,
 -             real *lambda,double *lam0,
 +             real *lambda, int *fep_state, double *lam0,
               t_nrnb *nrnb,gmx_mtop_t *mtop,
               gmx_update_t *upd,
               int nfile,const t_filenm fnm[],
  {
      int  i,j,n;
      real tmpt,mod;
 -      
 +
      /* Initial values */
      *t = *t0       = ir->init_t;
 -    if (ir->efep != efepNO)
 -    {
 -        *lam0 = ir->init_lambda;
 -        *lambda = *lam0 + ir->init_step*ir->delta_lambda;
 -    }
 -    else
 -    {
 -        *lambda = *lam0   = 0.0;
 -    } 
  
      *bSimAnn=FALSE;
      for(i=0;i<ir->opts.ngtc;i++)
      {
          update_annealing_target_temp(&(ir->opts),ir->init_t);
      }
 -    
 +
 +    /* Initialize lambda variables */
 +    initialize_lambdas(fplog,ir,fep_state,lambda,lam0);
 +
      if (upd)
      {
          *upd = init_update(fplog,ir);
      }
 -    
 +
 +
      if (vcm != NULL)
      {
          *vcm = init_vcm(fplog,&mtop->groups,ir);
      }
 -    
 +
      if (EI_DYNAMICS(ir->eI) && !(Flags & MD_APPENDFILES))
      {
          if (ir->etc == etcBERENDSEN)
              please_cite(fplog,"Bussi2007a");
          }
      }
 -    
 +
      init_nrnb(nrnb);
 -    
 +
      if (nfile != -1)
      {
          *outf = init_mdoutf(nfile,fnm,Flags,cr,ir,oenv);
          *mdebin = init_mdebin((Flags & MD_APPENDFILES) ? NULL : (*outf)->fp_ene,
                                mtop,ir, (*outf)->fp_dhdl);
      }
 -    
 -    /* Initiate variables */  
 +
 +    if (ir->bAdress)
 +    {
 +      please_cite(fplog,"Fritsch12");
 +      please_cite(fplog,"Junghans10");
 +    }
 +    /* Initiate variables */
      clear_mat(force_vir);
      clear_mat(shake_vir);
      clear_rvec(mu_tot);
 -    
 +
      debug_gmx();
  }
  
 +
 +
 +
diff --combined src/mdlib/tpi.c
index 9e0fe967c6ab7f1d7662c939925ce4b5f9ff1a8b,4c2e396107c70190b99ea6b0b9c5a7f24c3b93ee..65e3fde04336ab0b28763c7ea27c64e532cc2571
  #include "pme.h"
  #include "gbutil.h"
  
 -#if ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_X86_64_SSE2) )
 -#if defined(GMX_DOUBLE)
 -#include "gmx_sse2_double.h"
 -#else
 -#include "gmx_sse2_single.h"
 -#endif
 +#ifdef GMX_X86_SSE2
 +#include "gmx_x86_sse2.h"
  #endif
  
  
@@@ -122,8 -126,7 +122,8 @@@ double do_tpi(FILE *fplog,t_commrec *cr
                t_nrnb *nrnb,gmx_wallcycle_t wcycle,
                gmx_edsam_t ed,
                t_forcerec *fr,
 -              int repl_ex_nst,int repl_ex_seed,
 +              int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
 +              gmx_membed_t membed,
                real cpt_period,real max_hours,
                const char *deviceOptions,
                unsigned long Flags,
      sscanf(dump_pdb,"%lf",&dump_ener);
  
    atoms2md(top_global,inputrec,0,NULL,0,top_global->natoms,mdatoms);
 -  update_mdatoms(mdatoms,inputrec->init_lambda);
 +  update_mdatoms(mdatoms,inputrec->fepvals->init_lambda);
  
    snew(enerd,1);
 -  init_enerdata(groups->grps[egcENER].nr,inputrec->n_flambda,enerd);
 +  init_enerdata(groups->grps[egcENER].nr,inputrec->fepvals->n_lambda,enerd);
    snew(f,top_global->natoms);
  
    /* Print to log file  */
  
    refvolshift = log(det(rerun_fr.box));
  
 -#if ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_X86_64_SSE2) )
 +#ifdef GMX_X86_SSE2
      /* Make sure we don't detect SSE overflow generated before this point */
      gmx_mm_check_and_reset_overflow();
  #endif
          {
              copy_rvec(rerun_fr.x[i],state->x[i]);
          }
+         copy_mat(rerun_fr.box,state->box);
          
-         V = det(rerun_fr.box);
+         V = det(state->box);
          logV = log(V);
          
          bStateChanged = TRUE;
                  cr->nnodes = 1;
                  do_force(fplog,cr,inputrec,
                           step,nrnb,wcycle,top,top_global,&top_global->groups,
-                          rerun_fr.box,state->x,&state->hist,
+                          state->box,state->x,&state->hist,
                           f,force_vir,mdatoms,enerd,fcd,
 -                         lambda,NULL,fr,NULL,mu_tot,t,NULL,NULL,FALSE,
 +                         state->lambda,
 +                         NULL,fr,NULL,mu_tot,t,NULL,NULL,FALSE,
                           GMX_FORCE_NONBONDED |
-                          (bNS ? GMX_FORCE_NS | GMX_FORCE_DOLR : 0) |
+                          (bNS ? GMX_FORCE_DYNAMICBOX | GMX_FORCE_NS | GMX_FORCE_DOLR : 0) |
                           (bStateChanged ? GMX_FORCE_STATECHANGED : 0)); 
                  cr->nnodes = nnodes;
                  bStateChanged = FALSE;
                  bNS = FALSE;
                  
                  /* Calculate long range corrections to pressure and energy */
-                 calc_dispcorr(fplog,inputrec,fr,step,top_global->natoms,
-                               rerun_fr.box,
+                 calc_dispcorr(fplog,inputrec,fr,step,top_global->natoms,state->box,
                                lambda,pres,vir,&prescorr,&enercorr,&dvdlcorr);
                  /* figure out how to rearrange the next 4 lines MRS 8/4/2009 */
                  enerd->term[F_DISPCORR] = enercorr;
                  enerd->term[F_EPOT] += enercorr;
                  enerd->term[F_PRES] += prescorr;
 -                enerd->term[F_DVDL] += dvdlcorr;
 +                enerd->term[F_DVDL_VDW] += dvdlcorr;
  
                  epot = enerd->term[F_EPOT];
                  bEnergyOutOfBounds = FALSE;
 -#if ( defined(GMX_IA32_SSE) || defined(GMX_X86_64_SSE) || defined(GMX_X86_64_SSE2) )
 +#ifdef GMX_X86_SSE2
                  /* With SSE the energy can overflow, check for this */
                  if (gmx_mm_check_and_reset_overflow())
                  {
diff --combined src/tools/gmx_energy.c
index 52ec7ffbd14d7aff53e061a015647f3a2aa4cbb7,7931b808839aaaccbceb01acd79bd782fb3592f5..83c00bf2a718afb8a1ac562dd21b997f33d42e61
@@@ -1,12 -1,12 +1,12 @@@
  /*  -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   *
 - * 
 + *
   *                This source code is part of
 - * 
 + *
   *                 G   R   O   M   A   C   S
 - * 
 + *
   *          GROningen MAchine for Chemical Simulations
 - * 
 + *
   *                        VERSION 3.2.0
   * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * modify it under the terms of the GNU General Public License
   * as published by the Free Software Foundation; either version 2
   * of the License, or (at your option) any later version.
 - * 
 + *
   * If you want to redistribute modifications, please consider that
   * scientific software is very special. Version control is crucial -
   * bugs must be traceable. We will be happy to consider code for
   * inclusion in the official distribution, but derived work must not
   * be called official GROMACS. Details are found in the README & COPYING
   * files - if they are missing, get the official version at www.gromacs.org.
 - * 
 + *
   * To help us fund GROMACS development, we humbly ask that you cite
   * the papers on the package - you can find them in the top README file.
 - * 
 + *
   * For more info, check our website at http://www.gromacs.org
 - * 
 + *
   * And Hey:
   * Green Red Orange Magenta Azure Cyan Skyblue
   */
@@@ -58,7 -58,7 +58,7 @@@
  #include "viewit.h"
  #include "mtop_util.h"
  #include "gmx_ana.h"
 -
 +#include "mdebin.h"
  
  static real       minthird=-1.0/3.0,minsixth=-1.0/6.0;
  
@@@ -91,7 -91,7 +91,7 @@@ static double mypow(double x,double y
  {
    if (x > 0)
      return pow(x,y);
 -  else 
 +  else
      return 0.0;
  }
  
@@@ -101,16 -101,16 +101,16 @@@ static int *select_it(int nre,char *nm[
    int  n,k,j,i;
    int  *set;
    gmx_bool bVerbose = TRUE;
 -  
 +
    if ((getenv("VERBOSE")) != NULL)
      bVerbose = FALSE;
 -  
 +
    fprintf(stderr,"Select the terms you want from the following list\n");
    fprintf(stderr,"End your selection with 0\n");
  
    if ( bVerbose ) {
      for(k=0; (k<nre); ) {
 -      for(j=0; (j<4) && (k<nre); j++,k++) 
 +      for(j=0; (j<4) && (k<nre); j++,k++)
        fprintf(stderr," %3d=%14s",k+1,nm[k]);
        fprintf(stderr,"\n");
      }
    for(i=(*nset)=0; (i<nre); i++)
      if (bE[i])
        set[(*nset)++]=i;
 - 
 +
    sfree(bE);
 -  
 -  return set;
 -}
  
 -static int strcount(const char *s1,const char *s2)
 -{
 -  int n=0;
 -  while (s1 && s2 && (toupper(s1[n]) == toupper(s2[n])))
 -    n++;
 -  return n;
 +  return set;
  }
  
  static void chomp(char *buf)
  {
    int len = strlen(buf);
 -  
 +
    while ((len > 0) && (buf[len-1] == '\n')) {
      buf[len-1] = '\0';
      len--;
@@@ -156,16 -164,16 +156,16 @@@ static int *select_by_name(int nre,gmx_
    const char *fm4="%3d  %-14s";
    const char *fm2="%3d  %-34s";
    char **newnm=NULL;
 -  
 +
    if ((getenv("VERBOSE")) != NULL)
      bVerbose = FALSE;
 - 
 +
    fprintf(stderr,"\n");
    fprintf(stderr,"Select the terms you want from the following list by\n");
    fprintf(stderr,"selecting either (part of) the name or the number or a combination.\n");
    fprintf(stderr,"End your selection with an empty line or a zero.\n");
    fprintf(stderr,"-------------------------------------------------------------------\n");
 -  
 +
    snew(newnm,nre);
    j = 0;
    for(k=0; k<nre; k++) {
    if ( bVerbose ) {
      fprintf(stderr,"\n\n");
    }
 -  
 +
    snew(bE,nre);
 -  
 +
    bEOF = FALSE;
    while (!bEOF && (fgets2(buf,STRLEN-1,stdin))) {
      /* Remove newlines */
      chomp(buf);
 -    
 +
      /* Remove spaces */
      trim(buf);
 -    
 +
      /* Empty line means end of input */
      bEOF = (strlen(buf) == 0);
      if (!bEOF) {
        } while (!bEOF && (ptr && (strlen(ptr) > 0)));
      }
    }
 -  
 +
    snew(set,nre);
    for(i=(*nset)=0; (i<nre); i++)
      if (bE[i])
        set[(*nset)++]=i;
 - 
 +
    sfree(bE);
 -  
 +
    if (*nset == 0)
      gmx_fatal(FARGS,"No energy terms selected");
  
 -  for(i=0; (i<nre); i++) 
 +  for(i=0; (i<nre); i++)
      sfree(newnm[i]);
    sfree(newnm);
 -  
 +
    return set;
  }
  
 +static void get_dhdl_parms(const char *topnm, t_inputrec *ir)
 +{
 +    gmx_mtop_t mtop;
 +    int        natoms;
 +    t_iatom    *iatom;
 +    matrix     box;
 +
 +    /* all we need is the ir to be able to write the label */
 +    read_tpx(topnm,ir,box,&natoms,NULL,NULL,NULL,&mtop);
 +}
 +
  static void get_orires_parms(const char *topnm,
 -                           int *nor,int *nex,int **label,real **obs)
 +                               int *nor,int *nex,int **label,real **obs)
  {
    gmx_mtop_t mtop;
    gmx_localtop_t *top;
  
    ip       = top->idef.iparams;
    iatom    = top->idef.il[F_ORIRES].iatoms;
 -  
 +
    /* Count how many distance restraint there are... */
    nb = top->idef.il[F_ORIRES].nr;
    if (nb == 0)
      gmx_fatal(FARGS,"No orientation restraints in topology!\n");
 -  
 +
    *nor = nb/3;
    *nex = 0;
    snew(*label,*nor);
@@@ -355,17 -352,17 +355,17 @@@ static int get_bounds(const char *topnm
  
    functype = top->idef.functype;
    ip       = top->idef.iparams;
 -  
 +
    /* Count how many distance restraint there are... */
    nb=top->idef.il[F_DISRES].nr;
    if (nb == 0)
      gmx_fatal(FARGS,"No distance restraints in topology!\n");
 -  
 +
    /* Allocate memory */
    snew(b,nb);
    snew(ind,nb);
    snew(pair,nb+1);
 -  
 +
    /* Fill the bound array */
    nb=0;
    for(i=0; (i<top->idef.ntypes); i++) {
      }
    }
    *bounds = b;
 -  
 +
    /* Fill the index array */
    label1  = -1;
    disres  = &(top->idef.il[F_DISRES]);
      natom = interaction_function[ftype].nratoms+1;
      if (label1 != top->idef.iparams[type].disres.label) {
        pair[j] = k;
 -      label1  = top->idef.iparams[type].disres.label; 
 +      label1  = top->idef.iparams[type].disres.label;
        j ++;
      }
      k++;
  
    *index   = ind;
    *dr_pair = pair;
 -  
 +
    return nb;
  }
  
@@@ -413,7 -410,7 +413,7 @@@ static void calc_violations(real rt[],r
    const   real sixth=1.0/6.0;
    int     i,j;
    double  rsum,rav,sumaver,sumt;
 -  
 +
    sumaver = 0;
    sumt    = 0;
    for(i=0; (i<nb); i++) {
      }
      rsum    = max(0.0,mypow(rsum,-sixth)-bounds[i]);
      rav     = max(0.0,mypow(rav, -sixth)-bounds[i]);
 -    
 +
      sumt    += rsum;
      sumaver += rav;
    }
@@@ -453,7 -450,7 +453,7 @@@ static void analyse_disre(const char *v
              sumaver);
      fprintf(stdout,"Largest violation averaged over simulation: %g nm\n\n",
              sumt);
 -#endif                    
 +#endif
      vout=xvgropen(voutfn,"r\\S-3\\N average violations","DR Index","nm",
              oenv);
      sum  = 0.0;
      for(i=0; (i<nbounds); i++) {
          /* Do ensemble averaging */
          sumaver = 0;
 -        for(j=pair[i]; (j<pair[i+1]); j++) 
 -            sumaver += sqr(violaver[j]/nframes); 
 +        for(j=pair[i]; (j<pair[i+1]); j++)
 +            sumaver += sqr(violaver[j]/nframes);
          sumaver = max(0.0,mypow(sumaver,minsixth)-bounds[i]);
  
          sumt   += sumaver;
@@@ -566,7 -563,7 +566,7 @@@ static void add_ee_sum(ee_sum_t *ees,do
  static void add_ee_av(ee_sum_t *ees)
  {
      double av;
 -    
 +
      av = ees->sum/ees->np;
      ees->sav  += av;
      ees->sav2 += av*av;
@@@ -637,7 -634,7 +637,7 @@@ static void calc_averages(int nset,ener
      for(i=0; i<nset; i++)
      {
          ed = &edat->s[i];
 -        
 +
          sum  = 0;
          sum2 = 0;
          np   = 0;
                  sump  = ed->ener[f];
                  sum2 += dsqr(sump);
              }
 -            
 +
              /* sum has to be increased after sum2 */
              np  += p;
              sum += sump;
@@@ -782,14 -779,14 +782,14 @@@ static enerdata_t *calc_sum(int nset,en
      enerdat_t *s;
      int f,i;
      double sum;
 -    
 +
      snew(esum,1);
      *esum = *edat;
      snew(esum->s,1);
      s = &esum->s[0];
      snew(s->ener,esum->nframes);
      snew(s->es  ,esum->nframes);
 -    
 +
      s->bExactStat = TRUE;
      s->slope      = 0;
      for(i=0; i<nset; i++)
          }
          s->slope += edat->s[i].slope;
      }
 -    
 +
      for(f=0; f<edat->nframes; f++)
      {
          sum = 0;
          s->es[f].sum  = sum;
          s->es[f].sum2 = 0;
      }
 -    
 +
      calc_averages(1,esum,nbmin,nbmax);
  
      return esum;
@@@ -827,7 -824,7 +827,7 @@@ static char *ee_pr(double ee,char *buf
  {
      char   tmp[100];
      double rnd;
 -    
 +
      if (ee < 0)
      {
          sprintf(buf,"%s","--");
@@@ -852,16 -849,16 +852,16 @@@ static void remove_drift(int nset,int n
  
      edat->npoints = edat->nframes;
      edat->nsteps = edat->nframes;
 -        
 +
      for(k=0; (k<5); k++)
      {
 -        for(i=0; (i<nset); i++) 
 +        for(i=0; (i<nset); i++)
          {
              delta = edat->s[i].slope*dt;
 -            
 +
              if (NULL != debug)
                  fprintf(debug,"slope for set %d is %g\n",i,edat->s[i].slope);
 -            
 +
              for(j=0; (j<edat->nframes); j++)
              {
                  edat->s[i].ener[j]   -= j*delta;
@@@ -885,7 -882,7 +885,7 @@@ static void calc_fluctuation_props(FIL
      enum { eVol, eEnth, eTemp, eEtot, eNR };
      const char *my_ener[] = { "Volume", "Enthalpy", "Temperature", "Total Energy" };
      int ii[eNR];
 -    
 +
      NANO3 = NANO*NANO*NANO;
      if (!bDriftCorr)
      {
      {
          remove_drift(nset,nbmin,nbmax,dt,edat);
      }
 -    for(i=0; (i<eNR); i++) 
 +    for(i=0; (i<eNR); i++)
      {
 -        for(ii[i]=0; (ii[i]<nset && 
 +        for(ii[i]=0; (ii[i]<nset &&
                        (gmx_strcasecmp(leg[ii[i]],my_ener[i]) != 0)); ii[i]++)
              ;
  /*        if (ii[i] < nset)
  */  }
      /* Compute it all! */
      vvhh = alpha = kappa = cp = dcp = cv = NOTSET;
 -    
 +
      /* Temperature */
      tt = NOTSET;
      if (ii[eTemp] < nset)
  
          if (debug != NULL)
          {
 -            if (varv != NOTSET)    
 +            if (varv != NOTSET)
                  fprintf(fp,"varv  =  %10g (m^6)\n",varv*AVOGADRO/nmol);
              if (vvhh != NOTSET)
                  fprintf(fp,"vvhh  =  %10g (m^3 J)\n",vvhh);
                      dcp);
          please_cite(fp,"Allen1987a");
      }
 -    else 
 +    else
      {
          fprintf(fp,"You should select the temperature in order to obtain fluctuation properties.\n");
      }
@@@ -1043,12 -1040,12 +1043,12 @@@ static void analyse_ener(gmx_bool bCorr
    else {
      /* Calculate the time difference */
      delta_t = t - start_t;
 -    
 +
      fprintf(stdout,"\nStatistics over %s steps [ %.4f through %.4f ps ], %d data sets\n",
            gmx_step_str(nsteps,buf),start_t,t,nset);
  
      calc_averages(nset,edat,nbmin,nbmax);
 -    
 +
      if (bSum) {
          esum = calc_sum(nset,edat,nbmin,nbmax);
      }
        }
        }
      }
 -    
 +
      if (nnotexact == 0) {
        fprintf(stdout,"All statistics are over %s points\n",
              gmx_step_str(edat->npoints,buf));
      else
        fprintf(stdout,"\n");
      fprintf(stdout,"-------------------------------------------------------------------------------\n");
 -    
 +
      /* Initiate locals, only used with -sum */
      expEtot=0;
      if (bFee) {
        for(j=0; (j<edat->nframes); j++) {
          expE += exp(beta*(edat->s[i].ener[j] - aver)/nmol);
        }
 -      if (bSum) 
 +      if (bSum)
          expEtot+=expE/edat->nframes;
 -      
 +
        fee[i] = log(expE/edat->nframes)/beta + aver/nmol;
        }
        if (strstr(leg[i],"empera") != NULL) {
        Vaver= aver;
        } else if (strstr(leg[i],"essure") != NULL) {
        Pres = aver;
 -      } 
 +      }
        if (bIsEner[i]) {
        pr_aver   = aver/nmol-ezero;
        pr_stddev = stddev/nmol;
  
        fprintf(stdout,"%-24s %10g %10s %10g %10g",
              leg[i],pr_aver,ee_pr(pr_errest,eebuf),pr_stddev,totaldrift);
 -      if (bFee) 
 +      if (bFee)
        fprintf(stdout,"  %10g",fee[i]);
 -      
 +
        fprintf(stdout,"  (%s)\n",enm[set[i]].unit);
  
        if (bFluct) {
              "Total",esum->s[0].av/nmol,ee_pr(esum->s[0].ee/nmol,eebuf),
              "--",totaldrift/nmol,enm[set[0]].unit);
        /* pr_aver,pr_stddev,a,totaldrift */
 -      if (bFee) 
 +      if (bFee)
        fprintf(stdout,"  %10g  %10g\n",
                log(expEtot)/beta + esum->s[0].av/nmol,log(expEtot)/beta);
        else
        fprintf(stdout,"\n");
      }
 -      
 +
      /* Do correlation function */
      if (edat->nframes > 1)
      {
        real factor;
        real **eneset;
        real **enesum;
 -    
 +
        /* Assume pressure tensor is in Pxx Pxy Pxz Pyx Pyy Pyz Pzx Pzy Pzz */
 -      
 -      /* Symmetrise tensor! (and store in first three elements) 
 +
 +      /* Symmetrise tensor! (and store in first three elements)
         * And subtract average pressure!
         */
        snew(eneset,12);
        enesum[1][i] = 0.5*(edat->s[2].es[i].sum+edat->s[6].es[i].sum);
        enesum[2][i] = 0.5*(edat->s[5].es[i].sum+edat->s[7].es[i].sum);
        }
 -      
 +
        einstein_visco("evisco.xvg","eviscoi.xvg",
                     3,edat->nframes,enesum,Vaver,Temp,nsteps,time,oenv);
 -      
 +
        /*do_autocorr(corrfn,buf,nenergy,3,eneset,Dt,eacNormal,TRUE);*/
        /* Do it for shear viscosity */
        strcpy(buf,"Shear Viscosity");
        low_do_autocorr(corrfn,oenv,buf,edat->nframes,3,
                      (edat->nframes+1)/2,eneset,Dt,
                      eacNormal,1,TRUE,FALSE,FALSE,0.0,0.0,0,1);
 -      
 +
        /* Now for bulk viscosity */
        strcpy(buf,"Bulk Viscosity");
        low_do_autocorr(corrfn,oenv,buf,edat->nframes,1,
                      (edat->nframes+1)/2,&(eneset[11]),Dt,
                      eacNormal,1,TRUE,FALSE,FALSE,0.0,0.0,0,1);
 -      
 +
        factor = (Vaver*1e-26/(BOLTZMANN*Temp))*Dt;
        fp=xvgropen(visfn,buf,"Time (ps)","\\8h\\4 (cp)",oenv);
        xvgr_legend(fp,asize(leg),leg,oenv);
 -      
 +
        /* Use trapezium rule for integration */
        integral = 0;
        intBulk  = 0;
        nout = get_acfnout();
        if ((nout < 2) || (nout >= edat->nframes/2))
            nout = edat->nframes/2;
 -      for(i=1; (i<nout); i++) 
 +      for(i=1; (i<nout); i++)
        {
            integral += 0.5*(eneset[0][i-1]  + eneset[0][i])*factor;
            intBulk  += 0.5*(eneset[11][i-1] + eneset[11][i])*factor;
@@@ -1270,12 -1267,12 +1270,12 @@@ static void print1(FILE *fp,gmx_bool bD
      fprintf(fp,"  %10.6f",e);
  }
  
 -static void fec(const char *ene2fn, const char *runavgfn, 
 -              real reftemp, int nset, int set[], char *leg[], 
 +static void fec(const char *ene2fn, const char *runavgfn,
 +              real reftemp, int nset, int set[], char *leg[],
                enerdata_t *edat, double time[],
                  const output_env_t oenv)
  {
 -  const char* ravgleg[] = { "\\8D\\4E = E\\sB\\N-E\\sA\\N", 
 +  const char* ravgleg[] = { "\\8D\\4E = E\\sB\\N-E\\sA\\N",
                             "<e\\S-\\8D\\4E/kT\\N>\\s0..t\\N" };
    FILE *fp;
    ener_file_t enx;
    gmx_enxnm_t *enm=NULL;
    t_enxframe *fr;
    char buf[22];
 -  
 +
    /* read second energy file */
    snew(fr,1);
    enm = NULL;
    enx = open_enx(ene2fn,"r");
    do_enxnms(enx,&(fr->nre),&enm);
 -  
 +
    snew(eneset2,nset+1);
    nenergy2=0;
    maxenergy=0;
    timecheck=0;
    do {
 -    /* This loop searches for the first frame (when -b option is given), 
 +    /* This loop searches for the first frame (when -b option is given),
       * or when this has been found it reads just one energy frame
       */
      do {
        bCont = do_enx(enx,fr);
 -      
 +
        if (bCont)
        timecheck = check_times(fr->t);
 -      
 +
      } while (bCont && (timecheck < 0));
 -    
 +
      /* Store energies for analysis afterwards... */
      if ((timecheck == 0) && bCont) {
        if (fr->nre > 0) {
        }
      }
    } while (bCont && (timecheck == 0));
 -  
 +
    /* check */
    if (edat->nframes != nenergy2) {
      fprintf(stderr,"\nWARNING file length mismatch %d!=%d\n",
            edat->nframes,nenergy2);
    }
    nenergy = min(edat->nframes,nenergy2);
 -  
 +
    /* calculate fe difference dF = -kT ln < exp(-(E_B-E_A)/kT) >_A */
    fp=NULL;
    if (runavgfn) {
        dE = eneset2[i][j] - edat->s[i].ener[j];
        sum += exp(-dE*beta);
        if (fp)
 -      fprintf(fp,"%10g %10g %10g\n", 
 +      fprintf(fp,"%10g %10g %10g\n",
                time[j], dE, -BOLTZ*reftemp*log(sum/(j+1)) );
      }
      aver = -BOLTZ*reftemp*log(sum/nenergy);
  }
  
  
 -static void do_dhdl(t_enxframe *fr, FILE **fp_dhdl, const char *filename,
 -                    int *blocks, int *hists, int *samples, int *nlambdas,
 -                    const output_env_t oenv)
 +static void do_dhdl(t_enxframe *fr, t_inputrec *ir, FILE **fp_dhdl, const char *filename, gmx_bool bDp,
 +                    int *blocks, int *hists, int *samples, int *nlambdas, const output_env_t oenv)
  {
      const char *dhdl="dH/d\\lambda",*deltag="\\DeltaH",*lambda="\\lambda";
      char title[STRLEN],label_x[STRLEN],label_y[STRLEN], legend[STRLEN];
      {
          if (nblock_dh>0)
          {
 -            sprintf(title,"%s, %s",dhdl,deltag);
 -            sprintf(label_x,"%s (%s)","Time",unit_time);
 -            sprintf(label_y,"(%s)",unit_energy);
 +            /* we have standard, non-histogram data -- call open_dhdl to open the file */
 +            *fp_dhdl=open_dhdl(filename,ir,oenv);
          }
          else
          {
              sprintf(title,"N(%s)",deltag);
              sprintf(label_x,"%s (%s)",deltag,unit_energy);
              sprintf(label_y,"Samples");
 -        }
 -        *fp_dhdl=xvgropen_type(filename, title, label_x, label_y, exvggtXNY, 
 -                               oenv);
 -        if (! changing_lambda)
 -        {
 +            *fp_dhdl=xvgropen_type(filename, title, label_x, label_y, exvggtXNY,oenv);
              sprintf(buf,"T = %g (K), %s = %g", temp, lambda, start_lambda);
 +            xvgr_subtitle(*fp_dhdl,buf,oenv);
          }
 -        else
 -        {
 -            sprintf(buf,"T = %g (K)", temp);
 -        }
 -        xvgr_subtitle(*fp_dhdl,buf,oenv);
 -        first=TRUE;
      }
  
 -
 -
      (*hists)+=nblock_hist;
      (*blocks)+=nblock_dh;
      (*nlambdas) = nblock_hist+nblock_dh;
  
 -
      /* write the data */
      if (nblock_hist > 0)
      {
                      if (!derivative)
                      {
                          sprintf(legend, "N(%s(%s=%g) | %s=%g)",
 -                                deltag, lambda, foreign_lambda, 
 +                                deltag, lambda, foreign_lambda,
                                  lambda, start_lambda);
                      }
                      else
                      {
 -                        sprintf(legend, "N(%s | %s=%g)", 
 +                        sprintf(legend, "N(%s | %s=%g)",
                                  dhdl, lambda, start_lambda);
                      }
 -                                       
 +
                      lg[0]=legend;
 -                    xvgr_new_dataset(*fp_dhdl, setnr, 1, lg, oenv); 
 +                    xvgr_new_dataset(*fp_dhdl, setnr, 1, lg, oenv);
                      setnr++;
                      for(k=0;k<blk->sub[j+2].nr;k++)
                      {
                          int hist;
                          double xmin, xmax;
 -                    
 +
                          hist=blk->sub[j+2].ival[k];
                          xmin=(x0+k)*dx;
                          xmax=(x0+k+1)*dx;
 -                        fprintf(*fp_dhdl,"%g %d\n%g %d\n", xmin, hist, 
 +                        fprintf(*fp_dhdl,"%g %d\n%g %d\n", xmin, hist,
                                  xmax, hist);
                          sum+=hist;
                      }
                  }
              }
          }
 -
          (*samples) += (int)(sum/nblock_hist);
      }
      else
          char **setnames=NULL;
          int nnames=nblock_dh;
  
 -        if (changing_lambda)
 -        {
 -            nnames++;
 -        }
 -        if (first)
 -        {
 -            snew(setnames, nnames);
 -        }
 -        j=0;
 -
 -        if ( changing_lambda && first)
 -        {
 -            /* lambda is a plotted value */
 -            setnames[j]=gmx_strdup(lambda);
 -            j++;
 -        }
 -
 -
          for(i=0;i<fr->nblock;i++)
          {
              t_enxblock *blk=&(fr->block[i]);
              if (blk->id == enxDH)
              {
 -                if (first)
 -                {
 -                    /* do the legends */
 -                    int derivative;
 -                    double foreign_lambda;
 -
 -                    derivative=blk->sub[0].ival[0];
 -                    foreign_lambda=blk->sub[1].dval[0];
 -
 -                    if (derivative)
 -                    {
 -                        sprintf(buf, "%s %s %g",dhdl,lambda,start_lambda);
 -                    }
 -                    else
 -                    {
 -                        sprintf(buf, "%s %s %g",deltag,lambda, foreign_lambda);
 -                    }
 -                    setnames[j] = gmx_strdup(buf);
 -                    j++;
 -                }
 -
                  if (len == 0)
 -                {   
 +                {
                      len=blk->sub[2].nr;
                  }
                  else
                  }
              }
          }
 -
 -
 -        if (first)
 -        {
 -            xvgr_legend(*fp_dhdl, nblock_dh, (const char**)setnames, oenv);
 -            setnr += nblock_dh;
 -            for(i=0;i<nblock_dh;i++)
 -            {
 -                sfree(setnames[i]);
 -            }
 -            sfree(setnames);
 -        }
 -
          (*samples) += len;
 +
          for(i=0;i<len;i++)
          {
              double time=start_time + delta_time*i;
  
 -            fprintf(*fp_dhdl,"%.4f", time);
 -            if (fabs(delta_lambda) > 1e-9)
 -            {
 -                double lambda_now=i*delta_lambda + start_lambda;
 -                fprintf(*fp_dhdl,"  %.4f", lambda_now);
 -            }
 +            fprintf(*fp_dhdl,"%.4f ", time);
 +
              for(j=0;j<fr->nblock;j++)
              {
                  t_enxblock *blk=&(fr->block[j]);
                      {
                          value=blk->sub[2].dval[i];
                      }
 -                    fprintf(*fp_dhdl,"  %g", value);
 +                    /* we need to decide which data type it is based on the count*/
 +
 +                    if (j==1 && ir->bExpanded)
 +                    {
 +                        fprintf(*fp_dhdl,"%4d", (int)value);   /* if expanded ensembles and zero, this is a state value, it's an integer. We need a cleaner conditional than if j==1! */
 +                    } else {
 +                        if (bDp) {
 +                            fprintf(*fp_dhdl," %#.12g", value);   /* print normal precision */
 +                        }
 +                        else
 +                        {
 +                            fprintf(*fp_dhdl," %#.8g", value);   /* print normal precision */
 +                        }
 +                    }
                  }
              }
              fprintf(*fp_dhdl, "\n");
  int gmx_energy(int argc,char *argv[])
  {
    const char *desc[] = {
 -    
 +
      "[TT]g_energy[tt] extracts energy components or distance restraint",
      "data from an energy file. The user is prompted to interactively",
      "select the desired energy terms.[PAR]",
 -    
 +
      "Average, RMSD, and drift are calculated with full precision from the",
      "simulation (see printed manual). Drift is calculated by performing",
      "a least-squares fit of the data to a straight line. The reported total drift",
      "energy values.[PAR]",
  
      "The term fluctuation gives the RMSD around the least-squares fit.[PAR]",
 -    
 +
      "Some fluctuation-dependent properties can be calculated provided",
-     "the correct energy terms are selected. The following properties",
+     "the correct energy terms are selected, and that the command line option",
+     "[TT]-fluct_props[tt] is given. The following properties",
      "will be computed:[BR]",
      "Property                        Energy terms needed[BR]",
      "---------------------------------------------------[BR]",
      "  [GRK]Delta[grk] S(N,V,T) = S(N,V,T) - S[SUB]idealgas[sub](N,V,T) = ([CHEVRON]U[SUB]pot[sub][chevron] - [GRK]Delta[grk] A)/T[BR]",
      "  [GRK]Delta[grk] S(N,p,T) = S(N,p,T) - S[SUB]idealgas[sub](N,p,T) = ([CHEVRON]U[SUB]pot[sub][chevron] + pV - [GRK]Delta[grk] G)/T",
      "[PAR]",
 -    
 +
      "When a second energy file is specified ([TT]-f2[tt]), a free energy",
      "difference is calculated [BR] dF = -kT [LN][CHEVRON][EXP]-(E[SUB]B[sub]-E[SUB]A[sub])/kT[exp][chevron][SUB]A[sub][ln] ,",
      "where E[SUB]A[sub] and E[SUB]B[sub] are the energies from the first and second energy",
      "files, and the average is over the ensemble A. The running average",
      "of the free energy difference is printed to a file specified by [TT]-ravg[tt].",
      "[BB]Note[bb] that the energies must both be calculated from the same trajectory."
 -    
 +
    };
    static gmx_bool bSum=FALSE,bFee=FALSE,bPrAll=FALSE,bFluct=FALSE,bDriftCorr=FALSE;
-   static gmx_bool bDp=FALSE,bMutot=FALSE,bOrinst=FALSE,bOvec=FALSE;
+   static gmx_bool bDp=FALSE,bMutot=FALSE,bOrinst=FALSE,bOvec=FALSE,bFluctProps=FALSE;
    static int  skip=0,nmol=1,nbmin=5,nbmax=5;
    static real reftemp=300.0,ezero=0;
    t_pargs pa[] = {
        "Print energies in high precision" },
      { "-nbmin", FALSE, etINT, {&nbmin},
        "Minimum number of blocks for error estimate" },
 -    { "-nbmax", FALSE, etINT, {&nbmax}, 
 +    { "-nbmax", FALSE, etINT, {&nbmax},
        "Maximum number of blocks for error estimate" },
      { "-mutot",FALSE, etBOOL, {&bMutot},
        "Compute the total dipole moment from the components" },
        "Also print the exact average and rmsd stored in the energy frames (only when 1 term is requested)" },
      { "-nmol", FALSE, etINT,  {&nmol},
        "Number of molecules in your sample: the energies are divided by this number" },
+     { "-fluct_props", FALSE, etBOOL, {&bFluctProps},
+       "Compute properties based on energy fluctuations, like heat capacity" },
      { "-driftcorr", FALSE, etBOOL, {&bDriftCorr},
        "Useful only for calculations of fluctuation properties. The drift in the observables will be subtracted before computing the fluctuation properties."},
      { "-fluc", FALSE, etBOOL, {&bFluct},
      "Pres-YZ", "Pres-ZX", "Pres-ZY", "Pres-ZZ", "Temperature",
      "Volume",  "Pressure"
    };
 -  
 +
    FILE       *out=NULL,*fp_pairs=NULL,*fort=NULL,*fodt=NULL,*foten=NULL;
    FILE       *fp_dhdl=NULL;
    FILE       **drout;
  #define NFILE asize(fnm)
    int     npargs;
    t_pargs *ppa;
 -  
 +
    CopyRight(stderr,argv[0]);
    npargs = asize(pa);
    ppa    = add_acf_pargs(&npargs,pa);
    parse_common_args(&argc,argv,
                      PCA_CAN_VIEW | PCA_CAN_BEGIN | PCA_CAN_END | PCA_BE_NICE,
                    NFILE,fnm,npargs,ppa,asize(desc),desc,0,NULL,&oenv);
 -  
 +
    bDRAll = opt2bSet("-pairs",NFILE,fnm);
    bDisRe = opt2bSet("-viol",NFILE,fnm) || bDRAll;
    bORA   = opt2bSet("-ora",NFILE,fnm);
    do_enxnms(fp,&nre,&enm);
  
    Vaver = -1;
 -  
 +
    bVisco = opt2bSet("-vis",NFILE,fnm);
 -  
 -  if (!bDisRe && !bDHDL) 
 +
 +  if ((!bDisRe) && (!bDHDL))
    {
        if (bVisco) {
            nset=asize(setnm);
                }
            }
        }
 -      else 
 +      else
        {
            set=select_by_name(nre,enm,&nset);
        }
        snew(violaver,npairs);
        out=xvgropen(opt2fn("-o",NFILE,fnm),"Sum of Violations",
                     "Time (ps)","nm",oenv);
 -      xvgr_legend(out,2,drleg,oenv);  
 -      if (bDRAll) { 
 +      xvgr_legend(out,2,drleg,oenv);
 +      if (bDRAll) {
            fp_pairs=xvgropen(opt2fn("-pairs",NFILE,fnm),"Pair Distances",
                              "Time (ps)","Distance (nm)",oenv);
            if (output_env_get_print_xvgr_codes(oenv))
                fprintf(fp_pairs,"@ subtitle \"averaged (tau=%g) and instantaneous\"\n",
                        ir.dr_tau);
        }
 +  } else if (bDHDL) {
 +      get_dhdl_parms(ftp2fn(efTPX,NFILE,fnm),&ir);
    }
  
 -
 -  /* Initiate energies and set them to zero */
 -  edat.nsteps  = 0;
 -  edat.npoints = 0;
 -  edat.nframes = 0;
 -  edat.step    = NULL;
 -  edat.steps   = NULL;
 -  edat.points  = NULL;
 -  snew(edat.s,nset);
 -  
 -  /* Initiate counters */
 -  teller       = 0;
 -  teller_disre = 0;
 -  bFoundStart  = FALSE;
 -  start_step   = 0;
 -  start_t      = 0;
 -  do {
 -    /* This loop searches for the first frame (when -b option is given), 
 -     * or when this has been found it reads just one energy frame
 -     */
 -    do {
 -      bCont = do_enx(fp,&(frame[NEXT]));
 -      
 -      if (bCont) {
 -      timecheck = check_times(frame[NEXT].t);
 -      }      
 -    } while (bCont && (timecheck < 0));
 -    
 -    if ((timecheck == 0) && bCont) {
 -      /* We read a valid frame, so we can use it */
 -      fr = &(frame[NEXT]);
 -      
 -      if (fr->nre > 0) {
 -      /* The frame contains energies, so update cur */
 -      cur  = NEXT;
 -
 -              if (edat.nframes % 1000 == 0)
 -            {
 -                srenew(edat.step,edat.nframes+1000);
 -                memset(&(edat.step[edat.nframes]),0,1000*sizeof(edat.step[0]));
 -                srenew(edat.steps,edat.nframes+1000);
 -                memset(&(edat.steps[edat.nframes]),0,1000*sizeof(edat.steps[0]));
 -                srenew(edat.points,edat.nframes+1000);
 -                memset(&(edat.points[edat.nframes]),0,1000*sizeof(edat.points[0]));
 -                for(i=0; i<nset; i++)
 -                {
 -                    srenew(edat.s[i].ener,edat.nframes+1000);
 -                    memset(&(edat.s[i].ener[edat.nframes]),0,
 -                           1000*sizeof(edat.s[i].ener[0]));
 -
 -                    srenew(edat.s[i].es  ,edat.nframes+1000);
 -                    memset(&(edat.s[i].es[edat.nframes]),0,
 -                           1000*sizeof(edat.s[i].es[0]));
 -                }
 -            }
 -
 -              nfr = edat.nframes;
 -            edat.step[nfr] = fr->step;
 -
 -            if (!bFoundStart)
 -            {
 -                bFoundStart = TRUE;
 -                /* Initiate the previous step data */
 -                start_step = fr->step;
 -                start_t    = fr->t;
 -                /* Initiate the energy sums */
 -                edat.steps[nfr]  = 1;
 -                edat.points[nfr] = 1;
 -                for(i=0; i<nset; i++)
 -                {
 -                    sss = set[i];
 -                    edat.s[i].es[nfr].sum  = fr->ener[sss].e;
 -                    edat.s[i].es[nfr].sum2 = 0;
 -                }
 -                edat.nsteps  = 1;
 -                edat.npoints = 1;
 -            }
 -            else
 -            {
 -                edat.steps[nfr] = fr->nsteps;
 -                {
 -                    if (fr->step - start_step + 1 == edat.nsteps + fr->nsteps)
 -                    {
 -                        if (fr->nsum <= 1)
 -                        {
 -                            edat.points[nfr] = 1;
 -                            for(i=0; i<nset; i++)
 -                            {
 -                                sss = set[i];
 -                                edat.s[i].es[nfr].sum  = fr->ener[sss].e;
 -                                edat.s[i].es[nfr].sum2 = 0;
 -                            }
 -                            edat.npoints += 1;
 -                        }
 -                        else
 -                        {
 -                            edat.points[nfr] = fr->nsum;
 -                            for(i=0; i<nset; i++)
 -                            {
 -                                sss = set[i];
 -                                edat.s[i].es[nfr].sum  = fr->ener[sss].esum;
 -                                edat.s[i].es[nfr].sum2 = fr->ener[sss].eav;
 -                            }
 -                            edat.npoints += fr->nsum;
 -                        }
 -                    }
 -                    else
 -                    {
 -                        /* The interval does not match fr->nsteps:
 -                         * can not do exact averages.
 -                         */
 -                        edat.npoints = 0;
 -                    }
 -                    edat.nsteps = fr->step - start_step + 1;
 -                }
 -            }
 -            for(i=0; i<nset; i++)
 -            {
 -                edat.s[i].ener[nfr] = fr->ener[set[i]].e;
 -            }
 -      }
 -      /*
 -       * Define distance restraint legends. Can only be done after
 -       * the first frame has been read... (Then we know how many there are)
 -       */
 -      blk_disre=find_block_id_enxframe(fr, enxDISRE, NULL);
 -      if (bDisRe && bDRAll && !leg && blk_disre) 
 -      {
 -          t_iatom   *fa;
 -          t_iparams *ip;
 -
 -          fa = top->idef.il[F_DISRES].iatoms; 
 -          ip = top->idef.iparams;
 -          if (blk_disre->nsub != 2 || 
 -              (blk_disre->sub[0].nr != blk_disre->sub[1].nr) )
 -          {
 -              gmx_incons("Number of disre sub-blocks not equal to 2");
 -          }
 -
 -          ndisre=blk_disre->sub[0].nr ;
 -          if (ndisre != top->idef.il[F_DISRES].nr/3)
 -          {
 -              gmx_fatal(FARGS,"Number of disre pairs in the energy file (%d) does not match the number in the run input file (%d)\n",
 -                        ndisre,top->idef.il[F_DISRES].nr/3);
 -          }
 -          snew(pairleg,ndisre);
 -          for(i=0; i<ndisre; i++) 
 -          {
 -              snew(pairleg[i],30);
 -              j=fa[3*i+1];
 -              k=fa[3*i+2];
 -              gmx_mtop_atominfo_global(&mtop,j,&anm_j,&resnr_j,&resnm_j);
 -              gmx_mtop_atominfo_global(&mtop,k,&anm_k,&resnr_k,&resnm_k);
 -              sprintf(pairleg[i],"%d %s %d %s (%d)",
 -                      resnr_j,anm_j,resnr_k,anm_k,
 -                      ip[fa[3*i]].disres.label);
 -          }
 -          set=select_it(ndisre,pairleg,&nset);
 -          snew(leg,2*nset);
 -          for(i=0; (i<nset); i++) 
 -          {
 -              snew(leg[2*i],32);
 -              sprintf(leg[2*i],  "a %s",pairleg[set[i]]);
 -              snew(leg[2*i+1],32);
 -              sprintf(leg[2*i+1],"i %s",pairleg[set[i]]);
 -          }
 -          xvgr_legend(fp_pairs,2*nset,(const char**)leg,oenv);    
 -      }
 -
 -      /* 
 -       * Store energies for analysis afterwards... 
 -       */
 -      if (!bDisRe && !bDHDL && (fr->nre > 0)) {
 -      if (edat.nframes % 1000 == 0) {
 -        srenew(time,edat.nframes+1000);
 -      }
 -      time[edat.nframes] = fr->t;
 -      edat.nframes++;
 -      }
 -      /* 
 -       * Printing time, only when we do not want to skip frames
 -       */
 -      if (!skip || teller % skip == 0) {
 -      if (bDisRe) {
 -        /*******************************************
 -         * D I S T A N C E   R E S T R A I N T S  
 -         *******************************************/
 -        if (ndisre > 0) 
 -          {
 -#ifndef GMX_DOUBLE
 -            float *disre_rt =     blk_disre->sub[0].fval;
 -            float *disre_rm3tav = blk_disre->sub[1].fval;
 -#else
 -            double *disre_rt =     blk_disre->sub[0].dval;
 -            double *disre_rm3tav = blk_disre->sub[1].dval;
 -#endif
 -
 -          print_time(out,fr->t);
 -          if (violaver == NULL)
 -            snew(violaver,ndisre);
 -          
 -          /* Subtract bounds from distances, to calculate violations */
 -          calc_violations(disre_rt, disre_rm3tav,
 -                          nbounds,pair,bounds,violaver,&sumt,&sumaver);
 -
 -          fprintf(out,"  %8.4f  %8.4f\n",sumaver,sumt);
 -          if (bDRAll) {
 -            print_time(fp_pairs,fr->t);
 -            for(i=0; (i<nset); i++) {
 -              sss=set[i];
 -              fprintf(fp_pairs,"  %8.4f", mypow(disre_rm3tav[sss],minthird));
 -              fprintf(fp_pairs,"  %8.4f", disre_rt[sss]);
 -            }
 -            fprintf(fp_pairs,"\n");
 -          }
 -          teller_disre++;
 -        }
 -      }
 -        else if (bDHDL)
 -        {
 -            do_dhdl(fr, &fp_dhdl, opt2fn("-odh",NFILE,fnm), 
 -                    &dh_blocks, &dh_hists, &dh_samples, &dh_lambdas,
 -                    oenv);
 -        }
 -      /*******************************************
 -       * E N E R G I E S
 -       *******************************************/
 -      else {
 -        if (fr->nre > 0) {
 -            if (bPrAll)
 -            {
 -                /* We skip frames with single points (usually only the first frame),
 -                 * since they would result in an average plot with outliers.
 -                 */
 -                if (fr->nsum > 1) {
 -                    print_time(out,fr->t);
 -                     print1(out,bDp,fr->ener[set[0]].e);
 -                     print1(out,bDp,fr->ener[set[0]].esum/fr->nsum);
 -                     print1(out,bDp,sqrt(fr->ener[set[0]].eav/fr->nsum));
 -                     fprintf(out,"\n");
 -                }
 -            }
 -            else
 -            {
 -                print_time(out,fr->t);
 -                if (bSum)
 -                {
 -                    sum = 0;
 -                    for(i=0; i<nset; i++)
 -                    {
 -                        sum += fr->ener[set[i]].e;
 -                    }
 -                    print1(out,bDp,sum/nmol-ezero);
 -                }
 -                else
 -                {
 -                    for(i=0; (i<nset); i++)
 -                    {
 -                        if (bIsEner[i])
 -                        {
 -                            print1(out,bDp,(fr->ener[set[i]].e)/nmol-ezero);
 -                        }
 -                        else
 -                        {
 -                            print1(out,bDp,fr->ener[set[i]].e);
 -                        }
 -                    }
 -                }
 -                fprintf(out,"\n");
 -            }
 -        }
 -#if 0
 -          /* we first count the blocks that have id 0: the orire blocks */
 -          block_orire=0;
 -          for(b=0;b<fr->nblock;b++)
 -          {
 -              if (fr->block[b].id == mde_block_type_orire)
 -                  nblock_orire++;
 -          }
 -#endif
 +   /* Initiate energies and set them to zero */
 +   edat.nsteps  = 0;
 +   edat.npoints = 0;
 +   edat.nframes = 0;
 +   edat.step    = NULL;
 +   edat.steps   = NULL;
 +   edat.points  = NULL;
 +   snew(edat.s,nset);
 +
 +   /* Initiate counters */
 +   teller       = 0;
 +   teller_disre = 0;
 +   bFoundStart  = FALSE;
 +   start_step   = 0;
 +   start_t      = 0;
 +   do {
 +     /* This loop searches for the first frame (when -b option is given),
 +      * or when this has been found it reads just one energy frame
 +      */
 +     do {
 +         bCont = do_enx(fp,&(frame[NEXT]));
 +         if (bCont) {
 +             timecheck = check_times(frame[NEXT].t);
 +         }
 +     } while (bCont && (timecheck < 0));
 +
 +     if ((timecheck == 0) && bCont) {
 +       /* We read a valid frame, so we can use it */
 +       fr = &(frame[NEXT]);
 +
 +       if (fr->nre > 0) {
 +         /* The frame contains energies, so update cur */
 +         cur  = NEXT;
 +
 +             if (edat.nframes % 1000 == 0)
 +             {
 +                 srenew(edat.step,edat.nframes+1000);
 +                 memset(&(edat.step[edat.nframes]),0,1000*sizeof(edat.step[0]));
 +                 srenew(edat.steps,edat.nframes+1000);
 +                 memset(&(edat.steps[edat.nframes]),0,1000*sizeof(edat.steps[0]));
 +                 srenew(edat.points,edat.nframes+1000);
 +                 memset(&(edat.points[edat.nframes]),0,1000*sizeof(edat.points[0]));
 +
 +                 for(i=0; i<nset; i++)
 +                 {
 +                     srenew(edat.s[i].ener,edat.nframes+1000);
 +                     memset(&(edat.s[i].ener[edat.nframes]),0,
 +                            1000*sizeof(edat.s[i].ener[0]));
 +                     srenew(edat.s[i].es  ,edat.nframes+1000);
 +                     memset(&(edat.s[i].es[edat.nframes]),0,
 +                            1000*sizeof(edat.s[i].es[0]));
 +                 }
 +             }
 +
 +             nfr = edat.nframes;
 +             edat.step[nfr] = fr->step;
 +
 +             if (!bFoundStart)
 +             {
 +                 bFoundStart = TRUE;
 +                 /* Initiate the previous step data */
 +                 start_step = fr->step;
 +                 start_t    = fr->t;
 +                 /* Initiate the energy sums */
 +                 edat.steps[nfr]  = 1;
 +                 edat.points[nfr] = 1;
 +                 for(i=0; i<nset; i++)
 +                 {
 +                     sss = set[i];
 +                     edat.s[i].es[nfr].sum  = fr->ener[sss].e;
 +                     edat.s[i].es[nfr].sum2 = 0;
 +                 }
 +                 edat.nsteps  = 1;
 +                 edat.npoints = 1;
 +             }
 +             else
 +             {
 +                 edat.steps[nfr] = fr->nsteps;
 +                 {
 +                     if (fr->step - start_step + 1 == edat.nsteps + fr->nsteps)
 +                     {
 +                         if (fr->nsum <= 1)
 +                         {
 +                             edat.points[nfr] = 1;
 +                             for(i=0; i<nset; i++)
 +                             {
 +                                 sss = set[i];
 +                                 edat.s[i].es[nfr].sum  = fr->ener[sss].e;
 +                                 edat.s[i].es[nfr].sum2 = 0;
 +                             }
 +                             edat.npoints += 1;
 +                         }
 +                         else
 +                         {
 +                             edat.points[nfr] = fr->nsum;
 +                             for(i=0; i<nset; i++)
 +                             {
 +                                 sss = set[i];
 +                                 edat.s[i].es[nfr].sum  = fr->ener[sss].esum;
 +                                 edat.s[i].es[nfr].sum2 = fr->ener[sss].eav;
 +                             }
 +                             edat.npoints += fr->nsum;
 +                         }
 +                     }
 +                     else
 +                     {
 +                         /* The interval does not match fr->nsteps:
 +                          * can not do exact averages.
 +                          */
 +                         edat.npoints = 0;
 +                     }
 +                     edat.nsteps = fr->step - start_step + 1;
 +                 }
 +             }
 +             for(i=0; i<nset; i++)
 +             {
 +                 edat.s[i].ener[nfr] = fr->ener[set[i]].e;
 +             }
 +       }
 +       /*
 +        * Define distance restraint legends. Can only be done after
 +        * the first frame has been read... (Then we know how many there are)
 +        */
 +       blk_disre=find_block_id_enxframe(fr, enxDISRE, NULL);
 +       if (bDisRe && bDRAll && !leg && blk_disre)
 +       {
 +           t_iatom   *fa;
 +           t_iparams *ip;
 +
 +           fa = top->idef.il[F_DISRES].iatoms;
 +           ip = top->idef.iparams;
 +           if (blk_disre->nsub != 2 ||
 +               (blk_disre->sub[0].nr != blk_disre->sub[1].nr) )
 +           {
 +               gmx_incons("Number of disre sub-blocks not equal to 2");
 +           }
 +
 +           ndisre=blk_disre->sub[0].nr ;
 +           if (ndisre != top->idef.il[F_DISRES].nr/3)
 +           {
 +               gmx_fatal(FARGS,"Number of disre pairs in the energy file (%d) does not match the number in the run input file (%d)\n",
 +                         ndisre,top->idef.il[F_DISRES].nr/3);
 +           }
 +           snew(pairleg,ndisre);
 +           for(i=0; i<ndisre; i++)
 +           {
 +               snew(pairleg[i],30);
 +               j=fa[3*i+1];
 +               k=fa[3*i+2];
 +               gmx_mtop_atominfo_global(&mtop,j,&anm_j,&resnr_j,&resnm_j);
 +               gmx_mtop_atominfo_global(&mtop,k,&anm_k,&resnr_k,&resnm_k);
 +               sprintf(pairleg[i],"%d %s %d %s (%d)",
 +                       resnr_j,anm_j,resnr_k,anm_k,
 +                       ip[fa[3*i]].disres.label);
 +           }
 +           set=select_it(ndisre,pairleg,&nset);
 +           snew(leg,2*nset);
 +           for(i=0; (i<nset); i++)
 +           {
 +               snew(leg[2*i],32);
 +               sprintf(leg[2*i],  "a %s",pairleg[set[i]]);
 +               snew(leg[2*i+1],32);
 +               sprintf(leg[2*i+1],"i %s",pairleg[set[i]]);
 +           }
 +           xvgr_legend(fp_pairs,2*nset,(const char**)leg,oenv);
 +       }
 +
 +       /*
 +        * Store energies for analysis afterwards...
 +        */
 +       if (!bDisRe && !bDHDL && (fr->nre > 0)) {
 +           if (edat.nframes % 1000 == 0) {
 +               srenew(time,edat.nframes+1000);
 +           }
 +           time[edat.nframes] = fr->t;
 +           edat.nframes++;
 +       }
 +       /*
 +        * Printing time, only when we do not want to skip frames
 +        */
 +       if (!skip || teller % skip == 0) {
 +     if (bDisRe) {
 +       /*******************************************
 +        * D I S T A N C E   R E S T R A I N T S
 +        *******************************************/
 +       if (ndisre > 0)
 +           {
 + #ifndef GMX_DOUBLE
 +             float *disre_rt =     blk_disre->sub[0].fval;
 +             float *disre_rm3tav = blk_disre->sub[1].fval;
 + #else
 +             double *disre_rt =     blk_disre->sub[0].dval;
 +             double *disre_rm3tav = blk_disre->sub[1].dval;
 + #endif
 +
 +         print_time(out,fr->t);
 +         if (violaver == NULL)
 +           snew(violaver,ndisre);
 +
 +         /* Subtract bounds from distances, to calculate violations */
 +         calc_violations(disre_rt, disre_rm3tav,
 +                 nbounds,pair,bounds,violaver,&sumt,&sumaver);
 +
 +         fprintf(out,"  %8.4f  %8.4f\n",sumaver,sumt);
 +         if (bDRAll) {
 +           print_time(fp_pairs,fr->t);
 +           for(i=0; (i<nset); i++) {
 +         sss=set[i];
 +         fprintf(fp_pairs,"  %8.4f", mypow(disre_rm3tav[sss],minthird));
 +         fprintf(fp_pairs,"  %8.4f", disre_rt[sss]);
 +           }
 +           fprintf(fp_pairs,"\n");
 +         }
 +         teller_disre++;
 +       }
 +     }
 +     else if (bDHDL)
 +     {
 +         do_dhdl(fr, &ir, &fp_dhdl, opt2fn("-odh",NFILE,fnm), bDp, &dh_blocks, &dh_hists, &dh_samples, &dh_lambdas, oenv);
 +     }
 +
 +     /*******************************************
 +      * E N E R G I E S
 +      *******************************************/
 +     else {
 +         if (fr->nre > 0) {
 +             if (bPrAll)
 +             {
 +                 /* We skip frames with single points (usually only the first frame),
 +                  * since they would result in an average plot with outliers.
 +                  */
 +                 if (fr->nsum > 1) {
 +                     print_time(out,fr->t);
 +                      print1(out,bDp,fr->ener[set[0]].e);
 +                      print1(out,bDp,fr->ener[set[0]].esum/fr->nsum);
 +                      print1(out,bDp,sqrt(fr->ener[set[0]].eav/fr->nsum));
 +                      fprintf(out,"\n");
 +                 }
 +             }
 +             else
 +             {
 +                 print_time(out,fr->t);
 +                 if (bSum)
 +                 {
 +                     sum = 0;
 +                     for(i=0; i<nset; i++)
 +                     {
 +                         sum += fr->ener[set[i]].e;
 +                     }
 +                     print1(out,bDp,sum/nmol-ezero);
 +                 }
 +                 else
 +                 {
 +                     for(i=0; (i<nset); i++)
 +                     {
 +                         if (bIsEner[i])
 +                         {
 +                             print1(out,bDp,(fr->ener[set[i]].e)/nmol-ezero);
 +                         }
 +                         else
 +                         {
 +                             print1(out,bDp,fr->ener[set[i]].e);
 +                         }
 +                     }
 +                 }
 +                 fprintf(out,"\n");
 +             }
 +         }
            blk = find_block_id_enxframe(fr, enx_i, NULL);
          if (bORIRE && blk)
            {
                vals=blk->sub[0].dval;
  #endif
  
 -              if (blk->sub[0].nr != (size_t)nor) 
 +              if (blk->sub[0].nr != (size_t)nor)
                    gmx_fatal(FARGS,"Number of orientation restraints in energy file (%d) does not match with the topology (%d)", blk->sub[0].nr);
                if (bORA || bODA)
                {
                    for(i=0; i<nor; i++)
                        odrms[i] += sqr(vals[i]-oobs[i]);
                }
 -              if (bORT) 
 +              if (bORT)
                {
                    fprintf(fort,"  %10f",fr->t);
                    for(i=0; i<norsel; i++)
                        fprintf(fort," %g",vals[orsel[i]]);
                    fprintf(fort,"\n");
                }
 -              if (bODT) 
 +              if (bODT)
                {
                    fprintf(fodt,"  %10f",fr->t);
                    for(i=0; i<norsel; i++)
                }
                norfr++;
            }
 -          blk = find_block_id_enxframe(fr, enxORT, NULL);
 -          if (bOTEN && blk) 
 -          {
 +         blk = find_block_id_enxframe(fr, enxORT, NULL);
 +         if (bOTEN && blk)
 +         {
  #ifndef GMX_DOUBLE
 -              xdr_datatype dt=xdr_datatype_float;
 +             xdr_datatype dt=xdr_datatype_float;
  #else
 -              xdr_datatype dt=xdr_datatype_double;
 +             xdr_datatype dt=xdr_datatype_double;
  #endif
 -              real *vals;
 - 
 -              if ( (blk->nsub != 1) || (blk->sub[0].type!=dt) )
 -                  gmx_fatal(FARGS,"Orientational restraints read in incorrectly");
 +             real *vals;
 +
 +             if ( (blk->nsub != 1) || (blk->sub[0].type!=dt) )
 +                 gmx_fatal(FARGS,"Orientational restraints read in incorrectly");
  #ifndef GMX_DOUBLE
 -              vals=blk->sub[0].fval;
 +             vals=blk->sub[0].fval;
  #else
 -              vals=blk->sub[0].dval;
 +             vals=blk->sub[0].dval;
  #endif
  
                if (blk->sub[0].nr != (size_t)(nex*12))
        teller++;
      }
    } while (bCont && (timecheck == 0));
 -  
 +
    fprintf(stderr,"\n");
    close_enx(fp);
 -  if (out) 
 +  if (out)
        ffclose(out);
  
    if (bDRAll)
        ffclose(fort);
    if (bODT)
        ffclose(fodt);
 -  if (bORA) 
 +  if (bORA)
    {
        out = xvgropen(opt2fn("-ora",NFILE,fnm),
                       "Average calculated orientations",
    if (bOTEN)
        ffclose(foten);
  
 -  if (bDisRe) 
 +  if (bDisRe)
    {
        analyse_disre(opt2fn("-viol",NFILE,fnm),
                      teller_disre,violaver,bounds,index,pair,nbounds,oenv);
 -  } 
 +  }
    else if (bDHDL)
    {
        if (fp_dhdl)
        {
            ffclose(fp_dhdl);
 -          printf("\n\nWrote %d lambda values with %d samples as ", 
 +          printf("\n\nWrote %d lambda values with %d samples as ",
                   dh_lambdas, dh_samples);
            if (dh_hists > 0)
            {
                     time,reftemp,&edat,
                     nset,set,bIsEner,leg,enm,Vaver,ezero,nbmin,nbmax,
                     oenv);
-       calc_fluctuation_props(stdout,bDriftCorr,dt,nset,set,nmol,leg,&edat,
-                              nbmin,nbmax);
+       if (bFluctProps)
+           calc_fluctuation_props(stdout,bDriftCorr,dt,nset,set,nmol,leg,&edat,
+                                  nbmin,nbmax);
    }
    if (opt2bSet("-f2",NFILE,fnm)) {
 -      fec(opt2fn("-f2",NFILE,fnm), opt2fn("-ravg",NFILE,fnm), 
 +      fec(opt2fn("-f2",NFILE,fnm), opt2fn("-ravg",NFILE,fnm),
            reftemp, nset, set, leg, &edat, time ,oenv);
    }
  
diff --combined src/tools/gmx_trjconv.c
index 330a67766809387e604a768ad9b28968eb2e87a1,6b564ddb15dc06bed5d0a38f4f28a8aa50073c85..b7c323358176282300963d4d5e367bc100bdd22a
@@@ -35,7 -35,6 +35,7 @@@
  #ifdef HAVE_CONFIG_H
  #include <config.h>
  #endif
 +#include "gmx_header_config.h"
  
  #include <string.h>
  #include <math.h>
@@@ -59,6 -58,7 +59,6 @@@
  #include "do_fit.h"
  #include "rmpbc.h"
  #include "wgms.h"
 -#include "magic.h"
  #include "pbc.h"
  #include "viewit.h"
  #include "xvgr.h"
  enum { euSel,euRect, euTric, euCompact, euNR};
  
  
 -static int 
 -sort_comdist2(void *thunk, const void *a, const void *b)
 -{
 -    /* Thunk should point to a real array with the distance to the cluster COM for each molecule,
 -     * a/b point to integers that refer to the molecule number. 
 -     */
 -    real *pcomdist2 = thunk;
 -    int  ia    = * (int *)a;
 -    int  ib    = * (int *)b;
 -    int  rc;
 -    
 -    if(pcomdist2[ia]<pcomdist2[ib])
 -    {
 -        rc=-1;
 -    }
 -    else if (pcomdist2[ia]>pcomdist2[ib])
 -    {
 -        rc=1;
 -    }
 -    else
 -    {
 -        rc=0;
 -    }
 -    return rc;
 -}
 -
 -
  static void calc_pbc_cluster(int ecenter,int nrefat,t_topology *top,int ePBC,
                               rvec x[],atom_id index[],
                               rvec clust_com,matrix box, rvec clustercenter)
@@@ -414,7 -441,7 +414,7 @@@ void check_trn(const char *fn
          gmx_fatal(FARGS,"%s is not a trajectory file, exiting\n",fn);
  }
  
 -#if (!defined WIN32 && !defined _WIN32 && !defined WIN64 && !defined _WIN64)
 +#ifndef GMX_NATIVE_WINDOWS
  void do_trunc(const char *fn, real t0)
  {
      t_fileio     *in;
@@@ -717,7 -744,7 +717,7 @@@ int gmx_trjconv(int argc,char *argv[]
                          { &bVels }, "Read and write velocities if possible" },
                      { "-force", FALSE, etBOOL,
                          { &bForce }, "Read and write forces if possible" },
 -#if (!defined WIN32 && !defined _WIN32 && !defined WIN64 && !defined _WIN64)
 +#ifndef GMX_NATIVE_WINDOWS
                      { "-trunc", FALSE, etTIME,
                          { &ttrunc }, 
                          "Truncate input trajectory file after this time (%t)" },
      in_file=opt2fn("-f",NFILE,fnm);
  
      if (ttrunc != -1) {
 -#if (!defined WIN32 && !defined _WIN32 && !defined WIN64 && !defined _WIN64)
 +#ifndef GMX_NATIVE_WINDOWS
          do_trunc(in_file,ttrunc);
  #endif
      }
                          }
                          /* Copy the input trxframe struct to the output trxframe struct */
                          frout = fr;
-                       frout.bV    &= bVels;
-                       frout.bF    &= bForce;
+                       frout.bV = (frout.bV && bVels);
+                       frout.bF = (frout.bF && bForce);
                          frout.natoms = nout;
                          if (bNeedPrec && (bSetPrec || !fr.bPrec)) {
                              frout.bPrec = TRUE;
          fprintf(stderr,"\n");
  
          close_trj(status);
 +        sfree(outf_base);
 +
        if (bRmPBC)
          gmx_rmpbc_done(gpbc);
        
diff --combined src/tools/make_edi.c
index d715409034828199dcc59a39065510bac422e65d,d6f8d94663b82b4625c6e98f78c08bce12b545c1..1a9b9ef622a1c860730b7c092047661aa9cd06e2
@@@ -34,7 -34,6 +34,7 @@@
  #ifdef HAVE_CONFIG_H
  #include <config.h>
  #endif
 +#include "gmx_header_config.h"
  
  #include <math.h>
  #include <stdlib.h>
@@@ -236,15 -235,14 +236,15 @@@ int sscan_list(int *list[], const char 
  
         /* format error occured */
         case sError:
 -       gmx_fatal(FARGS,"Error in the list of eigenvectors for %s at pos %d with char %c",listname,pos-startpos,*(pos-1));
 -
 +         gmx_fatal(FARGS,"Error in the list of eigenvectors for %s at pos %d with char %c",listname,pos-startpos,*(pos-1));
 +         break;
         /* logical error occured */
         case sZero:
 -               gmx_fatal(FARGS,"Error in the list of eigenvectors for %s at pos %d: eigenvector 0 is not valid",listname,pos-startpos);
 +                 gmx_fatal(FARGS,"Error in the list of eigenvectors for %s at pos %d: eigenvector 0 is not valid",listname,pos-startpos);
 +                 break;
         case sSmaller:
 -               gmx_fatal(FARGS,"Error in the list of eigenvectors for %s at pos %d: second index %d is not bigger than %d",listname,pos-startpos,end_number,number);
 -
 +                 gmx_fatal(FARGS,"Error in the list of eigenvectors for %s at pos %d: second index %d is not bigger than %d",listname,pos-startpos,end_number,number);
 +                 break;
       }
     ++pos; /* read next character */
     } /*scanner has finished */
@@@ -443,7 -441,7 +443,7 @@@ void filter2edx(struct edix *edx,int ni
  
  void get_structure(t_atoms *atoms,const char *IndexFile,
                     const char *StructureFile,struct edix *edx,int nfit,
-                    atom_id ifit[],int natoms, atom_id index[])
+                    atom_id ifit[],int nav, atom_id index[])
  {
    atom_id *igro;  /*index corresponding to target or origin structure*/
    int ngro;
       gmx_fatal(FARGS,"You selected an index group with %d elements instead of %d",ngro,ntar);
    init_edx(edx);
    filter2edx(edx,nfit,ifit,ngro,igro,xtar,StructureFile);
+   /* If average and reference/fitting structure differ, append the average structure as well */
    if (ifit!=index) /*if fit structure is different append these coordinates, too -- don't mind duplicates*/
-      filter2edx(edx,natoms,index,ngro,igro,xtar,StructureFile);
+      filter2edx(edx,nav,index,ngro,igro,xtar,StructureFile);
  }
  
  int main(int argc,char *argv[])
      int        nvec1,*eignr1=NULL;
      rvec       *xav1,**eigvec1=NULL;
      t_atoms    *atoms=NULL;
-     int natoms;
+     int        nav;  /* Number of atoms in the average structure */
      char       *grpname;
      const char *indexfile;
      int        i;
      atom_id    *index,*ifit;
-     int        nfit;
+     int        nfit; /* Number of atoms in the reference/fit structure */
      int ev_class; /* parameter _class i.e. evMON, evRADFIX etc. */
      int nvecs;
      real *eigval1=NULL; /* in V3.3 this is parameter of read_eigenvectors */
      EigvecFile=opt2fn("-f",NFILE,fnm);
  
      /*read eigenvectors from eigvec.trr*/
-     read_eigenvectors(EigvecFile,&natoms,&bFit1,
+     read_eigenvectors(EigvecFile,&nav,&bFit1,
                        &xref1,&edi_params.fitmas,&xav1,&edi_params.pcamas,&nvec1,&eignr1,&eigvec1,&eigval1);
  
      bTop=read_tps_conf(ftp2fn(efTPS,NFILE,fnm),
      atoms=&top.atoms;
  
  
-     printf("\nSelect an index group of %d elements that corresponds to the eigenvectors\n",natoms);
+     printf("\nSelect an index group of %d elements that corresponds to the eigenvectors\n",nav);
      get_index(atoms,indexfile,1,&i,&index,&grpname); /*if indexfile != NULL parameter 'atoms' is ignored */
-     if (i!=natoms) {
+     if (i!=nav) {
          gmx_fatal(FARGS,"you selected a group with %d elements instead of %d",
-                   i,natoms);
+                   i,nav);
      }
      printf("\n");
  
      }
      else
      {
-         nfit=natoms;
+         nfit=nav;
          ifit=index;
      }
  
          }
      }
  
-     edi_params.ned=natoms;
+     edi_params.ned=nav;
  
    /*number of system atoms  */
    edi_params.nini=atoms->nr;
  
  
    /*store reference and average structure in edi_params*/
-   make_t_edx(&edi_params.sref,nfit,xref1,ifit);
-   make_t_edx(&edi_params.sav,natoms,xav1,index);
+   make_t_edx(&edi_params.sref,nfit,xref1,ifit );
+   make_t_edx(&edi_params.sav ,nav ,xav1 ,index);
  
  
    /* Store target positions in edi_params */
            fprintf(stderr, "\nNote: Providing a TARGET structure has no effect when using flooding.\n"
                            "      You may want to use -ori to define the flooding potential center.\n\n");
        }
-       get_structure(atoms,indexfile,TargetFile,&edi_params.star,nfit,ifit,natoms,index);
+       get_structure(atoms,indexfile,TargetFile,&edi_params.star,nfit,ifit,nav,index);
    }
    else
    {
    /* Store origin positions */
    if (opt2bSet("-ori",NFILE,fnm))
    {
-       get_structure(atoms,indexfile,OriginFile,&edi_params.sori,nfit,ifit,natoms,index);
+       get_structure(atoms,indexfile,OriginFile,&edi_params.sori,nfit,ifit,nav,index);
    }
    else
    {