Merge branch 'release-4-6'
authorRoland Schulz <roland@utk.edu>
Fri, 19 Oct 2012 18:50:56 +0000 (14:50 -0400)
committerRoland Schulz <roland@utk.edu>
Fri, 19 Oct 2012 19:22:32 +0000 (15:22 -0400)
Conflicts:
src/kernel/CMakeLists.txt (applied to programs/mdrun)
src/programs/mdrun/pme_switch.c (renamed)
src/programs/mdrun/pme_switch.h (renamed)

Change-Id: I0914b274baed69a25736dca50b57d1b8658d4ca8

16 files changed:
1  2 
src/gromacs/gmxlib/thread_mpi/pthreads.c
src/gromacs/gmxlib/thread_mpi/pthreads.h
src/gromacs/gmxlib/thread_mpi/tmpi_init.c
src/gromacs/gmxlib/thread_mpi/winthreads.c
src/gromacs/gmxlib/thread_mpi/winthreads.h
src/gromacs/legacyheaders/nbnxn_cuda_data_mgmt.h
src/gromacs/legacyheaders/thread_mpi/threads.h
src/gromacs/legacyheaders/thread_mpi/tmpi.h
src/gromacs/mdlib/clincs.c
src/gromacs/mdlib/gmx_wallcycle.c
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
src/programs/mdrun/CMakeLists.txt
src/programs/mdrun/md.c
src/programs/mdrun/pme_loadbal.c
src/programs/mdrun/pme_loadbal.h
src/programs/mdrun/runner.c

Simple merge
Simple merge
index 6c499aa0020dd1a7074a6adb1236f29c5f46ec5c,0000000000000000000000000000000000000000..85f65b9a96dd88945182374f3f3d70863a72699d
mode 100644,000000..100644
--- /dev/null
@@@ -1,52 -1,0 +1,52 @@@
-     do_gct.c    gctio.c         genalg.c    ionize.c
-     md.c        md_openmm.c     mdrun.c     membed.c
-     pme_switch.c  repl_ex.c   runner.c        xutils.c)
 +include_directories(${CMAKE_SOURCE_DIR}/src/gromacs/gmxpreprocess)
 +
 +set(MDRUN_SOURCES
++    do_gct.c      gctio.c       genalg.c    ionize.c
++    md.c          md_openmm.c   mdrun.c     membed.c
++    pme_loadbal.c repl_ex.c     runner.c    xutils.c)
 +
 +if(GMX_OPENMM) 
 +    include_directories(./gmx_gpu_utils ${OpenMM_INCLUDE_DIR})
 +    link_directories(${OpenMM_LIBRARY_DIR}) 
 +    # with this define no evn.var. is needed with OPENMM_PLUGIN_DIR
 +    # if the same OpenMM installation is used for running and building 
 +    add_definitions( -DOPENMM_PLUGIN_DIR="${OpenMM_PLUGIN_DIR}" ) 
 +    file(TO_CMAKE_PATH ${OpenMM_PLUGIN_DIR} _path)
 +    add_library(openmm_api_wrapper STATIC openmm_wrapper.cpp)
 +    target_link_libraries(openmm_api_wrapper gmx_gpu_utils ${OpenMM_LIBRARIES})
 +    set(GMX_OPENMM_LIBRARIES openmm_api_wrapper gmx_gpu_utils ${OpenMM_LIBRARIES})   
 +endif(GMX_OPENMM)
 +
 +if(GMX_FAHCORE)
 +    add_library(fahcore ${MDRUN_SOURCES})
 +else(GMX_FAHCORE)
 +    add_executable(mdrun ${MDRUN_SOURCES})
 +    gmx_add_man_page(mdrun)
 +    target_link_libraries(mdrun ${GMX_EXTRA_LIBRARIES} libgromacs ${GMX_OPENMM_LIBRARIES}
 +        ${OpenMP_LINKER_FLAGS})
 +    set_target_properties(mdrun PROPERTIES OUTPUT_NAME "mdrun${GMX_BINARY_SUFFIX}"
 +        COMPILE_FLAGS "${OpenMP_C_FLAGS}")
 +    install(TARGETS mdrun DESTINATION ${BIN_INSTALL_DIR} COMPONENT mdrun)
 +
 +    if(GMX_OPENMM AND MSVC)
 +        set_target_properties(mdrun PROPERTIES LINK_FLAGS "/NODEFAULTLIB:LIBCMT")
 +    endif()
 +
 +    # Create the custom install-mdrun target
 +    if (BUILD_SHARED_LIBS)
 +        # If shared libraries are used, we need to install the libraries in
 +        # addition to the mdrun binary.
 +        add_custom_target(install-mdrun
 +            COMMAND ${CMAKE_COMMAND} -DCOMPONENT=libraries
 +                    -P ${CMAKE_BINARY_DIR}/cmake_install.cmake
 +            COMMAND ${CMAKE_COMMAND} -DCOMPONENT=mdrun
 +                    -P ${CMAKE_BINARY_DIR}/cmake_install.cmake
 +            COMMENT "Installing mdrun")
 +    else (BUILD_SHARED_LIBS)
 +        add_custom_target(install-mdrun
 +            COMMAND ${CMAKE_COMMAND} -DCOMPONENT=mdrun
 +                    -P ${CMAKE_BINARY_DIR}/cmake_install.cmake
 +            COMMENT "Installing mdrun")
 +    endif (BUILD_SHARED_LIBS)
 +    add_dependencies(install-mdrun mdrun)
 +endif(GMX_FAHCORE)
index d6f4017607a1cd61c9269fe67a9e6022c389982f,0000000000000000000000000000000000000000..2a99d5f7ee53cb006df15b76139f621d48556db0
mode 100644,000000..100644
--- /dev/null
@@@ -1,2127 -1,0 +1,2132 @@@
- #include "pme_switch.h"
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "sysstuff.h"
 +#include "vec.h"
 +#include "statutil.h"
 +#include "vcm.h"
 +#include "mdebin.h"
 +#include "nrnb.h"
 +#include "calcmu.h"
 +#include "index.h"
 +#include "vsite.h"
 +#include "update.h"
 +#include "ns.h"
 +#include "trnio.h"
 +#include "xtcio.h"
 +#include "mdrun.h"
 +#include "md_support.h"
 +#include "confio.h"
 +#include "network.h"
 +#include "pull.h"
 +#include "xvgr.h"
 +#include "physics.h"
 +#include "names.h"
 +#include "xmdrun.h"
 +#include "ionize.h"
 +#include "disre.h"
 +#include "orires.h"
 +#include "pme.h"
 +#include "mdatoms.h"
 +#include "repl_ex.h"
 +#include "qmmm.h"
 +#include "domdec.h"
 +#include "domdec_network.h"
 +#include "partdec.h"
 +#include "topsort.h"
 +#include "coulomb.h"
 +#include "constr.h"
 +#include "shellfc.h"
 +#include "compute_io.h"
 +#include "mvdata.h"
 +#include "checkpoint.h"
 +#include "mtop_util.h"
 +#include "sighandler.h"
 +#include "txtdump.h"
 +#include "string2.h"
-     pme_switch_t    pme_switch=NULL;
++#include "pme_loadbal.h"
 +#include "bondf.h"
 +#include "membed.h"
 +#include "types/nlistheuristics.h"
 +#include "types/iteratedconstraints.h"
 +#include "nbnxn_cuda_data_mgmt.h"
 +
 +#ifdef GMX_LIB_MPI
 +#include <mpi.h>
 +#endif
 +#ifdef GMX_THREAD_MPI
 +#include "tmpi.h"
 +#endif
 +
 +#ifdef GMX_FAHCORE
 +#include "corewrap.h"
 +#endif
 +
 +static void reset_all_counters(FILE *fplog,t_commrec *cr,
 +                               gmx_large_int_t step,
 +                               gmx_large_int_t *step_rel,t_inputrec *ir,
 +                               gmx_wallcycle_t wcycle,t_nrnb *nrnb,
 +                               gmx_runtime_t *runtime,
 +                               nbnxn_cuda_ptr_t cu_nbv)
 +{
 +    char sbuf[STEPSTRSIZE];
 +
 +    /* Reset all the counters related to performance over the run */
 +    md_print_warn(cr,fplog,"step %s: resetting all time and cycle counters\n",
 +                  gmx_step_str(step,sbuf));
 +
 +    if (cu_nbv)
 +    {
 +        nbnxn_cuda_reset_timings(cu_nbv);
 +    }
 +
 +    wallcycle_stop(wcycle,ewcRUN);
 +    wallcycle_reset_all(wcycle);
 +    if (DOMAINDECOMP(cr))
 +    {
 +        reset_dd_statistics_counters(cr->dd);
 +    }
 +    init_nrnb(nrnb);
 +    ir->init_step += *step_rel;
 +    ir->nsteps    -= *step_rel;
 +    *step_rel = 0;
 +    wallcycle_start(wcycle,ewcRUN);
 +    runtime_start(runtime);
 +    print_date_and_time(fplog,cr->nodeid,"Restarted time",runtime);
 +}
 +
 +double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
 +             const output_env_t oenv, gmx_bool bVerbose,gmx_bool bCompact,
 +             int nstglobalcomm,
 +             gmx_vsite_t *vsite,gmx_constr_t constr,
 +             int stepout,t_inputrec *ir,
 +             gmx_mtop_t *top_global,
 +             t_fcdata *fcd,
 +             t_state *state_global,
 +             t_mdatoms *mdatoms,
 +             t_nrnb *nrnb,gmx_wallcycle_t wcycle,
 +             gmx_edsam_t ed,t_forcerec *fr,
 +             int repl_ex_nst,int repl_ex_nex,int repl_ex_seed,gmx_membed_t membed,
 +             real cpt_period,real max_hours,
 +             const char *deviceOptions,
 +             unsigned long Flags,
 +             gmx_runtime_t *runtime)
 +{
 +    gmx_mdoutf_t *outf;
 +    gmx_large_int_t step,step_rel;
 +    double     run_time;
 +    double     t,t0,lam0[efptNR];
 +    gmx_bool       bGStatEveryStep,bGStat,bCalcVir,bCalcEner;
 +    gmx_bool       bNS,bNStList,bSimAnn,bStopCM,bRerunMD,bNotLastFrame=FALSE,
 +               bFirstStep,bStateFromCP,bStateFromTPX,bInitStep,bLastStep,
 +               bBornRadii,bStartingFromCpt;
 +    gmx_bool   bDoDHDL=FALSE,bDoFEP=FALSE,bDoExpanded=FALSE;
 +    gmx_bool       do_ene,do_log,do_verbose,bRerunWarnNoV=TRUE,
 +               bForceUpdate=FALSE,bCPT;
 +    int        mdof_flags;
 +    gmx_bool       bMasterState;
 +    int        force_flags,cglo_flags;
 +    tensor     force_vir,shake_vir,total_vir,tmp_vir,pres;
 +    int        i,m;
 +    t_trxstatus *status;
 +    rvec       mu_tot;
 +    t_vcm      *vcm;
 +    t_state    *bufstate=NULL;   
 +    matrix     *scale_tot,pcoupl_mu,M,ebox;
 +    gmx_nlheur_t nlh;
 +    t_trxframe rerun_fr;
 +    gmx_repl_ex_t repl_ex=NULL;
 +    int        nchkpt=1;
 +    gmx_localtop_t *top;      
 +    t_mdebin *mdebin=NULL;
 +    df_history_t df_history;
 +    t_state    *state=NULL;
 +    rvec       *f_global=NULL;
 +    int        n_xtc=-1;
 +    rvec       *x_xtc=NULL;
 +    gmx_enerdata_t *enerd;
 +    rvec       *f=NULL;
 +    gmx_global_stat_t gstat;
 +    gmx_update_t upd=NULL;
 +    t_graph    *graph=NULL;
 +    globsig_t   gs;
 +    gmx_rng_t mcrng=NULL;
 +    gmx_bool        bFFscan;
 +    gmx_groups_t *groups;
 +    gmx_ekindata_t *ekind, *ekind_save;
 +    gmx_shellfc_t shellfc;
 +    int         count,nconverged=0;
 +    real        timestep=0;
 +    double      tcount=0;
 +    gmx_bool        bIonize=FALSE;
 +    gmx_bool        bTCR=FALSE,bConverged=TRUE,bOK,bSumEkinhOld,bExchanged;
 +    gmx_bool        bAppend;
 +    gmx_bool        bResetCountersHalfMaxH=FALSE;
 +    gmx_bool        bVV,bIterations,bFirstIterate,bTemp,bPres,bTrotter;
 +    real        mu_aver=0,dvdl;
 +    int         a0,a1,gnx=0,ii;
 +    atom_id     *grpindex=NULL;
 +    char        *grpname;
 +    t_coupl_rec *tcr=NULL;
 +    rvec        *xcopy=NULL,*vcopy=NULL,*cbuf=NULL;
 +    matrix      boxcopy={{0}},lastbox;
 +      tensor      tmpvir;
 +      real        fom,oldfom,veta_save,pcurr,scalevir,tracevir;
 +      real        vetanew = 0;
 +    int         lamnew=0;
 +    /* for FEP */
 +    int         fep_state=0;
 +    int         nstfep;
 +    real        rate;
 +    double      cycles;
 +      real        saved_conserved_quantity = 0;
 +    real        last_ekin = 0;
 +      int         iter_i;
 +      t_extmass   MassQ;
 +    int         **trotter_seq; 
 +    char        sbuf[STEPSTRSIZE],sbuf2[STEPSTRSIZE];
 +    int         handled_stop_condition=gmx_stop_cond_none; /* compare to get_stop_condition*/
 +    gmx_iterate_t iterate;
 +    gmx_large_int_t multisim_nsteps=-1; /* number of steps to do  before first multisim 
 +                                          simulation stops. If equal to zero, don't
 +                                          communicate any more between multisims.*/
 +    /* PME load balancing data for GPU kernels */
-         switch_pme_init(&pme_switch,ir,state->box,fr->ic,fr->pmedata);
++    pme_load_balancing_t pme_loadbal=NULL;
 +    double          cycles_pmes;
 +    gmx_bool        bPMETuneTry=FALSE,bPMETuneRunning=FALSE;
 +
 +    if(MASTER(cr))
 +    {
 +        fprintf(stderr,
 +                "\n* WARNING * WARNING * WARNING * WARNING * WARNING * WARNING *\n"
 +                "We have just committed the new CPU detection code in this branch,\n"
 +                "and will commit new SSE/AVX kernels in a few days. However, this\n"
 +                "means that currently only the NxN kernels are accelerated!\n"
 +                "In the mean time, you might want to avoid production runs in 4.6.\n\n");
 +    }
 +
 +#ifdef GMX_FAHCORE
 +    /* Temporary addition for FAHCORE checkpointing */
 +    int chkpt_ret;
 +#endif
 +    
 +    /* Check for special mdrun options */
 +    bRerunMD = (Flags & MD_RERUN);
 +    bIonize  = (Flags & MD_IONIZE);
 +    bFFscan  = (Flags & MD_FFSCAN);
 +    bAppend  = (Flags & MD_APPENDFILES);
 +    if (Flags & MD_RESETCOUNTERSHALFWAY)
 +    {
 +        if (ir->nsteps > 0)
 +        {
 +            /* Signal to reset the counters half the simulation steps. */
 +            wcycle_set_reset_counters(wcycle,ir->nsteps/2);
 +        }
 +        /* Signal to reset the counters halfway the simulation time. */
 +        bResetCountersHalfMaxH = (max_hours > 0);
 +    }
 +
 +    /* md-vv uses averaged full step velocities for T-control 
 +       md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
 +       md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
 +    bVV = EI_VV(ir->eI);
 +    if (bVV) /* to store the initial velocities while computing virial */
 +    {
 +        snew(cbuf,top_global->natoms);
 +    }
 +    /* all the iteratative cases - only if there are constraints */ 
 +    bIterations = ((IR_NPH_TROTTER(ir) || IR_NPT_TROTTER(ir)) && (constr) && (!bRerunMD));
 +    bTrotter = (bVV && (IR_NPT_TROTTER(ir) || IR_NPH_TROTTER(ir) || IR_NVT_TROTTER(ir)));
 +    
 +    if (bRerunMD)
 +    {
 +        /* Since we don't know if the frames read are related in any way,
 +         * rebuild the neighborlist at every step.
 +         */
 +        ir->nstlist       = 1;
 +        ir->nstcalcenergy = 1;
 +        nstglobalcomm     = 1;
 +    }
 +
 +    check_ir_old_tpx_versions(cr,fplog,ir,top_global);
 +
 +    nstglobalcomm = check_nstglobalcomm(fplog,cr,nstglobalcomm,ir);
 +    bGStatEveryStep = (nstglobalcomm == 1);
 +
 +    if (!bGStatEveryStep && ir->nstlist == -1 && fplog != NULL)
 +    {
 +        fprintf(fplog,
 +                "To reduce the energy communication with nstlist = -1\n"
 +                "the neighbor list validity should not be checked at every step,\n"
 +                "this means that exact integration is not guaranteed.\n"
 +                "The neighbor list validity is checked after:\n"
 +                "  <n.list life time> - 2*std.dev.(n.list life time)  steps.\n"
 +                "In most cases this will result in exact integration.\n"
 +                "This reduces the energy communication by a factor of 2 to 3.\n"
 +                "If you want less energy communication, set nstlist > 3.\n\n");
 +    }
 +
 +    if (bRerunMD || bFFscan)
 +    {
 +        ir->nstxtcout = 0;
 +    }
 +    groups = &top_global->groups;
 +
 +    /* Initial values */
 +    init_md(fplog,cr,ir,oenv,&t,&t0,state_global->lambda,
 +            &(state_global->fep_state),lam0,
 +            nrnb,top_global,&upd,
 +            nfile,fnm,&outf,&mdebin,
 +            force_vir,shake_vir,mu_tot,&bSimAnn,&vcm,state_global,Flags);
 +
 +    clear_mat(total_vir);
 +    clear_mat(pres);
 +    /* Energy terms and groups */
 +    snew(enerd,1);
 +    init_enerdata(top_global->groups.grps[egcENER].nr,ir->fepvals->n_lambda,
 +                  enerd);
 +    if (DOMAINDECOMP(cr))
 +    {
 +        f = NULL;
 +    }
 +    else
 +    {
 +        snew(f,top_global->natoms);
 +    }
 +
 +    /* lambda Monte carlo random number generator  */
 +    if (ir->bExpanded)
 +    {
 +        mcrng = gmx_rng_init(ir->expandedvals->lmc_seed);
 +    }
 +    /* copy the state into df_history */
 +    copy_df_history(&df_history,&state_global->dfhist);
 +
 +    /* Kinetic energy data */
 +    snew(ekind,1);
 +    init_ekindata(fplog,top_global,&(ir->opts),ekind);
 +    /* needed for iteration of constraints */
 +    snew(ekind_save,1);
 +    init_ekindata(fplog,top_global,&(ir->opts),ekind_save);
 +    /* Copy the cos acceleration to the groups struct */    
 +    ekind->cosacc.cos_accel = ir->cos_accel;
 +
 +    gstat = global_stat_init(ir);
 +    debug_gmx();
 +
 +    /* Check for polarizable models and flexible constraints */
 +    shellfc = init_shell_flexcon(fplog,
 +                                 top_global,n_flexible_constraints(constr),
 +                                 (ir->bContinuation || 
 +                                  (DOMAINDECOMP(cr) && !MASTER(cr))) ?
 +                                 NULL : state_global->x);
 +
 +    if (DEFORM(*ir))
 +    {
 +#ifdef GMX_THREAD_MPI
 +        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
 +#endif
 +        set_deform_reference_box(upd,
 +                                 deform_init_init_step_tpx,
 +                                 deform_init_box_tpx);
 +#ifdef GMX_THREAD_MPI
 +        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
 +#endif
 +    }
 +
 +    {
 +        double io = compute_io(ir,top_global->natoms,groups,mdebin->ebin->nener,1);
 +        if ((io > 2000) && MASTER(cr))
 +            fprintf(stderr,
 +                    "\nWARNING: This run will generate roughly %.0f Mb of data\n\n",
 +                    io);
 +    }
 +
 +    if (DOMAINDECOMP(cr)) {
 +        top = dd_init_local_top(top_global);
 +
 +        snew(state,1);
 +        dd_init_local_state(cr->dd,state_global,state);
 +
 +        if (DDMASTER(cr->dd) && ir->nstfout) {
 +            snew(f_global,state_global->natoms);
 +        }
 +    } else {
 +        if (PAR(cr)) {
 +            /* Initialize the particle decomposition and split the topology */
 +            top = split_system(fplog,top_global,ir,cr);
 +
 +            pd_cg_range(cr,&fr->cg0,&fr->hcg);
 +            pd_at_range(cr,&a0,&a1);
 +        } else {
 +            top = gmx_mtop_generate_local_top(top_global,ir);
 +
 +            a0 = 0;
 +            a1 = top_global->natoms;
 +        }
 +
 +        forcerec_set_excl_load(fr,top,cr);
 +
 +        state = partdec_init_local_state(cr,state_global);
 +        f_global = f;
 +
 +        atoms2md(top_global,ir,0,NULL,a0,a1-a0,mdatoms);
 +
 +        if (vsite) {
 +            set_vsite_top(vsite,top,mdatoms,cr);
 +        }
 +
 +        if (ir->ePBC != epbcNONE && !fr->bMolPBC) {
 +            graph = mk_graph(fplog,&(top->idef),0,top_global->natoms,FALSE,FALSE);
 +        }
 +
 +        if (shellfc) {
 +            make_local_shells(cr,mdatoms,shellfc);
 +        }
 +
 +        init_bonded_thread_force_reduction(fr,&top->idef);
 +
 +        if (ir->pull && PAR(cr)) {
 +            dd_make_local_pull_groups(NULL,ir->pull,mdatoms);
 +        }
 +    }
 +
 +    if (DOMAINDECOMP(cr))
 +    {
 +        /* Distribute the charge groups over the nodes from the master node */
 +        dd_partition_system(fplog,ir->init_step,cr,TRUE,1,
 +                            state_global,top_global,ir,
 +                            state,&f,mdatoms,top,fr,
 +                            vsite,shellfc,constr,
 +                            nrnb,wcycle,FALSE);
 +
 +    }
 +
 +    update_mdatoms(mdatoms,state->lambda[efptMASS]);
 +
 +    if (opt2bSet("-cpi",nfile,fnm))
 +    {
 +        bStateFromCP = gmx_fexist_master(opt2fn_master("-cpi",nfile,fnm,cr),cr);
 +    }
 +    else
 +    {
 +        bStateFromCP = FALSE;
 +    }
 +
 +    if (MASTER(cr))
 +    {
 +        if (bStateFromCP)
 +        {
 +            /* Update mdebin with energy history if appending to output files */
 +            if ( Flags & MD_APPENDFILES )
 +            {
 +                restore_energyhistory_from_state(mdebin,&state_global->enerhist);
 +            }
 +            else
 +            {
 +                /* We might have read an energy history from checkpoint,
 +                 * free the allocated memory and reset the counts.
 +                 */
 +                done_energyhistory(&state_global->enerhist);
 +                init_energyhistory(&state_global->enerhist);
 +            }
 +        }
 +        /* Set the initial energy history in state by updating once */
 +        update_energyhistory(&state_global->enerhist,mdebin);
 +    } 
 +
 +    if ((state->flags & (1<<estLD_RNG)) && (Flags & MD_READ_RNG)) 
 +    {
 +        /* Set the random state if we read a checkpoint file */
 +        set_stochd_state(upd,state);
 +    }
 +
 +    if (state->flags & (1<<estMC_RNG))
 +    {
 +        set_mc_state(mcrng,state);
 +    }
 +
 +    /* Initialize constraints */
 +    if (constr) {
 +        if (!DOMAINDECOMP(cr))
 +            set_constraints(constr,top,ir,mdatoms,cr);
 +    }
 +
 +    /* Check whether we have to GCT stuff */
 +    bTCR = ftp2bSet(efGCT,nfile,fnm);
 +    if (bTCR) {
 +        if (MASTER(cr)) {
 +            fprintf(stderr,"Will do General Coupling Theory!\n");
 +        }
 +        gnx = top_global->mols.nr;
 +        snew(grpindex,gnx);
 +        for(i=0; (i<gnx); i++) {
 +            grpindex[i] = i;
 +        }
 +    }
 +
 +    if (repl_ex_nst > 0)
 +    {
 +        /* We need to be sure replica exchange can only occur
 +         * when the energies are current */
 +        check_nst_param(fplog,cr,"nstcalcenergy",ir->nstcalcenergy,
 +                        "repl_ex_nst",&repl_ex_nst);
 +        /* This check needs to happen before inter-simulation
 +         * signals are initialized, too */
 +    }
 +    if (repl_ex_nst > 0 && MASTER(cr))
 +    {
 +        repl_ex = init_replica_exchange(fplog,cr->ms,state_global,ir,
 +                                        repl_ex_nst,repl_ex_nex,repl_ex_seed);
 +    }
 +
 +    /* PME tuning is only supported with GPUs or PME nodes and not with rerun */
 +    if ((Flags & MD_TUNEPME) &&
 +        EEL_PME(fr->eeltype) &&
 +        fr->cutoff_scheme == ecutsVERLET &&
 +        (fr->nbv->bUseGPU || !(cr->duty & DUTY_PME)) &&
 +        !bRerunMD)
 +    {
-                         switch_pme(pme_switch,cr,
-                                    (bVerbose && MASTER(cr)) ? stderr : NULL,
-                                    fplog,
-                                    ir,state,cycles_pmes,
-                                    fr->ic,fr->nbv,&fr->pmedata,
-                                    step);
++        pme_loadbal_init(&pme_loadbal,ir,state->box,fr->ic,fr->pmedata);
 +        cycles_pmes = 0;
 +        if (cr->duty & DUTY_PME)
 +        {
 +            /* Start tuning right away, as we can't measure the load */
 +            bPMETuneRunning = TRUE;
 +        }
 +        else
 +        {
 +            /* Separate PME nodes, we can measure the PP/PME load balance */
 +            bPMETuneTry = TRUE;
 +        }
 +    }
 +
 +    if (!ir->bContinuation && !bRerunMD)
 +    {
 +        if (mdatoms->cFREEZE && (state->flags & (1<<estV)))
 +        {
 +            /* Set the velocities of frozen particles to zero */
 +            for(i=mdatoms->start; i<mdatoms->start+mdatoms->homenr; i++)
 +            {
 +                for(m=0; m<DIM; m++)
 +                {
 +                    if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
 +                    {
 +                        state->v[i][m] = 0;
 +                    }
 +                }
 +            }
 +        }
 +
 +        if (constr)
 +        {
 +            /* Constrain the initial coordinates and velocities */
 +            do_constrain_first(fplog,constr,ir,mdatoms,state,f,
 +                               graph,cr,nrnb,fr,top,shake_vir);
 +        }
 +        if (vsite)
 +        {
 +            /* Construct the virtual sites for the initial configuration */
 +            construct_vsites(fplog,vsite,state->x,nrnb,ir->delta_t,NULL,
 +                             top->idef.iparams,top->idef.il,
 +                             fr->ePBC,fr->bMolPBC,graph,cr,state->box);
 +        }
 +    }
 +
 +    debug_gmx();
 +  
 +    /* set free energy calculation frequency as the minimum of nstdhdl, nstexpanded, and nstrepl_ex_nst*/
 +    nstfep = ir->fepvals->nstdhdl;
 +    if (ir->bExpanded && (nstfep > ir->expandedvals->nstexpanded))
 +    {
 +        nstfep = ir->expandedvals->nstexpanded;
 +    }
 +    if (repl_ex_nst > 0 && repl_ex_nst > nstfep)
 +    {
 +        nstfep = repl_ex_nst;
 +    }
 +
 +    /* I'm assuming we need global communication the first time! MRS */
 +    cglo_flags = (CGLO_TEMPERATURE | CGLO_GSTAT
 +                  | ((ir->comm_mode != ecmNO) ? CGLO_STOPCM:0)
 +                  | (bVV ? CGLO_PRESSURE:0)
 +                  | (bVV ? CGLO_CONSTRAINT:0)
 +                  | (bRerunMD ? CGLO_RERUNMD:0)
 +                  | ((Flags & MD_READ_EKIN) ? CGLO_READEKIN:0));
 +    
 +    bSumEkinhOld = FALSE;
 +    compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
 +                    NULL,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
 +                    constr,NULL,FALSE,state->box,
 +                    top_global,&pcurr,top_global->natoms,&bSumEkinhOld,cglo_flags);
 +    if (ir->eI == eiVVAK) {
 +        /* a second call to get the half step temperature initialized as well */ 
 +        /* we do the same call as above, but turn the pressure off -- internally to 
 +           compute_globals, this is recognized as a velocity verlet half-step 
 +           kinetic energy calculation.  This minimized excess variables, but 
 +           perhaps loses some logic?*/
 +        
 +        compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
 +                        NULL,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
 +                        constr,NULL,FALSE,state->box,
 +                        top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
 +                        cglo_flags &~ (CGLO_STOPCM | CGLO_PRESSURE));
 +    }
 +    
 +    /* Calculate the initial half step temperature, and save the ekinh_old */
 +    if (!(Flags & MD_STARTFROMCPT)) 
 +    {
 +        for(i=0; (i<ir->opts.ngtc); i++) 
 +        {
 +            copy_mat(ekind->tcstat[i].ekinh,ekind->tcstat[i].ekinh_old);
 +        } 
 +    }
 +    if (ir->eI != eiVV) 
 +    {
 +        enerd->term[F_TEMP] *= 2; /* result of averages being done over previous and current step,
 +                                     and there is no previous step */
 +    }
 +    
 +    /* if using an iterative algorithm, we need to create a working directory for the state. */
 +    if (bIterations) 
 +    {
 +            bufstate = init_bufstate(state);
 +    }
 +    if (bFFscan) 
 +    {
 +        snew(xcopy,state->natoms);
 +        snew(vcopy,state->natoms);
 +        copy_rvecn(state->x,xcopy,0,state->natoms);
 +        copy_rvecn(state->v,vcopy,0,state->natoms);
 +        copy_mat(state->box,boxcopy);
 +    } 
 +    
 +    /* need to make an initiation call to get the Trotter variables set, as well as other constants for non-trotter
 +       temperature control */
 +    trotter_seq = init_npt_vars(ir,state,&MassQ,bTrotter);
 +    
 +    if (MASTER(cr))
 +    {
 +        if (constr && !ir->bContinuation && ir->eConstrAlg == econtLINCS)
 +        {
 +            fprintf(fplog,
 +                    "RMS relative constraint deviation after constraining: %.2e\n",
 +                    constr_rmsd(constr,FALSE));
 +        }
 +        if (EI_STATE_VELOCITY(ir->eI))
 +        {
 +            fprintf(fplog,"Initial temperature: %g K\n",enerd->term[F_TEMP]);
 +        }
 +        if (bRerunMD)
 +        {
 +            fprintf(stderr,"starting md rerun '%s', reading coordinates from"
 +                    " input trajectory '%s'\n\n",
 +                    *(top_global->name),opt2fn("-rerun",nfile,fnm));
 +            if (bVerbose)
 +            {
 +                fprintf(stderr,"Calculated time to finish depends on nsteps from "
 +                        "run input file,\nwhich may not correspond to the time "
 +                        "needed to process input trajectory.\n\n");
 +            }
 +        }
 +        else
 +        {
 +            char tbuf[20];
 +            fprintf(stderr,"starting mdrun '%s'\n",
 +                    *(top_global->name));
 +            if (ir->nsteps >= 0)
 +            {
 +                sprintf(tbuf,"%8.1f",(ir->init_step+ir->nsteps)*ir->delta_t);
 +            }
 +            else
 +            {
 +                sprintf(tbuf,"%s","infinite");
 +            }
 +            if (ir->init_step > 0)
 +            {
 +                fprintf(stderr,"%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
 +                        gmx_step_str(ir->init_step+ir->nsteps,sbuf),tbuf,
 +                        gmx_step_str(ir->init_step,sbuf2),
 +                        ir->init_step*ir->delta_t);
 +            }
 +            else
 +            {
 +                fprintf(stderr,"%s steps, %s ps.\n",
 +                        gmx_step_str(ir->nsteps,sbuf),tbuf);
 +            }
 +        }
 +        fprintf(fplog,"\n");
 +    }
 +
 +    /* Set and write start time */
 +    runtime_start(runtime);
 +    print_date_and_time(fplog,cr->nodeid,"Started mdrun",runtime);
 +    wallcycle_start(wcycle,ewcRUN);
 +    if (fplog)
 +    {
 +        fprintf(fplog,"\n");
 +    }
 +
 +    /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
 +#ifdef GMX_FAHCORE
 +    chkpt_ret=fcCheckPointParallel( cr->nodeid,
 +                                    NULL,0);
 +    if ( chkpt_ret == 0 ) 
 +        gmx_fatal( 3,__FILE__,__LINE__, "Checkpoint error on step %d\n", 0 );
 +#endif
 +
 +    debug_gmx();
 +    /***********************************************************
 +     *
 +     *             Loop over MD steps 
 +     *
 +     ************************************************************/
 +
 +    /* if rerunMD then read coordinates and velocities from input trajectory */
 +    if (bRerunMD)
 +    {
 +        if (getenv("GMX_FORCE_UPDATE"))
 +        {
 +            bForceUpdate = TRUE;
 +        }
 +
 +        rerun_fr.natoms = 0;
 +        if (MASTER(cr))
 +        {
 +            bNotLastFrame = read_first_frame(oenv,&status,
 +                                             opt2fn("-rerun",nfile,fnm),
 +                                             &rerun_fr,TRX_NEED_X | TRX_READ_V);
 +            if (rerun_fr.natoms != top_global->natoms)
 +            {
 +                gmx_fatal(FARGS,
 +                          "Number of atoms in trajectory (%d) does not match the "
 +                          "run input file (%d)\n",
 +                          rerun_fr.natoms,top_global->natoms);
 +            }
 +            if (ir->ePBC != epbcNONE)
 +            {
 +                if (!rerun_fr.bBox)
 +                {
 +                    gmx_fatal(FARGS,"Rerun trajectory frame step %d time %f does not contain a box, while pbc is used",rerun_fr.step,rerun_fr.time);
 +                }
 +                if (max_cutoff2(ir->ePBC,rerun_fr.box) < sqr(fr->rlistlong))
 +                {
 +                    gmx_fatal(FARGS,"Rerun trajectory frame step %d time %f has too small box dimensions",rerun_fr.step,rerun_fr.time);
 +                }
 +            }
 +        }
 +
 +        if (PAR(cr))
 +        {
 +            rerun_parallel_comm(cr,&rerun_fr,&bNotLastFrame);
 +        }
 +
 +        if (ir->ePBC != epbcNONE)
 +        {
 +            /* Set the shift vectors.
 +             * Necessary here when have a static box different from the tpr box.
 +             */
 +            calc_shifts(rerun_fr.box,fr->shift_vec);
 +        }
 +    }
 +
 +    /* loop over MD steps or if rerunMD to end of input trajectory */
 +    bFirstStep = TRUE;
 +    /* Skip the first Nose-Hoover integration when we get the state from tpx */
 +    bStateFromTPX = !bStateFromCP;
 +    bInitStep = bFirstStep && (bStateFromTPX || bVV);
 +    bStartingFromCpt = (Flags & MD_STARTFROMCPT) && bInitStep;
 +    bLastStep    = FALSE;
 +    bSumEkinhOld = FALSE;
 +    bExchanged   = FALSE;
 +
 +    init_global_signals(&gs,cr,ir,repl_ex_nst);
 +
 +    step = ir->init_step;
 +    step_rel = 0;
 +
 +    if (ir->nstlist == -1)
 +    {
 +        init_nlistheuristics(&nlh,bGStatEveryStep,step);
 +    }
 +
 +    if (MULTISIM(cr) && (repl_ex_nst <=0 ))
 +    {
 +        /* check how many steps are left in other sims */
 +        multisim_nsteps=get_multisim_nsteps(cr, ir->nsteps);
 +    }
 +
 +
 +    /* and stop now if we should */
 +    bLastStep = (bRerunMD || (ir->nsteps >= 0 && step_rel > ir->nsteps) ||
 +                 ((multisim_nsteps >= 0) && (step_rel >= multisim_nsteps )));
 +    while (!bLastStep || (bRerunMD && bNotLastFrame)) {
 +
 +        wallcycle_start(wcycle,ewcSTEP);
 +
 +        if (bRerunMD) {
 +            if (rerun_fr.bStep) {
 +                step = rerun_fr.step;
 +                step_rel = step - ir->init_step;
 +            }
 +            if (rerun_fr.bTime) {
 +                t = rerun_fr.time;
 +            }
 +            else
 +            {
 +                t = step;
 +            }
 +        } 
 +        else 
 +        {
 +            bLastStep = (step_rel == ir->nsteps);
 +            t = t0 + step*ir->delta_t;
 +        }
 +
 +        if (ir->efep != efepNO || ir->bSimTemp)
 +            {
 +            /* find and set the current lambdas.  If rerunning, we either read in a state, or a lambda value,
 +               requiring different logic. */
 +            
 +            set_current_lambdas(step,ir->fepvals,bRerunMD,&rerun_fr,state_global,state,lam0);
 +            bDoDHDL = do_per_step(step,ir->fepvals->nstdhdl);
 +            bDoFEP  = (do_per_step(step,nstfep) && (ir->efep != efepNO));
 +            bDoExpanded  = (do_per_step(step,ir->expandedvals->nstexpanded) && (ir->bExpanded) && (step > 0));
 +        }
 +
 +        if (bSimAnn) 
 +        {
 +            update_annealing_target_temp(&(ir->opts),t);
 +        }
 +
 +        if (bRerunMD)
 +        {
 +            if (!(DOMAINDECOMP(cr) && !MASTER(cr)))
 +            {
 +                for(i=0; i<state_global->natoms; i++)
 +                {
 +                    copy_rvec(rerun_fr.x[i],state_global->x[i]);
 +                }
 +                if (rerun_fr.bV)
 +                {
 +                    for(i=0; i<state_global->natoms; i++)
 +                    {
 +                        copy_rvec(rerun_fr.v[i],state_global->v[i]);
 +                    }
 +                }
 +                else
 +                {
 +                    for(i=0; i<state_global->natoms; i++)
 +                    {
 +                        clear_rvec(state_global->v[i]);
 +                    }
 +                    if (bRerunWarnNoV)
 +                    {
 +                        fprintf(stderr,"\nWARNING: Some frames do not contain velocities.\n"
 +                                "         Ekin, temperature and pressure are incorrect,\n"
 +                                "         the virial will be incorrect when constraints are present.\n"
 +                                "\n");
 +                        bRerunWarnNoV = FALSE;
 +                    }
 +                }
 +            }
 +            copy_mat(rerun_fr.box,state_global->box);
 +            copy_mat(state_global->box,state->box);
 +
 +            if (vsite && (Flags & MD_RERUN_VSITE))
 +            {
 +                if (DOMAINDECOMP(cr))
 +                {
 +                    gmx_fatal(FARGS,"Vsite recalculation with -rerun is not implemented for domain decomposition, use particle decomposition");
 +                }
 +                if (graph)
 +                {
 +                    /* Following is necessary because the graph may get out of sync
 +                     * with the coordinates if we only have every N'th coordinate set
 +                     */
 +                    mk_mshift(fplog,graph,fr->ePBC,state->box,state->x);
 +                    shift_self(graph,state->box,state->x);
 +                }
 +                construct_vsites(fplog,vsite,state->x,nrnb,ir->delta_t,state->v,
 +                                 top->idef.iparams,top->idef.il,
 +                                 fr->ePBC,fr->bMolPBC,graph,cr,state->box);
 +                if (graph)
 +                {
 +                    unshift_self(graph,state->box,state->x);
 +                }
 +            }
 +        }
 +
 +        /* Stop Center of Mass motion */
 +        bStopCM = (ir->comm_mode != ecmNO && do_per_step(step,ir->nstcomm));
 +
 +        /* Copy back starting coordinates in case we're doing a forcefield scan */
 +        if (bFFscan)
 +        {
 +            for(ii=0; (ii<state->natoms); ii++)
 +            {
 +                copy_rvec(xcopy[ii],state->x[ii]);
 +                copy_rvec(vcopy[ii],state->v[ii]);
 +            }
 +            copy_mat(boxcopy,state->box);
 +        }
 +
 +        if (bRerunMD)
 +        {
 +            /* for rerun MD always do Neighbour Searching */
 +            bNS = (bFirstStep || ir->nstlist != 0);
 +            bNStList = bNS;
 +        }
 +        else
 +        {
 +            /* Determine whether or not to do Neighbour Searching and LR */
 +            bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
 +            
 +            bNS = (bFirstStep || bExchanged || bNStList || bDoFEP ||
 +                   (ir->nstlist == -1 && nlh.nabnsb > 0));
 +
 +            if (bNS && ir->nstlist == -1)
 +            {
 +                set_nlistheuristics(&nlh,bFirstStep || bExchanged || bDoFEP, step);
 +            }
 +        } 
 +
 +        /* check whether we should stop because another simulation has 
 +           stopped. */
 +        if (MULTISIM(cr))
 +        {
 +            if ( (multisim_nsteps >= 0) &&  (step_rel >= multisim_nsteps)  &&  
 +                 (multisim_nsteps != ir->nsteps) )  
 +            {
 +                if (bNS)
 +                {
 +                    if (MASTER(cr))
 +                    {
 +                        fprintf(stderr, 
 +                                "Stopping simulation %d because another one has finished\n",
 +                                cr->ms->sim);
 +                    }
 +                    bLastStep=TRUE;
 +                    gs.sig[eglsCHKPT] = 1;
 +                }
 +            }
 +        }
 +
 +        /* < 0 means stop at next step, > 0 means stop at next NS step */
 +        if ( (gs.set[eglsSTOPCOND] < 0 ) ||
 +             ( (gs.set[eglsSTOPCOND] > 0 ) && ( bNS || ir->nstlist==0)) )
 +        {
 +            bLastStep = TRUE;
 +        }
 +
 +        /* Determine whether or not to update the Born radii if doing GB */
 +        bBornRadii=bFirstStep;
 +        if (ir->implicit_solvent && (step % ir->nstgbradii==0))
 +        {
 +            bBornRadii=TRUE;
 +        }
 +        
 +        do_log = do_per_step(step,ir->nstlog) || bFirstStep || bLastStep;
 +        do_verbose = bVerbose &&
 +                  (step % stepout == 0 || bFirstStep || bLastStep);
 +
 +        if (bNS && !(bFirstStep && ir->bContinuation && !bRerunMD))
 +        {
 +            if (bRerunMD)
 +            {
 +                bMasterState = TRUE;
 +            }
 +            else
 +            {
 +                bMasterState = FALSE;
 +                /* Correct the new box if it is too skewed */
 +                if (DYNAMIC_BOX(*ir))
 +                {
 +                    if (correct_box(fplog,step,state->box,graph))
 +                    {
 +                        bMasterState = TRUE;
 +                    }
 +                }
 +                if (DOMAINDECOMP(cr) && bMasterState)
 +                {
 +                    dd_collect_state(cr->dd,state,state_global);
 +                }
 +            }
 +
 +            if (DOMAINDECOMP(cr))
 +            {
 +                /* Repartition the domain decomposition */
 +                wallcycle_start(wcycle,ewcDOMDEC);
 +                dd_partition_system(fplog,step,cr,
 +                                    bMasterState,nstglobalcomm,
 +                                    state_global,top_global,ir,
 +                                    state,&f,mdatoms,top,fr,
 +                                    vsite,shellfc,constr,
 +                                    nrnb,wcycle,
 +                                    do_verbose && !bPMETuneRunning);
 +                wallcycle_stop(wcycle,ewcDOMDEC);
 +                /* If using an iterative integrator, reallocate space to match the decomposition */
 +            }
 +        }
 +
 +        if (MASTER(cr) && do_log && !bFFscan)
 +        {
 +            print_ebin_header(fplog,step,t,state->lambda[efptFEP]); /* can we improve the information printed here? */
 +        }
 +
 +        if (ir->efep != efepNO)
 +        {
 +            update_mdatoms(mdatoms,state->lambda[efptMASS]);
 +        }
 +
 +        if ((bRerunMD && rerun_fr.bV) || bExchanged)
 +        {
 +            
 +            /* We need the kinetic energy at minus the half step for determining
 +             * the full step kinetic energy and possibly for T-coupling.*/
 +            /* This may not be quite working correctly yet . . . . */
 +            compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
 +                            wcycle,enerd,NULL,NULL,NULL,NULL,mu_tot,
 +                            constr,NULL,FALSE,state->box,
 +                            top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
 +                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
 +        }
 +        clear_mat(force_vir);
 +        
 +        /* Ionize the atoms if necessary */
 +        if (bIonize)
 +        {
 +            ionize(fplog,oenv,mdatoms,top_global,t,ir,state->x,state->v,
 +                   mdatoms->start,mdatoms->start+mdatoms->homenr,state->box,cr);
 +        }
 +        
 +        /* Update force field in ffscan program */
 +        if (bFFscan)
 +        {
 +            if (update_forcefield(fplog,
 +                                  nfile,fnm,fr,
 +                                  mdatoms->nr,state->x,state->box))
 +            {
 +                gmx_finalize_par();
 +
 +                exit(0);
 +            }
 +        }
 +
 +        /* We write a checkpoint at this MD step when:
 +         * either at an NS step when we signalled through gs,
 +         * or at the last step (but not when we do not want confout),
 +         * but never at the first step or with rerun.
 +         */
 +        bCPT = (((gs.set[eglsCHKPT] && (bNS || ir->nstlist == 0)) ||
 +                 (bLastStep && (Flags & MD_CONFOUT))) &&
 +                step > ir->init_step && !bRerunMD);
 +        if (bCPT)
 +        {
 +            gs.set[eglsCHKPT] = 0;
 +        }
 +
 +        /* Determine the energy and pressure:
 +         * at nstcalcenergy steps and at energy output steps (set below).
 +         */
 +        if (EI_VV(ir->eI) && (!bInitStep))
 +        {
 +            /* for vv, the first half actually corresponds to the last step */
 +            bCalcEner = do_per_step(step-1,ir->nstcalcenergy);
 +        }
 +        else
 +        {
 +            bCalcEner = do_per_step(step,ir->nstcalcenergy);
 +        }
 +        bCalcVir = bCalcEner ||
 +            (ir->epc != epcNO && do_per_step(step,ir->nstpcouple));
 +
 +        /* Do we need global communication ? */
 +        bGStat = (bCalcVir || bCalcEner || bStopCM ||
 +                  do_per_step(step,nstglobalcomm) ||
 +                  (ir->nstlist == -1 && !bRerunMD && step >= nlh.step_nscheck));
 +
 +        do_ene = (do_per_step(step,ir->nstenergy) || bLastStep);
 +
 +        if (do_ene || do_log)
 +        {
 +            bCalcVir  = TRUE;
 +            bCalcEner = TRUE;
 +            bGStat    = TRUE;
 +        }
 +        
 +        /* these CGLO_ options remain the same throughout the iteration */
 +        cglo_flags = ((bRerunMD ? CGLO_RERUNMD : 0) |
 +                      (bGStat ? CGLO_GSTAT : 0)
 +            );
 +        
 +        force_flags = (GMX_FORCE_STATECHANGED |
 +                       ((DYNAMIC_BOX(*ir) || bRerunMD) ? GMX_FORCE_DYNAMICBOX : 0) |
 +                       GMX_FORCE_ALLFORCES |
 +                       (bNStList ? GMX_FORCE_DOLR : 0) |
 +                       GMX_FORCE_SEPLRF |
 +                       (bCalcVir ? GMX_FORCE_VIRIAL : 0) |
 +                       (bCalcEner ? GMX_FORCE_ENERGY : 0) |
 +                       (bDoFEP ? GMX_FORCE_DHDL : 0)
 +            );
 +        
 +        if (shellfc)
 +        {
 +            /* Now is the time to relax the shells */
 +            count=relax_shell_flexcon(fplog,cr,bVerbose,bFFscan ? step+1 : step,
 +                                      ir,bNS,force_flags,
 +                                      bStopCM,top,top_global,
 +                                      constr,enerd,fcd,
 +                                      state,f,force_vir,mdatoms,
 +                                      nrnb,wcycle,graph,groups,
 +                                      shellfc,fr,bBornRadii,t,mu_tot,
 +                                      state->natoms,&bConverged,vsite,
 +                                      outf->fp_field);
 +            tcount+=count;
 +
 +            if (bConverged)
 +            {
 +                nconverged++;
 +            }
 +        }
 +        else
 +        {
 +            /* The coordinates (x) are shifted (to get whole molecules)
 +             * in do_force.
 +             * This is parallellized as well, and does communication too. 
 +             * Check comments in sim_util.c
 +             */
 +             do_force(fplog,cr,ir,step,nrnb,wcycle,top,top_global,groups,
 +                     state->box,state->x,&state->hist,
 +                     f,force_vir,mdatoms,enerd,fcd,
 +                     state->lambda,graph,
 +                     fr,vsite,mu_tot,t,outf->fp_field,ed,bBornRadii,
 +                     (bNS ? GMX_FORCE_NS : 0) | force_flags);
 +        }
 +        
 +        if (bTCR)
 +        {
 +            mu_aver = calc_mu_aver(cr,state->x,mdatoms->chargeA,
 +                                   mu_tot,&top_global->mols,mdatoms,gnx,grpindex);
 +        }
 +        
 +        if (bTCR && bFirstStep)
 +        {
 +            tcr=init_coupling(fplog,nfile,fnm,cr,fr,mdatoms,&(top->idef));
 +            fprintf(fplog,"Done init_coupling\n"); 
 +            fflush(fplog);
 +        }
 +        
 +        if (bVV && !bStartingFromCpt && !bRerunMD)
 +        /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
 +        {
 +            if (ir->eI==eiVV && bInitStep) 
 +            {
 +                /* if using velocity verlet with full time step Ekin,
 +                 * take the first half step only to compute the 
 +                 * virial for the first step. From there,
 +                 * revert back to the initial coordinates
 +                 * so that the input is actually the initial step.
 +                 */
 +                copy_rvecn(state->v,cbuf,0,state->natoms); /* should make this better for parallelizing? */
 +            } else {
 +                /* this is for NHC in the Ekin(t+dt/2) version of vv */
 +                trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ1);            
 +            }
 +
 +            update_coords(fplog,step,ir,mdatoms,state,fr->bMolPBC,
 +                          f,fr->bTwinRange && bNStList,fr->f_twin,fcd,
 +                          ekind,M,wcycle,upd,bInitStep,etrtVELOCITY1,
 +                          cr,nrnb,constr,&top->idef);
 +            
 +            if (bIterations)
 +            {
 +                gmx_iterate_init(&iterate,bIterations && !bInitStep);
 +            }
 +            /* for iterations, we save these vectors, as we will be self-consistently iterating
 +               the calculations */
 +
 +            /*#### UPDATE EXTENDED VARIABLES IN TROTTER FORMULATION */
 +            
 +            /* save the state */
 +            if (bIterations && iterate.bIterate) { 
 +                copy_coupling_state(state,bufstate,ekind,ekind_save,&(ir->opts));
 +            }
 +            
 +            bFirstIterate = TRUE;
 +            while (bFirstIterate || (bIterations && iterate.bIterate))
 +            {
 +                if (bIterations && iterate.bIterate) 
 +                {
 +                    copy_coupling_state(bufstate,state,ekind_save,ekind,&(ir->opts));
 +                    if (bFirstIterate && bTrotter) 
 +                    {
 +                        /* The first time through, we need a decent first estimate
 +                           of veta(t+dt) to compute the constraints.  Do
 +                           this by computing the box volume part of the
 +                           trotter integration at this time. Nothing else
 +                           should be changed by this routine here.  If
 +                           !(first time), we start with the previous value
 +                           of veta.  */
 +                        
 +                        veta_save = state->veta;
 +                        trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ0);
 +                        vetanew = state->veta;
 +                        state->veta = veta_save;
 +                    } 
 +                } 
 +                
 +                bOK = TRUE;
 +                if ( !bRerunMD || rerun_fr.bV || bForceUpdate) {  /* Why is rerun_fr.bV here?  Unclear. */
 +                    dvdl = 0;
 +                    
 +                    update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,
 +                                       state,fr->bMolPBC,graph,f,
 +                                       &top->idef,shake_vir,NULL,
 +                                       cr,nrnb,wcycle,upd,constr,
 +                                       bInitStep,TRUE,bCalcVir,vetanew);
 +                    
 +                    if (!bOK && !bFFscan)
 +                    {
 +                        gmx_fatal(FARGS,"Constraint error: Shake, Lincs or Settle could not solve the constrains");
 +                    }
 +                    
 +                } 
 +                else if (graph)
 +                { /* Need to unshift here if a do_force has been
 +                     called in the previous step */
 +                    unshift_self(graph,state->box,state->x);
 +                }
 +                
 +                
 +                /* if VV, compute the pressure and constraints */
 +                /* For VV2, we strictly only need this if using pressure
 +                 * control, but we really would like to have accurate pressures
 +                 * printed out.
 +                 * Think about ways around this in the future?
 +                 * For now, keep this choice in comments.
 +                 */
 +                /*bPres = (ir->eI==eiVV || IR_NPT_TROTTER(ir)); */
 +                    /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && IR_NPT_TROTTER(ir)));*/
 +                bPres = TRUE;
 +                bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK));
 +                if (bCalcEner && ir->eI==eiVVAK)  /*MRS:  7/9/2010 -- this still doesn't fix it?*/
 +                {
 +                    bSumEkinhOld = TRUE;
 +                }
 +                compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
 +                                wcycle,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
 +                                constr,NULL,FALSE,state->box,
 +                                top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
 +                                cglo_flags 
 +                                | CGLO_ENERGY 
 +                                | (bStopCM ? CGLO_STOPCM : 0)
 +                                | (bTemp ? CGLO_TEMPERATURE:0) 
 +                                | (bPres ? CGLO_PRESSURE : 0) 
 +                                | (bPres ? CGLO_CONSTRAINT : 0)
 +                                | ((bIterations && iterate.bIterate) ? CGLO_ITERATE : 0)  
 +                                | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
 +                                | CGLO_SCALEEKIN 
 +                    );
 +                /* explanation of above: 
 +                   a) We compute Ekin at the full time step
 +                   if 1) we are using the AveVel Ekin, and it's not the
 +                   initial step, or 2) if we are using AveEkin, but need the full
 +                   time step kinetic energy for the pressure (always true now, since we want accurate statistics).
 +                   b) If we are using EkinAveEkin for the kinetic energy for the temperture control, we still feed in 
 +                   EkinAveVel because it's needed for the pressure */
 +                
 +                /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
 +                if (!bInitStep) 
 +                {
 +                    if (bTrotter)
 +                    {
 +                        trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ2);
 +                    } 
 +                    else 
 +                    {
 +                        if (bExchanged)
 +                        {
 +            
 +                            /* We need the kinetic energy at minus the half step for determining
 +                             * the full step kinetic energy and possibly for T-coupling.*/
 +                            /* This may not be quite working correctly yet . . . . */
 +                            compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
 +                                            wcycle,enerd,NULL,NULL,NULL,NULL,mu_tot,
 +                                            constr,NULL,FALSE,state->box,
 +                                            top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
 +                                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
 +                        }
 +
 +
 +                        update_tcouple(fplog,step,ir,state,ekind,wcycle,upd,&MassQ,mdatoms);
 +                    }
 +                }
 +                
 +                if (bIterations &&
 +                    done_iterating(cr,fplog,step,&iterate,bFirstIterate,
 +                                   state->veta,&vetanew)) 
 +                {
 +                    break;
 +                }
 +                bFirstIterate = FALSE;
 +            }
 +
 +            if (bTrotter && !bInitStep) {
 +                enerd->term[F_DVDL_BONDED] += dvdl;        /* only add after iterations */
 +                copy_mat(shake_vir,state->svir_prev);
 +                copy_mat(force_vir,state->fvir_prev);
 +                if (IR_NVT_TROTTER(ir) && ir->eI==eiVV) {
 +                    /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
 +                    enerd->term[F_TEMP] = sum_ekin(&(ir->opts),ekind,NULL,(ir->eI==eiVV),FALSE,FALSE);
 +                    enerd->term[F_EKIN] = trace(ekind->ekin);
 +                }
 +            }
 +            /* if it's the initial step, we performed this first step just to get the constraint virial */
 +            if (bInitStep && ir->eI==eiVV) {
 +                copy_rvecn(cbuf,state->v,0,state->natoms);
 +            }
 +            
 +            if (fr->bSepDVDL && fplog && do_log) 
 +            {
 +                fprintf(fplog,sepdvdlformat,"Constraint",0.0,dvdl);
 +            }
 +            enerd->term[F_DVDL_BONDED] += dvdl;
 +        }
 +    
 +        /* MRS -- now done iterating -- compute the conserved quantity */
 +        if (bVV) {
 +            saved_conserved_quantity = compute_conserved_from_auxiliary(ir,state,&MassQ);
 +            if (ir->eI==eiVV) 
 +            {
 +                last_ekin = enerd->term[F_EKIN];
 +            }
 +            if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres)) 
 +            {
 +                saved_conserved_quantity -= enerd->term[F_DISPCORR];
 +            }
 +            /* sum up the foreign energy and dhdl terms for vv.  currently done every step so that dhdl is correct in the .edr */
 +            sum_dhdl(enerd,state->lambda,ir->fepvals);
 +        }
 +        
 +        /* ########  END FIRST UPDATE STEP  ############## */
 +        /* ########  If doing VV, we now have v(dt) ###### */
 +        if (bDoExpanded) {
 +            /* perform extended ensemble sampling in lambda - we don't
 +               actually move to the new state before outputting
 +               statistics, but if performing simulated tempering, we
 +               do update the velocities and the tau_t. */
 +        
 +            lamnew = ExpandedEnsembleDynamics(fplog,ir,enerd,state,&MassQ,&df_history,step,mcrng,state->v,mdatoms);
 +        }
 +        /* ################## START TRAJECTORY OUTPUT ################# */
 +        
 +        /* Now we have the energies and forces corresponding to the 
 +         * coordinates at time t. We must output all of this before
 +         * the update.
 +         * for RerunMD t is read from input trajectory
 +         */
 +        mdof_flags = 0;
 +        if (do_per_step(step,ir->nstxout)) { mdof_flags |= MDOF_X; }
 +        if (do_per_step(step,ir->nstvout)) { mdof_flags |= MDOF_V; }
 +        if (do_per_step(step,ir->nstfout)) { mdof_flags |= MDOF_F; }
 +        if (do_per_step(step,ir->nstxtcout)) { mdof_flags |= MDOF_XTC; }
 +        if (bCPT) { mdof_flags |= MDOF_CPT; };
 +
 +#if defined(GMX_FAHCORE) || defined(GMX_WRITELASTSTEP)
 +        if (bLastStep)
 +        {
 +            /* Enforce writing positions and velocities at end of run */
 +            mdof_flags |= (MDOF_X | MDOF_V);
 +        }
 +#endif
 +#ifdef GMX_FAHCORE
 +        if (MASTER(cr))
 +            fcReportProgress( ir->nsteps, step );
 +
 +        /* sync bCPT and fc record-keeping */
 +        if (bCPT && MASTER(cr))
 +            fcRequestCheckPoint();
 +#endif
 +        
 +        if (mdof_flags != 0)
 +        {
 +            wallcycle_start(wcycle,ewcTRAJ);
 +            if (bCPT)
 +            {
 +                if (state->flags & (1<<estLD_RNG))
 +                {
 +                    get_stochd_state(upd,state);
 +                }
 +                if (state->flags  & (1<<estMC_RNG))
 +                {
 +                    get_mc_state(mcrng,state);
 +                }
 +                if (MASTER(cr))
 +                {
 +                    if (bSumEkinhOld)
 +                    {
 +                        state_global->ekinstate.bUpToDate = FALSE;
 +                    }
 +                    else
 +                    {
 +                        update_ekinstate(&state_global->ekinstate,ekind);
 +                        state_global->ekinstate.bUpToDate = TRUE;
 +                    }
 +                    update_energyhistory(&state_global->enerhist,mdebin);
 +                    if (ir->efep!=efepNO || ir->bSimTemp) 
 +                    {
 +                        state_global->fep_state = state->fep_state; /* MRS: seems kludgy. The code should be
 +                                                                       structured so this isn't necessary.
 +                                                                       Note this reassignment is only necessary
 +                                                                       for single threads.*/
 +                        copy_df_history(&state_global->dfhist,&df_history);
 +                    }
 +                }
 +            }
 +            write_traj(fplog,cr,outf,mdof_flags,top_global,
 +                       step,t,state,state_global,f,f_global,&n_xtc,&x_xtc);
 +            if (bCPT)
 +            {
 +                nchkpt++;
 +                bCPT = FALSE;
 +            }
 +            debug_gmx();
 +            if (bLastStep && step_rel == ir->nsteps &&
 +                (Flags & MD_CONFOUT) && MASTER(cr) &&
 +                !bRerunMD && !bFFscan)
 +            {
 +                /* x and v have been collected in write_traj,
 +                 * because a checkpoint file will always be written
 +                 * at the last step.
 +                 */
 +                fprintf(stderr,"\nWriting final coordinates.\n");
 +                if (fr->bMolPBC)
 +                {
 +                    /* Make molecules whole only for confout writing */
 +                    do_pbc_mtop(fplog,ir->ePBC,state->box,top_global,state_global->x);
 +                }
 +                write_sto_conf_mtop(ftp2fn(efSTO,nfile,fnm),
 +                                    *top_global->name,top_global,
 +                                    state_global->x,state_global->v,
 +                                    ir->ePBC,state->box);
 +                debug_gmx();
 +            }
 +            wallcycle_stop(wcycle,ewcTRAJ);
 +        }
 +        
 +        /* kludge -- virial is lost with restart for NPT control. Must restart */
 +        if (bStartingFromCpt && bVV) 
 +        {
 +            copy_mat(state->svir_prev,shake_vir);
 +            copy_mat(state->fvir_prev,force_vir);
 +        }
 +        /*  ################## END TRAJECTORY OUTPUT ################ */
 +        
 +        /* Determine the wallclock run time up till now */
 +        run_time = gmx_gettime() - (double)runtime->real;
 +
 +        /* Check whether everything is still allright */    
 +        if (((int)gmx_get_stop_condition() > handled_stop_condition)
 +#ifdef GMX_THREAD_MPI
 +            && MASTER(cr)
 +#endif
 +            )
 +        {
 +            /* this is just make gs.sig compatible with the hack 
 +               of sending signals around by MPI_Reduce with together with
 +               other floats */
 +            if ( gmx_get_stop_condition() == gmx_stop_cond_next_ns )
 +                gs.sig[eglsSTOPCOND]=1;
 +            if ( gmx_get_stop_condition() == gmx_stop_cond_next )
 +                gs.sig[eglsSTOPCOND]=-1;
 +            /* < 0 means stop at next step, > 0 means stop at next NS step */
 +            if (fplog)
 +            {
 +                fprintf(fplog,
 +                        "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
 +                        gmx_get_signal_name(),
 +                        gs.sig[eglsSTOPCOND]==1 ? "NS " : "");
 +                fflush(fplog);
 +            }
 +            fprintf(stderr,
 +                    "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
 +                    gmx_get_signal_name(),
 +                    gs.sig[eglsSTOPCOND]==1 ? "NS " : "");
 +            fflush(stderr);
 +            handled_stop_condition=(int)gmx_get_stop_condition();
 +        }
 +        else if (MASTER(cr) && (bNS || ir->nstlist <= 0) &&
 +                 (max_hours > 0 && run_time > max_hours*60.0*60.0*0.99) &&
 +                 gs.sig[eglsSTOPCOND] == 0 && gs.set[eglsSTOPCOND] == 0)
 +        {
 +            /* Signal to terminate the run */
 +            gs.sig[eglsSTOPCOND] = 1;
 +            if (fplog)
 +            {
 +                fprintf(fplog,"\nStep %s: Run time exceeded %.3f hours, will terminate the run\n",gmx_step_str(step,sbuf),max_hours*0.99);
 +            }
 +            fprintf(stderr, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n",gmx_step_str(step,sbuf),max_hours*0.99);
 +        }
 +
 +        if (bResetCountersHalfMaxH && MASTER(cr) &&
 +            run_time > max_hours*60.0*60.0*0.495)
 +        {
 +            gs.sig[eglsRESETCOUNTERS] = 1;
 +        }
 +
 +        if (ir->nstlist == -1 && !bRerunMD)
 +        {
 +            /* When bGStatEveryStep=FALSE, global_stat is only called
 +             * when we check the atom displacements, not at NS steps.
 +             * This means that also the bonded interaction count check is not
 +             * performed immediately after NS. Therefore a few MD steps could
 +             * be performed with missing interactions.
 +             * But wrong energies are never written to file,
 +             * since energies are only written after global_stat
 +             * has been called.
 +             */
 +            if (step >= nlh.step_nscheck)
 +            {
 +                nlh.nabnsb = natoms_beyond_ns_buffer(ir,fr,&top->cgs,
 +                                                     nlh.scale_tot,state->x);
 +            }
 +            else
 +            {
 +                /* This is not necessarily true,
 +                 * but step_nscheck is determined quite conservatively.
 +                 */
 +                nlh.nabnsb = 0;
 +            }
 +        }
 +
 +        /* In parallel we only have to check for checkpointing in steps
 +         * where we do global communication,
 +         *  otherwise the other nodes don't know.
 +         */
 +        if (MASTER(cr) && ((bGStat || !PAR(cr)) &&
 +                           cpt_period >= 0 &&
 +                           (cpt_period == 0 || 
 +                            run_time >= nchkpt*cpt_period*60.0)) &&
 +            gs.set[eglsCHKPT] == 0)
 +        {
 +            gs.sig[eglsCHKPT] = 1;
 +        }
 +  
 +
 +        /* at the start of step, randomize the velocities */
 +        if (ETC_ANDERSEN(ir->etc) && EI_VV(ir->eI))
 +        {
 +            gmx_bool bDoAndersenConstr;
 +            bDoAndersenConstr = (constr && update_randomize_velocities(ir,step,mdatoms,state,upd,&top->idef,constr));
 +            /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
 +            if (bDoAndersenConstr)
 +            {
 +                update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,
 +                                   state,fr->bMolPBC,graph,f,
 +                                   &top->idef,tmp_vir,NULL,
 +                                   cr,nrnb,wcycle,upd,constr,
 +                                   bInitStep,TRUE,bCalcVir,vetanew);
 +            }
 +        }
 +
 +        if (bIterations)
 +        {
 +            gmx_iterate_init(&iterate,bIterations);
 +        }
 +    
 +        /* for iterations, we save these vectors, as we will be redoing the calculations */
 +        if (bIterations && iterate.bIterate) 
 +        {
 +            copy_coupling_state(state,bufstate,ekind,ekind_save,&(ir->opts));
 +        }
 +        bFirstIterate = TRUE;
 +        while (bFirstIterate || (bIterations && iterate.bIterate))
 +        {
 +            /* We now restore these vectors to redo the calculation with improved extended variables */    
 +            if (bIterations) 
 +            { 
 +                copy_coupling_state(bufstate,state,ekind_save,ekind,&(ir->opts));
 +            }
 +
 +            /* We make the decision to break or not -after- the calculation of Ekin and Pressure,
 +               so scroll down for that logic */
 +            
 +            /* #########   START SECOND UPDATE STEP ################# */
 +            /* Box is changed in update() when we do pressure coupling,
 +             * but we should still use the old box for energy corrections and when
 +             * writing it to the energy file, so it matches the trajectory files for
 +             * the same timestep above. Make a copy in a separate array.
 +             */
 +            copy_mat(state->box,lastbox);
 +
 +            bOK = TRUE;
 +            if (!(bRerunMD && !rerun_fr.bV && !bForceUpdate))
 +            {
 +                wallcycle_start(wcycle,ewcUPDATE);
 +                dvdl = 0;
 +                /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
 +                if (bTrotter) 
 +                {
 +                    if (bIterations && iterate.bIterate) 
 +                    {
 +                        if (bFirstIterate) 
 +                        {
 +                            scalevir = 1;
 +                        }
 +                        else 
 +                        {
 +                            /* we use a new value of scalevir to converge the iterations faster */
 +                            scalevir = tracevir/trace(shake_vir);
 +                        }
 +                        msmul(shake_vir,scalevir,shake_vir); 
 +                        m_add(force_vir,shake_vir,total_vir);
 +                        clear_mat(shake_vir);
 +                    }
 +                    trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ3);
 +                /* We can only do Berendsen coupling after we have summed
 +                 * the kinetic energy or virial. Since the happens
 +                 * in global_state after update, we should only do it at
 +                 * step % nstlist = 1 with bGStatEveryStep=FALSE.
 +                 */
 +                }
 +                else 
 +                {
 +                    update_tcouple(fplog,step,ir,state,ekind,wcycle,upd,&MassQ,mdatoms);
 +                    update_pcouple(fplog,step,ir,state,pcoupl_mu,M,wcycle,
 +                                   upd,bInitStep);
 +                }
 +
 +                if (bVV)
 +                {
 +                    /* velocity half-step update */
 +                    update_coords(fplog,step,ir,mdatoms,state,fr->bMolPBC,f,
 +                                  fr->bTwinRange && bNStList,fr->f_twin,fcd,
 +                                  ekind,M,wcycle,upd,FALSE,etrtVELOCITY2,
 +                                  cr,nrnb,constr,&top->idef);
 +                }
 +
 +                /* Above, initialize just copies ekinh into ekin,
 +                 * it doesn't copy position (for VV),
 +                 * and entire integrator for MD.
 +                 */
 +                
 +                if (ir->eI==eiVVAK) 
 +                {
 +                    copy_rvecn(state->x,cbuf,0,state->natoms);
 +                }
 +                
 +                update_coords(fplog,step,ir,mdatoms,state,fr->bMolPBC,f,
 +                              fr->bTwinRange && bNStList,fr->f_twin,fcd,
 +                              ekind,M,wcycle,upd,bInitStep,etrtPOSITION,cr,nrnb,constr,&top->idef);
 +                wallcycle_stop(wcycle,ewcUPDATE);
 +
 +                update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,state,
 +                                   fr->bMolPBC,graph,f,
 +                                   &top->idef,shake_vir,force_vir,
 +                                   cr,nrnb,wcycle,upd,constr,
 +                                   bInitStep,FALSE,bCalcVir,state->veta);  
 +                
 +                if (ir->eI==eiVVAK) 
 +                {
 +                    /* erase F_EKIN and F_TEMP here? */
 +                    /* just compute the kinetic energy at the half step to perform a trotter step */
 +                    compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
 +                                    wcycle,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
 +                                    constr,NULL,FALSE,lastbox,
 +                                    top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
 +                                    cglo_flags | CGLO_TEMPERATURE    
 +                        );
 +                    wallcycle_start(wcycle,ewcUPDATE);
 +                    trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ4);            
 +                    /* now we know the scaling, we can compute the positions again again */
 +                    copy_rvecn(cbuf,state->x,0,state->natoms);
 +
 +                    update_coords(fplog,step,ir,mdatoms,state,fr->bMolPBC,f,
 +                                  fr->bTwinRange && bNStList,fr->f_twin,fcd,
 +                                  ekind,M,wcycle,upd,bInitStep,etrtPOSITION,cr,nrnb,constr,&top->idef);
 +                    wallcycle_stop(wcycle,ewcUPDATE);
 +
 +                    /* do we need an extra constraint here? just need to copy out of state->v to upd->xp? */
 +                    /* are the small terms in the shake_vir here due
 +                     * to numerical errors, or are they important
 +                     * physically? I'm thinking they are just errors, but not completely sure. 
 +                     * For now, will call without actually constraining, constr=NULL*/
 +                    update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,
 +                                       state,fr->bMolPBC,graph,f,
 +                                       &top->idef,tmp_vir,force_vir,
 +                                       cr,nrnb,wcycle,upd,NULL,
 +                                       bInitStep,FALSE,bCalcVir,
 +                                       state->veta);  
 +                }
 +                if (!bOK && !bFFscan) 
 +                {
 +                    gmx_fatal(FARGS,"Constraint error: Shake, Lincs or Settle could not solve the constrains");
 +                }
 +                
 +                if (fr->bSepDVDL && fplog && do_log) 
 +                {
 +                    fprintf(fplog,sepdvdlformat,"Constraint dV/dl",0.0,dvdl);
 +                }
 +                enerd->term[F_DVDL_BONDED] += dvdl;
 +            } 
 +            else if (graph) 
 +            {
 +                /* Need to unshift here */
 +                unshift_self(graph,state->box,state->x);
 +            }
 +
 +            if (vsite != NULL) 
 +            {
 +                wallcycle_start(wcycle,ewcVSITECONSTR);
 +                if (graph != NULL) 
 +                {
 +                    shift_self(graph,state->box,state->x);
 +                }
 +                construct_vsites(fplog,vsite,state->x,nrnb,ir->delta_t,state->v,
 +                                 top->idef.iparams,top->idef.il,
 +                                 fr->ePBC,fr->bMolPBC,graph,cr,state->box);
 +                
 +                if (graph != NULL) 
 +                {
 +                    unshift_self(graph,state->box,state->x);
 +                }
 +                wallcycle_stop(wcycle,ewcVSITECONSTR);
 +            }
 +            
 +            /* ############## IF NOT VV, Calculate globals HERE, also iterate constraints ############ */
 +            /* With Leap-Frog we can skip compute_globals at
 +             * non-communication steps, but we need to calculate
 +             * the kinetic energy one step before communication.
 +             */
 +            if (bGStat || do_per_step(step+1,nstglobalcomm) ||
 +                EI_VV(ir->eI))
 +            {
 +                if (ir->nstlist == -1 && bFirstIterate)
 +                {
 +                    gs.sig[eglsNABNSB] = nlh.nabnsb;
 +                }
 +                compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
 +                                wcycle,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
 +                                constr,
 +                                bFirstIterate ? &gs : NULL, 
 +                                (step_rel % gs.nstms == 0) && 
 +                                (multisim_nsteps<0 || (step_rel<multisim_nsteps)),
 +                                lastbox,
 +                                top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
 +                                cglo_flags 
 +                                | (!EI_VV(ir->eI) ? CGLO_ENERGY : 0)
 +                                | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
 +                                | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0) 
 +                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_PRESSURE : 0) 
 +                                | (bIterations && iterate.bIterate ? CGLO_ITERATE : 0) 
 +                                | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
 +                                | CGLO_CONSTRAINT 
 +                    );
 +                if (ir->nstlist == -1 && bFirstIterate)
 +                {
 +                    nlh.nabnsb = gs.set[eglsNABNSB];
 +                    gs.set[eglsNABNSB] = 0;
 +                }
 +            }
 +            /* bIterate is set to keep it from eliminating the old ekin kinetic energy terms */
 +            /* #############  END CALC EKIN AND PRESSURE ################# */
 +        
 +            /* Note: this is OK, but there are some numerical precision issues with using the convergence of
 +               the virial that should probably be addressed eventually. state->veta has better properies,
 +               but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
 +               generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
 +
 +            if (bIterations && 
 +                done_iterating(cr,fplog,step,&iterate,bFirstIterate,
 +                               trace(shake_vir),&tracevir)) 
 +            {
 +                break;
 +            }
 +            bFirstIterate = FALSE;
 +        }
 +
 +        /* only add constraint dvdl after constraints */
 +        enerd->term[F_DVDL_BONDED] += dvdl;
 +        if (!bVV)
 +        {
 +            /* sum up the foreign energy and dhdl terms for md and sd. currently done every step so that dhdl is correct in the .edr */
 +            sum_dhdl(enerd,state->lambda,ir->fepvals);
 +        }
 +        update_box(fplog,step,ir,mdatoms,state,graph,f,
 +                   ir->nstlist==-1 ? &nlh.scale_tot : NULL,pcoupl_mu,nrnb,wcycle,upd,bInitStep,FALSE);
 +        
 +        /* ################# END UPDATE STEP 2 ################# */
 +        /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
 +    
 +        /* The coordinates (x) were unshifted in update */
 +        if (bFFscan && (shellfc==NULL || bConverged))
 +        {
 +            if (print_forcefield(fplog,enerd->term,mdatoms->homenr,
 +                                 f,NULL,xcopy,
 +                                 &(top_global->mols),mdatoms->massT,pres))
 +            {
 +                gmx_finalize_par();
 +
 +                fprintf(stderr,"\n");
 +                exit(0);
 +            }
 +        }
 +        if (!bGStat)
 +        {
 +            /* We will not sum ekinh_old,                                                            
 +             * so signal that we still have to do it.                                                
 +             */
 +            bSumEkinhOld = TRUE;
 +        }
 +        
 +        if (bTCR)
 +        {
 +            /* Only do GCT when the relaxation of shells (minimization) has converged,
 +             * otherwise we might be coupling to bogus energies. 
 +             * In parallel we must always do this, because the other sims might
 +             * update the FF.
 +             */
 +
 +            /* Since this is called with the new coordinates state->x, I assume
 +             * we want the new box state->box too. / EL 20040121
 +             */
 +            do_coupling(fplog,oenv,nfile,fnm,tcr,t,step,enerd->term,fr,
 +                        ir,MASTER(cr),
 +                        mdatoms,&(top->idef),mu_aver,
 +                        top_global->mols.nr,cr,
 +                        state->box,total_vir,pres,
 +                        mu_tot,state->x,f,bConverged);
 +            debug_gmx();
 +        }
 +
 +        /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
 +        
 +        /* use the directly determined last velocity, not actually the averaged half steps */
 +        if (bTrotter && ir->eI==eiVV) 
 +        {
 +            enerd->term[F_EKIN] = last_ekin;
 +        }
 +        enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
 +        
 +        if (bVV)
 +        {
 +            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
 +        }
 +        else 
 +        {
 +            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + compute_conserved_from_auxiliary(ir,state,&MassQ);
 +        }
 +        /* Check for excessively large energies */
 +        if (bIonize) 
 +        {
 +#ifdef GMX_DOUBLE
 +            real etot_max = 1e200;
 +#else
 +            real etot_max = 1e30;
 +#endif
 +            if (fabs(enerd->term[F_ETOT]) > etot_max) 
 +            {
 +                fprintf(stderr,"Energy too large (%g), giving up\n",
 +                        enerd->term[F_ETOT]);
 +            }
 +        }
 +        /* #########  END PREPARING EDR OUTPUT  ###########  */
 +        
 +        /* Time for performance */
 +        if (((step % stepout) == 0) || bLastStep) 
 +        {
 +            runtime_upd_proc(runtime);
 +        }
 +        
 +        /* Output stuff */
 +        if (MASTER(cr))
 +        {
 +            gmx_bool do_dr,do_or;
 +            
 +            if (fplog && do_log && bDoExpanded)
 +            {
 +                /* only needed if doing expanded ensemble */
 +                PrintFreeEnergyInfoToFile(fplog,ir->fepvals,ir->expandedvals,ir->bSimTemp?ir->simtempvals:NULL,
 +                                          &df_history,state->fep_state,ir->nstlog,step);
 +            }
 +            if (!(bStartingFromCpt && (EI_VV(ir->eI)))) 
 +            {
 +                if (bCalcEner)
 +                {
 +                    upd_mdebin(mdebin,bDoDHDL, TRUE,
 +                               t,mdatoms->tmass,enerd,state,
 +                               ir->fepvals,ir->expandedvals,lastbox,
 +                               shake_vir,force_vir,total_vir,pres,
 +                               ekind,mu_tot,constr);
 +                }
 +                else
 +                {
 +                    upd_mdebin_step(mdebin);
 +                }
 +                
 +                do_dr  = do_per_step(step,ir->nstdisreout);
 +                do_or  = do_per_step(step,ir->nstorireout);
 +                
 +                print_ebin(outf->fp_ene,do_ene,do_dr,do_or,do_log?fplog:NULL,
 +                           step,t,
 +                           eprNORMAL,bCompact,mdebin,fcd,groups,&(ir->opts));
 +            }
 +            if (ir->ePull != epullNO)
 +            {
 +                pull_print_output(ir->pull,step,t);
 +            }
 +            
 +            if (do_per_step(step,ir->nstlog))
 +            {
 +                if(fflush(fplog) != 0)
 +                {
 +                    gmx_fatal(FARGS,"Cannot flush logfile - maybe you are out of disk space?");
 +                }
 +            }
 +        }
 +        if (bDoExpanded)
 +        {
 +            /* Have to do this part after outputting the logfile and the edr file */
 +            state->fep_state = lamnew;
 +            for (i=0;i<efptNR;i++)
 +            {
 +                state->lambda[i] = ir->fepvals->all_lambda[i][lamnew];
 +            }
 +        }
 +        /* Remaining runtime */
 +        if (MULTIMASTER(cr) && (do_verbose || gmx_got_usr_signal()) && !bPMETuneRunning)
 +        {
 +            if (shellfc) 
 +            {
 +                fprintf(stderr,"\n");
 +            }
 +            print_time(stderr,runtime,step,ir,cr);
 +        }
 +
 +        /* Replica exchange */
 +        bExchanged = FALSE;
 +        if ((repl_ex_nst > 0) && (step > 0) && !bLastStep &&
 +            do_per_step(step,repl_ex_nst)) 
 +        {
 +            bExchanged = replica_exchange(fplog,cr,repl_ex,
 +                                          state_global,enerd,
 +                                          state,step,t);
 +
 +            if (bExchanged && DOMAINDECOMP(cr)) 
 +            {
 +                dd_partition_system(fplog,step,cr,TRUE,1,
 +                                    state_global,top_global,ir,
 +                                    state,&f,mdatoms,top,fr,
 +                                    vsite,shellfc,constr,
 +                                    nrnb,wcycle,FALSE);
 +            }
 +        }
 +        
 +        bFirstStep = FALSE;
 +        bInitStep = FALSE;
 +        bStartingFromCpt = FALSE;
 +
 +        /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
 +        /* With all integrators, except VV, we need to retain the pressure
 +         * at the current step for coupling at the next step.
 +         */
 +        if ((state->flags & (1<<estPRES_PREV)) &&
 +            (bGStatEveryStep ||
 +             (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
 +        {
 +            /* Store the pressure in t_state for pressure coupling
 +             * at the next MD step.
 +             */
 +            copy_mat(pres,state->pres_prev);
 +        }
 +        
 +        /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
 +
 +        if ( (membed!=NULL) && (!bLastStep) )
 +        {
 +            rescale_membed(step_rel,membed,state_global->x);
 +        }
 +
 +        if (bRerunMD) 
 +        {
 +            if (MASTER(cr))
 +            {
 +                /* read next frame from input trajectory */
 +                bNotLastFrame = read_next_frame(oenv,status,&rerun_fr);
 +            }
 +
 +            if (PAR(cr))
 +            {
 +                rerun_parallel_comm(cr,&rerun_fr,&bNotLastFrame);
 +            }
 +        }
 +        
 +        if (!bRerunMD || !rerun_fr.bStep)
 +        {
 +            /* increase the MD step number */
 +            step++;
 +            step_rel++;
 +        }
 +        
 +        cycles = wallcycle_stop(wcycle,ewcSTEP);
 +        if (DOMAINDECOMP(cr) && wcycle)
 +        {
 +            dd_cycles_add(cr->dd,cycles,ddCyclStep);
 +        }
 +
 +        if (bPMETuneRunning || bPMETuneTry)
 +        {
 +            /* PME grid + cut-off optimization with GPUs or PME nodes */
 +
 +            /* Count the total cycles over the last steps */
 +            cycles_pmes += cycles;
 +
 +            /* We can only switch cut-off at NS steps */
 +            if (step % ir->nstlist == 0)
 +            {
 +                /* PME grid + cut-off optimization with GPUs or PME nodes */
 +                if (bPMETuneTry)
 +                {
 +                    if (DDMASTER(cr->dd))
 +                    {
 +                        /* PME node load is too high, start tuning */
 +                        bPMETuneRunning = (dd_pme_f_ratio(cr->dd) >= 1.05);
 +                    }
 +                    dd_bcast(cr->dd,sizeof(gmx_bool),&bPMETuneRunning);
 +
 +                    if (bPMETuneRunning || step_rel > ir->nstlist*50)
 +                    {
 +                        bPMETuneTry     = FALSE;
 +                    }
 +                }
 +                if (bPMETuneRunning)
 +                {
 +                    /* init_step might not be a multiple of nstlist,
 +                     * but the first cycle is always skipped anyhow.
 +                     */
 +                    bPMETuneRunning =
-     
++                        pme_load_balance(pme_loadbal,cr,
++                                         (bVerbose && MASTER(cr)) ? stderr : NULL,
++                                         fplog,
++                                         ir,state,cycles_pmes,
++                                         fr->ic,fr->nbv,&fr->pmedata,
++                                         step);
 +
 +                    fr->ewaldcoeff = fr->ic->ewaldcoeff;
 +                }
 +
 +                cycles_pmes = 0;
 +            }
 +        }
 +        
 +        if (step_rel == wcycle_get_reset_counters(wcycle) ||
 +            gs.set[eglsRESETCOUNTERS] != 0)
 +        {
 +            /* Reset all the counters related to performance over the run */
 +            reset_all_counters(fplog,cr,step,&step_rel,ir,wcycle,nrnb,runtime,
 +                               fr->nbv != NULL && fr->nbv->bUseGPU ? fr->nbv->cu_nbv : NULL);
 +            wcycle_set_reset_counters(wcycle,-1);
 +            /* Correct max_hours for the elapsed time */
 +            max_hours -= run_time/(60.0*60.0);
 +            bResetCountersHalfMaxH = FALSE;
 +            gs.set[eglsRESETCOUNTERS] = 0;
 +        }
 +
 +    }
 +    /* End of main MD loop */
 +    debug_gmx();
 +    
 +    /* Stop the time */
 +    runtime_end(runtime);
 +    
 +    if (bRerunMD && MASTER(cr))
 +    {
 +        close_trj(status);
 +    }
 +    
 +    if (!(cr->duty & DUTY_PME))
 +    {
 +        /* Tell the PME only node to finish */
 +        gmx_pme_send_finish(cr);
 +    }
 +    
 +    if (MASTER(cr))
 +    {
 +        if (ir->nstcalcenergy > 0 && !bRerunMD) 
 +        {
 +            print_ebin(outf->fp_ene,FALSE,FALSE,FALSE,fplog,step,t,
 +                       eprAVER,FALSE,mdebin,fcd,groups,&(ir->opts));
 +        }
 +    }
 +
 +    done_mdoutf(outf);
 +
 +    debug_gmx();
 +
 +    if (ir->nstlist == -1 && nlh.nns > 0 && fplog)
 +    {
 +        fprintf(fplog,"Average neighborlist lifetime: %.1f steps, std.dev.: %.1f steps\n",nlh.s1/nlh.nns,sqrt(nlh.s2/nlh.nns - sqr(nlh.s1/nlh.nns)));
 +        fprintf(fplog,"Average number of atoms that crossed the half buffer length: %.1f\n\n",nlh.ab/nlh.nns);
 +    }
++
++    if (pme_loadbal != NULL)
++    {
++        pme_loadbal_done(pme_loadbal,fplog);
++    }
++
 +    if (shellfc && fplog)
 +    {
 +        fprintf(fplog,"Fraction of iterations that converged:           %.2f %%\n",
 +                (nconverged*100.0)/step_rel);
 +        fprintf(fplog,"Average number of force evaluations per MD step: %.2f\n\n",
 +                tcount/step_rel);
 +    }
 +    
 +    if (repl_ex_nst > 0 && MASTER(cr))
 +    {
 +        print_replica_exchange_statistics(fplog,repl_ex);
 +    }
 +    
 +    runtime->nsteps_done = step_rel;
 +
 +   return 0;
 +}
index 0000000000000000000000000000000000000000,0000000000000000000000000000000000000000..c96d61eac613f0ccde62ddc10b44edc915a38b28
new file mode 100644 (file)
--- /dev/null
--- /dev/null
@@@ -1,0 -1,0 +1,650 @@@
++/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
++ *
++ * 
++ *                This source code is part of
++ * 
++ *                 G   R   O   M   A   C   S
++ * 
++ *          GROningen MAchine for Chemical Simulations
++ * 
++ *                        VERSION 4.6.0
++ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
++ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
++ * Copyright (c) 2001-2011, The GROMACS development team,
++ * check out http://www.gromacs.org for more information.
++
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License
++ * as published by the Free Software Foundation; either version 2
++ * of the License, or (at your option) any later version.
++ * 
++ * If you want to redistribute modifications, please consider that
++ * scientific software is very special. Version control is crucial -
++ * bugs must be traceable. We will be happy to consider code for
++ * inclusion in the official distribution, but derived work must not
++ * be called official GROMACS. Details are found in the README & COPYING
++ * files - if they are missing, get the official version at www.gromacs.org.
++ * 
++ * To help us fund GROMACS development, we humbly ask that you cite
++ * the papers on the package - you can find them in the top README file.
++ * 
++ * For more info, check our website at http://www.gromacs.org
++ * 
++ * And Hey:
++ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
++ */
++#ifdef HAVE_CONFIG_H
++#include <config.h>
++#endif
++
++#include "smalloc.h"
++#include "network.h"
++#include "calcgrid.h"
++#include "pme.h"
++#include "vec.h"
++#include "domdec.h"
++#include "nbnxn_cuda_data_mgmt.h"
++#include "force.h"
++#include "macros.h"
++#include "pme_loadbal.h"
++
++/* Parameters and setting for one PP-PME setup */
++typedef struct {
++    real rcut;            /* Coulomb cut-off                              */
++    real rlist;           /* pair-list cut-off                            */
++    real spacing;         /* (largest) PME grid spacing                   */
++    ivec grid;            /* the PME grid dimensions                      */
++    real grid_efficiency; /* ineffiency factor for non-uniform grids <= 1 */
++    real ewaldcoeff;      /* the Ewald coefficient                        */
++    gmx_pme_t pmedata;    /* the data structure used in the PME code      */
++
++    int  count;           /* number of times this setup has been timed    */
++    double cycles;        /* the fastest time for this setup in cycles    */
++} pme_setup_t;
++
++/* In the initial scan, step by grids that are at least a factor 0.8 coarser */
++#define PME_LB_GRID_SCALE_FAC  0.8
++/* In the initial scan, try to skip grids with uneven x/y/z spacing,
++ * checking if the "efficiency" is more than 5% worse than the previous grid.
++ */
++#define PME_LB_GRID_EFFICIENCY_REL_FAC  1.05
++/* Rerun up till 12% slower setups than the fastest up till now */
++#define PME_LB_SLOW_FAC  1.12
++/* If setups get more than 2% faster, do another round to avoid
++ * choosing a slower setup due to acceleration or fluctuations.
++ */
++#define PME_LB_ACCEL_TOL 1.02
++
++enum { epmelblimNO, epmelblimBOX, epmelblimDD, epmelblimNR };
++
++const char *pmelblim_str[epmelblimNR] =
++{ "no", "box size", "domain decompostion" };
++
++struct pme_load_balancing {
++    int  nstage;        /* the current maximum number of stages */
++
++    real cut_spacing;   /* the minimum cutoff / PME grid spacing ratio */
++    real rbuf;          /* the pairlist buffer size */
++    matrix box_start;   /* the initial simulation box */
++    int n;              /* the count of setup as well as the allocation size */
++    pme_setup_t *setup; /* the PME+cutoff setups */
++    int cur;            /* the current setup */
++    int fastest;        /* fastest setup up till now */
++    int start;          /* start of setup range to consider in stage>0 */
++    int end;            /* end   of setup range to consider in stage>0 */
++    int elimited;       /* was the balancing limited, uses enum above */
++
++    int stage;          /* the current stage */
++};
++
++void pme_loadbal_init(pme_load_balancing_t *pme_lb_p,
++                      const t_inputrec *ir,matrix box,
++                      const interaction_const_t *ic,
++                      gmx_pme_t pmedata)
++{
++    pme_load_balancing_t pme_lb;
++    real spm,sp;
++    int  d;
++
++    snew(pme_lb,1);
++
++    /* Any number of stages >= 2 is supported */
++    pme_lb->nstage   = 2;
++
++    pme_lb->rbuf = ic->rlist - ic->rcoulomb;
++
++    copy_mat(box,pme_lb->box_start);
++    if (ir->ePBC==epbcXY && ir->nwall==2)
++    {
++        svmul(ir->wall_ewald_zfac,pme_lb->box_start[ZZ],pme_lb->box_start[ZZ]);
++    }
++
++    pme_lb->n = 1;
++    snew(pme_lb->setup,pme_lb->n);
++
++    pme_lb->cur = 0;
++    pme_lb->setup[0].rcut       = ic->rcoulomb;
++    pme_lb->setup[0].rlist      = ic->rlist;
++    pme_lb->setup[0].grid[XX]   = ir->nkx;
++    pme_lb->setup[0].grid[YY]   = ir->nky;
++    pme_lb->setup[0].grid[ZZ]   = ir->nkz;
++    pme_lb->setup[0].ewaldcoeff = ic->ewaldcoeff;
++
++    pme_lb->setup[0].pmedata  = pmedata;
++    
++    spm = 0;
++    for(d=0; d<DIM; d++)
++    {
++        sp = norm(pme_lb->box_start[d])/pme_lb->setup[0].grid[d];
++        if (sp > spm)
++        {
++            spm = sp;
++        }
++    }
++    pme_lb->setup[0].spacing = spm;
++
++    if (ir->fourier_spacing > 0)
++    {
++        pme_lb->cut_spacing = ir->rcoulomb/ir->fourier_spacing;
++    }
++    else
++    {
++        pme_lb->cut_spacing = ir->rcoulomb/pme_lb->setup[0].spacing;
++    }
++
++    pme_lb->stage = 0;
++
++    pme_lb->fastest  = 0;
++    pme_lb->start    = 0;
++    pme_lb->end      = 0;
++    pme_lb->elimited = epmelblimNO;
++
++    *pme_lb_p = pme_lb;
++}
++
++static gmx_bool pme_loadbal_increase_cutoff(pme_load_balancing_t pme_lb,
++                                            int pme_order)
++{
++    pme_setup_t *set;
++    real fac,sp;
++    int d;
++
++    /* Try to add a new setup with next larger cut-off to the list */
++    pme_lb->n++;
++    srenew(pme_lb->setup,pme_lb->n);
++    set = &pme_lb->setup[pme_lb->n-1];
++    set->pmedata = NULL;
++
++    fac = 1;
++    do
++    {
++        fac *= 1.01;
++        clear_ivec(set->grid);
++        sp = calc_grid(NULL,pme_lb->box_start,
++                       fac*pme_lb->setup[pme_lb->cur].spacing,
++                       &set->grid[XX],
++                       &set->grid[YY],
++                       &set->grid[ZZ]);
++
++        /* In parallel we can't have grids smaller than 2*pme_order,
++         * and we would anyhow not gain much speed at these grid sizes.
++         */
++        for(d=0; d<DIM; d++)
++        {
++            if (set->grid[d] <= 2*pme_order)
++            {
++                pme_lb->n--;
++
++                return FALSE;
++            }
++        }
++    }
++    while (sp <= 1.001*pme_lb->setup[pme_lb->cur].spacing);
++
++    set->rcut    = pme_lb->cut_spacing*sp;
++    set->rlist   = set->rcut + pme_lb->rbuf;
++    set->spacing = sp;
++    /* The grid efficiency is the size wrt a grid with uniform x/y/z spacing */
++    set->grid_efficiency = 1;
++    for(d=0; d<DIM; d++)
++    {
++        set->grid_efficiency *= (set->grid[d]*sp)/norm(pme_lb->box_start[d]);
++    }
++    /* The Ewald coefficient is inversly proportional to the cut-off */
++    set->ewaldcoeff =
++        pme_lb->setup[0].ewaldcoeff*pme_lb->setup[0].rcut/set->rcut;
++
++    set->count   = 0;
++    set->cycles  = 0;
++
++    if (debug)
++    {
++        fprintf(debug,"PME loadbal: grid %d %d %d, cutoff %f\n",
++                set->grid[XX],set->grid[YY],set->grid[ZZ],set->rcut);
++    }
++
++    return TRUE;
++}
++
++static void print_grid(FILE *fp_err,FILE *fp_log,
++                       const char *pre,
++                       const char *desc,
++                       const pme_setup_t *set,
++                       double cycles)
++{
++    char buf[STRLEN],buft[STRLEN];
++    
++    if (cycles >= 0)
++    {
++        sprintf(buft,": %.1f M-cycles",cycles*1e-6);
++    }
++    else
++    {
++        buft[0] = '\0';
++    }
++    sprintf(buf,"%-11s%10s pme grid %d %d %d, cutoff %.3f%s",
++            pre,
++            desc,set->grid[XX],set->grid[YY],set->grid[ZZ],set->rcut,
++            buft);
++    if (fp_err != NULL)
++    {
++        fprintf(fp_err,"\r%s\n",buf);
++    }
++    if (fp_log != NULL)
++    {
++        fprintf(fp_log,"%s\n",buf);
++    }
++}
++
++static int pme_loadbal_end(pme_load_balancing_t pme_lb)
++{
++    /* In the initial stage only n is set; end is not set yet */
++    if (pme_lb->end > 0)
++    {
++        return pme_lb->end;
++    }
++    else
++    {
++        return pme_lb->n;
++    }
++}
++
++static void print_loadbal_limited(FILE *fp_err,FILE *fp_log,
++                                  gmx_large_int_t step,
++                                  pme_load_balancing_t pme_lb)
++{
++    char buf[STRLEN],sbuf[22];
++
++    sprintf(buf,"step %4s: the %s limited the PME load balancing to a cut-off of %.3f",
++            gmx_step_str(step,sbuf),
++            pmelblim_str[pme_lb->elimited],
++            pme_lb->setup[pme_loadbal_end(pme_lb)].rcut);
++    if (fp_err != NULL)
++    {
++        fprintf(fp_err,"\r%s\n",buf);
++    }
++    if (fp_log != NULL)
++    {
++        fprintf(fp_log,"%s\n",buf);
++    }
++}
++
++static void switch_to_stage1(pme_load_balancing_t pme_lb)
++{
++    pme_lb->start = 0;
++    while (pme_lb->start+1 < pme_lb->n &&
++           (pme_lb->setup[pme_lb->start].count == 0 ||
++            pme_lb->setup[pme_lb->start].cycles >
++            pme_lb->setup[pme_lb->fastest].cycles*PME_LB_SLOW_FAC))
++    {
++        pme_lb->start++;
++    }
++    while (pme_lb->start > 0 && pme_lb->setup[pme_lb->start-1].cycles == 0)
++    {
++        pme_lb->start--;
++    }
++
++    pme_lb->end = pme_lb->n;
++    if (pme_lb->setup[pme_lb->end-1].count > 0 &&
++        pme_lb->setup[pme_lb->end-1].cycles >
++        pme_lb->setup[pme_lb->fastest].cycles*PME_LB_SLOW_FAC)
++    {
++        pme_lb->end--;
++    }
++
++    pme_lb->stage = 1;
++
++    /* Next we want to choose setup pme_lb->start, but as we will increase
++     * pme_ln->cur by one right after returning, we subtract 1 here.
++     */
++    pme_lb->cur = pme_lb->start - 1;
++}
++
++gmx_bool pme_load_balance(pme_load_balancing_t pme_lb,
++                          t_commrec *cr,
++                          FILE *fp_err,
++                          FILE *fp_log,
++                          t_inputrec *ir,
++                          t_state *state,
++                          double cycles,
++                          interaction_const_t *ic,
++                          nonbonded_verlet_t *nbv,
++                          gmx_pme_t *pmedata,
++                          gmx_large_int_t step)
++{
++    gmx_bool OK;
++    pme_setup_t *set;
++    double cycles_fast;
++    char buf[STRLEN],sbuf[22];
++
++    if (pme_lb->stage == pme_lb->nstage)
++    {
++        return FALSE;
++    }
++
++    if (PAR(cr))
++    {
++        gmx_sumd(1,&cycles,cr);
++        cycles /= cr->nnodes;
++    }
++
++    set = &pme_lb->setup[pme_lb->cur];
++
++    set->count++;
++    if (set->count % 2 == 1)
++    {
++        /* Skip the first cycle, because the first step after a switch
++         * is much slower due to allocation and/or caching effects.
++         */
++        return TRUE;
++    }
++
++    sprintf(buf, "step %4s: ", gmx_step_str(step,sbuf));
++    print_grid(fp_err,fp_log,buf,"timed with",set,cycles);
++
++    if (set->count <= 2)
++    {
++        set->cycles = cycles;
++    }
++    else
++    {
++        if (cycles*PME_LB_ACCEL_TOL < set->cycles &&
++            pme_lb->stage == pme_lb->nstage - 1)
++        {
++            /* The performance went up a lot (due to e.g. DD load balancing).
++             * Add a stage, keep the minima, but rescan all setups.
++             */
++            pme_lb->nstage++;
++
++            if (debug)
++            {
++                fprintf(debug,"The performance for grid %d %d %d went from %.3f to %.1f M-cycles, this is more than %f\n"
++                        "Increased the number stages to %d"
++                        " and ignoring the previous performance\n",
++                        set->grid[XX],set->grid[YY],set->grid[ZZ],
++                        cycles*1e-6,set->cycles*1e-6,PME_LB_ACCEL_TOL,
++                        pme_lb->nstage);
++            }
++        }
++        set->cycles = min(set->cycles,cycles);
++    }
++
++    if (set->cycles < pme_lb->setup[pme_lb->fastest].cycles)
++    {
++        pme_lb->fastest = pme_lb->cur;
++    }
++    cycles_fast = pme_lb->setup[pme_lb->fastest].cycles;
++
++    /* Check in stage 0 if we should stop scanning grids.
++     * Stop when the time is more than SLOW_FAC longer than the fastest.
++     */
++    if (pme_lb->stage == 0 && pme_lb->cur > 0 &&
++        cycles > pme_lb->setup[pme_lb->fastest].cycles*PME_LB_SLOW_FAC)
++    {
++        pme_lb->n = pme_lb->cur + 1;
++        /* Done with scanning, go to stage 1 */
++        switch_to_stage1(pme_lb);
++    }
++
++    if (pme_lb->stage == 0)
++    {
++        int gridsize_start;
++
++        gridsize_start = set->grid[XX]*set->grid[YY]*set->grid[ZZ];
++
++        do
++        {
++            if (pme_lb->cur+1 < pme_lb->n)
++            {
++                /* We had already generated the next setup */
++                OK = TRUE;
++            }
++            else
++            {
++                /* Find the next setup */
++                OK = pme_loadbal_increase_cutoff(pme_lb,ir->pme_order);
++            }
++                
++            if (OK && ir->ePBC != epbcNONE)
++            {
++                OK = (sqr(pme_lb->setup[pme_lb->cur+1].rlist)
++                      <= max_cutoff2(ir->ePBC,state->box));
++                if (!OK)
++                {
++                    pme_lb->elimited = epmelblimBOX;
++                }
++            }
++
++            if (OK)
++            {
++                pme_lb->cur++;
++
++                if (DOMAINDECOMP(cr))
++                {
++                    OK = change_dd_cutoff(cr,state,ir,
++                                          pme_lb->setup[pme_lb->cur].rlist);
++                    if (!OK)
++                    {
++                        /* Failed: do not use this setup */
++                        pme_lb->cur--;
++                        pme_lb->elimited = epmelblimDD;
++                    }
++                }
++            }
++            if (!OK)
++            {
++                /* We hit the upper limit for the cut-off,
++                 * the setup should not go further than cur.
++                 */
++                pme_lb->n = pme_lb->cur + 1;
++                print_loadbal_limited(fp_err,fp_log,step,pme_lb);
++                /* Switch to the next stage */
++                switch_to_stage1(pme_lb);
++            }
++        }
++        while (OK &&
++               !(pme_lb->setup[pme_lb->cur].grid[XX]*
++                 pme_lb->setup[pme_lb->cur].grid[YY]*
++                 pme_lb->setup[pme_lb->cur].grid[ZZ] <
++                 gridsize_start*PME_LB_GRID_SCALE_FAC
++                 &&
++                 pme_lb->setup[pme_lb->cur].grid_efficiency <
++                 pme_lb->setup[pme_lb->cur-1].grid_efficiency*PME_LB_GRID_EFFICIENCY_REL_FAC));
++    }
++
++    if (pme_lb->stage > 0 && pme_lb->end == 1)
++    {
++        pme_lb->cur = 0;
++        pme_lb->stage = pme_lb->nstage;
++    }
++    else if (pme_lb->stage > 0 && pme_lb->end > 1)
++    {
++        /* If stage = nstage-1:
++         *   scan over all setups, rerunning only those setups
++         *   which are not much slower than the fastest
++         * else:
++         *   use the next setup
++         */
++        do
++        {
++            pme_lb->cur++;
++            if (pme_lb->cur == pme_lb->end)
++            {
++                pme_lb->stage++;
++                pme_lb->cur = pme_lb->start;
++            }
++        }
++        while (pme_lb->stage == pme_lb->nstage - 1 &&
++               pme_lb->setup[pme_lb->cur].count > 0 &&
++               pme_lb->setup[pme_lb->cur].cycles > cycles_fast*PME_LB_SLOW_FAC);
++
++        if (pme_lb->stage == pme_lb->nstage)
++        {
++            /* We are done optimizing, use the fastest setup we found */
++            pme_lb->cur = pme_lb->fastest;
++        }
++    }
++
++    if (DOMAINDECOMP(cr) && pme_lb->stage > 0)
++    {
++        OK = change_dd_cutoff(cr,state,ir,pme_lb->setup[pme_lb->cur].rlist);
++        if (!OK)
++        {
++            /* Failsafe solution */
++            if (pme_lb->cur > 1 && pme_lb->stage == pme_lb->nstage)
++            {
++                pme_lb->stage--;
++            }
++            pme_lb->fastest  = 0;
++            pme_lb->start    = 0;
++            pme_lb->end      = pme_lb->cur;
++            pme_lb->cur      = pme_lb->start;
++            pme_lb->elimited = epmelblimDD;
++            print_loadbal_limited(fp_err,fp_log,step,pme_lb);
++        }
++    }
++
++    /* Change the Coulomb cut-off and the PME grid */
++
++    set = &pme_lb->setup[pme_lb->cur];
++
++    ic->rcoulomb   = set->rcut;
++    ic->rlist      = set->rlist;
++    ic->ewaldcoeff = set->ewaldcoeff;
++
++    if (nbv->grp[0].kernel_type == nbk8x8x8_CUDA)
++    {
++        nbnxn_cuda_pme_loadbal_update_param(nbv->cu_nbv,ic);
++    }
++    else
++    {
++        init_interaction_const_tables(NULL,ic,nbv->grp[0].kernel_type);
++    }
++
++    if (nbv->ngrp > 1)
++    {
++        init_interaction_const_tables(NULL,ic,nbv->grp[1].kernel_type);
++    }
++
++    if (cr->duty & DUTY_PME)
++    {
++        if (pme_lb->setup[pme_lb->cur].pmedata == NULL)
++        {
++            /* Generate a new PME data structure,
++             * copying part of the old pointers.
++             */
++            gmx_pme_reinit(&set->pmedata,
++                           cr,pme_lb->setup[0].pmedata,ir,
++                           set->grid);
++        }
++        *pmedata = set->pmedata;
++    }
++    else
++    {
++        /* Tell our PME-only node to switch grid */
++        gmx_pme_send_switch(cr, set->grid, set->ewaldcoeff);
++    }
++
++    if (debug)
++    {
++        print_grid(NULL,debug,"","switched to",set,-1);
++    }
++
++    if (pme_lb->stage == pme_lb->nstage)
++    {
++        print_grid(fp_err,fp_log,"","optimal",set,-1);
++    }
++
++    return TRUE;
++}
++
++void restart_pme_loadbal(pme_load_balancing_t pme_lb, int n)
++{
++    pme_lb->nstage += n;
++}
++
++static int pme_grid_points(const pme_setup_t *setup)
++{
++    return setup->grid[XX]*setup->grid[YY]*setup->grid[ZZ];
++}
++
++static void print_pme_loadbal_setting(FILE *fplog,
++                                     char *name,
++                                     const pme_setup_t *setup)
++{
++    fprintf(fplog,
++            "   %-7s %6.3f nm %6.3f nm     %3d %3d %3d   %5.3f nm  %5.3f nm\n",
++            name,
++            setup->rcut,setup->rlist,
++            setup->grid[XX],setup->grid[YY],setup->grid[ZZ],
++            setup->spacing,1/setup->ewaldcoeff);
++}
++
++static void print_pme_loadbal_settings(pme_load_balancing_t pme_lb,
++                                       FILE *fplog)
++{
++    double pp_ratio,grid_ratio;
++
++    pp_ratio   = pow(pme_lb->setup[pme_lb->cur].rlist/pme_lb->setup[0].rlist,3.0);
++    grid_ratio = pme_grid_points(&pme_lb->setup[pme_lb->cur])/
++        (double)pme_grid_points(&pme_lb->setup[0]);
++
++    fprintf(fplog,"\n");
++    fprintf(fplog,"       P P   -   P M E   L O A D   B A L A N C I N G\n");
++    fprintf(fplog,"\n");
++    /* Here we only warn when the optimal setting is the last one */
++    if (pme_lb->elimited != epmelblimNO &&
++        pme_lb->cur == pme_loadbal_end(pme_lb)-1)
++    {
++        fprintf(fplog," NOTE: The PP/PME load balancing was limited by the %s,\n",
++                pmelblim_str[pme_lb->elimited]);
++        fprintf(fplog,"       you might not have reached a good load balance.\n");
++        if (pme_lb->elimited == epmelblimDD)
++        {
++            fprintf(fplog,"       Try different mdrun -dd settings or lower the -dds value.\n");
++        }
++        fprintf(fplog,"\n");
++    }
++    fprintf(fplog," PP/PME load balancing changed the cut-off and PME settings:\n");
++    fprintf(fplog,"           particle-particle                    PME\n");
++    fprintf(fplog,"            rcoulomb  rlist            grid      spacing   1/beta\n");
++    print_pme_loadbal_setting(fplog,"initial",&pme_lb->setup[0]);
++    print_pme_loadbal_setting(fplog,"final"  ,&pme_lb->setup[pme_lb->cur]);
++    fprintf(fplog," cost-ratio           %4.2f             %4.2f\n",
++            pp_ratio,grid_ratio);
++    fprintf(fplog," (note that these numbers concern only part of the total PP and PME load)\n");
++    fprintf(fplog,"\n");
++}
++
++void pme_loadbal_done(pme_load_balancing_t pme_lb, FILE *fplog)
++{
++    if (fplog != NULL && (pme_lb->cur > 0 || pme_lb->elimited != epmelblimNO))
++    {
++        print_pme_loadbal_settings(pme_lb,fplog);
++    }
++
++    /* TODO: Here we should free all pointers in pme_lb,
++     * but as it contains pme data structures,
++     * we need to first make pme.c free all data.
++     */
++}
index 0000000000000000000000000000000000000000,0254a29153b588805ec7ef975844df63dc9e1eaf..0254a29153b588805ec7ef975844df63dc9e1eaf
mode 000000,100644..100644
--- /dev/null
index ab7884ac21c1e45835ee21bbb41a4e8bd2aae085,0000000000000000000000000000000000000000..1b9803446c7e044b3397a070f422f2fb19dcb4fa
mode 100644,000000..100644
--- /dev/null
@@@ -1,1776 -1,0 +1,1776 @@@
-     ret=tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi,
 +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
 + *
 + * 
 + *                This source code is part of
 + * 
 + *                 G   R   O   M   A   C   S
 + * 
 + *          GROningen MAchine for Chemical Simulations
 + * 
 + *                        VERSION 3.2.0
 + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
 + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
 + * Copyright (c) 2001-2004, The GROMACS development team,
 + * check out http://www.gromacs.org for more information.
 +
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License
 + * as published by the Free Software Foundation; either version 2
 + * of the License, or (at your option) any later version.
 + * 
 + * If you want to redistribute modifications, please consider that
 + * scientific software is very special. Version control is crucial -
 + * bugs must be traceable. We will be happy to consider code for
 + * inclusion in the official distribution, but derived work must not
 + * be called official GROMACS. Details are found in the README & COPYING
 + * files - if they are missing, get the official version at www.gromacs.org.
 + * 
 + * To help us fund GROMACS development, we humbly ask that you cite
 + * the papers on the package - you can find them in the top README file.
 + * 
 + * For more info, check our website at http://www.gromacs.org
 + * 
 + * And Hey:
 + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
 + */
 +#ifdef HAVE_CONFIG_H
 +#include <config.h>
 +#endif
 +#ifdef __linux
 +#define _GNU_SOURCE
 +#include <sched.h>
 +#include <sys/syscall.h>
 +#endif
 +#include <signal.h>
 +#include <stdlib.h>
 +#ifdef HAVE_UNISTD_H
 +#include <unistd.h>
 +#endif
 +#include <string.h>
 +#include <assert.h>
 +
 +#include "typedefs.h"
 +#include "smalloc.h"
 +#include "sysstuff.h"
 +#include "statutil.h"
 +#include "mdrun.h"
 +#include "md_logging.h"
 +#include "md_support.h"
 +#include "network.h"
 +#include "pull.h"
 +#include "pull_rotation.h"
 +#include "names.h"
 +#include "disre.h"
 +#include "orires.h"
 +#include "pme.h"
 +#include "mdatoms.h"
 +#include "repl_ex.h"
 +#include "qmmm.h"
 +#include "domdec.h"
 +#include "partdec.h"
 +#include "coulomb.h"
 +#include "constr.h"
 +#include "mvdata.h"
 +#include "checkpoint.h"
 +#include "mtop_util.h"
 +#include "sighandler.h"
 +#include "tpxio.h"
 +#include "txtdump.h"
 +#include "gmx_detect_hardware.h"
 +#include "gmx_omp_nthreads.h"
 +#include "pull_rotation.h"
 +#include "calc_verletbuf.h"
 +#include "nbnxn_search.h"
 +#include "../mdlib/nbnxn_consts.h"
 +#include "gmx_fatal_collective.h"
 +#include "membed.h"
 +#include "macros.h"
 +#include "gmx_omp.h"
 +
 +#ifdef GMX_LIB_MPI
 +#include <mpi.h>
 +#endif
 +#ifdef GMX_THREAD_MPI
 +#include "tmpi.h"
 +#endif
 +
 +#ifdef GMX_FAHCORE
 +#include "corewrap.h"
 +#endif
 +
 +#ifdef GMX_OPENMM
 +#include "md_openmm.h"
 +#endif
 +
 +#include "gpu_utils.h"
 +#include "nbnxn_cuda_data_mgmt.h"
 +
 +typedef struct { 
 +    gmx_integrator_t *func;
 +} gmx_intp_t;
 +
 +/* The array should match the eI array in include/types/enums.h */
 +#ifdef GMX_OPENMM  /* FIXME do_md_openmm needs fixing */
 +const gmx_intp_t integrator[eiNR] = { {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm},{do_md_openmm}};
 +#else
 +const gmx_intp_t integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md},{do_md}};
 +#endif
 +
 +gmx_large_int_t     deform_init_init_step_tpx;
 +matrix              deform_init_box_tpx;
 +#ifdef GMX_THREAD_MPI
 +tMPI_Thread_mutex_t deform_init_box_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
 +#endif
 +
 +
 +#ifdef GMX_THREAD_MPI
 +struct mdrunner_arglist
 +{
 +    gmx_hw_opt_t *hw_opt;
 +    FILE *fplog;
 +    t_commrec *cr;
 +    int nfile;
 +    const t_filenm *fnm;
 +    output_env_t oenv;
 +    gmx_bool bVerbose;
 +    gmx_bool bCompact;
 +    int nstglobalcomm;
 +    ivec ddxyz;
 +    int dd_node_order;
 +    real rdd;
 +    real rconstr;
 +    const char *dddlb_opt;
 +    real dlb_scale;
 +    const char *ddcsx;
 +    const char *ddcsy;
 +    const char *ddcsz;
 +    const char *nbpu_opt;
 +    int nsteps_cmdline;
 +    int nstepout;
 +    int resetstep;
 +    int nmultisim;
 +    int repl_ex_nst;
 +    int repl_ex_nex;
 +    int repl_ex_seed;
 +    real pforce;
 +    real cpt_period;
 +    real max_hours;
 +    const char *deviceOptions;
 +    unsigned long Flags;
 +    int ret; /* return value */
 +};
 +
 +
 +/* The function used for spawning threads. Extracts the mdrunner() 
 +   arguments from its one argument and calls mdrunner(), after making
 +   a commrec. */
 +static void mdrunner_start_fn(void *arg)
 +{
 +    struct mdrunner_arglist *mda=(struct mdrunner_arglist*)arg;
 +    struct mdrunner_arglist mc=*mda; /* copy the arg list to make sure 
 +                                        that it's thread-local. This doesn't
 +                                        copy pointed-to items, of course,
 +                                        but those are all const. */
 +    t_commrec *cr;  /* we need a local version of this */
 +    FILE *fplog=NULL;
 +    t_filenm *fnm;
 +
 +    fnm = dup_tfn(mc.nfile, mc.fnm);
 +
 +    cr = init_par_threads(mc.cr);
 +
 +    if (MASTER(cr))
 +    {
 +        fplog=mc.fplog;
 +    }
 +
 +    mda->ret=mdrunner(mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv, 
 +                      mc.bVerbose, mc.bCompact, mc.nstglobalcomm, 
 +                      mc.ddxyz, mc.dd_node_order, mc.rdd,
 +                      mc.rconstr, mc.dddlb_opt, mc.dlb_scale, 
 +                      mc.ddcsx, mc.ddcsy, mc.ddcsz,
 +                      mc.nbpu_opt,
 +                      mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
 +                      mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce, 
 +                      mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.Flags);
 +}
 +
 +/* called by mdrunner() to start a specific number of threads (including 
 +   the main thread) for thread-parallel runs. This in turn calls mdrunner()
 +   for each thread. 
 +   All options besides nthreads are the same as for mdrunner(). */
 +static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt, 
 +              FILE *fplog,t_commrec *cr,int nfile, 
 +              const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
 +              gmx_bool bCompact, int nstglobalcomm,
 +              ivec ddxyz,int dd_node_order,real rdd,real rconstr,
 +              const char *dddlb_opt,real dlb_scale,
 +              const char *ddcsx,const char *ddcsy,const char *ddcsz,
 +              const char *nbpu_opt,
 +              int nsteps_cmdline, int nstepout,int resetstep,
 +              int nmultisim,int repl_ex_nst,int repl_ex_nex, int repl_ex_seed,
 +              real pforce,real cpt_period, real max_hours, 
 +              const char *deviceOptions, unsigned long Flags)
 +{
 +    int ret;
 +    struct mdrunner_arglist *mda;
 +    t_commrec *crn; /* the new commrec */
 +    t_filenm *fnmn;
 +
 +    /* first check whether we even need to start tMPI */
 +    if (hw_opt->nthreads_tmpi < 2)
 +    {
 +        return cr;
 +    }
 +
 +    /* a few small, one-time, almost unavoidable memory leaks: */
 +    snew(mda,1);
 +    fnmn=dup_tfn(nfile, fnm);
 +
 +    /* fill the data structure to pass as void pointer to thread start fn */
 +    mda->hw_opt=hw_opt;
 +    mda->fplog=fplog;
 +    mda->cr=cr;
 +    mda->nfile=nfile;
 +    mda->fnm=fnmn;
 +    mda->oenv=oenv;
 +    mda->bVerbose=bVerbose;
 +    mda->bCompact=bCompact;
 +    mda->nstglobalcomm=nstglobalcomm;
 +    mda->ddxyz[XX]=ddxyz[XX];
 +    mda->ddxyz[YY]=ddxyz[YY];
 +    mda->ddxyz[ZZ]=ddxyz[ZZ];
 +    mda->dd_node_order=dd_node_order;
 +    mda->rdd=rdd;
 +    mda->rconstr=rconstr;
 +    mda->dddlb_opt=dddlb_opt;
 +    mda->dlb_scale=dlb_scale;
 +    mda->ddcsx=ddcsx;
 +    mda->ddcsy=ddcsy;
 +    mda->ddcsz=ddcsz;
 +    mda->nbpu_opt=nbpu_opt;
 +    mda->nsteps_cmdline=nsteps_cmdline;
 +    mda->nstepout=nstepout;
 +    mda->resetstep=resetstep;
 +    mda->nmultisim=nmultisim;
 +    mda->repl_ex_nst=repl_ex_nst;
 +    mda->repl_ex_nex=repl_ex_nex;
 +    mda->repl_ex_seed=repl_ex_seed;
 +    mda->pforce=pforce;
 +    mda->cpt_period=cpt_period;
 +    mda->max_hours=max_hours;
 +    mda->deviceOptions=deviceOptions;
 +    mda->Flags=Flags;
 +
 +    fprintf(stderr, "Starting %d tMPI threads\n",hw_opt->nthreads_tmpi);
 +    fflush(stderr);
 +    /* now spawn new threads that start mdrunner_start_fn(), while 
 +       the main thread returns */
++    ret=tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi, TMPI_AFFINITY_ALL_CORES,
 +                     mdrunner_start_fn, (void*)(mda) );
 +    if (ret!=TMPI_SUCCESS)
 +        return NULL;
 +
 +    /* make a new comm_rec to reflect the new situation */
 +    crn=init_par_threads(cr);
 +    return crn;
 +}
 +
 +
 +static int get_tmpi_omp_thread_distribution(const gmx_hw_opt_t *hw_opt,
 +                                            int nthreads_tot,
 +                                            int ngpu)
 +{
 +    int nthreads_tmpi;
 +
 +    /* There are no separate PME nodes here, as we ensured in
 +     * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
 +     * and a conditional ensures we would not have ended up here.
 +     * Note that separate PME nodes might be switched on later.
 +     */
 +    if (ngpu > 0)
 +    {
 +        nthreads_tmpi = ngpu;
 +        if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi)
 +        {
 +            nthreads_tmpi = nthreads_tot;
 +        }
 +    }
 +    else if (hw_opt->nthreads_omp > 0)
 +    {
 +        if (hw_opt->nthreads_omp > nthreads_tot)
 +        {
 +            gmx_fatal(FARGS,"More OpenMP threads requested (%d) than the total number of threads requested (%d)",hw_opt->nthreads_omp,nthreads_tot);
 +        }
 +        nthreads_tmpi = nthreads_tot/hw_opt->nthreads_omp;
 +    }
 +    else
 +    {
 +        /* TODO choose nthreads_omp based on hardware topology
 +           when we have a hardware topology detection library */
 +        /* Don't use OpenMP parallelization */
 +        nthreads_tmpi = nthreads_tot;
 +    }
 +
 +    return nthreads_tmpi;
 +}
 +
 +
 +/* Get the number of threads to use for thread-MPI based on how many
 + * were requested, which algorithms we're using,
 + * and how many particles there are.
 + * At the point we have already called check_and_update_hw_opt.
 + * Thus all options should be internally consistent and consistent
 + * with the hardware, except that ntmpi could be larger than #GPU.
 + */
 +static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
 +                            gmx_hw_opt_t *hw_opt,
 +                            t_inputrec *inputrec, gmx_mtop_t *mtop,
 +                            const t_commrec *cr,
 +                            FILE *fplog)
 +{
 +    int nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu;
 +    int min_atoms_per_mpi_thread;
 +    char *env;
 +    char sbuf[STRLEN];
 +    gmx_bool bCanUseGPU;
 +
 +    if (hw_opt->nthreads_tmpi > 0)
 +    {
 +        /* Trivial, return right away */
 +        return hw_opt->nthreads_tmpi;
 +    }
 +
 +    /* How many total (#tMPI*#OpenMP) threads can we start? */ 
 +    if (hw_opt->nthreads_tot > 0)
 +    {
 +        nthreads_tot_max = hw_opt->nthreads_tot;
 +    }
 +    else
 +    {
 +        nthreads_tot_max = tMPI_Thread_get_hw_number();
 +    }
 +
 +    bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
 +    if (bCanUseGPU)
 +    {
 +        ngpu = hwinfo->gpu_info.ncuda_dev_use;
 +    }
 +    else
 +    {
 +        ngpu = 0;
 +    }
 +
 +    nthreads_tmpi =
 +        get_tmpi_omp_thread_distribution(hw_opt,nthreads_tot_max,ngpu);
 +
 +    if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
 +    {
 +        /* Steps are divided over the nodes iso splitting the atoms */
 +        min_atoms_per_mpi_thread = 0;
 +    }
 +    else
 +    {
 +        if (bCanUseGPU)
 +        {
 +            min_atoms_per_mpi_thread = MIN_ATOMS_PER_GPU;
 +        }
 +        else
 +        {
 +            min_atoms_per_mpi_thread = MIN_ATOMS_PER_MPI_THREAD;
 +        }
 +    }
 +
 +    /* Check if an algorithm does not support parallel simulation.  */
 +    if (nthreads_tmpi != 1 &&
 +        ( inputrec->eI == eiLBFGS ||
 +          inputrec->coulombtype == eelEWALD ) )
 +    {
 +        nthreads_tmpi = 1;
 +
 +        md_print_warn(cr,fplog,"The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
 +        if (hw_opt->nthreads_tmpi > nthreads_tmpi)
 +        {
 +            gmx_fatal(FARGS,"You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
 +        }
 +    }
 +    else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_thread)
 +    {
 +        /* the thread number was chosen automatically, but there are too many
 +           threads (too few atoms per thread) */
 +        nthreads_new = max(1,mtop->natoms/min_atoms_per_mpi_thread);
 +
 +        if (nthreads_new > 8 || (nthreads_tmpi == 8 && nthreads_new > 4))
 +        {
 +            /* TODO replace this once we have proper HT detection
 +             * Use only multiples of 4 above 8 threads
 +             * or with an 8-core processor
 +             * (to avoid 6 threads on 8 core processors with 4 real cores).
 +             */
 +            nthreads_new = (nthreads_new/4)*4;
 +        }
 +        else if (nthreads_new > 4)
 +        {
 +            /* Avoid 5 or 7 threads */
 +            nthreads_new = (nthreads_new/2)*2;
 +        }
 +
 +        nthreads_tmpi = nthreads_new;
 +
 +        fprintf(stderr,"\n");
 +        fprintf(stderr,"NOTE: Parallelization is limited by the small number of atoms,\n");
 +        fprintf(stderr,"      only starting %d thread-MPI threads.\n",nthreads_tmpi);
 +        fprintf(stderr,"      You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
 +    }
 +
 +    return nthreads_tmpi;
 +}
 +#endif /* GMX_THREAD_MPI */
 +
 +
 +/* Environment variable for setting nstlist */
 +static const char*  NSTLIST_ENVVAR          =  "GMX_NSTLIST";
 +/* Try to increase nstlist when using a GPU with nstlist less than this */
 +static const int    NSTLIST_GPU_ENOUGH      = 20;
 +/* Increase nstlist until the non-bonded cost increases more than this factor */
 +static const float  NBNXN_GPU_LIST_OK_FAC   = 1.25;
 +/* Don't increase nstlist beyond a non-bonded cost increases of this factor */
 +static const float  NBNXN_GPU_LIST_MAX_FAC  = 1.40;
 +
 +/* Try to increase nstlist when running on a GPU */
 +static void increase_nstlist(FILE *fp,t_commrec *cr,
 +                             t_inputrec *ir,const gmx_mtop_t *mtop,matrix box)
 +{
 +    char *env;
 +    int  nstlist_orig,nstlist_prev;
 +    verletbuf_list_setup_t ls;
 +    real rlist_inc,rlist_ok,rlist_max,rlist_new,rlist_prev;
 +    int  i;
 +    t_state state_tmp;
 +    gmx_bool bBox,bDD,bCont;
 +    const char *nstl_fmt="\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
 +    const char *vbd_err="Can not increase nstlist for GPU run because verlet-buffer-drift is not set or used";
 +    const char *box_err="Can not increase nstlist for GPU run because the box is too small";
 +    const char *dd_err ="Can not increase nstlist for GPU run because of domain decomposition limitations";
 +    char buf[STRLEN];
 +
 +    /* Number of + nstlist alternative values to try when switching  */
 +    const int nstl[]={ 20, 25, 40, 50 };
 +#define NNSTL  sizeof(nstl)/sizeof(nstl[0])
 +
 +    env = getenv(NSTLIST_ENVVAR);
 +    if (env == NULL)
 +    {
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,nstl_fmt,ir->nstlist);
 +        }
 +    }
 +
 +    if (ir->verletbuf_drift == 0)
 +    {
 +        gmx_fatal(FARGS,"You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
 +    }
 +
 +    if (ir->verletbuf_drift < 0)
 +    {
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr,"%s\n",vbd_err);
 +        }
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,"%s\n",vbd_err);
 +        }
 +
 +        return;
 +    }
 +
 +    nstlist_orig = ir->nstlist;
 +    if (env != NULL)
 +    {
 +        sprintf(buf,"Getting nstlist from environment variable GMX_NSTLIST=%s",env);
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr,"%s\n",buf);
 +        }
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,"%s\n",buf);
 +        }
 +        sscanf(env,"%d",&ir->nstlist);
 +    }
 +
 +    verletbuf_get_list_setup(TRUE,&ls);
 +
 +    /* Allow rlist to make the list double the size of the cut-off sphere */
 +    rlist_inc = nbnxn_get_rlist_effective_inc(NBNXN_GPU_CLUSTER_SIZE,mtop->natoms/det(box));
 +    rlist_ok  = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_OK_FAC,1.0/3.0) - rlist_inc;
 +    rlist_max = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_MAX_FAC,1.0/3.0) - rlist_inc;
 +    if (debug)
 +    {
 +        fprintf(debug,"GPU nstlist tuning: rlist_inc %.3f rlist_max %.3f\n",
 +                rlist_inc,rlist_max);
 +    }
 +
 +    i = 0;
 +    nstlist_prev = nstlist_orig;
 +    rlist_prev   = ir->rlist;
 +    do
 +    {
 +        if (env == NULL)
 +        {
 +            ir->nstlist = nstl[i];
 +        }
 +
 +        /* Set the pair-list buffer size in ir */
 +        calc_verlet_buffer_size(mtop,det(box),ir,ir->verletbuf_drift,&ls,
 +                                NULL,&rlist_new);
 +
 +        /* Does rlist fit in the box? */
 +        bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC,box));
 +        bDD  = TRUE;
 +        if (bBox && DOMAINDECOMP(cr))
 +        {
 +            /* Check if rlist fits in the domain decomposition */
 +            if (inputrec2nboundeddim(ir) < DIM)
 +            {
 +                gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
 +            }
 +            copy_mat(box,state_tmp.box);
 +            bDD = change_dd_cutoff(cr,&state_tmp,ir,rlist_new);
 +        }
 +
 +        bCont = FALSE;
 +
 +        if (env == NULL)
 +        {
 +            if (bBox && bDD && rlist_new <= rlist_max)
 +            {
 +                /* Increase nstlist */
 +                nstlist_prev = ir->nstlist;
 +                rlist_prev   = rlist_new;
 +                bCont = (i+1 < NNSTL && rlist_new < rlist_ok);
 +            }
 +            else
 +            {
 +                /* Stick with the previous nstlist */
 +                ir->nstlist = nstlist_prev;
 +                rlist_new   = rlist_prev;
 +                bBox = TRUE;
 +                bDD  = TRUE;
 +            }
 +        }
 +
 +        i++;
 +    }
 +    while (bCont);
 +
 +    if (!bBox || !bDD)
 +    {
 +        gmx_warning(!bBox ? box_err : dd_err);
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,"\n%s\n",bBox ? box_err : dd_err);
 +        }
 +        ir->nstlist = nstlist_orig;
 +    }
 +    else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
 +    {
 +        sprintf(buf,"Changing nstlist from %d to %d, rlist from %g to %g",
 +                nstlist_orig,ir->nstlist,
 +                ir->rlist,rlist_new);
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr,"%s\n\n",buf);
 +        }
 +        if (fp != NULL)
 +        {
 +            fprintf(fp,"%s\n\n",buf);
 +        }
 +        ir->rlist     = rlist_new;
 +        ir->rlistlong = rlist_new;
 +    }
 +}
 +
 +static void prepare_verlet_scheme(FILE *fplog,
 +                                  gmx_hw_info_t *hwinfo,
 +                                  t_commrec *cr,
 +                                  gmx_hw_opt_t *hw_opt,
 +                                  const char *nbpu_opt,
 +                                  t_inputrec *ir,
 +                                  const gmx_mtop_t *mtop,
 +                                  matrix box,
 +                                  gmx_bool *bUseGPU)
 +{
 +    /* Here we only check for GPU usage on the MPI master process,
 +     * as here we don't know how many GPUs we will use yet.
 +     * We check for a GPU on all processes later.
 +     */
 +    *bUseGPU = hwinfo->bCanUseGPU || (getenv("GMX_EMULATE_GPU") != NULL);
 +
 +    if (ir->verletbuf_drift > 0)
 +    {
 +        /* Update the Verlet buffer size for the current run setup */
 +        verletbuf_list_setup_t ls;
 +        real rlist_new;
 +
 +        /* Here we assume CPU acceleration is on. But as currently
 +         * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
 +         * and 4x2 gives a larger buffer than 4x4, this is ok.
 +         */
 +        verletbuf_get_list_setup(*bUseGPU,&ls);
 +
 +        calc_verlet_buffer_size(mtop,det(box),ir,
 +                                ir->verletbuf_drift,&ls,
 +                                NULL,&rlist_new);
 +        if (rlist_new != ir->rlist)
 +        {
 +            if (fplog != NULL)
 +            {
 +                fprintf(fplog,"\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
 +                        ir->rlist,rlist_new,
 +                        ls.cluster_size_i,ls.cluster_size_j);
 +            }
 +            ir->rlist     = rlist_new;
 +            ir->rlistlong = rlist_new;
 +        }
 +    }
 +
 +    /* With GPU or emulation we should check nstlist for performance */
 +    if ((EI_DYNAMICS(ir->eI) &&
 +         *bUseGPU &&
 +         ir->nstlist < NSTLIST_GPU_ENOUGH) ||
 +        getenv(NSTLIST_ENVVAR) != NULL)
 +    {
 +        /* Choose a better nstlist */
 +        increase_nstlist(fplog,cr,ir,mtop,box);
 +    }
 +}
 +
 +static void convert_to_verlet_scheme(FILE *fplog,
 +                                     t_inputrec *ir,
 +                                     gmx_mtop_t *mtop,real box_vol)
 +{
 +    char *conv_mesg="Converting input file with group cut-off scheme to the Verlet cut-off scheme";
 +
 +    md_print_warn(NULL,fplog,"%s\n",conv_mesg);
 +
 +    ir->cutoff_scheme   = ecutsVERLET;
 +    ir->verletbuf_drift = 0.005;
 +
 +    if (ir->rcoulomb != ir->rvdw)
 +    {
 +        gmx_fatal(FARGS,"The VdW and Coulomb cut-offs are different, whereas the Verlet scheme only supports equal cut-offs");
 +    }
 +
 +    if (ir->vdwtype == evdwUSER || EEL_USER(ir->coulombtype))
 +    {
 +        gmx_fatal(FARGS,"User non-bonded potentials are not (yet) supported with the Verlet scheme");
 +    }
 +    else if (EVDW_SWITCHED(ir->vdwtype) || EEL_SWITCHED(ir->coulombtype))
 +    {
 +        md_print_warn(NULL,fplog,"Converting switched or shifted interactions to a shifted potential (without force shift), this will lead to slightly different interaction potentials");
 +
 +        if (EVDW_SWITCHED(ir->vdwtype))
 +        {
 +            ir->vdwtype = evdwCUT;
 +        }
 +        if (EEL_SWITCHED(ir->coulombtype))
 +        {
 +            if (EEL_FULL(ir->coulombtype))
 +            {
 +                /* With full electrostatic only PME can be switched */
 +                ir->coulombtype = eelPME;
 +            }
 +            else
 +            {
 +                md_print_warn(NULL,fplog,"NOTE: Replacing %s electrostatics with reaction-field with epsilon-rf=inf\n",eel_names[ir->coulombtype]);
 +                ir->coulombtype = eelRF;
 +                ir->epsilon_rf  = 0.0;
 +            }
 +        }
 +
 +        /* We set the target energy drift to a small number.
 +         * Note that this is only for testing. For production the user
 +         * should think about this and set the mdp options.
 +         */
 +        ir->verletbuf_drift = 1e-4;
 +    }
 +
 +    if (inputrec2nboundeddim(ir) != 3)
 +    {
 +        gmx_fatal(FARGS,"Can only convert old tpr files to the Verlet cut-off scheme with 3D pbc");
 +    }
 +
 +    if (ir->efep != efepNO || ir->implicit_solvent != eisNO)
 +    {
 +        gmx_fatal(FARGS,"Will not convert old tpr files to the Verlet cut-off scheme with free-energy calculations or implicit solvent");
 +    }
 +
 +    if (EI_DYNAMICS(ir->eI) && !(EI_MD(ir->eI) && ir->etc == etcNO))
 +    {
 +        verletbuf_list_setup_t ls;
 +
 +        verletbuf_get_list_setup(FALSE,&ls);
 +        calc_verlet_buffer_size(mtop,box_vol,ir,ir->verletbuf_drift,&ls,
 +                                NULL,&ir->rlist);
 +    }
 +    else
 +    {
 +        ir->verletbuf_drift = -1;
 +        ir->rlist           = 1.05*max(ir->rvdw,ir->rcoulomb);
 +    }
 +
 +    gmx_mtop_remove_chargegroups(mtop);
 +}
 +
 +
 +/* Set CPU affinity. Can be important for performance.
 +   On some systems (e.g. Cray) CPU Affinity is set by default.
 +   But default assigning doesn't work (well) with only some ranks
 +   having threads. This causes very low performance.
 +   External tools have cumbersome syntax for setting affinity
 +   in the case that only some ranks have threads.
 +   Thus it is important that GROMACS sets the affinity internally
 +   if only PME is using threads.
 +*/
 +static void set_cpu_affinity(FILE *fplog,
 +                             const t_commrec *cr,
 +                             const gmx_hw_opt_t *hw_opt,
 +                             int nthreads_pme,
 +                             const gmx_hw_info_t *hwinfo,
 +                             const t_inputrec *inputrec)
 +{
 +#ifdef GMX_OPENMP /* TODO: actually we could do this even without OpenMP?! */
 +#ifdef __linux /* TODO: only linux? why not everywhere if sched_setaffinity is available */
 +    if (hw_opt->bThreadPinning)
 +    {
 +        int thread, nthread_local, nthread_node, nthread_hw_max, nphyscore;
 +        int offset;
 +        char *env;
 +
 +        /* threads on this MPI process or TMPI thread */
 +        if (cr->duty & DUTY_PP)
 +        {
 +            nthread_local = gmx_omp_nthreads_get(emntNonbonded);
 +        }
 +        else
 +        {
 +            nthread_local = gmx_omp_nthreads_get(emntPME);
 +        }
 +
 +        /* map the current process to cores */
 +        thread = 0;
 +        nthread_node = nthread_local;
 +#ifdef GMX_MPI
 +        if (PAR(cr) || MULTISIM(cr))
 +        {
 +            /* We need to determine a scan of the thread counts in this
 +             * compute node.
 +             */
 +            MPI_Comm comm_intra;
 +
 +            MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),cr->nodeid_intra,
 +                           &comm_intra);
 +            MPI_Scan(&nthread_local,&thread,1,MPI_INT,MPI_SUM,comm_intra);
 +            /* MPI_Scan is inclusive, but here we need exclusive */
 +            thread -= nthread_local;
 +            /* Get the total number of threads on this physical node */
 +            MPI_Allreduce(&nthread_local,&nthread_node,1,MPI_INT,MPI_SUM,comm_intra);
 +            MPI_Comm_free(&comm_intra);
 +        }
 +#endif
 +
 +        offset = 0;
 +        if (hw_opt->core_pinning_offset > 0)
 +        {
 +            offset = hw_opt->core_pinning_offset;
 +            if (SIMMASTER(cr))
 +            {
 +                fprintf(stderr, "Applying core pinning offset %d\n", offset);
 +            }
 +            if (fplog)
 +            {
 +                fprintf(fplog, "Applying core pinning offset %d\n", offset);
 +            }
 +        }
 +
 +        /* With Intel Hyper-Threading enabled, we want to pin consecutive
 +         * threads to physical cores when using more threads than physical
 +         * cores or when the user requests so.
 +         */
 +        nthread_hw_max = hwinfo->nthreads_hw_avail;
 +        nphyscore = -1;
 +        if (hw_opt->bPinHyperthreading ||
 +            (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
 +             nthread_node > nthread_hw_max/2 && getenv("GMX_DISABLE_PINHT") == NULL))
 +        {
 +            if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) != GMX_CPUID_X86_SMT_ENABLED)
 +            {
 +                /* We print to stderr on all processes, as we might have
 +                 * different settings on different physical nodes.
 +                 */
 +                if (gmx_cpuid_vendor(hwinfo->cpuid_info) != GMX_CPUID_VENDOR_INTEL)
 +                {
 +                    md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
 +                                  "but non-Intel CPU detected (vendor: %s)\n",
 +                                  gmx_cpuid_vendor_string[gmx_cpuid_vendor(hwinfo->cpuid_info)]);
 +                }
 +                else
 +                {
 +                    md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
 +                                  "but the CPU detected does not have Intel Hyper-Threading support "
 +                                  "(or it is turned off)\n");
 +                }
 +            }
 +            nphyscore = nthread_hw_max/2;
 +
 +            if (SIMMASTER(cr))
 +            {
 +                fprintf(stderr, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
 +                        nphyscore);
 +            }
 +            if (fplog)
 +            {
 +                fprintf(fplog, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
 +                        nphyscore);
 +            }
 +        }
 +
 +        /* set the per-thread affinity */
 +#pragma omp parallel firstprivate(thread) num_threads(nthread_local)
 +        {
 +            cpu_set_t mask;
 +            int core;
 +
 +            CPU_ZERO(&mask);
 +            thread += gmx_omp_get_thread_num();
 +            if (nphyscore <= 0)
 +            {
 +                core = offset + thread;
 +            }
 +            else
 +            {
 +                /* Lock pairs of threads to the same hyperthreaded core */
 +                core = offset + thread/2 + (thread % 2)*nphyscore;
 +            }
 +            CPU_SET(core, &mask);
 +            sched_setaffinity((pid_t) syscall (SYS_gettid), sizeof(cpu_set_t), &mask);
 +        }
 +    }
 +#endif /* __linux    */
 +#endif /* GMX_OPENMP */
 +}
 +
 +
 +static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt,
 +                                    int cutoff_scheme)
 +{
 +    gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp);
 +
 +#ifndef GMX_THREAD_MPI
 +    if (hw_opt->nthreads_tot > 0)
 +    {
 +        gmx_fatal(FARGS,"Setting the total number of threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
 +    }
 +    if (hw_opt->nthreads_tmpi > 0)
 +    {
 +        gmx_fatal(FARGS,"Setting the number of thread-MPI threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
 +    }
 +#endif
 +
 +    if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
 +    {
 +        /* We have the same number of OpenMP threads for PP and PME processes,
 +         * thus we can perform several consistency checks.
 +         */
 +        if (hw_opt->nthreads_tmpi > 0 &&
 +            hw_opt->nthreads_omp > 0 &&
 +            hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
 +        {
 +            gmx_fatal(FARGS,"The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
 +                      hw_opt->nthreads_tot,hw_opt->nthreads_tmpi,hw_opt->nthreads_omp);
 +        }
 +
 +        if (hw_opt->nthreads_tmpi > 0 &&
 +            hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
 +        {
 +            gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
 +                      hw_opt->nthreads_tot,hw_opt->nthreads_tmpi);
 +        }
 +
 +        if (hw_opt->nthreads_omp > 0 &&
 +            hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
 +        {
 +            gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
 +                      hw_opt->nthreads_tot,hw_opt->nthreads_omp);
 +        }
 +
 +        if (hw_opt->nthreads_tmpi > 0 &&
 +            hw_opt->nthreads_omp <= 0)
 +        {
 +            hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
 +        }
 +    }
 +
 +#ifndef GMX_OPENMP
 +    if (hw_opt->nthreads_omp > 1)
 +    {
 +        gmx_fatal(FARGS,"OpenMP threads are requested, but Gromacs was compiled without OpenMP support");
 +    }
 +#endif
 +
 +    if (cutoff_scheme == ecutsGROUP)
 +    {
 +        /* We only have OpenMP support for PME only nodes */
 +        if (hw_opt->nthreads_omp > 1)
 +        {
 +            gmx_fatal(FARGS,"OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
 +                      ecutscheme_names[cutoff_scheme],
 +                      ecutscheme_names[ecutsVERLET]);
 +        }
 +        hw_opt->nthreads_omp = 1;
 +    }
 +
 +    if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
 +    {
 +        gmx_fatal(FARGS,"You need to specify -ntomp in addition to -ntomp_pme");
 +    }
 +
 +    if (hw_opt->nthreads_tot == 1)
 +    {
 +        hw_opt->nthreads_tmpi = 1;
 +
 +        if (hw_opt->nthreads_omp > 1)
 +        {
 +            gmx_fatal(FARGS,"You requested %d OpenMP threads with %d total threads",
 +                      hw_opt->nthreads_tmpi,hw_opt->nthreads_tot);
 +        }
 +        hw_opt->nthreads_omp = 1;
 +    }
 +
 +    if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
 +    {
 +        hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
 +    }
 +
 +    if (debug)
 +    {
 +        fprintf(debug,"hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
 +                hw_opt->nthreads_tot,
 +                hw_opt->nthreads_tmpi,
 +                hw_opt->nthreads_omp,
 +                hw_opt->nthreads_omp_pme,
 +                hw_opt->gpu_id!=NULL ? hw_opt->gpu_id : "");
 +                
 +    }
 +}
 +
 +
 +/* Override the value in inputrec with value passed on the command line (if any) */
 +static void override_nsteps_cmdline(FILE *fplog,
 +                                    int nsteps_cmdline,
 +                                    t_inputrec *ir,
 +                                    const t_commrec *cr)
 +{
 +    assert(ir);
 +    assert(cr);
 +
 +    /* override with anything else than the default -2 */
 +    if (nsteps_cmdline > -2)
 +    {
 +        char stmp[STRLEN];
 +
 +        ir->nsteps = nsteps_cmdline;
 +        if (EI_DYNAMICS(ir->eI))
 +        {
 +            sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps, %.3f ps",
 +                    nsteps_cmdline, nsteps_cmdline*ir->delta_t);
 +        }
 +        else
 +        {
 +            sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps",
 +                    nsteps_cmdline);
 +        }
 +
 +        md_print_warn(cr, fplog, "%s\n", stmp);
 +    }
 +}
 +
 +/* Data structure set by SIMMASTER which needs to be passed to all nodes
 + * before the other nodes have read the tpx file and called gmx_detect_hardware.
 + */
 +typedef struct {
 +    int      cutoff_scheme; /* The cutoff-scheme from inputrec_t */
 +    gmx_bool bUseGPU;       /* Use GPU or GPU emulation          */
 +} master_inf_t;
 +
 +int mdrunner(gmx_hw_opt_t *hw_opt,
 +             FILE *fplog,t_commrec *cr,int nfile,
 +             const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
 +             gmx_bool bCompact, int nstglobalcomm,
 +             ivec ddxyz,int dd_node_order,real rdd,real rconstr,
 +             const char *dddlb_opt,real dlb_scale,
 +             const char *ddcsx,const char *ddcsy,const char *ddcsz,
 +             const char *nbpu_opt,
 +             int nsteps_cmdline, int nstepout,int resetstep,
 +             int nmultisim,int repl_ex_nst,int repl_ex_nex,
 +             int repl_ex_seed, real pforce,real cpt_period,real max_hours,
 +             const char *deviceOptions, unsigned long Flags)
 +{
 +    gmx_bool   bForceUseGPU,bTryUseGPU;
 +    double     nodetime=0,realtime;
 +    t_inputrec *inputrec;
 +    t_state    *state=NULL;
 +    matrix     box;
 +    gmx_ddbox_t ddbox={0};
 +    int        npme_major,npme_minor;
 +    real       tmpr1,tmpr2;
 +    t_nrnb     *nrnb;
 +    gmx_mtop_t *mtop=NULL;
 +    t_mdatoms  *mdatoms=NULL;
 +    t_forcerec *fr=NULL;
 +    t_fcdata   *fcd=NULL;
 +    real       ewaldcoeff=0;
 +    gmx_pme_t  *pmedata=NULL;
 +    gmx_vsite_t *vsite=NULL;
 +    gmx_constr_t constr;
 +    int        i,m,nChargePerturbed=-1,status,nalloc;
 +    char       *gro;
 +    gmx_wallcycle_t wcycle;
 +    gmx_bool       bReadRNG,bReadEkin;
 +    int        list;
 +    gmx_runtime_t runtime;
 +    int        rc;
 +    gmx_large_int_t reset_counters;
 +    gmx_edsam_t ed=NULL;
 +    t_commrec   *cr_old=cr; 
 +    int         nthreads_pme=1;
 +    int         nthreads_pp=1;
 +    gmx_membed_t membed=NULL;
 +    gmx_hw_info_t *hwinfo=NULL;
 +    master_inf_t minf={-1,FALSE};
 +
 +    /* CAUTION: threads may be started later on in this function, so
 +       cr doesn't reflect the final parallel state right now */
 +    snew(inputrec,1);
 +    snew(mtop,1);
 +    
 +    if (Flags & MD_APPENDFILES) 
 +    {
 +        fplog = NULL;
 +    }
 +
 +    bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
 +    bTryUseGPU   = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
 +
 +    snew(state,1);
 +    if (SIMMASTER(cr)) 
 +    {
 +        /* Read (nearly) all data required for the simulation */
 +        read_tpx_state(ftp2fn(efTPX,nfile,fnm),inputrec,state,NULL,mtop);
 +
 +        if (inputrec->cutoff_scheme != ecutsVERLET &&
 +            ((Flags & MD_TESTVERLET) || getenv("GMX_VERLET_SCHEME") != NULL))
 +        {
 +            convert_to_verlet_scheme(fplog,inputrec,mtop,det(state->box));
 +        }
 +
 +        /* Detect hardware, gather information. With tMPI only thread 0 does it
 +         * and after threads are started broadcasts hwinfo around. */
 +        snew(hwinfo, 1);
 +        gmx_detect_hardware(fplog, hwinfo, cr,
 +                            bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
 +
 +        minf.cutoff_scheme = inputrec->cutoff_scheme;
 +        minf.bUseGPU       = FALSE;
 +
 +        if (inputrec->cutoff_scheme == ecutsVERLET)
 +        {
 +            prepare_verlet_scheme(fplog,hwinfo,cr,hw_opt,nbpu_opt,
 +                                  inputrec,mtop,state->box,
 +                                  &minf.bUseGPU);
 +        }
 +        else if (hwinfo->bCanUseGPU)
 +        {
 +            md_print_warn(cr,fplog,
 +                          "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
 +                          "      To use a GPU, set the mdp option: cutoff-scheme = Verlet\n"
 +                          "      (for quick performance testing you can use the -testverlet option)\n");
 +
 +            if (bForceUseGPU)
 +            {
 +                gmx_fatal(FARGS,"GPU requested, but can't be used without cutoff-scheme=Verlet");
 +            }
 +        }
 +    }
 +#ifndef GMX_THREAD_MPI
 +    if (PAR(cr))
 +    {
 +        gmx_bcast_sim(sizeof(minf),&minf,cr);
 +    }
 +#endif
 +    if (minf.bUseGPU && cr->npmenodes == -1)
 +    {
 +        /* Don't automatically use PME-only nodes with GPUs */
 +        cr->npmenodes = 0;
 +    }
 +
 +#ifdef GMX_THREAD_MPI
 +    /* With thread-MPI inputrec is only set here on the master thread */
 +    if (SIMMASTER(cr))
 +#endif
 +    {
 +        check_and_update_hw_opt(hw_opt,minf.cutoff_scheme);
 +
 +#ifdef GMX_THREAD_MPI
 +        if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
 +        {
 +            gmx_fatal(FARGS,"You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME nodes");
 +        }
 +#endif
 +
 +        if (hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
 +            cr->npmenodes <= 0)
 +        {
 +            gmx_fatal(FARGS,"You need to explicitly specify the number of PME nodes (-npme) when using different number of OpenMP threads for PP and PME nodes");
 +        }
 +    }
 +
 +#ifdef GMX_THREAD_MPI
 +    if (SIMMASTER(cr))
 +    {
 +        /* NOW the threads will be started: */
 +        hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
 +                                                 hw_opt,
 +                                                 inputrec, mtop,
 +                                                 cr, fplog);
 +        if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
 +        {
 +            hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
 +        }
 +
 +        if (hw_opt->nthreads_tmpi > 1)
 +        {
 +            /* now start the threads. */
 +            cr=mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm, 
 +                                      oenv, bVerbose, bCompact, nstglobalcomm, 
 +                                      ddxyz, dd_node_order, rdd, rconstr, 
 +                                      dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
 +                                      nbpu_opt,
 +                                      nsteps_cmdline, nstepout, resetstep, nmultisim, 
 +                                      repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
 +                                      cpt_period, max_hours, deviceOptions, 
 +                                      Flags);
 +            /* the main thread continues here with a new cr. We don't deallocate
 +               the old cr because other threads may still be reading it. */
 +            if (cr == NULL)
 +            {
 +                gmx_comm("Failed to spawn threads");
 +            }
 +        }
 +    }
 +#endif
 +    /* END OF CAUTION: cr is now reliable */
 +
 +    /* g_membed initialisation *
 +     * Because we change the mtop, init_membed is called before the init_parallel *
 +     * (in case we ever want to make it run in parallel) */
 +    if (opt2bSet("-membed",nfile,fnm))
 +    {
 +        if (MASTER(cr))
 +        {
 +            fprintf(stderr,"Initializing membed");
 +        }
 +        membed = init_membed(fplog,nfile,fnm,mtop,inputrec,state,cr,&cpt_period);
 +    }
 +
 +    if (PAR(cr))
 +    {
 +        /* now broadcast everything to the non-master nodes/threads: */
 +        init_parallel(fplog, cr, inputrec, mtop);
 +
 +        /* This check needs to happen after get_nthreads_mpi() */
 +        if (inputrec->cutoff_scheme == ecutsVERLET && (Flags & MD_PARTDEC))
 +        {
 +            gmx_fatal_collective(FARGS,cr,NULL,
 +                                 "The Verlet cut-off scheme is not supported with particle decomposition.\n"
 +                                 "You can achieve the same effect as particle decomposition by running in parallel using only OpenMP threads.");
 +        }
 +    }
 +    if (fplog != NULL)
 +    {
 +        pr_inputrec(fplog,0,"Input Parameters",inputrec,FALSE);
 +    }
 +
 +#if defined GMX_THREAD_MPI
 +    /* With tMPI we detected on thread 0 and we'll just pass the hwinfo pointer
 +     * to the other threads  -- slightly uncool, but works fine, just need to
 +     * make sure that the data doesn't get freed twice. */
 +    if (cr->nnodes > 1)
 +    {
 +        if (!SIMMASTER(cr))
 +        {
 +            snew(hwinfo, 1);
 +        }
 +        gmx_bcast(sizeof(&hwinfo), &hwinfo, cr);
 +    }
 +#else
 +    if (PAR(cr) && !SIMMASTER(cr))
 +    {
 +        /* now we have inputrec on all nodes, can run the detection */
 +        /* TODO: perhaps it's better to propagate within a node instead? */
 +        snew(hwinfo, 1);
 +        gmx_detect_hardware(fplog, hwinfo, cr,
 +                                 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
 +    }
 +#endif
 +
 +    /* now make sure the state is initialized and propagated */
 +    set_state_entries(state,inputrec,cr->nnodes);
 +
 +    /* remove when vv and rerun works correctly! */
 +    if (PAR(cr) && EI_VV(inputrec->eI) && ((Flags & MD_RERUN) || (Flags & MD_RERUN_VSITE)))
 +    {
 +        gmx_fatal(FARGS,
 +                  "Currently can't do velocity verlet with rerun in parallel.");
 +    }
 +
 +    /* A parallel command line option consistency check that we can
 +       only do after any threads have started. */
 +    if (!PAR(cr) &&
 +        (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
 +    {
 +        gmx_fatal(FARGS,
 +                  "The -dd or -npme option request a parallel simulation, "
 +#ifndef GMX_MPI
 +                  "but %s was compiled without threads or MPI enabled"
 +#else
 +#ifdef GMX_THREAD_MPI
 +                  "but the number of threads (option -nt) is 1"
 +#else
 +                  "but %s was not started through mpirun/mpiexec or only one process was requested through mpirun/mpiexec"
 +#endif
 +#endif
 +                  , ShortProgram()
 +            );
 +    }
 +
 +    if ((Flags & MD_RERUN) &&
 +        (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
 +    {
 +        gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
 +    }
 +
 +    if (can_use_allvsall(inputrec,mtop,TRUE,cr,fplog) && PAR(cr))
 +    {
 +        /* All-vs-all loops do not work with domain decomposition */
 +        Flags |= MD_PARTDEC;
 +    }
 +
 +    if (!EEL_PME(inputrec->coulombtype) || (Flags & MD_PARTDEC))
 +    {
 +        if (cr->npmenodes > 0)
 +        {
 +            if (!EEL_PME(inputrec->coulombtype))
 +            {
 +                gmx_fatal_collective(FARGS,cr,NULL,
 +                                     "PME nodes are requested, but the system does not use PME electrostatics");
 +            }
 +            if (Flags & MD_PARTDEC)
 +            {
 +                gmx_fatal_collective(FARGS,cr,NULL,
 +                                     "PME nodes are requested, but particle decomposition does not support separate PME nodes");
 +            }
 +        }
 +
 +        cr->npmenodes = 0;
 +    }
 +
 +#ifdef GMX_FAHCORE
 +    fcRegisterSteps(inputrec->nsteps,inputrec->init_step);
 +#endif
 +
 +    /* NMR restraints must be initialized before load_checkpoint,
 +     * since with time averaging the history is added to t_state.
 +     * For proper consistency check we therefore need to extend
 +     * t_state here.
 +     * So the PME-only nodes (if present) will also initialize
 +     * the distance restraints.
 +     */
 +    snew(fcd,1);
 +
 +    /* This needs to be called before read_checkpoint to extend the state */
 +    init_disres(fplog,mtop,inputrec,cr,Flags & MD_PARTDEC,fcd,state);
 +
 +    if (gmx_mtop_ftype_count(mtop,F_ORIRES) > 0)
 +    {
 +        if (PAR(cr) && !(Flags & MD_PARTDEC))
 +        {
 +            gmx_fatal(FARGS,"Orientation restraints do not work (yet) with domain decomposition, use particle decomposition (mdrun option -pd)");
 +        }
 +        /* Orientation restraints */
 +        if (MASTER(cr))
 +        {
 +            init_orires(fplog,mtop,state->x,inputrec,cr->ms,&(fcd->orires),
 +                        state);
 +        }
 +    }
 +
 +    if (DEFORM(*inputrec))
 +    {
 +        /* Store the deform reference box before reading the checkpoint */
 +        if (SIMMASTER(cr))
 +        {
 +            copy_mat(state->box,box);
 +        }
 +        if (PAR(cr))
 +        {
 +            gmx_bcast(sizeof(box),box,cr);
 +        }
 +        /* Because we do not have the update struct available yet
 +         * in which the reference values should be stored,
 +         * we store them temporarily in static variables.
 +         * This should be thread safe, since they are only written once
 +         * and with identical values.
 +         */
 +#ifdef GMX_THREAD_MPI
 +        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
 +#endif
 +        deform_init_init_step_tpx = inputrec->init_step;
 +        copy_mat(box,deform_init_box_tpx);
 +#ifdef GMX_THREAD_MPI
 +        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
 +#endif
 +    }
 +
 +    if (opt2bSet("-cpi",nfile,fnm)) 
 +    {
 +        /* Check if checkpoint file exists before doing continuation.
 +         * This way we can use identical input options for the first and subsequent runs...
 +         */
 +        if( gmx_fexist_master(opt2fn_master("-cpi",nfile,fnm,cr),cr) )
 +        {
 +            load_checkpoint(opt2fn_master("-cpi",nfile,fnm,cr),&fplog,
 +                            cr,Flags & MD_PARTDEC,ddxyz,
 +                            inputrec,state,&bReadRNG,&bReadEkin,
 +                            (Flags & MD_APPENDFILES),
 +                            (Flags & MD_APPENDFILESSET));
 +            
 +            if (bReadRNG)
 +            {
 +                Flags |= MD_READ_RNG;
 +            }
 +            if (bReadEkin)
 +            {
 +                Flags |= MD_READ_EKIN;
 +            }
 +        }
 +    }
 +
 +    if (((MASTER(cr) || (Flags & MD_SEPPOT)) && (Flags & MD_APPENDFILES))
 +#ifdef GMX_THREAD_MPI
 +        /* With thread MPI only the master node/thread exists in mdrun.c,
 +         * therefore non-master nodes need to open the "seppot" log file here.
 +         */
 +        || (!MASTER(cr) && (Flags & MD_SEPPOT))
 +#endif
 +        )
 +    {
 +        gmx_log_open(ftp2fn(efLOG,nfile,fnm),cr,!(Flags & MD_SEPPOT),
 +                             Flags,&fplog);
 +    }
 +
 +    /* override nsteps with value from cmdline */
 +    override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
 +
 +    if (SIMMASTER(cr)) 
 +    {
 +        copy_mat(state->box,box);
 +    }
 +
 +    if (PAR(cr)) 
 +    {
 +        gmx_bcast(sizeof(box),box,cr);
 +    }
 +
 +    /* Essential dynamics */
 +    if (opt2bSet("-ei",nfile,fnm))
 +    {
 +        /* Open input and output files, allocate space for ED data structure */
 +        ed = ed_open(nfile,fnm,Flags,cr);
 +    }
 +
 +    if (PAR(cr) && !((Flags & MD_PARTDEC) ||
 +                     EI_TPI(inputrec->eI) ||
 +                     inputrec->eI == eiNM))
 +    {
 +        cr->dd = init_domain_decomposition(fplog,cr,Flags,ddxyz,rdd,rconstr,
 +                                           dddlb_opt,dlb_scale,
 +                                           ddcsx,ddcsy,ddcsz,
 +                                           mtop,inputrec,
 +                                           box,state->x,
 +                                           &ddbox,&npme_major,&npme_minor);
 +
 +        make_dd_communicators(fplog,cr,dd_node_order);
 +
 +        /* Set overallocation to avoid frequent reallocation of arrays */
 +        set_over_alloc_dd(TRUE);
 +    }
 +    else
 +    {
 +        /* PME, if used, is done on all nodes with 1D decomposition */
 +        cr->npmenodes = 0;
 +        cr->duty = (DUTY_PP | DUTY_PME);
 +        npme_major = 1;
 +        npme_minor = 1;
 +        if (!EI_TPI(inputrec->eI))
 +        {
 +            npme_major = cr->nnodes;
 +        }
 +        
 +        if (inputrec->ePBC == epbcSCREW)
 +        {
 +            gmx_fatal(FARGS,
 +                      "pbc=%s is only implemented with domain decomposition",
 +                      epbc_names[inputrec->ePBC]);
 +        }
 +    }
 +
 +    if (PAR(cr))
 +    {
 +        /* After possible communicator splitting in make_dd_communicators.
 +         * we can set up the intra/inter node communication.
 +         */
 +        gmx_setup_nodecomm(fplog,cr);
 +    }
 +
 +    /* Initialize per-node process ID and counters. */
 +    gmx_init_intra_counters(cr);
 +
 +#ifdef GMX_MPI
 +    md_print_info(cr,fplog,"Using %d MPI %s\n",
 +                  cr->nnodes,
 +#ifdef GMX_THREAD_MPI
 +                  cr->nnodes==1 ? "thread" : "threads"
 +#else
 +                  cr->nnodes==1 ? "process" : "processes"
 +#endif
 +                  );
 +#endif
 +
 +    gmx_omp_nthreads_init(fplog, cr,
 +                          hwinfo->nthreads_hw_avail,
 +                          hw_opt->nthreads_omp,
 +                          hw_opt->nthreads_omp_pme,
 +                          (cr->duty & DUTY_PP) == 0,
 +                          inputrec->cutoff_scheme == ecutsVERLET);
 +
 +    gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi, minf.bUseGPU);
 +
 +    /* getting number of PP/PME threads
 +       PME: env variable should be read only on one node to make sure it is 
 +       identical everywhere;
 +     */
 +    /* TODO nthreads_pp is only used for pinning threads.
 +     * This is a temporary solution until we have a hw topology library.
 +     */
 +    nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
 +    nthreads_pme = gmx_omp_nthreads_get(emntPME);
 +
 +    wcycle = wallcycle_init(fplog,resetstep,cr,nthreads_pp,nthreads_pme);
 +
 +    if (PAR(cr))
 +    {
 +        /* Master synchronizes its value of reset_counters with all nodes 
 +         * including PME only nodes */
 +        reset_counters = wcycle_get_reset_counters(wcycle);
 +        gmx_bcast_sim(sizeof(reset_counters),&reset_counters,cr);
 +        wcycle_set_reset_counters(wcycle, reset_counters);
 +    }
 +
 +    snew(nrnb,1);
 +    if (cr->duty & DUTY_PP)
 +    {
 +        /* For domain decomposition we allocate dynamically
 +         * in dd_partition_system.
 +         */
 +        if (DOMAINDECOMP(cr))
 +        {
 +            bcast_state_setup(cr,state);
 +        }
 +        else
 +        {
 +            if (PAR(cr))
 +            {
 +                bcast_state(cr,state,TRUE);
 +            }
 +        }
 +
 +        /* Initiate forcerecord */
 +        fr = mk_forcerec();
 +        fr->hwinfo = hwinfo;
 +        init_forcerec(fplog,oenv,fr,fcd,inputrec,mtop,cr,box,FALSE,
 +                      opt2fn("-table",nfile,fnm),
 +                      opt2fn("-tabletf",nfile,fnm),
 +                      opt2fn("-tablep",nfile,fnm),
 +                      opt2fn("-tableb",nfile,fnm),
 +                      nbpu_opt,
 +                      FALSE,pforce);
 +
 +        /* version for PCA_NOT_READ_NODE (see md.c) */
 +        /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
 +          "nofile","nofile","nofile","nofile",FALSE,pforce);
 +          */        
 +        fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT);
 +
 +        /* Initialize QM-MM */
 +        if(fr->bQMMM)
 +        {
 +            init_QMMMrec(cr,box,mtop,inputrec,fr);
 +        }
 +
 +        /* Initialize the mdatoms structure.
 +         * mdatoms is not filled with atom data,
 +         * as this can not be done now with domain decomposition.
 +         */
 +        mdatoms = init_mdatoms(fplog,mtop,inputrec->efep!=efepNO);
 +
 +        /* Initialize the virtual site communication */
 +        vsite = init_vsite(mtop,cr);
 +
 +        calc_shifts(box,fr->shift_vec);
 +
 +        /* With periodic molecules the charge groups should be whole at start up
 +         * and the virtual sites should not be far from their proper positions.
 +         */
 +        if (!inputrec->bContinuation && MASTER(cr) &&
 +            !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
 +        {
 +            /* Make molecules whole at start of run */
 +            if (fr->ePBC != epbcNONE)
 +            {
 +                do_pbc_first_mtop(fplog,inputrec->ePBC,box,mtop,state->x);
 +            }
 +            if (vsite)
 +            {
 +                /* Correct initial vsite positions are required
 +                 * for the initial distribution in the domain decomposition
 +                 * and for the initial shell prediction.
 +                 */
 +                construct_vsites_mtop(fplog,vsite,mtop,state->x);
 +            }
 +        }
 +
 +        if (EEL_PME(fr->eeltype))
 +        {
 +            ewaldcoeff = fr->ewaldcoeff;
 +            pmedata = &fr->pmedata;
 +        }
 +        else
 +        {
 +            pmedata = NULL;
 +        }
 +    }
 +    else
 +    {
 +        /* This is a PME only node */
 +
 +        /* We don't need the state */
 +        done_state(state);
 +
 +        ewaldcoeff = calc_ewaldcoeff(inputrec->rcoulomb, inputrec->ewald_rtol);
 +        snew(pmedata,1);
 +    }
 +
 +#if defined GMX_THREAD_MPI
 +    /* With the number of TMPI threads equal to the number of cores
 +     * we already pinned in thread-MPI, so don't pin again here.
 +     */
 +    if (hw_opt->nthreads_tmpi != tMPI_Thread_get_hw_number())
 +#endif
 +    {
 +        /* Set the CPU affinity */
 +        set_cpu_affinity(fplog,cr,hw_opt,nthreads_pme,hwinfo,inputrec);
 +    }
 +
 +    /* Initiate PME if necessary,
 +     * either on all nodes or on dedicated PME nodes only. */
 +    if (EEL_PME(inputrec->coulombtype))
 +    {
 +        if (mdatoms)
 +        {
 +            nChargePerturbed = mdatoms->nChargePerturbed;
 +        }
 +        if (cr->npmenodes > 0)
 +        {
 +            /* The PME only nodes need to know nChargePerturbed */
 +            gmx_bcast_sim(sizeof(nChargePerturbed),&nChargePerturbed,cr);
 +        }
 +
 +        if (cr->duty & DUTY_PME)
 +        {
 +            status = gmx_pme_init(pmedata,cr,npme_major,npme_minor,inputrec,
 +                                  mtop ? mtop->natoms : 0,nChargePerturbed,
 +                                  (Flags & MD_REPRODUCIBLE),nthreads_pme);
 +            if (status != 0) 
 +            {
 +                gmx_fatal(FARGS,"Error %d initializing PME",status);
 +            }
 +        }
 +    }
 +
 +
 +    if (integrator[inputrec->eI].func == do_md
 +#ifdef GMX_OPENMM
 +        ||
 +        integrator[inputrec->eI].func == do_md_openmm
 +#endif
 +        )
 +    {
 +        /* Turn on signal handling on all nodes */
 +        /*
 +         * (A user signal from the PME nodes (if any)
 +         * is communicated to the PP nodes.
 +         */
 +        signal_handler_install();
 +    }
 +
 +    if (cr->duty & DUTY_PP)
 +    {
 +        if (inputrec->ePull != epullNO)
 +        {
 +            /* Initialize pull code */
 +            init_pull(fplog,inputrec,nfile,fnm,mtop,cr,oenv, inputrec->fepvals->init_lambda,
 +                      EI_DYNAMICS(inputrec->eI) && MASTER(cr),Flags);
 +        }
 +        
 +        if (inputrec->bRot)
 +        {
 +           /* Initialize enforced rotation code */
 +           init_rot(fplog,inputrec,nfile,fnm,cr,state->x,box,mtop,oenv,
 +                    bVerbose,Flags);
 +        }
 +
 +        constr = init_constraints(fplog,mtop,inputrec,ed,state,cr);
 +
 +        if (DOMAINDECOMP(cr))
 +        {
 +            dd_init_bondeds(fplog,cr->dd,mtop,vsite,constr,inputrec,
 +                            Flags & MD_DDBONDCHECK,fr->cginfo_mb);
 +
 +            set_dd_parameters(fplog,cr->dd,dlb_scale,inputrec,fr,&ddbox);
 +
 +            setup_dd_grid(fplog,cr->dd);
 +        }
 +
 +        /* Now do whatever the user wants us to do (how flexible...) */
 +        integrator[inputrec->eI].func(fplog,cr,nfile,fnm,
 +                                      oenv,bVerbose,bCompact,
 +                                      nstglobalcomm,
 +                                      vsite,constr,
 +                                      nstepout,inputrec,mtop,
 +                                      fcd,state,
 +                                      mdatoms,nrnb,wcycle,ed,fr,
 +                                      repl_ex_nst,repl_ex_nex,repl_ex_seed,
 +                                      membed,
 +                                      cpt_period,max_hours,
 +                                      deviceOptions,
 +                                      Flags,
 +                                      &runtime);
 +
 +        if (inputrec->ePull != epullNO)
 +        {
 +            finish_pull(fplog,inputrec->pull);
 +        }
 +        
 +        if (inputrec->bRot)
 +        {
 +            finish_rot(fplog,inputrec->rot);
 +        }
 +
 +    } 
 +    else 
 +    {
 +        /* do PME only */
 +        gmx_pmeonly(*pmedata,cr,nrnb,wcycle,ewaldcoeff,FALSE,inputrec);
 +    }
 +
 +    if (EI_DYNAMICS(inputrec->eI) || EI_TPI(inputrec->eI))
 +    {
 +        /* Some timing stats */  
 +        if (SIMMASTER(cr))
 +        {
 +            if (runtime.proc == 0)
 +            {
 +                runtime.proc = runtime.real;
 +            }
 +        }
 +        else
 +        {
 +            runtime.real = 0;
 +        }
 +    }
 +
 +    wallcycle_stop(wcycle,ewcRUN);
 +
 +    /* Finish up, write some stuff
 +     * if rerunMD, don't write last frame again 
 +     */
 +    finish_run(fplog,cr,ftp2fn(efSTO,nfile,fnm),
 +               inputrec,nrnb,wcycle,&runtime,
 +               fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ?
 +                 nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL,
 +               nthreads_pp, 
 +               EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
 +
 +    if ((cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU)
 +    {
 +        char gpu_err_str[STRLEN];
 +
 +        /* free GPU memory and uninitialize GPU (by destroying the context) */
 +        nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
 +
 +        if (!free_gpu(gpu_err_str))
 +        {
 +            gmx_warning("On node %d failed to free GPU #%d: %s",
 +                        cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
 +        }
 +    }
 +
 +    if (opt2bSet("-membed",nfile,fnm))
 +    {
 +        sfree(membed);
 +    }
 +
 +#ifdef GMX_THREAD_MPI
 +    if (PAR(cr) && SIMMASTER(cr))
 +#endif
 +    {
 +        gmx_hardware_info_free(hwinfo);
 +    }
 +
 +    /* Does what it says */  
 +    print_date_and_time(fplog,cr->nodeid,"Finished mdrun",&runtime);
 +
 +    /* Close logfile already here if we were appending to it */
 +    if (MASTER(cr) && (Flags & MD_APPENDFILES))
 +    {
 +        gmx_log_close(fplog);
 +    } 
 +
 +    rc=(int)gmx_get_stop_condition();
 +
 +#ifdef GMX_THREAD_MPI
 +    /* we need to join all threads. The sub-threads join when they
 +       exit this function, but the master thread needs to be told to 
 +       wait for that. */
 +    if (PAR(cr) && MASTER(cr))
 +    {
 +        tMPI_Finalize();
 +    }
 +#endif
 +
 +    return rc;
 +}