Merge branch 'release-4-6'

author Roland Schulz <roland@utk.edu>

Fri, 19 Oct 2012 18:50:56 +0000 (14:50 -0400)

committer Roland Schulz <roland@utk.edu>

Fri, 19 Oct 2012 19:22:32 +0000 (15:22 -0400)
author Roland Schulz <roland@utk.edu>
Fri, 19 Oct 2012 18:50:56 +0000 (14:50 -0400)
committer Roland Schulz <roland@utk.edu>
Fri, 19 Oct 2012 19:22:32 +0000 (15:22 -0400)
diff --cc src/gromacs/gmxlib/thread_mpi/pthreads.c
Simple merge
diff --cc src/gromacs/gmxlib/thread_mpi/pthreads.h
Simple merge
diff --cc src/gromacs/gmxlib/thread_mpi/tmpi_init.c
Simple merge
diff --cc src/gromacs/gmxlib/thread_mpi/winthreads.c
Simple merge
diff --cc src/gromacs/gmxlib/thread_mpi/winthreads.h
Simple merge
diff --cc src/gromacs/legacyheaders/nbnxn_cuda_data_mgmt.h
Simple merge
diff --cc src/gromacs/legacyheaders/thread_mpi/threads.h
Simple merge
diff --cc src/gromacs/legacyheaders/thread_mpi/tmpi.h
Simple merge
diff --cc src/gromacs/mdlib/clincs.c
Simple merge
diff --cc src/gromacs/mdlib/gmx_wallcycle.c
Simple merge
diff --cc src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
Simple merge
diff --cc src/programs/mdrun/CMakeLists.txt

index 6c499aa0020dd1a7074a6adb1236f29c5f46ec5c,0000000000000000000000000000000000000000..85f65b9a96dd88945182374f3f3d70863a72699d

mode 100644,000000..100644
--- 1/src/programs/mdrun/CMakeLists.txt
--- /dev/null
+++ b/src/programs/mdrun/CMakeLists.txt
@@@ -1,52 -1,0 +1,52 @@@
-     do_gct.c    gctio.c         genalg.c    ionize.c
-     md.c        md_openmm.c     mdrun.c     membed.c
-     pme_switch.c  repl_ex.c   runner.c        xutils.c)
+ +include_directories(${CMAKE_SOURCE_DIR}/src/gromacs/gmxpreprocess)
+ +
+ +set(MDRUN_SOURCES
++    do_gct.c      gctio.c       genalg.c    ionize.c
++    md.c          md_openmm.c   mdrun.c     membed.c
++    pme_loadbal.c repl_ex.c     runner.c    xutils.c)
+ +
+ +if(GMX_OPENMM) 
+ +    include_directories(./gmx_gpu_utils ${OpenMM_INCLUDE_DIR})
+ +    link_directories(${OpenMM_LIBRARY_DIR}) 
+ +    # with this define no evn.var. is needed with OPENMM_PLUGIN_DIR
+ +    # if the same OpenMM installation is used for running and building 
+ +    add_definitions( -DOPENMM_PLUGIN_DIR="${OpenMM_PLUGIN_DIR}" ) 
+ +    file(TO_CMAKE_PATH ${OpenMM_PLUGIN_DIR} _path)
+ +    add_library(openmm_api_wrapper STATIC openmm_wrapper.cpp)
+ +    target_link_libraries(openmm_api_wrapper gmx_gpu_utils ${OpenMM_LIBRARIES})
+ +    set(GMX_OPENMM_LIBRARIES openmm_api_wrapper gmx_gpu_utils ${OpenMM_LIBRARIES})   
+ +endif(GMX_OPENMM)
+ +
+ +if(GMX_FAHCORE)
+ +    add_library(fahcore ${MDRUN_SOURCES})
+ +else(GMX_FAHCORE)
+ +    add_executable(mdrun ${MDRUN_SOURCES})
+ +    gmx_add_man_page(mdrun)
+ +    target_link_libraries(mdrun ${GMX_EXTRA_LIBRARIES} libgromacs ${GMX_OPENMM_LIBRARIES}
+ +        ${OpenMP_LINKER_FLAGS})
+ +    set_target_properties(mdrun PROPERTIES OUTPUT_NAME "mdrun${GMX_BINARY_SUFFIX}"
+ +        COMPILE_FLAGS "${OpenMP_C_FLAGS}")
+ +    install(TARGETS mdrun DESTINATION ${BIN_INSTALL_DIR} COMPONENT mdrun)
+ +
+ +    if(GMX_OPENMM AND MSVC)
+ +        set_target_properties(mdrun PROPERTIES LINK_FLAGS "/NODEFAULTLIB:LIBCMT")
+ +    endif()
+ +
+ +    # Create the custom install-mdrun target
+ +    if (BUILD_SHARED_LIBS)
+ +        # If shared libraries are used, we need to install the libraries in
+ +        # addition to the mdrun binary.
+ +        add_custom_target(install-mdrun
+ +            COMMAND ${CMAKE_COMMAND} -DCOMPONENT=libraries
+ +                    -P ${CMAKE_BINARY_DIR}/cmake_install.cmake
+ +            COMMAND ${CMAKE_COMMAND} -DCOMPONENT=mdrun
+ +                    -P ${CMAKE_BINARY_DIR}/cmake_install.cmake
+ +            COMMENT "Installing mdrun")
+ +    else (BUILD_SHARED_LIBS)
+ +        add_custom_target(install-mdrun
+ +            COMMAND ${CMAKE_COMMAND} -DCOMPONENT=mdrun
+ +                    -P ${CMAKE_BINARY_DIR}/cmake_install.cmake
+ +            COMMENT "Installing mdrun")
+ +    endif (BUILD_SHARED_LIBS)
+ +    add_dependencies(install-mdrun mdrun)
+ +endif(GMX_FAHCORE)
diff --cc src/programs/mdrun/md.c

index d6f4017607a1cd61c9269fe67a9e6022c389982f,0000000000000000000000000000000000000000..2a99d5f7ee53cb006df15b76139f621d48556db0

mode 100644,000000..100644
--- 1/src/programs/mdrun/md.c
--- /dev/null
+++ b/src/programs/mdrun/md.c
@@@ -1,2127 -1,0 +1,2132 @@@
- #include "pme_switch.h"
+ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ + *
+ + * 
+ + *                This source code is part of
+ + * 
+ + *                 G   R   O   M   A   C   S
+ + * 
+ + *          GROningen MAchine for Chemical Simulations
+ + * 
+ + *                        VERSION 3.2.0
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2004, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ +
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + * 
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + * 
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + * 
+ + * For more info, check our website at http://www.gromacs.org
+ + * 
+ + * And Hey:
+ + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ + */
+ +#ifdef HAVE_CONFIG_H
+ +#include <config.h>
+ +#endif
+ +
+ +#include "typedefs.h"
+ +#include "smalloc.h"
+ +#include "sysstuff.h"
+ +#include "vec.h"
+ +#include "statutil.h"
+ +#include "vcm.h"
+ +#include "mdebin.h"
+ +#include "nrnb.h"
+ +#include "calcmu.h"
+ +#include "index.h"
+ +#include "vsite.h"
+ +#include "update.h"
+ +#include "ns.h"
+ +#include "trnio.h"
+ +#include "xtcio.h"
+ +#include "mdrun.h"
+ +#include "md_support.h"
+ +#include "confio.h"
+ +#include "network.h"
+ +#include "pull.h"
+ +#include "xvgr.h"
+ +#include "physics.h"
+ +#include "names.h"
+ +#include "xmdrun.h"
+ +#include "ionize.h"
+ +#include "disre.h"
+ +#include "orires.h"
+ +#include "pme.h"
+ +#include "mdatoms.h"
+ +#include "repl_ex.h"
+ +#include "qmmm.h"
+ +#include "domdec.h"
+ +#include "domdec_network.h"
+ +#include "partdec.h"
+ +#include "topsort.h"
+ +#include "coulomb.h"
+ +#include "constr.h"
+ +#include "shellfc.h"
+ +#include "compute_io.h"
+ +#include "mvdata.h"
+ +#include "checkpoint.h"
+ +#include "mtop_util.h"
+ +#include "sighandler.h"
+ +#include "txtdump.h"
+ +#include "string2.h"
-     pme_switch_t    pme_switch=NULL;
++#include "pme_loadbal.h"
+ +#include "bondf.h"
+ +#include "membed.h"
+ +#include "types/nlistheuristics.h"
+ +#include "types/iteratedconstraints.h"
+ +#include "nbnxn_cuda_data_mgmt.h"
+ +
+ +#ifdef GMX_LIB_MPI
+ +#include <mpi.h>
+ +#endif
+ +#ifdef GMX_THREAD_MPI
+ +#include "tmpi.h"
+ +#endif
+ +
+ +#ifdef GMX_FAHCORE
+ +#include "corewrap.h"
+ +#endif
+ +
+ +static void reset_all_counters(FILE *fplog,t_commrec *cr,
+ +                               gmx_large_int_t step,
+ +                               gmx_large_int_t *step_rel,t_inputrec *ir,
+ +                               gmx_wallcycle_t wcycle,t_nrnb *nrnb,
+ +                               gmx_runtime_t *runtime,
+ +                               nbnxn_cuda_ptr_t cu_nbv)
+ +{
+ +    char sbuf[STEPSTRSIZE];
+ +
+ +    /* Reset all the counters related to performance over the run */
+ +    md_print_warn(cr,fplog,"step %s: resetting all time and cycle counters\n",
+ +                  gmx_step_str(step,sbuf));
+ +
+ +    if (cu_nbv)
+ +    {
+ +        nbnxn_cuda_reset_timings(cu_nbv);
+ +    }
+ +
+ +    wallcycle_stop(wcycle,ewcRUN);
+ +    wallcycle_reset_all(wcycle);
+ +    if (DOMAINDECOMP(cr))
+ +    {
+ +        reset_dd_statistics_counters(cr->dd);
+ +    }
+ +    init_nrnb(nrnb);
+ +    ir->init_step += *step_rel;
+ +    ir->nsteps    -= *step_rel;
+ +    *step_rel = 0;
+ +    wallcycle_start(wcycle,ewcRUN);
+ +    runtime_start(runtime);
+ +    print_date_and_time(fplog,cr->nodeid,"Restarted time",runtime);
+ +}
+ +
+ +double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
+ +             const output_env_t oenv, gmx_bool bVerbose,gmx_bool bCompact,
+ +             int nstglobalcomm,
+ +             gmx_vsite_t *vsite,gmx_constr_t constr,
+ +             int stepout,t_inputrec *ir,
+ +             gmx_mtop_t *top_global,
+ +             t_fcdata *fcd,
+ +             t_state *state_global,
+ +             t_mdatoms *mdatoms,
+ +             t_nrnb *nrnb,gmx_wallcycle_t wcycle,
+ +             gmx_edsam_t ed,t_forcerec *fr,
+ +             int repl_ex_nst,int repl_ex_nex,int repl_ex_seed,gmx_membed_t membed,
+ +             real cpt_period,real max_hours,
+ +             const char *deviceOptions,
+ +             unsigned long Flags,
+ +             gmx_runtime_t *runtime)
+ +{
+ +    gmx_mdoutf_t *outf;
+ +    gmx_large_int_t step,step_rel;
+ +    double     run_time;
+ +    double     t,t0,lam0[efptNR];
+ +    gmx_bool       bGStatEveryStep,bGStat,bCalcVir,bCalcEner;
+ +    gmx_bool       bNS,bNStList,bSimAnn,bStopCM,bRerunMD,bNotLastFrame=FALSE,
+ +               bFirstStep,bStateFromCP,bStateFromTPX,bInitStep,bLastStep,
+ +               bBornRadii,bStartingFromCpt;
+ +    gmx_bool   bDoDHDL=FALSE,bDoFEP=FALSE,bDoExpanded=FALSE;
+ +    gmx_bool       do_ene,do_log,do_verbose,bRerunWarnNoV=TRUE,
+ +               bForceUpdate=FALSE,bCPT;
+ +    int        mdof_flags;
+ +    gmx_bool       bMasterState;
+ +    int        force_flags,cglo_flags;
+ +    tensor     force_vir,shake_vir,total_vir,tmp_vir,pres;
+ +    int        i,m;
+ +    t_trxstatus *status;
+ +    rvec       mu_tot;
+ +    t_vcm      *vcm;
+ +    t_state    *bufstate=NULL;   
+ +    matrix     *scale_tot,pcoupl_mu,M,ebox;
+ +    gmx_nlheur_t nlh;
+ +    t_trxframe rerun_fr;
+ +    gmx_repl_ex_t repl_ex=NULL;
+ +    int        nchkpt=1;
+ +    gmx_localtop_t *top;      
+ +    t_mdebin *mdebin=NULL;
+ +    df_history_t df_history;
+ +    t_state    *state=NULL;
+ +    rvec       *f_global=NULL;
+ +    int        n_xtc=-1;
+ +    rvec       *x_xtc=NULL;
+ +    gmx_enerdata_t *enerd;
+ +    rvec       *f=NULL;
+ +    gmx_global_stat_t gstat;
+ +    gmx_update_t upd=NULL;
+ +    t_graph    *graph=NULL;
+ +    globsig_t   gs;
+ +    gmx_rng_t mcrng=NULL;
+ +    gmx_bool        bFFscan;
+ +    gmx_groups_t *groups;
+ +    gmx_ekindata_t *ekind, *ekind_save;
+ +    gmx_shellfc_t shellfc;
+ +    int         count,nconverged=0;
+ +    real        timestep=0;
+ +    double      tcount=0;
+ +    gmx_bool        bIonize=FALSE;
+ +    gmx_bool        bTCR=FALSE,bConverged=TRUE,bOK,bSumEkinhOld,bExchanged;
+ +    gmx_bool        bAppend;
+ +    gmx_bool        bResetCountersHalfMaxH=FALSE;
+ +    gmx_bool        bVV,bIterations,bFirstIterate,bTemp,bPres,bTrotter;
+ +    real        mu_aver=0,dvdl;
+ +    int         a0,a1,gnx=0,ii;
+ +    atom_id     *grpindex=NULL;
+ +    char        *grpname;
+ +    t_coupl_rec *tcr=NULL;
+ +    rvec        *xcopy=NULL,*vcopy=NULL,*cbuf=NULL;
+ +    matrix      boxcopy={{0}},lastbox;
+ +      tensor      tmpvir;
+ +      real        fom,oldfom,veta_save,pcurr,scalevir,tracevir;
+ +      real        vetanew = 0;
+ +    int         lamnew=0;
+ +    /* for FEP */
+ +    int         fep_state=0;
+ +    int         nstfep;
+ +    real        rate;
+ +    double      cycles;
+ +      real        saved_conserved_quantity = 0;
+ +    real        last_ekin = 0;
+ +      int         iter_i;
+ +      t_extmass   MassQ;
+ +    int         **trotter_seq; 
+ +    char        sbuf[STEPSTRSIZE],sbuf2[STEPSTRSIZE];
+ +    int         handled_stop_condition=gmx_stop_cond_none; /* compare to get_stop_condition*/
+ +    gmx_iterate_t iterate;
+ +    gmx_large_int_t multisim_nsteps=-1; /* number of steps to do  before first multisim 
+ +                                          simulation stops. If equal to zero, don't
+ +                                          communicate any more between multisims.*/
+ +    /* PME load balancing data for GPU kernels */
-         switch_pme_init(&pme_switch,ir,state->box,fr->ic,fr->pmedata);
++    pme_load_balancing_t pme_loadbal=NULL;
+ +    double          cycles_pmes;
+ +    gmx_bool        bPMETuneTry=FALSE,bPMETuneRunning=FALSE;
+ +
+ +    if(MASTER(cr))
+ +    {
+ +        fprintf(stderr,
+ +                "\n* WARNING * WARNING * WARNING * WARNING * WARNING * WARNING *\n"
+ +                "We have just committed the new CPU detection code in this branch,\n"
+ +                "and will commit new SSE/AVX kernels in a few days. However, this\n"
+ +                "means that currently only the NxN kernels are accelerated!\n"
+ +                "In the mean time, you might want to avoid production runs in 4.6.\n\n");
+ +    }
+ +
+ +#ifdef GMX_FAHCORE
+ +    /* Temporary addition for FAHCORE checkpointing */
+ +    int chkpt_ret;
+ +#endif
+ +    
+ +    /* Check for special mdrun options */
+ +    bRerunMD = (Flags & MD_RERUN);
+ +    bIonize  = (Flags & MD_IONIZE);
+ +    bFFscan  = (Flags & MD_FFSCAN);
+ +    bAppend  = (Flags & MD_APPENDFILES);
+ +    if (Flags & MD_RESETCOUNTERSHALFWAY)
+ +    {
+ +        if (ir->nsteps > 0)
+ +        {
+ +            /* Signal to reset the counters half the simulation steps. */
+ +            wcycle_set_reset_counters(wcycle,ir->nsteps/2);
+ +        }
+ +        /* Signal to reset the counters halfway the simulation time. */
+ +        bResetCountersHalfMaxH = (max_hours > 0);
+ +    }
+ +
+ +    /* md-vv uses averaged full step velocities for T-control 
+ +       md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
+ +       md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
+ +    bVV = EI_VV(ir->eI);
+ +    if (bVV) /* to store the initial velocities while computing virial */
+ +    {
+ +        snew(cbuf,top_global->natoms);
+ +    }
+ +    /* all the iteratative cases - only if there are constraints */ 
+ +    bIterations = ((IR_NPH_TROTTER(ir) || IR_NPT_TROTTER(ir)) && (constr) && (!bRerunMD));
+ +    bTrotter = (bVV && (IR_NPT_TROTTER(ir) || IR_NPH_TROTTER(ir) || IR_NVT_TROTTER(ir)));
+ +    
+ +    if (bRerunMD)
+ +    {
+ +        /* Since we don't know if the frames read are related in any way,
+ +         * rebuild the neighborlist at every step.
+ +         */
+ +        ir->nstlist       = 1;
+ +        ir->nstcalcenergy = 1;
+ +        nstglobalcomm     = 1;
+ +    }
+ +
+ +    check_ir_old_tpx_versions(cr,fplog,ir,top_global);
+ +
+ +    nstglobalcomm = check_nstglobalcomm(fplog,cr,nstglobalcomm,ir);
+ +    bGStatEveryStep = (nstglobalcomm == 1);
+ +
+ +    if (!bGStatEveryStep && ir->nstlist == -1 && fplog != NULL)
+ +    {
+ +        fprintf(fplog,
+ +                "To reduce the energy communication with nstlist = -1\n"
+ +                "the neighbor list validity should not be checked at every step,\n"
+ +                "this means that exact integration is not guaranteed.\n"
+ +                "The neighbor list validity is checked after:\n"
+ +                "  <n.list life time> - 2*std.dev.(n.list life time)  steps.\n"
+ +                "In most cases this will result in exact integration.\n"
+ +                "This reduces the energy communication by a factor of 2 to 3.\n"
+ +                "If you want less energy communication, set nstlist > 3.\n\n");
+ +    }
+ +
+ +    if (bRerunMD || bFFscan)
+ +    {
+ +        ir->nstxtcout = 0;
+ +    }
+ +    groups = &top_global->groups;
+ +
+ +    /* Initial values */
+ +    init_md(fplog,cr,ir,oenv,&t,&t0,state_global->lambda,
+ +            &(state_global->fep_state),lam0,
+ +            nrnb,top_global,&upd,
+ +            nfile,fnm,&outf,&mdebin,
+ +            force_vir,shake_vir,mu_tot,&bSimAnn,&vcm,state_global,Flags);
+ +
+ +    clear_mat(total_vir);
+ +    clear_mat(pres);
+ +    /* Energy terms and groups */
+ +    snew(enerd,1);
+ +    init_enerdata(top_global->groups.grps[egcENER].nr,ir->fepvals->n_lambda,
+ +                  enerd);
+ +    if (DOMAINDECOMP(cr))
+ +    {
+ +        f = NULL;
+ +    }
+ +    else
+ +    {
+ +        snew(f,top_global->natoms);
+ +    }
+ +
+ +    /* lambda Monte carlo random number generator  */
+ +    if (ir->bExpanded)
+ +    {
+ +        mcrng = gmx_rng_init(ir->expandedvals->lmc_seed);
+ +    }
+ +    /* copy the state into df_history */
+ +    copy_df_history(&df_history,&state_global->dfhist);
+ +
+ +    /* Kinetic energy data */
+ +    snew(ekind,1);
+ +    init_ekindata(fplog,top_global,&(ir->opts),ekind);
+ +    /* needed for iteration of constraints */
+ +    snew(ekind_save,1);
+ +    init_ekindata(fplog,top_global,&(ir->opts),ekind_save);
+ +    /* Copy the cos acceleration to the groups struct */    
+ +    ekind->cosacc.cos_accel = ir->cos_accel;
+ +
+ +    gstat = global_stat_init(ir);
+ +    debug_gmx();
+ +
+ +    /* Check for polarizable models and flexible constraints */
+ +    shellfc = init_shell_flexcon(fplog,
+ +                                 top_global,n_flexible_constraints(constr),
+ +                                 (ir->bContinuation || 
+ +                                  (DOMAINDECOMP(cr) && !MASTER(cr))) ?
+ +                                 NULL : state_global->x);
+ +
+ +    if (DEFORM(*ir))
+ +    {
+ +#ifdef GMX_THREAD_MPI
+ +        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
+ +#endif
+ +        set_deform_reference_box(upd,
+ +                                 deform_init_init_step_tpx,
+ +                                 deform_init_box_tpx);
+ +#ifdef GMX_THREAD_MPI
+ +        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
+ +#endif
+ +    }
+ +
+ +    {
+ +        double io = compute_io(ir,top_global->natoms,groups,mdebin->ebin->nener,1);
+ +        if ((io > 2000) && MASTER(cr))
+ +            fprintf(stderr,
+ +                    "\nWARNING: This run will generate roughly %.0f Mb of data\n\n",
+ +                    io);
+ +    }
+ +
+ +    if (DOMAINDECOMP(cr)) {
+ +        top = dd_init_local_top(top_global);
+ +
+ +        snew(state,1);
+ +        dd_init_local_state(cr->dd,state_global,state);
+ +
+ +        if (DDMASTER(cr->dd) && ir->nstfout) {
+ +            snew(f_global,state_global->natoms);
+ +        }
+ +    } else {
+ +        if (PAR(cr)) {
+ +            /* Initialize the particle decomposition and split the topology */
+ +            top = split_system(fplog,top_global,ir,cr);
+ +
+ +            pd_cg_range(cr,&fr->cg0,&fr->hcg);
+ +            pd_at_range(cr,&a0,&a1);
+ +        } else {
+ +            top = gmx_mtop_generate_local_top(top_global,ir);
+ +
+ +            a0 = 0;
+ +            a1 = top_global->natoms;
+ +        }
+ +
+ +        forcerec_set_excl_load(fr,top,cr);
+ +
+ +        state = partdec_init_local_state(cr,state_global);
+ +        f_global = f;
+ +
+ +        atoms2md(top_global,ir,0,NULL,a0,a1-a0,mdatoms);
+ +
+ +        if (vsite) {
+ +            set_vsite_top(vsite,top,mdatoms,cr);
+ +        }
+ +
+ +        if (ir->ePBC != epbcNONE && !fr->bMolPBC) {
+ +            graph = mk_graph(fplog,&(top->idef),0,top_global->natoms,FALSE,FALSE);
+ +        }
+ +
+ +        if (shellfc) {
+ +            make_local_shells(cr,mdatoms,shellfc);
+ +        }
+ +
+ +        init_bonded_thread_force_reduction(fr,&top->idef);
+ +
+ +        if (ir->pull && PAR(cr)) {
+ +            dd_make_local_pull_groups(NULL,ir->pull,mdatoms);
+ +        }
+ +    }
+ +
+ +    if (DOMAINDECOMP(cr))
+ +    {
+ +        /* Distribute the charge groups over the nodes from the master node */
+ +        dd_partition_system(fplog,ir->init_step,cr,TRUE,1,
+ +                            state_global,top_global,ir,
+ +                            state,&f,mdatoms,top,fr,
+ +                            vsite,shellfc,constr,
+ +                            nrnb,wcycle,FALSE);
+ +
+ +    }
+ +
+ +    update_mdatoms(mdatoms,state->lambda[efptMASS]);
+ +
+ +    if (opt2bSet("-cpi",nfile,fnm))
+ +    {
+ +        bStateFromCP = gmx_fexist_master(opt2fn_master("-cpi",nfile,fnm,cr),cr);
+ +    }
+ +    else
+ +    {
+ +        bStateFromCP = FALSE;
+ +    }
+ +
+ +    if (MASTER(cr))
+ +    {
+ +        if (bStateFromCP)
+ +        {
+ +            /* Update mdebin with energy history if appending to output files */
+ +            if ( Flags & MD_APPENDFILES )
+ +            {
+ +                restore_energyhistory_from_state(mdebin,&state_global->enerhist);
+ +            }
+ +            else
+ +            {
+ +                /* We might have read an energy history from checkpoint,
+ +                 * free the allocated memory and reset the counts.
+ +                 */
+ +                done_energyhistory(&state_global->enerhist);
+ +                init_energyhistory(&state_global->enerhist);
+ +            }
+ +        }
+ +        /* Set the initial energy history in state by updating once */
+ +        update_energyhistory(&state_global->enerhist,mdebin);
+ +    } 
+ +
+ +    if ((state->flags & (1<<estLD_RNG)) && (Flags & MD_READ_RNG)) 
+ +    {
+ +        /* Set the random state if we read a checkpoint file */
+ +        set_stochd_state(upd,state);
+ +    }
+ +
+ +    if (state->flags & (1<<estMC_RNG))
+ +    {
+ +        set_mc_state(mcrng,state);
+ +    }
+ +
+ +    /* Initialize constraints */
+ +    if (constr) {
+ +        if (!DOMAINDECOMP(cr))
+ +            set_constraints(constr,top,ir,mdatoms,cr);
+ +    }
+ +
+ +    /* Check whether we have to GCT stuff */
+ +    bTCR = ftp2bSet(efGCT,nfile,fnm);
+ +    if (bTCR) {
+ +        if (MASTER(cr)) {
+ +            fprintf(stderr,"Will do General Coupling Theory!\n");
+ +        }
+ +        gnx = top_global->mols.nr;
+ +        snew(grpindex,gnx);
+ +        for(i=0; (i<gnx); i++) {
+ +            grpindex[i] = i;
+ +        }
+ +    }
+ +
+ +    if (repl_ex_nst > 0)
+ +    {
+ +        /* We need to be sure replica exchange can only occur
+ +         * when the energies are current */
+ +        check_nst_param(fplog,cr,"nstcalcenergy",ir->nstcalcenergy,
+ +                        "repl_ex_nst",&repl_ex_nst);
+ +        /* This check needs to happen before inter-simulation
+ +         * signals are initialized, too */
+ +    }
+ +    if (repl_ex_nst > 0 && MASTER(cr))
+ +    {
+ +        repl_ex = init_replica_exchange(fplog,cr->ms,state_global,ir,
+ +                                        repl_ex_nst,repl_ex_nex,repl_ex_seed);
+ +    }
+ +
+ +    /* PME tuning is only supported with GPUs or PME nodes and not with rerun */
+ +    if ((Flags & MD_TUNEPME) &&
+ +        EEL_PME(fr->eeltype) &&
+ +        fr->cutoff_scheme == ecutsVERLET &&
+ +        (fr->nbv->bUseGPU || !(cr->duty & DUTY_PME)) &&
+ +        !bRerunMD)
+ +    {
-                         switch_pme(pme_switch,cr,
-                                    (bVerbose && MASTER(cr)) ? stderr : NULL,
-                                    fplog,
-                                    ir,state,cycles_pmes,
-                                    fr->ic,fr->nbv,&fr->pmedata,
-                                    step);
++        pme_loadbal_init(&pme_loadbal,ir,state->box,fr->ic,fr->pmedata);
+ +        cycles_pmes = 0;
+ +        if (cr->duty & DUTY_PME)
+ +        {
+ +            /* Start tuning right away, as we can't measure the load */
+ +            bPMETuneRunning = TRUE;
+ +        }
+ +        else
+ +        {
+ +            /* Separate PME nodes, we can measure the PP/PME load balance */
+ +            bPMETuneTry = TRUE;
+ +        }
+ +    }
+ +
+ +    if (!ir->bContinuation && !bRerunMD)
+ +    {
+ +        if (mdatoms->cFREEZE && (state->flags & (1<<estV)))
+ +        {
+ +            /* Set the velocities of frozen particles to zero */
+ +            for(i=mdatoms->start; i<mdatoms->start+mdatoms->homenr; i++)
+ +            {
+ +                for(m=0; m<DIM; m++)
+ +                {
+ +                    if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
+ +                    {
+ +                        state->v[i][m] = 0;
+ +                    }
+ +                }
+ +            }
+ +        }
+ +
+ +        if (constr)
+ +        {
+ +            /* Constrain the initial coordinates and velocities */
+ +            do_constrain_first(fplog,constr,ir,mdatoms,state,f,
+ +                               graph,cr,nrnb,fr,top,shake_vir);
+ +        }
+ +        if (vsite)
+ +        {
+ +            /* Construct the virtual sites for the initial configuration */
+ +            construct_vsites(fplog,vsite,state->x,nrnb,ir->delta_t,NULL,
+ +                             top->idef.iparams,top->idef.il,
+ +                             fr->ePBC,fr->bMolPBC,graph,cr,state->box);
+ +        }
+ +    }
+ +
+ +    debug_gmx();
+ +  
+ +    /* set free energy calculation frequency as the minimum of nstdhdl, nstexpanded, and nstrepl_ex_nst*/
+ +    nstfep = ir->fepvals->nstdhdl;
+ +    if (ir->bExpanded && (nstfep > ir->expandedvals->nstexpanded))
+ +    {
+ +        nstfep = ir->expandedvals->nstexpanded;
+ +    }
+ +    if (repl_ex_nst > 0 && repl_ex_nst > nstfep)
+ +    {
+ +        nstfep = repl_ex_nst;
+ +    }
+ +
+ +    /* I'm assuming we need global communication the first time! MRS */
+ +    cglo_flags = (CGLO_TEMPERATURE | CGLO_GSTAT
+ +                  | ((ir->comm_mode != ecmNO) ? CGLO_STOPCM:0)
+ +                  | (bVV ? CGLO_PRESSURE:0)
+ +                  | (bVV ? CGLO_CONSTRAINT:0)
+ +                  | (bRerunMD ? CGLO_RERUNMD:0)
+ +                  | ((Flags & MD_READ_EKIN) ? CGLO_READEKIN:0));
+ +    
+ +    bSumEkinhOld = FALSE;
+ +    compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
+ +                    NULL,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
+ +                    constr,NULL,FALSE,state->box,
+ +                    top_global,&pcurr,top_global->natoms,&bSumEkinhOld,cglo_flags);
+ +    if (ir->eI == eiVVAK) {
+ +        /* a second call to get the half step temperature initialized as well */ 
+ +        /* we do the same call as above, but turn the pressure off -- internally to 
+ +           compute_globals, this is recognized as a velocity verlet half-step 
+ +           kinetic energy calculation.  This minimized excess variables, but 
+ +           perhaps loses some logic?*/
+ +        
+ +        compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
+ +                        NULL,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
+ +                        constr,NULL,FALSE,state->box,
+ +                        top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
+ +                        cglo_flags &~ (CGLO_STOPCM | CGLO_PRESSURE));
+ +    }
+ +    
+ +    /* Calculate the initial half step temperature, and save the ekinh_old */
+ +    if (!(Flags & MD_STARTFROMCPT)) 
+ +    {
+ +        for(i=0; (i<ir->opts.ngtc); i++) 
+ +        {
+ +            copy_mat(ekind->tcstat[i].ekinh,ekind->tcstat[i].ekinh_old);
+ +        } 
+ +    }
+ +    if (ir->eI != eiVV) 
+ +    {
+ +        enerd->term[F_TEMP] *= 2; /* result of averages being done over previous and current step,
+ +                                     and there is no previous step */
+ +    }
+ +    
+ +    /* if using an iterative algorithm, we need to create a working directory for the state. */
+ +    if (bIterations) 
+ +    {
+ +            bufstate = init_bufstate(state);
+ +    }
+ +    if (bFFscan) 
+ +    {
+ +        snew(xcopy,state->natoms);
+ +        snew(vcopy,state->natoms);
+ +        copy_rvecn(state->x,xcopy,0,state->natoms);
+ +        copy_rvecn(state->v,vcopy,0,state->natoms);
+ +        copy_mat(state->box,boxcopy);
+ +    } 
+ +    
+ +    /* need to make an initiation call to get the Trotter variables set, as well as other constants for non-trotter
+ +       temperature control */
+ +    trotter_seq = init_npt_vars(ir,state,&MassQ,bTrotter);
+ +    
+ +    if (MASTER(cr))
+ +    {
+ +        if (constr && !ir->bContinuation && ir->eConstrAlg == econtLINCS)
+ +        {
+ +            fprintf(fplog,
+ +                    "RMS relative constraint deviation after constraining: %.2e\n",
+ +                    constr_rmsd(constr,FALSE));
+ +        }
+ +        if (EI_STATE_VELOCITY(ir->eI))
+ +        {
+ +            fprintf(fplog,"Initial temperature: %g K\n",enerd->term[F_TEMP]);
+ +        }
+ +        if (bRerunMD)
+ +        {
+ +            fprintf(stderr,"starting md rerun '%s', reading coordinates from"
+ +                    " input trajectory '%s'\n\n",
+ +                    *(top_global->name),opt2fn("-rerun",nfile,fnm));
+ +            if (bVerbose)
+ +            {
+ +                fprintf(stderr,"Calculated time to finish depends on nsteps from "
+ +                        "run input file,\nwhich may not correspond to the time "
+ +                        "needed to process input trajectory.\n\n");
+ +            }
+ +        }
+ +        else
+ +        {
+ +            char tbuf[20];
+ +            fprintf(stderr,"starting mdrun '%s'\n",
+ +                    *(top_global->name));
+ +            if (ir->nsteps >= 0)
+ +            {
+ +                sprintf(tbuf,"%8.1f",(ir->init_step+ir->nsteps)*ir->delta_t);
+ +            }
+ +            else
+ +            {
+ +                sprintf(tbuf,"%s","infinite");
+ +            }
+ +            if (ir->init_step > 0)
+ +            {
+ +                fprintf(stderr,"%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
+ +                        gmx_step_str(ir->init_step+ir->nsteps,sbuf),tbuf,
+ +                        gmx_step_str(ir->init_step,sbuf2),
+ +                        ir->init_step*ir->delta_t);
+ +            }
+ +            else
+ +            {
+ +                fprintf(stderr,"%s steps, %s ps.\n",
+ +                        gmx_step_str(ir->nsteps,sbuf),tbuf);
+ +            }
+ +        }
+ +        fprintf(fplog,"\n");
+ +    }
+ +
+ +    /* Set and write start time */
+ +    runtime_start(runtime);
+ +    print_date_and_time(fplog,cr->nodeid,"Started mdrun",runtime);
+ +    wallcycle_start(wcycle,ewcRUN);
+ +    if (fplog)
+ +    {
+ +        fprintf(fplog,"\n");
+ +    }
+ +
+ +    /* safest point to do file checkpointing is here.  More general point would be immediately before integrator call */
+ +#ifdef GMX_FAHCORE
+ +    chkpt_ret=fcCheckPointParallel( cr->nodeid,
+ +                                    NULL,0);
+ +    if ( chkpt_ret == 0 ) 
+ +        gmx_fatal( 3,__FILE__,__LINE__, "Checkpoint error on step %d\n", 0 );
+ +#endif
+ +
+ +    debug_gmx();
+ +    /***********************************************************
+ +     *
+ +     *             Loop over MD steps 
+ +     *
+ +     ************************************************************/
+ +
+ +    /* if rerunMD then read coordinates and velocities from input trajectory */
+ +    if (bRerunMD)
+ +    {
+ +        if (getenv("GMX_FORCE_UPDATE"))
+ +        {
+ +            bForceUpdate = TRUE;
+ +        }
+ +
+ +        rerun_fr.natoms = 0;
+ +        if (MASTER(cr))
+ +        {
+ +            bNotLastFrame = read_first_frame(oenv,&status,
+ +                                             opt2fn("-rerun",nfile,fnm),
+ +                                             &rerun_fr,TRX_NEED_X | TRX_READ_V);
+ +            if (rerun_fr.natoms != top_global->natoms)
+ +            {
+ +                gmx_fatal(FARGS,
+ +                          "Number of atoms in trajectory (%d) does not match the "
+ +                          "run input file (%d)\n",
+ +                          rerun_fr.natoms,top_global->natoms);
+ +            }
+ +            if (ir->ePBC != epbcNONE)
+ +            {
+ +                if (!rerun_fr.bBox)
+ +                {
+ +                    gmx_fatal(FARGS,"Rerun trajectory frame step %d time %f does not contain a box, while pbc is used",rerun_fr.step,rerun_fr.time);
+ +                }
+ +                if (max_cutoff2(ir->ePBC,rerun_fr.box) < sqr(fr->rlistlong))
+ +                {
+ +                    gmx_fatal(FARGS,"Rerun trajectory frame step %d time %f has too small box dimensions",rerun_fr.step,rerun_fr.time);
+ +                }
+ +            }
+ +        }
+ +
+ +        if (PAR(cr))
+ +        {
+ +            rerun_parallel_comm(cr,&rerun_fr,&bNotLastFrame);
+ +        }
+ +
+ +        if (ir->ePBC != epbcNONE)
+ +        {
+ +            /* Set the shift vectors.
+ +             * Necessary here when have a static box different from the tpr box.
+ +             */
+ +            calc_shifts(rerun_fr.box,fr->shift_vec);
+ +        }
+ +    }
+ +
+ +    /* loop over MD steps or if rerunMD to end of input trajectory */
+ +    bFirstStep = TRUE;
+ +    /* Skip the first Nose-Hoover integration when we get the state from tpx */
+ +    bStateFromTPX = !bStateFromCP;
+ +    bInitStep = bFirstStep && (bStateFromTPX || bVV);
+ +    bStartingFromCpt = (Flags & MD_STARTFROMCPT) && bInitStep;
+ +    bLastStep    = FALSE;
+ +    bSumEkinhOld = FALSE;
+ +    bExchanged   = FALSE;
+ +
+ +    init_global_signals(&gs,cr,ir,repl_ex_nst);
+ +
+ +    step = ir->init_step;
+ +    step_rel = 0;
+ +
+ +    if (ir->nstlist == -1)
+ +    {
+ +        init_nlistheuristics(&nlh,bGStatEveryStep,step);
+ +    }
+ +
+ +    if (MULTISIM(cr) && (repl_ex_nst <=0 ))
+ +    {
+ +        /* check how many steps are left in other sims */
+ +        multisim_nsteps=get_multisim_nsteps(cr, ir->nsteps);
+ +    }
+ +
+ +
+ +    /* and stop now if we should */
+ +    bLastStep = (bRerunMD || (ir->nsteps >= 0 && step_rel > ir->nsteps) ||
+ +                 ((multisim_nsteps >= 0) && (step_rel >= multisim_nsteps )));
+ +    while (!bLastStep || (bRerunMD && bNotLastFrame)) {
+ +
+ +        wallcycle_start(wcycle,ewcSTEP);
+ +
+ +        if (bRerunMD) {
+ +            if (rerun_fr.bStep) {
+ +                step = rerun_fr.step;
+ +                step_rel = step - ir->init_step;
+ +            }
+ +            if (rerun_fr.bTime) {
+ +                t = rerun_fr.time;
+ +            }
+ +            else
+ +            {
+ +                t = step;
+ +            }
+ +        } 
+ +        else 
+ +        {
+ +            bLastStep = (step_rel == ir->nsteps);
+ +            t = t0 + step*ir->delta_t;
+ +        }
+ +
+ +        if (ir->efep != efepNO || ir->bSimTemp)
+ +            {
+ +            /* find and set the current lambdas.  If rerunning, we either read in a state, or a lambda value,
+ +               requiring different logic. */
+ +            
+ +            set_current_lambdas(step,ir->fepvals,bRerunMD,&rerun_fr,state_global,state,lam0);
+ +            bDoDHDL = do_per_step(step,ir->fepvals->nstdhdl);
+ +            bDoFEP  = (do_per_step(step,nstfep) && (ir->efep != efepNO));
+ +            bDoExpanded  = (do_per_step(step,ir->expandedvals->nstexpanded) && (ir->bExpanded) && (step > 0));
+ +        }
+ +
+ +        if (bSimAnn) 
+ +        {
+ +            update_annealing_target_temp(&(ir->opts),t);
+ +        }
+ +
+ +        if (bRerunMD)
+ +        {
+ +            if (!(DOMAINDECOMP(cr) && !MASTER(cr)))
+ +            {
+ +                for(i=0; i<state_global->natoms; i++)
+ +                {
+ +                    copy_rvec(rerun_fr.x[i],state_global->x[i]);
+ +                }
+ +                if (rerun_fr.bV)
+ +                {
+ +                    for(i=0; i<state_global->natoms; i++)
+ +                    {
+ +                        copy_rvec(rerun_fr.v[i],state_global->v[i]);
+ +                    }
+ +                }
+ +                else
+ +                {
+ +                    for(i=0; i<state_global->natoms; i++)
+ +                    {
+ +                        clear_rvec(state_global->v[i]);
+ +                    }
+ +                    if (bRerunWarnNoV)
+ +                    {
+ +                        fprintf(stderr,"\nWARNING: Some frames do not contain velocities.\n"
+ +                                "         Ekin, temperature and pressure are incorrect,\n"
+ +                                "         the virial will be incorrect when constraints are present.\n"
+ +                                "\n");
+ +                        bRerunWarnNoV = FALSE;
+ +                    }
+ +                }
+ +            }
+ +            copy_mat(rerun_fr.box,state_global->box);
+ +            copy_mat(state_global->box,state->box);
+ +
+ +            if (vsite && (Flags & MD_RERUN_VSITE))
+ +            {
+ +                if (DOMAINDECOMP(cr))
+ +                {
+ +                    gmx_fatal(FARGS,"Vsite recalculation with -rerun is not implemented for domain decomposition, use particle decomposition");
+ +                }
+ +                if (graph)
+ +                {
+ +                    /* Following is necessary because the graph may get out of sync
+ +                     * with the coordinates if we only have every N'th coordinate set
+ +                     */
+ +                    mk_mshift(fplog,graph,fr->ePBC,state->box,state->x);
+ +                    shift_self(graph,state->box,state->x);
+ +                }
+ +                construct_vsites(fplog,vsite,state->x,nrnb,ir->delta_t,state->v,
+ +                                 top->idef.iparams,top->idef.il,
+ +                                 fr->ePBC,fr->bMolPBC,graph,cr,state->box);
+ +                if (graph)
+ +                {
+ +                    unshift_self(graph,state->box,state->x);
+ +                }
+ +            }
+ +        }
+ +
+ +        /* Stop Center of Mass motion */
+ +        bStopCM = (ir->comm_mode != ecmNO && do_per_step(step,ir->nstcomm));
+ +
+ +        /* Copy back starting coordinates in case we're doing a forcefield scan */
+ +        if (bFFscan)
+ +        {
+ +            for(ii=0; (ii<state->natoms); ii++)
+ +            {
+ +                copy_rvec(xcopy[ii],state->x[ii]);
+ +                copy_rvec(vcopy[ii],state->v[ii]);
+ +            }
+ +            copy_mat(boxcopy,state->box);
+ +        }
+ +
+ +        if (bRerunMD)
+ +        {
+ +            /* for rerun MD always do Neighbour Searching */
+ +            bNS = (bFirstStep || ir->nstlist != 0);
+ +            bNStList = bNS;
+ +        }
+ +        else
+ +        {
+ +            /* Determine whether or not to do Neighbour Searching and LR */
+ +            bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
+ +            
+ +            bNS = (bFirstStep || bExchanged || bNStList || bDoFEP ||
+ +                   (ir->nstlist == -1 && nlh.nabnsb > 0));
+ +
+ +            if (bNS && ir->nstlist == -1)
+ +            {
+ +                set_nlistheuristics(&nlh,bFirstStep || bExchanged || bDoFEP, step);
+ +            }
+ +        } 
+ +
+ +        /* check whether we should stop because another simulation has 
+ +           stopped. */
+ +        if (MULTISIM(cr))
+ +        {
+ +            if ( (multisim_nsteps >= 0) &&  (step_rel >= multisim_nsteps)  &&  
+ +                 (multisim_nsteps != ir->nsteps) )  
+ +            {
+ +                if (bNS)
+ +                {
+ +                    if (MASTER(cr))
+ +                    {
+ +                        fprintf(stderr, 
+ +                                "Stopping simulation %d because another one has finished\n",
+ +                                cr->ms->sim);
+ +                    }
+ +                    bLastStep=TRUE;
+ +                    gs.sig[eglsCHKPT] = 1;
+ +                }
+ +            }
+ +        }
+ +
+ +        /* < 0 means stop at next step, > 0 means stop at next NS step */
+ +        if ( (gs.set[eglsSTOPCOND] < 0 ) ||
+ +             ( (gs.set[eglsSTOPCOND] > 0 ) && ( bNS || ir->nstlist==0)) )
+ +        {
+ +            bLastStep = TRUE;
+ +        }
+ +
+ +        /* Determine whether or not to update the Born radii if doing GB */
+ +        bBornRadii=bFirstStep;
+ +        if (ir->implicit_solvent && (step % ir->nstgbradii==0))
+ +        {
+ +            bBornRadii=TRUE;
+ +        }
+ +        
+ +        do_log = do_per_step(step,ir->nstlog) || bFirstStep || bLastStep;
+ +        do_verbose = bVerbose &&
+ +                  (step % stepout == 0 || bFirstStep || bLastStep);
+ +
+ +        if (bNS && !(bFirstStep && ir->bContinuation && !bRerunMD))
+ +        {
+ +            if (bRerunMD)
+ +            {
+ +                bMasterState = TRUE;
+ +            }
+ +            else
+ +            {
+ +                bMasterState = FALSE;
+ +                /* Correct the new box if it is too skewed */
+ +                if (DYNAMIC_BOX(*ir))
+ +                {
+ +                    if (correct_box(fplog,step,state->box,graph))
+ +                    {
+ +                        bMasterState = TRUE;
+ +                    }
+ +                }
+ +                if (DOMAINDECOMP(cr) && bMasterState)
+ +                {
+ +                    dd_collect_state(cr->dd,state,state_global);
+ +                }
+ +            }
+ +
+ +            if (DOMAINDECOMP(cr))
+ +            {
+ +                /* Repartition the domain decomposition */
+ +                wallcycle_start(wcycle,ewcDOMDEC);
+ +                dd_partition_system(fplog,step,cr,
+ +                                    bMasterState,nstglobalcomm,
+ +                                    state_global,top_global,ir,
+ +                                    state,&f,mdatoms,top,fr,
+ +                                    vsite,shellfc,constr,
+ +                                    nrnb,wcycle,
+ +                                    do_verbose && !bPMETuneRunning);
+ +                wallcycle_stop(wcycle,ewcDOMDEC);
+ +                /* If using an iterative integrator, reallocate space to match the decomposition */
+ +            }
+ +        }
+ +
+ +        if (MASTER(cr) && do_log && !bFFscan)
+ +        {
+ +            print_ebin_header(fplog,step,t,state->lambda[efptFEP]); /* can we improve the information printed here? */
+ +        }
+ +
+ +        if (ir->efep != efepNO)
+ +        {
+ +            update_mdatoms(mdatoms,state->lambda[efptMASS]);
+ +        }
+ +
+ +        if ((bRerunMD && rerun_fr.bV) || bExchanged)
+ +        {
+ +            
+ +            /* We need the kinetic energy at minus the half step for determining
+ +             * the full step kinetic energy and possibly for T-coupling.*/
+ +            /* This may not be quite working correctly yet . . . . */
+ +            compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
+ +                            wcycle,enerd,NULL,NULL,NULL,NULL,mu_tot,
+ +                            constr,NULL,FALSE,state->box,
+ +                            top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
+ +                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
+ +        }
+ +        clear_mat(force_vir);
+ +        
+ +        /* Ionize the atoms if necessary */
+ +        if (bIonize)
+ +        {
+ +            ionize(fplog,oenv,mdatoms,top_global,t,ir,state->x,state->v,
+ +                   mdatoms->start,mdatoms->start+mdatoms->homenr,state->box,cr);
+ +        }
+ +        
+ +        /* Update force field in ffscan program */
+ +        if (bFFscan)
+ +        {
+ +            if (update_forcefield(fplog,
+ +                                  nfile,fnm,fr,
+ +                                  mdatoms->nr,state->x,state->box))
+ +            {
+ +                gmx_finalize_par();
+ +
+ +                exit(0);
+ +            }
+ +        }
+ +
+ +        /* We write a checkpoint at this MD step when:
+ +         * either at an NS step when we signalled through gs,
+ +         * or at the last step (but not when we do not want confout),
+ +         * but never at the first step or with rerun.
+ +         */
+ +        bCPT = (((gs.set[eglsCHKPT] && (bNS || ir->nstlist == 0)) ||
+ +                 (bLastStep && (Flags & MD_CONFOUT))) &&
+ +                step > ir->init_step && !bRerunMD);
+ +        if (bCPT)
+ +        {
+ +            gs.set[eglsCHKPT] = 0;
+ +        }
+ +
+ +        /* Determine the energy and pressure:
+ +         * at nstcalcenergy steps and at energy output steps (set below).
+ +         */
+ +        if (EI_VV(ir->eI) && (!bInitStep))
+ +        {
+ +            /* for vv, the first half actually corresponds to the last step */
+ +            bCalcEner = do_per_step(step-1,ir->nstcalcenergy);
+ +        }
+ +        else
+ +        {
+ +            bCalcEner = do_per_step(step,ir->nstcalcenergy);
+ +        }
+ +        bCalcVir = bCalcEner ||
+ +            (ir->epc != epcNO && do_per_step(step,ir->nstpcouple));
+ +
+ +        /* Do we need global communication ? */
+ +        bGStat = (bCalcVir || bCalcEner || bStopCM ||
+ +                  do_per_step(step,nstglobalcomm) ||
+ +                  (ir->nstlist == -1 && !bRerunMD && step >= nlh.step_nscheck));
+ +
+ +        do_ene = (do_per_step(step,ir->nstenergy) || bLastStep);
+ +
+ +        if (do_ene || do_log)
+ +        {
+ +            bCalcVir  = TRUE;
+ +            bCalcEner = TRUE;
+ +            bGStat    = TRUE;
+ +        }
+ +        
+ +        /* these CGLO_ options remain the same throughout the iteration */
+ +        cglo_flags = ((bRerunMD ? CGLO_RERUNMD : 0) |
+ +                      (bGStat ? CGLO_GSTAT : 0)
+ +            );
+ +        
+ +        force_flags = (GMX_FORCE_STATECHANGED |
+ +                       ((DYNAMIC_BOX(*ir) || bRerunMD) ? GMX_FORCE_DYNAMICBOX : 0) |
+ +                       GMX_FORCE_ALLFORCES |
+ +                       (bNStList ? GMX_FORCE_DOLR : 0) |
+ +                       GMX_FORCE_SEPLRF |
+ +                       (bCalcVir ? GMX_FORCE_VIRIAL : 0) |
+ +                       (bCalcEner ? GMX_FORCE_ENERGY : 0) |
+ +                       (bDoFEP ? GMX_FORCE_DHDL : 0)
+ +            );
+ +        
+ +        if (shellfc)
+ +        {
+ +            /* Now is the time to relax the shells */
+ +            count=relax_shell_flexcon(fplog,cr,bVerbose,bFFscan ? step+1 : step,
+ +                                      ir,bNS,force_flags,
+ +                                      bStopCM,top,top_global,
+ +                                      constr,enerd,fcd,
+ +                                      state,f,force_vir,mdatoms,
+ +                                      nrnb,wcycle,graph,groups,
+ +                                      shellfc,fr,bBornRadii,t,mu_tot,
+ +                                      state->natoms,&bConverged,vsite,
+ +                                      outf->fp_field);
+ +            tcount+=count;
+ +
+ +            if (bConverged)
+ +            {
+ +                nconverged++;
+ +            }
+ +        }
+ +        else
+ +        {
+ +            /* The coordinates (x) are shifted (to get whole molecules)
+ +             * in do_force.
+ +             * This is parallellized as well, and does communication too. 
+ +             * Check comments in sim_util.c
+ +             */
+ +             do_force(fplog,cr,ir,step,nrnb,wcycle,top,top_global,groups,
+ +                     state->box,state->x,&state->hist,
+ +                     f,force_vir,mdatoms,enerd,fcd,
+ +                     state->lambda,graph,
+ +                     fr,vsite,mu_tot,t,outf->fp_field,ed,bBornRadii,
+ +                     (bNS ? GMX_FORCE_NS : 0) | force_flags);
+ +        }
+ +        
+ +        if (bTCR)
+ +        {
+ +            mu_aver = calc_mu_aver(cr,state->x,mdatoms->chargeA,
+ +                                   mu_tot,&top_global->mols,mdatoms,gnx,grpindex);
+ +        }
+ +        
+ +        if (bTCR && bFirstStep)
+ +        {
+ +            tcr=init_coupling(fplog,nfile,fnm,cr,fr,mdatoms,&(top->idef));
+ +            fprintf(fplog,"Done init_coupling\n"); 
+ +            fflush(fplog);
+ +        }
+ +        
+ +        if (bVV && !bStartingFromCpt && !bRerunMD)
+ +        /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
+ +        {
+ +            if (ir->eI==eiVV && bInitStep) 
+ +            {
+ +                /* if using velocity verlet with full time step Ekin,
+ +                 * take the first half step only to compute the 
+ +                 * virial for the first step. From there,
+ +                 * revert back to the initial coordinates
+ +                 * so that the input is actually the initial step.
+ +                 */
+ +                copy_rvecn(state->v,cbuf,0,state->natoms); /* should make this better for parallelizing? */
+ +            } else {
+ +                /* this is for NHC in the Ekin(t+dt/2) version of vv */
+ +                trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ1);            
+ +            }
+ +
+ +            update_coords(fplog,step,ir,mdatoms,state,fr->bMolPBC,
+ +                          f,fr->bTwinRange && bNStList,fr->f_twin,fcd,
+ +                          ekind,M,wcycle,upd,bInitStep,etrtVELOCITY1,
+ +                          cr,nrnb,constr,&top->idef);
+ +            
+ +            if (bIterations)
+ +            {
+ +                gmx_iterate_init(&iterate,bIterations && !bInitStep);
+ +            }
+ +            /* for iterations, we save these vectors, as we will be self-consistently iterating
+ +               the calculations */
+ +
+ +            /*#### UPDATE EXTENDED VARIABLES IN TROTTER FORMULATION */
+ +            
+ +            /* save the state */
+ +            if (bIterations && iterate.bIterate) { 
+ +                copy_coupling_state(state,bufstate,ekind,ekind_save,&(ir->opts));
+ +            }
+ +            
+ +            bFirstIterate = TRUE;
+ +            while (bFirstIterate || (bIterations && iterate.bIterate))
+ +            {
+ +                if (bIterations && iterate.bIterate) 
+ +                {
+ +                    copy_coupling_state(bufstate,state,ekind_save,ekind,&(ir->opts));
+ +                    if (bFirstIterate && bTrotter) 
+ +                    {
+ +                        /* The first time through, we need a decent first estimate
+ +                           of veta(t+dt) to compute the constraints.  Do
+ +                           this by computing the box volume part of the
+ +                           trotter integration at this time. Nothing else
+ +                           should be changed by this routine here.  If
+ +                           !(first time), we start with the previous value
+ +                           of veta.  */
+ +                        
+ +                        veta_save = state->veta;
+ +                        trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ0);
+ +                        vetanew = state->veta;
+ +                        state->veta = veta_save;
+ +                    } 
+ +                } 
+ +                
+ +                bOK = TRUE;
+ +                if ( !bRerunMD || rerun_fr.bV || bForceUpdate) {  /* Why is rerun_fr.bV here?  Unclear. */
+ +                    dvdl = 0;
+ +                    
+ +                    update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,
+ +                                       state,fr->bMolPBC,graph,f,
+ +                                       &top->idef,shake_vir,NULL,
+ +                                       cr,nrnb,wcycle,upd,constr,
+ +                                       bInitStep,TRUE,bCalcVir,vetanew);
+ +                    
+ +                    if (!bOK && !bFFscan)
+ +                    {
+ +                        gmx_fatal(FARGS,"Constraint error: Shake, Lincs or Settle could not solve the constrains");
+ +                    }
+ +                    
+ +                } 
+ +                else if (graph)
+ +                { /* Need to unshift here if a do_force has been
+ +                     called in the previous step */
+ +                    unshift_self(graph,state->box,state->x);
+ +                }
+ +                
+ +                
+ +                /* if VV, compute the pressure and constraints */
+ +                /* For VV2, we strictly only need this if using pressure
+ +                 * control, but we really would like to have accurate pressures
+ +                 * printed out.
+ +                 * Think about ways around this in the future?
+ +                 * For now, keep this choice in comments.
+ +                 */
+ +                /*bPres = (ir->eI==eiVV || IR_NPT_TROTTER(ir)); */
+ +                    /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && IR_NPT_TROTTER(ir)));*/
+ +                bPres = TRUE;
+ +                bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK));
+ +                if (bCalcEner && ir->eI==eiVVAK)  /*MRS:  7/9/2010 -- this still doesn't fix it?*/
+ +                {
+ +                    bSumEkinhOld = TRUE;
+ +                }
+ +                compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
+ +                                wcycle,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
+ +                                constr,NULL,FALSE,state->box,
+ +                                top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
+ +                                cglo_flags 
+ +                                | CGLO_ENERGY 
+ +                                | (bStopCM ? CGLO_STOPCM : 0)
+ +                                | (bTemp ? CGLO_TEMPERATURE:0) 
+ +                                | (bPres ? CGLO_PRESSURE : 0) 
+ +                                | (bPres ? CGLO_CONSTRAINT : 0)
+ +                                | ((bIterations && iterate.bIterate) ? CGLO_ITERATE : 0)  
+ +                                | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
+ +                                | CGLO_SCALEEKIN 
+ +                    );
+ +                /* explanation of above: 
+ +                   a) We compute Ekin at the full time step
+ +                   if 1) we are using the AveVel Ekin, and it's not the
+ +                   initial step, or 2) if we are using AveEkin, but need the full
+ +                   time step kinetic energy for the pressure (always true now, since we want accurate statistics).
+ +                   b) If we are using EkinAveEkin for the kinetic energy for the temperture control, we still feed in 
+ +                   EkinAveVel because it's needed for the pressure */
+ +                
+ +                /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
+ +                if (!bInitStep) 
+ +                {
+ +                    if (bTrotter)
+ +                    {
+ +                        trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ2);
+ +                    } 
+ +                    else 
+ +                    {
+ +                        if (bExchanged)
+ +                        {
+ +            
+ +                            /* We need the kinetic energy at minus the half step for determining
+ +                             * the full step kinetic energy and possibly for T-coupling.*/
+ +                            /* This may not be quite working correctly yet . . . . */
+ +                            compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
+ +                                            wcycle,enerd,NULL,NULL,NULL,NULL,mu_tot,
+ +                                            constr,NULL,FALSE,state->box,
+ +                                            top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
+ +                                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
+ +                        }
+ +
+ +
+ +                        update_tcouple(fplog,step,ir,state,ekind,wcycle,upd,&MassQ,mdatoms);
+ +                    }
+ +                }
+ +                
+ +                if (bIterations &&
+ +                    done_iterating(cr,fplog,step,&iterate,bFirstIterate,
+ +                                   state->veta,&vetanew)) 
+ +                {
+ +                    break;
+ +                }
+ +                bFirstIterate = FALSE;
+ +            }
+ +
+ +            if (bTrotter && !bInitStep) {
+ +                enerd->term[F_DVDL_BONDED] += dvdl;        /* only add after iterations */
+ +                copy_mat(shake_vir,state->svir_prev);
+ +                copy_mat(force_vir,state->fvir_prev);
+ +                if (IR_NVT_TROTTER(ir) && ir->eI==eiVV) {
+ +                    /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
+ +                    enerd->term[F_TEMP] = sum_ekin(&(ir->opts),ekind,NULL,(ir->eI==eiVV),FALSE,FALSE);
+ +                    enerd->term[F_EKIN] = trace(ekind->ekin);
+ +                }
+ +            }
+ +            /* if it's the initial step, we performed this first step just to get the constraint virial */
+ +            if (bInitStep && ir->eI==eiVV) {
+ +                copy_rvecn(cbuf,state->v,0,state->natoms);
+ +            }
+ +            
+ +            if (fr->bSepDVDL && fplog && do_log) 
+ +            {
+ +                fprintf(fplog,sepdvdlformat,"Constraint",0.0,dvdl);
+ +            }
+ +            enerd->term[F_DVDL_BONDED] += dvdl;
+ +        }
+ +    
+ +        /* MRS -- now done iterating -- compute the conserved quantity */
+ +        if (bVV) {
+ +            saved_conserved_quantity = compute_conserved_from_auxiliary(ir,state,&MassQ);
+ +            if (ir->eI==eiVV) 
+ +            {
+ +                last_ekin = enerd->term[F_EKIN];
+ +            }
+ +            if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres)) 
+ +            {
+ +                saved_conserved_quantity -= enerd->term[F_DISPCORR];
+ +            }
+ +            /* sum up the foreign energy and dhdl terms for vv.  currently done every step so that dhdl is correct in the .edr */
+ +            sum_dhdl(enerd,state->lambda,ir->fepvals);
+ +        }
+ +        
+ +        /* ########  END FIRST UPDATE STEP  ############## */
+ +        /* ########  If doing VV, we now have v(dt) ###### */
+ +        if (bDoExpanded) {
+ +            /* perform extended ensemble sampling in lambda - we don't
+ +               actually move to the new state before outputting
+ +               statistics, but if performing simulated tempering, we
+ +               do update the velocities and the tau_t. */
+ +        
+ +            lamnew = ExpandedEnsembleDynamics(fplog,ir,enerd,state,&MassQ,&df_history,step,mcrng,state->v,mdatoms);
+ +        }
+ +        /* ################## START TRAJECTORY OUTPUT ################# */
+ +        
+ +        /* Now we have the energies and forces corresponding to the 
+ +         * coordinates at time t. We must output all of this before
+ +         * the update.
+ +         * for RerunMD t is read from input trajectory
+ +         */
+ +        mdof_flags = 0;
+ +        if (do_per_step(step,ir->nstxout)) { mdof_flags |= MDOF_X; }
+ +        if (do_per_step(step,ir->nstvout)) { mdof_flags |= MDOF_V; }
+ +        if (do_per_step(step,ir->nstfout)) { mdof_flags |= MDOF_F; }
+ +        if (do_per_step(step,ir->nstxtcout)) { mdof_flags |= MDOF_XTC; }
+ +        if (bCPT) { mdof_flags |= MDOF_CPT; };
+ +
+ +#if defined(GMX_FAHCORE) || defined(GMX_WRITELASTSTEP)
+ +        if (bLastStep)
+ +        {
+ +            /* Enforce writing positions and velocities at end of run */
+ +            mdof_flags |= (MDOF_X | MDOF_V);
+ +        }
+ +#endif
+ +#ifdef GMX_FAHCORE
+ +        if (MASTER(cr))
+ +            fcReportProgress( ir->nsteps, step );
+ +
+ +        /* sync bCPT and fc record-keeping */
+ +        if (bCPT && MASTER(cr))
+ +            fcRequestCheckPoint();
+ +#endif
+ +        
+ +        if (mdof_flags != 0)
+ +        {
+ +            wallcycle_start(wcycle,ewcTRAJ);
+ +            if (bCPT)
+ +            {
+ +                if (state->flags & (1<<estLD_RNG))
+ +                {
+ +                    get_stochd_state(upd,state);
+ +                }
+ +                if (state->flags  & (1<<estMC_RNG))
+ +                {
+ +                    get_mc_state(mcrng,state);
+ +                }
+ +                if (MASTER(cr))
+ +                {
+ +                    if (bSumEkinhOld)
+ +                    {
+ +                        state_global->ekinstate.bUpToDate = FALSE;
+ +                    }
+ +                    else
+ +                    {
+ +                        update_ekinstate(&state_global->ekinstate,ekind);
+ +                        state_global->ekinstate.bUpToDate = TRUE;
+ +                    }
+ +                    update_energyhistory(&state_global->enerhist,mdebin);
+ +                    if (ir->efep!=efepNO || ir->bSimTemp) 
+ +                    {
+ +                        state_global->fep_state = state->fep_state; /* MRS: seems kludgy. The code should be
+ +                                                                       structured so this isn't necessary.
+ +                                                                       Note this reassignment is only necessary
+ +                                                                       for single threads.*/
+ +                        copy_df_history(&state_global->dfhist,&df_history);
+ +                    }
+ +                }
+ +            }
+ +            write_traj(fplog,cr,outf,mdof_flags,top_global,
+ +                       step,t,state,state_global,f,f_global,&n_xtc,&x_xtc);
+ +            if (bCPT)
+ +            {
+ +                nchkpt++;
+ +                bCPT = FALSE;
+ +            }
+ +            debug_gmx();
+ +            if (bLastStep && step_rel == ir->nsteps &&
+ +                (Flags & MD_CONFOUT) && MASTER(cr) &&
+ +                !bRerunMD && !bFFscan)
+ +            {
+ +                /* x and v have been collected in write_traj,
+ +                 * because a checkpoint file will always be written
+ +                 * at the last step.
+ +                 */
+ +                fprintf(stderr,"\nWriting final coordinates.\n");
+ +                if (fr->bMolPBC)
+ +                {
+ +                    /* Make molecules whole only for confout writing */
+ +                    do_pbc_mtop(fplog,ir->ePBC,state->box,top_global,state_global->x);
+ +                }
+ +                write_sto_conf_mtop(ftp2fn(efSTO,nfile,fnm),
+ +                                    *top_global->name,top_global,
+ +                                    state_global->x,state_global->v,
+ +                                    ir->ePBC,state->box);
+ +                debug_gmx();
+ +            }
+ +            wallcycle_stop(wcycle,ewcTRAJ);
+ +        }
+ +        
+ +        /* kludge -- virial is lost with restart for NPT control. Must restart */
+ +        if (bStartingFromCpt && bVV) 
+ +        {
+ +            copy_mat(state->svir_prev,shake_vir);
+ +            copy_mat(state->fvir_prev,force_vir);
+ +        }
+ +        /*  ################## END TRAJECTORY OUTPUT ################ */
+ +        
+ +        /* Determine the wallclock run time up till now */
+ +        run_time = gmx_gettime() - (double)runtime->real;
+ +
+ +        /* Check whether everything is still allright */    
+ +        if (((int)gmx_get_stop_condition() > handled_stop_condition)
+ +#ifdef GMX_THREAD_MPI
+ +            && MASTER(cr)
+ +#endif
+ +            )
+ +        {
+ +            /* this is just make gs.sig compatible with the hack 
+ +               of sending signals around by MPI_Reduce with together with
+ +               other floats */
+ +            if ( gmx_get_stop_condition() == gmx_stop_cond_next_ns )
+ +                gs.sig[eglsSTOPCOND]=1;
+ +            if ( gmx_get_stop_condition() == gmx_stop_cond_next )
+ +                gs.sig[eglsSTOPCOND]=-1;
+ +            /* < 0 means stop at next step, > 0 means stop at next NS step */
+ +            if (fplog)
+ +            {
+ +                fprintf(fplog,
+ +                        "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
+ +                        gmx_get_signal_name(),
+ +                        gs.sig[eglsSTOPCOND]==1 ? "NS " : "");
+ +                fflush(fplog);
+ +            }
+ +            fprintf(stderr,
+ +                    "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
+ +                    gmx_get_signal_name(),
+ +                    gs.sig[eglsSTOPCOND]==1 ? "NS " : "");
+ +            fflush(stderr);
+ +            handled_stop_condition=(int)gmx_get_stop_condition();
+ +        }
+ +        else if (MASTER(cr) && (bNS || ir->nstlist <= 0) &&
+ +                 (max_hours > 0 && run_time > max_hours*60.0*60.0*0.99) &&
+ +                 gs.sig[eglsSTOPCOND] == 0 && gs.set[eglsSTOPCOND] == 0)
+ +        {
+ +            /* Signal to terminate the run */
+ +            gs.sig[eglsSTOPCOND] = 1;
+ +            if (fplog)
+ +            {
+ +                fprintf(fplog,"\nStep %s: Run time exceeded %.3f hours, will terminate the run\n",gmx_step_str(step,sbuf),max_hours*0.99);
+ +            }
+ +            fprintf(stderr, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n",gmx_step_str(step,sbuf),max_hours*0.99);
+ +        }
+ +
+ +        if (bResetCountersHalfMaxH && MASTER(cr) &&
+ +            run_time > max_hours*60.0*60.0*0.495)
+ +        {
+ +            gs.sig[eglsRESETCOUNTERS] = 1;
+ +        }
+ +
+ +        if (ir->nstlist == -1 && !bRerunMD)
+ +        {
+ +            /* When bGStatEveryStep=FALSE, global_stat is only called
+ +             * when we check the atom displacements, not at NS steps.
+ +             * This means that also the bonded interaction count check is not
+ +             * performed immediately after NS. Therefore a few MD steps could
+ +             * be performed with missing interactions.
+ +             * But wrong energies are never written to file,
+ +             * since energies are only written after global_stat
+ +             * has been called.
+ +             */
+ +            if (step >= nlh.step_nscheck)
+ +            {
+ +                nlh.nabnsb = natoms_beyond_ns_buffer(ir,fr,&top->cgs,
+ +                                                     nlh.scale_tot,state->x);
+ +            }
+ +            else
+ +            {
+ +                /* This is not necessarily true,
+ +                 * but step_nscheck is determined quite conservatively.
+ +                 */
+ +                nlh.nabnsb = 0;
+ +            }
+ +        }
+ +
+ +        /* In parallel we only have to check for checkpointing in steps
+ +         * where we do global communication,
+ +         *  otherwise the other nodes don't know.
+ +         */
+ +        if (MASTER(cr) && ((bGStat || !PAR(cr)) &&
+ +                           cpt_period >= 0 &&
+ +                           (cpt_period == 0 || 
+ +                            run_time >= nchkpt*cpt_period*60.0)) &&
+ +            gs.set[eglsCHKPT] == 0)
+ +        {
+ +            gs.sig[eglsCHKPT] = 1;
+ +        }
+ +  
+ +
+ +        /* at the start of step, randomize the velocities */
+ +        if (ETC_ANDERSEN(ir->etc) && EI_VV(ir->eI))
+ +        {
+ +            gmx_bool bDoAndersenConstr;
+ +            bDoAndersenConstr = (constr && update_randomize_velocities(ir,step,mdatoms,state,upd,&top->idef,constr));
+ +            /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
+ +            if (bDoAndersenConstr)
+ +            {
+ +                update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,
+ +                                   state,fr->bMolPBC,graph,f,
+ +                                   &top->idef,tmp_vir,NULL,
+ +                                   cr,nrnb,wcycle,upd,constr,
+ +                                   bInitStep,TRUE,bCalcVir,vetanew);
+ +            }
+ +        }
+ +
+ +        if (bIterations)
+ +        {
+ +            gmx_iterate_init(&iterate,bIterations);
+ +        }
+ +    
+ +        /* for iterations, we save these vectors, as we will be redoing the calculations */
+ +        if (bIterations && iterate.bIterate) 
+ +        {
+ +            copy_coupling_state(state,bufstate,ekind,ekind_save,&(ir->opts));
+ +        }
+ +        bFirstIterate = TRUE;
+ +        while (bFirstIterate || (bIterations && iterate.bIterate))
+ +        {
+ +            /* We now restore these vectors to redo the calculation with improved extended variables */    
+ +            if (bIterations) 
+ +            { 
+ +                copy_coupling_state(bufstate,state,ekind_save,ekind,&(ir->opts));
+ +            }
+ +
+ +            /* We make the decision to break or not -after- the calculation of Ekin and Pressure,
+ +               so scroll down for that logic */
+ +            
+ +            /* #########   START SECOND UPDATE STEP ################# */
+ +            /* Box is changed in update() when we do pressure coupling,
+ +             * but we should still use the old box for energy corrections and when
+ +             * writing it to the energy file, so it matches the trajectory files for
+ +             * the same timestep above. Make a copy in a separate array.
+ +             */
+ +            copy_mat(state->box,lastbox);
+ +
+ +            bOK = TRUE;
+ +            if (!(bRerunMD && !rerun_fr.bV && !bForceUpdate))
+ +            {
+ +                wallcycle_start(wcycle,ewcUPDATE);
+ +                dvdl = 0;
+ +                /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
+ +                if (bTrotter) 
+ +                {
+ +                    if (bIterations && iterate.bIterate) 
+ +                    {
+ +                        if (bFirstIterate) 
+ +                        {
+ +                            scalevir = 1;
+ +                        }
+ +                        else 
+ +                        {
+ +                            /* we use a new value of scalevir to converge the iterations faster */
+ +                            scalevir = tracevir/trace(shake_vir);
+ +                        }
+ +                        msmul(shake_vir,scalevir,shake_vir); 
+ +                        m_add(force_vir,shake_vir,total_vir);
+ +                        clear_mat(shake_vir);
+ +                    }
+ +                    trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ3);
+ +                /* We can only do Berendsen coupling after we have summed
+ +                 * the kinetic energy or virial. Since the happens
+ +                 * in global_state after update, we should only do it at
+ +                 * step % nstlist = 1 with bGStatEveryStep=FALSE.
+ +                 */
+ +                }
+ +                else 
+ +                {
+ +                    update_tcouple(fplog,step,ir,state,ekind,wcycle,upd,&MassQ,mdatoms);
+ +                    update_pcouple(fplog,step,ir,state,pcoupl_mu,M,wcycle,
+ +                                   upd,bInitStep);
+ +                }
+ +
+ +                if (bVV)
+ +                {
+ +                    /* velocity half-step update */
+ +                    update_coords(fplog,step,ir,mdatoms,state,fr->bMolPBC,f,
+ +                                  fr->bTwinRange && bNStList,fr->f_twin,fcd,
+ +                                  ekind,M,wcycle,upd,FALSE,etrtVELOCITY2,
+ +                                  cr,nrnb,constr,&top->idef);
+ +                }
+ +
+ +                /* Above, initialize just copies ekinh into ekin,
+ +                 * it doesn't copy position (for VV),
+ +                 * and entire integrator for MD.
+ +                 */
+ +                
+ +                if (ir->eI==eiVVAK) 
+ +                {
+ +                    copy_rvecn(state->x,cbuf,0,state->natoms);
+ +                }
+ +                
+ +                update_coords(fplog,step,ir,mdatoms,state,fr->bMolPBC,f,
+ +                              fr->bTwinRange && bNStList,fr->f_twin,fcd,
+ +                              ekind,M,wcycle,upd,bInitStep,etrtPOSITION,cr,nrnb,constr,&top->idef);
+ +                wallcycle_stop(wcycle,ewcUPDATE);
+ +
+ +                update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,state,
+ +                                   fr->bMolPBC,graph,f,
+ +                                   &top->idef,shake_vir,force_vir,
+ +                                   cr,nrnb,wcycle,upd,constr,
+ +                                   bInitStep,FALSE,bCalcVir,state->veta);  
+ +                
+ +                if (ir->eI==eiVVAK) 
+ +                {
+ +                    /* erase F_EKIN and F_TEMP here? */
+ +                    /* just compute the kinetic energy at the half step to perform a trotter step */
+ +                    compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
+ +                                    wcycle,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
+ +                                    constr,NULL,FALSE,lastbox,
+ +                                    top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
+ +                                    cglo_flags | CGLO_TEMPERATURE    
+ +                        );
+ +                    wallcycle_start(wcycle,ewcUPDATE);
+ +                    trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ4);            
+ +                    /* now we know the scaling, we can compute the positions again again */
+ +                    copy_rvecn(cbuf,state->x,0,state->natoms);
+ +
+ +                    update_coords(fplog,step,ir,mdatoms,state,fr->bMolPBC,f,
+ +                                  fr->bTwinRange && bNStList,fr->f_twin,fcd,
+ +                                  ekind,M,wcycle,upd,bInitStep,etrtPOSITION,cr,nrnb,constr,&top->idef);
+ +                    wallcycle_stop(wcycle,ewcUPDATE);
+ +
+ +                    /* do we need an extra constraint here? just need to copy out of state->v to upd->xp? */
+ +                    /* are the small terms in the shake_vir here due
+ +                     * to numerical errors, or are they important
+ +                     * physically? I'm thinking they are just errors, but not completely sure. 
+ +                     * For now, will call without actually constraining, constr=NULL*/
+ +                    update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,
+ +                                       state,fr->bMolPBC,graph,f,
+ +                                       &top->idef,tmp_vir,force_vir,
+ +                                       cr,nrnb,wcycle,upd,NULL,
+ +                                       bInitStep,FALSE,bCalcVir,
+ +                                       state->veta);  
+ +                }
+ +                if (!bOK && !bFFscan) 
+ +                {
+ +                    gmx_fatal(FARGS,"Constraint error: Shake, Lincs or Settle could not solve the constrains");
+ +                }
+ +                
+ +                if (fr->bSepDVDL && fplog && do_log) 
+ +                {
+ +                    fprintf(fplog,sepdvdlformat,"Constraint dV/dl",0.0,dvdl);
+ +                }
+ +                enerd->term[F_DVDL_BONDED] += dvdl;
+ +            } 
+ +            else if (graph) 
+ +            {
+ +                /* Need to unshift here */
+ +                unshift_self(graph,state->box,state->x);
+ +            }
+ +
+ +            if (vsite != NULL) 
+ +            {
+ +                wallcycle_start(wcycle,ewcVSITECONSTR);
+ +                if (graph != NULL) 
+ +                {
+ +                    shift_self(graph,state->box,state->x);
+ +                }
+ +                construct_vsites(fplog,vsite,state->x,nrnb,ir->delta_t,state->v,
+ +                                 top->idef.iparams,top->idef.il,
+ +                                 fr->ePBC,fr->bMolPBC,graph,cr,state->box);
+ +                
+ +                if (graph != NULL) 
+ +                {
+ +                    unshift_self(graph,state->box,state->x);
+ +                }
+ +                wallcycle_stop(wcycle,ewcVSITECONSTR);
+ +            }
+ +            
+ +            /* ############## IF NOT VV, Calculate globals HERE, also iterate constraints ############ */
+ +            /* With Leap-Frog we can skip compute_globals at
+ +             * non-communication steps, but we need to calculate
+ +             * the kinetic energy one step before communication.
+ +             */
+ +            if (bGStat || do_per_step(step+1,nstglobalcomm) ||
+ +                EI_VV(ir->eI))
+ +            {
+ +                if (ir->nstlist == -1 && bFirstIterate)
+ +                {
+ +                    gs.sig[eglsNABNSB] = nlh.nabnsb;
+ +                }
+ +                compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
+ +                                wcycle,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
+ +                                constr,
+ +                                bFirstIterate ? &gs : NULL, 
+ +                                (step_rel % gs.nstms == 0) && 
+ +                                (multisim_nsteps<0 || (step_rel<multisim_nsteps)),
+ +                                lastbox,
+ +                                top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
+ +                                cglo_flags 
+ +                                | (!EI_VV(ir->eI) ? CGLO_ENERGY : 0)
+ +                                | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
+ +                                | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0) 
+ +                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_PRESSURE : 0) 
+ +                                | (bIterations && iterate.bIterate ? CGLO_ITERATE : 0) 
+ +                                | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
+ +                                | CGLO_CONSTRAINT 
+ +                    );
+ +                if (ir->nstlist == -1 && bFirstIterate)
+ +                {
+ +                    nlh.nabnsb = gs.set[eglsNABNSB];
+ +                    gs.set[eglsNABNSB] = 0;
+ +                }
+ +            }
+ +            /* bIterate is set to keep it from eliminating the old ekin kinetic energy terms */
+ +            /* #############  END CALC EKIN AND PRESSURE ################# */
+ +        
+ +            /* Note: this is OK, but there are some numerical precision issues with using the convergence of
+ +               the virial that should probably be addressed eventually. state->veta has better properies,
+ +               but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
+ +               generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
+ +
+ +            if (bIterations && 
+ +                done_iterating(cr,fplog,step,&iterate,bFirstIterate,
+ +                               trace(shake_vir),&tracevir)) 
+ +            {
+ +                break;
+ +            }
+ +            bFirstIterate = FALSE;
+ +        }
+ +
+ +        /* only add constraint dvdl after constraints */
+ +        enerd->term[F_DVDL_BONDED] += dvdl;
+ +        if (!bVV)
+ +        {
+ +            /* sum up the foreign energy and dhdl terms for md and sd. currently done every step so that dhdl is correct in the .edr */
+ +            sum_dhdl(enerd,state->lambda,ir->fepvals);
+ +        }
+ +        update_box(fplog,step,ir,mdatoms,state,graph,f,
+ +                   ir->nstlist==-1 ? &nlh.scale_tot : NULL,pcoupl_mu,nrnb,wcycle,upd,bInitStep,FALSE);
+ +        
+ +        /* ################# END UPDATE STEP 2 ################# */
+ +        /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
+ +    
+ +        /* The coordinates (x) were unshifted in update */
+ +        if (bFFscan && (shellfc==NULL || bConverged))
+ +        {
+ +            if (print_forcefield(fplog,enerd->term,mdatoms->homenr,
+ +                                 f,NULL,xcopy,
+ +                                 &(top_global->mols),mdatoms->massT,pres))
+ +            {
+ +                gmx_finalize_par();
+ +
+ +                fprintf(stderr,"\n");
+ +                exit(0);
+ +            }
+ +        }
+ +        if (!bGStat)
+ +        {
+ +            /* We will not sum ekinh_old,                                                            
+ +             * so signal that we still have to do it.                                                
+ +             */
+ +            bSumEkinhOld = TRUE;
+ +        }
+ +        
+ +        if (bTCR)
+ +        {
+ +            /* Only do GCT when the relaxation of shells (minimization) has converged,
+ +             * otherwise we might be coupling to bogus energies. 
+ +             * In parallel we must always do this, because the other sims might
+ +             * update the FF.
+ +             */
+ +
+ +            /* Since this is called with the new coordinates state->x, I assume
+ +             * we want the new box state->box too. / EL 20040121
+ +             */
+ +            do_coupling(fplog,oenv,nfile,fnm,tcr,t,step,enerd->term,fr,
+ +                        ir,MASTER(cr),
+ +                        mdatoms,&(top->idef),mu_aver,
+ +                        top_global->mols.nr,cr,
+ +                        state->box,total_vir,pres,
+ +                        mu_tot,state->x,f,bConverged);
+ +            debug_gmx();
+ +        }
+ +
+ +        /* #########  BEGIN PREPARING EDR OUTPUT  ###########  */
+ +        
+ +        /* use the directly determined last velocity, not actually the averaged half steps */
+ +        if (bTrotter && ir->eI==eiVV) 
+ +        {
+ +            enerd->term[F_EKIN] = last_ekin;
+ +        }
+ +        enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
+ +        
+ +        if (bVV)
+ +        {
+ +            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
+ +        }
+ +        else 
+ +        {
+ +            enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + compute_conserved_from_auxiliary(ir,state,&MassQ);
+ +        }
+ +        /* Check for excessively large energies */
+ +        if (bIonize) 
+ +        {
+ +#ifdef GMX_DOUBLE
+ +            real etot_max = 1e200;
+ +#else
+ +            real etot_max = 1e30;
+ +#endif
+ +            if (fabs(enerd->term[F_ETOT]) > etot_max) 
+ +            {
+ +                fprintf(stderr,"Energy too large (%g), giving up\n",
+ +                        enerd->term[F_ETOT]);
+ +            }
+ +        }
+ +        /* #########  END PREPARING EDR OUTPUT  ###########  */
+ +        
+ +        /* Time for performance */
+ +        if (((step % stepout) == 0) || bLastStep) 
+ +        {
+ +            runtime_upd_proc(runtime);
+ +        }
+ +        
+ +        /* Output stuff */
+ +        if (MASTER(cr))
+ +        {
+ +            gmx_bool do_dr,do_or;
+ +            
+ +            if (fplog && do_log && bDoExpanded)
+ +            {
+ +                /* only needed if doing expanded ensemble */
+ +                PrintFreeEnergyInfoToFile(fplog,ir->fepvals,ir->expandedvals,ir->bSimTemp?ir->simtempvals:NULL,
+ +                                          &df_history,state->fep_state,ir->nstlog,step);
+ +            }
+ +            if (!(bStartingFromCpt && (EI_VV(ir->eI)))) 
+ +            {
+ +                if (bCalcEner)
+ +                {
+ +                    upd_mdebin(mdebin,bDoDHDL, TRUE,
+ +                               t,mdatoms->tmass,enerd,state,
+ +                               ir->fepvals,ir->expandedvals,lastbox,
+ +                               shake_vir,force_vir,total_vir,pres,
+ +                               ekind,mu_tot,constr);
+ +                }
+ +                else
+ +                {
+ +                    upd_mdebin_step(mdebin);
+ +                }
+ +                
+ +                do_dr  = do_per_step(step,ir->nstdisreout);
+ +                do_or  = do_per_step(step,ir->nstorireout);
+ +                
+ +                print_ebin(outf->fp_ene,do_ene,do_dr,do_or,do_log?fplog:NULL,
+ +                           step,t,
+ +                           eprNORMAL,bCompact,mdebin,fcd,groups,&(ir->opts));
+ +            }
+ +            if (ir->ePull != epullNO)
+ +            {
+ +                pull_print_output(ir->pull,step,t);
+ +            }
+ +            
+ +            if (do_per_step(step,ir->nstlog))
+ +            {
+ +                if(fflush(fplog) != 0)
+ +                {
+ +                    gmx_fatal(FARGS,"Cannot flush logfile - maybe you are out of disk space?");
+ +                }
+ +            }
+ +        }
+ +        if (bDoExpanded)
+ +        {
+ +            /* Have to do this part after outputting the logfile and the edr file */
+ +            state->fep_state = lamnew;
+ +            for (i=0;i<efptNR;i++)
+ +            {
+ +                state->lambda[i] = ir->fepvals->all_lambda[i][lamnew];
+ +            }
+ +        }
+ +        /* Remaining runtime */
+ +        if (MULTIMASTER(cr) && (do_verbose || gmx_got_usr_signal()) && !bPMETuneRunning)
+ +        {
+ +            if (shellfc) 
+ +            {
+ +                fprintf(stderr,"\n");
+ +            }
+ +            print_time(stderr,runtime,step,ir,cr);
+ +        }
+ +
+ +        /* Replica exchange */
+ +        bExchanged = FALSE;
+ +        if ((repl_ex_nst > 0) && (step > 0) && !bLastStep &&
+ +            do_per_step(step,repl_ex_nst)) 
+ +        {
+ +            bExchanged = replica_exchange(fplog,cr,repl_ex,
+ +                                          state_global,enerd,
+ +                                          state,step,t);
+ +
+ +            if (bExchanged && DOMAINDECOMP(cr)) 
+ +            {
+ +                dd_partition_system(fplog,step,cr,TRUE,1,
+ +                                    state_global,top_global,ir,
+ +                                    state,&f,mdatoms,top,fr,
+ +                                    vsite,shellfc,constr,
+ +                                    nrnb,wcycle,FALSE);
+ +            }
+ +        }
+ +        
+ +        bFirstStep = FALSE;
+ +        bInitStep = FALSE;
+ +        bStartingFromCpt = FALSE;
+ +
+ +        /* #######  SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
+ +        /* With all integrators, except VV, we need to retain the pressure
+ +         * at the current step for coupling at the next step.
+ +         */
+ +        if ((state->flags & (1<<estPRES_PREV)) &&
+ +            (bGStatEveryStep ||
+ +             (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
+ +        {
+ +            /* Store the pressure in t_state for pressure coupling
+ +             * at the next MD step.
+ +             */
+ +            copy_mat(pres,state->pres_prev);
+ +        }
+ +        
+ +        /* #######  END SET VARIABLES FOR NEXT ITERATION ###### */
+ +
+ +        if ( (membed!=NULL) && (!bLastStep) )
+ +        {
+ +            rescale_membed(step_rel,membed,state_global->x);
+ +        }
+ +
+ +        if (bRerunMD) 
+ +        {
+ +            if (MASTER(cr))
+ +            {
+ +                /* read next frame from input trajectory */
+ +                bNotLastFrame = read_next_frame(oenv,status,&rerun_fr);
+ +            }
+ +
+ +            if (PAR(cr))
+ +            {
+ +                rerun_parallel_comm(cr,&rerun_fr,&bNotLastFrame);
+ +            }
+ +        }
+ +        
+ +        if (!bRerunMD || !rerun_fr.bStep)
+ +        {
+ +            /* increase the MD step number */
+ +            step++;
+ +            step_rel++;
+ +        }
+ +        
+ +        cycles = wallcycle_stop(wcycle,ewcSTEP);
+ +        if (DOMAINDECOMP(cr) && wcycle)
+ +        {
+ +            dd_cycles_add(cr->dd,cycles,ddCyclStep);
+ +        }
+ +
+ +        if (bPMETuneRunning || bPMETuneTry)
+ +        {
+ +            /* PME grid + cut-off optimization with GPUs or PME nodes */
+ +
+ +            /* Count the total cycles over the last steps */
+ +            cycles_pmes += cycles;
+ +
+ +            /* We can only switch cut-off at NS steps */
+ +            if (step % ir->nstlist == 0)
+ +            {
+ +                /* PME grid + cut-off optimization with GPUs or PME nodes */
+ +                if (bPMETuneTry)
+ +                {
+ +                    if (DDMASTER(cr->dd))
+ +                    {
+ +                        /* PME node load is too high, start tuning */
+ +                        bPMETuneRunning = (dd_pme_f_ratio(cr->dd) >= 1.05);
+ +                    }
+ +                    dd_bcast(cr->dd,sizeof(gmx_bool),&bPMETuneRunning);
+ +
+ +                    if (bPMETuneRunning || step_rel > ir->nstlist*50)
+ +                    {
+ +                        bPMETuneTry     = FALSE;
+ +                    }
+ +                }
+ +                if (bPMETuneRunning)
+ +                {
+ +                    /* init_step might not be a multiple of nstlist,
+ +                     * but the first cycle is always skipped anyhow.
+ +                     */
+ +                    bPMETuneRunning =
-     
++                        pme_load_balance(pme_loadbal,cr,
++                                         (bVerbose && MASTER(cr)) ? stderr : NULL,
++                                         fplog,
++                                         ir,state,cycles_pmes,
++                                         fr->ic,fr->nbv,&fr->pmedata,
++                                         step);
+ +
+ +                    fr->ewaldcoeff = fr->ic->ewaldcoeff;
+ +                }
+ +
+ +                cycles_pmes = 0;
+ +            }
+ +        }
+ +        
+ +        if (step_rel == wcycle_get_reset_counters(wcycle) ||
+ +            gs.set[eglsRESETCOUNTERS] != 0)
+ +        {
+ +            /* Reset all the counters related to performance over the run */
+ +            reset_all_counters(fplog,cr,step,&step_rel,ir,wcycle,nrnb,runtime,
+ +                               fr->nbv != NULL && fr->nbv->bUseGPU ? fr->nbv->cu_nbv : NULL);
+ +            wcycle_set_reset_counters(wcycle,-1);
+ +            /* Correct max_hours for the elapsed time */
+ +            max_hours -= run_time/(60.0*60.0);
+ +            bResetCountersHalfMaxH = FALSE;
+ +            gs.set[eglsRESETCOUNTERS] = 0;
+ +        }
+ +
+ +    }
+ +    /* End of main MD loop */
+ +    debug_gmx();
+ +    
+ +    /* Stop the time */
+ +    runtime_end(runtime);
+ +    
+ +    if (bRerunMD && MASTER(cr))
+ +    {
+ +        close_trj(status);
+ +    }
+ +    
+ +    if (!(cr->duty & DUTY_PME))
+ +    {
+ +        /* Tell the PME only node to finish */
+ +        gmx_pme_send_finish(cr);
+ +    }
+ +    
+ +    if (MASTER(cr))
+ +    {
+ +        if (ir->nstcalcenergy > 0 && !bRerunMD) 
+ +        {
+ +            print_ebin(outf->fp_ene,FALSE,FALSE,FALSE,fplog,step,t,
+ +                       eprAVER,FALSE,mdebin,fcd,groups,&(ir->opts));
+ +        }
+ +    }
+ +
+ +    done_mdoutf(outf);
+ +
+ +    debug_gmx();
+ +
+ +    if (ir->nstlist == -1 && nlh.nns > 0 && fplog)
+ +    {
+ +        fprintf(fplog,"Average neighborlist lifetime: %.1f steps, std.dev.: %.1f steps\n",nlh.s1/nlh.nns,sqrt(nlh.s2/nlh.nns - sqr(nlh.s1/nlh.nns)));
+ +        fprintf(fplog,"Average number of atoms that crossed the half buffer length: %.1f\n\n",nlh.ab/nlh.nns);
+ +    }
++
++    if (pme_loadbal != NULL)
++    {
++        pme_loadbal_done(pme_loadbal,fplog);
++    }
++
+ +    if (shellfc && fplog)
+ +    {
+ +        fprintf(fplog,"Fraction of iterations that converged:           %.2f %%\n",
+ +                (nconverged*100.0)/step_rel);
+ +        fprintf(fplog,"Average number of force evaluations per MD step: %.2f\n\n",
+ +                tcount/step_rel);
+ +    }
+ +    
+ +    if (repl_ex_nst > 0 && MASTER(cr))
+ +    {
+ +        print_replica_exchange_statistics(fplog,repl_ex);
+ +    }
+ +    
+ +    runtime->nsteps_done = step_rel;
+ +
+ +   return 0;
+ +}
diff --cc src/programs/mdrun/pme_loadbal.c

index 0000000000000000000000000000000000000000,0000000000000000000000000000000000000000..c96d61eac613f0ccde62ddc10b44edc915a38b28

new file mode 100644 (file)
--- /dev/null
--- /dev/null
+++ b/src/programs/mdrun/pme_loadbal.c
@@@ -1,0 -1,0 +1,650 @@@
++/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
++ *
++ * 
++ *                This source code is part of
++ * 
++ *                 G   R   O   M   A   C   S
++ * 
++ *          GROningen MAchine for Chemical Simulations
++ * 
++ *                        VERSION 4.6.0
++ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
++ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
++ * Copyright (c) 2001-2011, The GROMACS development team,
++ * check out http://www.gromacs.org for more information.
++
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License
++ * as published by the Free Software Foundation; either version 2
++ * of the License, or (at your option) any later version.
++ * 
++ * If you want to redistribute modifications, please consider that
++ * scientific software is very special. Version control is crucial -
++ * bugs must be traceable. We will be happy to consider code for
++ * inclusion in the official distribution, but derived work must not
++ * be called official GROMACS. Details are found in the README & COPYING
++ * files - if they are missing, get the official version at www.gromacs.org.
++ * 
++ * To help us fund GROMACS development, we humbly ask that you cite
++ * the papers on the package - you can find them in the top README file.
++ * 
++ * For more info, check our website at http://www.gromacs.org
++ * 
++ * And Hey:
++ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
++ */
++#ifdef HAVE_CONFIG_H
++#include <config.h>
++#endif
++
++#include "smalloc.h"
++#include "network.h"
++#include "calcgrid.h"
++#include "pme.h"
++#include "vec.h"
++#include "domdec.h"
++#include "nbnxn_cuda_data_mgmt.h"
++#include "force.h"
++#include "macros.h"
++#include "pme_loadbal.h"
++
++/* Parameters and setting for one PP-PME setup */
++typedef struct {
++    real rcut;            /* Coulomb cut-off                              */
++    real rlist;           /* pair-list cut-off                            */
++    real spacing;         /* (largest) PME grid spacing                   */
++    ivec grid;            /* the PME grid dimensions                      */
++    real grid_efficiency; /* ineffiency factor for non-uniform grids <= 1 */
++    real ewaldcoeff;      /* the Ewald coefficient                        */
++    gmx_pme_t pmedata;    /* the data structure used in the PME code      */
++
++    int  count;           /* number of times this setup has been timed    */
++    double cycles;        /* the fastest time for this setup in cycles    */
++} pme_setup_t;
++
++/* In the initial scan, step by grids that are at least a factor 0.8 coarser */
++#define PME_LB_GRID_SCALE_FAC  0.8
++/* In the initial scan, try to skip grids with uneven x/y/z spacing,
++ * checking if the "efficiency" is more than 5% worse than the previous grid.
++ */
++#define PME_LB_GRID_EFFICIENCY_REL_FAC  1.05
++/* Rerun up till 12% slower setups than the fastest up till now */
++#define PME_LB_SLOW_FAC  1.12
++/* If setups get more than 2% faster, do another round to avoid
++ * choosing a slower setup due to acceleration or fluctuations.
++ */
++#define PME_LB_ACCEL_TOL 1.02
++
++enum { epmelblimNO, epmelblimBOX, epmelblimDD, epmelblimNR };
++
++const char *pmelblim_str[epmelblimNR] =
++{ "no", "box size", "domain decompostion" };
++
++struct pme_load_balancing {
++    int  nstage;        /* the current maximum number of stages */
++
++    real cut_spacing;   /* the minimum cutoff / PME grid spacing ratio */
++    real rbuf;          /* the pairlist buffer size */
++    matrix box_start;   /* the initial simulation box */
++    int n;              /* the count of setup as well as the allocation size */
++    pme_setup_t *setup; /* the PME+cutoff setups */
++    int cur;            /* the current setup */
++    int fastest;        /* fastest setup up till now */
++    int start;          /* start of setup range to consider in stage>0 */
++    int end;            /* end   of setup range to consider in stage>0 */
++    int elimited;       /* was the balancing limited, uses enum above */
++
++    int stage;          /* the current stage */
++};
++
++void pme_loadbal_init(pme_load_balancing_t *pme_lb_p,
++                      const t_inputrec *ir,matrix box,
++                      const interaction_const_t *ic,
++                      gmx_pme_t pmedata)
++{
++    pme_load_balancing_t pme_lb;
++    real spm,sp;
++    int  d;
++
++    snew(pme_lb,1);
++
++    /* Any number of stages >= 2 is supported */
++    pme_lb->nstage   = 2;
++
++    pme_lb->rbuf = ic->rlist - ic->rcoulomb;
++
++    copy_mat(box,pme_lb->box_start);
++    if (ir->ePBC==epbcXY && ir->nwall==2)
++    {
++        svmul(ir->wall_ewald_zfac,pme_lb->box_start[ZZ],pme_lb->box_start[ZZ]);
++    }
++
++    pme_lb->n = 1;
++    snew(pme_lb->setup,pme_lb->n);
++
++    pme_lb->cur = 0;
++    pme_lb->setup[0].rcut       = ic->rcoulomb;
++    pme_lb->setup[0].rlist      = ic->rlist;
++    pme_lb->setup[0].grid[XX]   = ir->nkx;
++    pme_lb->setup[0].grid[YY]   = ir->nky;
++    pme_lb->setup[0].grid[ZZ]   = ir->nkz;
++    pme_lb->setup[0].ewaldcoeff = ic->ewaldcoeff;
++
++    pme_lb->setup[0].pmedata  = pmedata;
++    
++    spm = 0;
++    for(d=0; d<DIM; d++)
++    {
++        sp = norm(pme_lb->box_start[d])/pme_lb->setup[0].grid[d];
++        if (sp > spm)
++        {
++            spm = sp;
++        }
++    }
++    pme_lb->setup[0].spacing = spm;
++
++    if (ir->fourier_spacing > 0)
++    {
++        pme_lb->cut_spacing = ir->rcoulomb/ir->fourier_spacing;
++    }
++    else
++    {
++        pme_lb->cut_spacing = ir->rcoulomb/pme_lb->setup[0].spacing;
++    }
++
++    pme_lb->stage = 0;
++
++    pme_lb->fastest  = 0;
++    pme_lb->start    = 0;
++    pme_lb->end      = 0;
++    pme_lb->elimited = epmelblimNO;
++
++    *pme_lb_p = pme_lb;
++}
++
++static gmx_bool pme_loadbal_increase_cutoff(pme_load_balancing_t pme_lb,
++                                            int pme_order)
++{
++    pme_setup_t *set;
++    real fac,sp;
++    int d;
++
++    /* Try to add a new setup with next larger cut-off to the list */
++    pme_lb->n++;
++    srenew(pme_lb->setup,pme_lb->n);
++    set = &pme_lb->setup[pme_lb->n-1];
++    set->pmedata = NULL;
++
++    fac = 1;
++    do
++    {
++        fac *= 1.01;
++        clear_ivec(set->grid);
++        sp = calc_grid(NULL,pme_lb->box_start,
++                       fac*pme_lb->setup[pme_lb->cur].spacing,
++                       &set->grid[XX],
++                       &set->grid[YY],
++                       &set->grid[ZZ]);
++
++        /* In parallel we can't have grids smaller than 2*pme_order,
++         * and we would anyhow not gain much speed at these grid sizes.
++         */
++        for(d=0; d<DIM; d++)
++        {
++            if (set->grid[d] <= 2*pme_order)
++            {
++                pme_lb->n--;
++
++                return FALSE;
++            }
++        }
++    }
++    while (sp <= 1.001*pme_lb->setup[pme_lb->cur].spacing);
++
++    set->rcut    = pme_lb->cut_spacing*sp;
++    set->rlist   = set->rcut + pme_lb->rbuf;
++    set->spacing = sp;
++    /* The grid efficiency is the size wrt a grid with uniform x/y/z spacing */
++    set->grid_efficiency = 1;
++    for(d=0; d<DIM; d++)
++    {
++        set->grid_efficiency *= (set->grid[d]*sp)/norm(pme_lb->box_start[d]);
++    }
++    /* The Ewald coefficient is inversly proportional to the cut-off */
++    set->ewaldcoeff =
++        pme_lb->setup[0].ewaldcoeff*pme_lb->setup[0].rcut/set->rcut;
++
++    set->count   = 0;
++    set->cycles  = 0;
++
++    if (debug)
++    {
++        fprintf(debug,"PME loadbal: grid %d %d %d, cutoff %f\n",
++                set->grid[XX],set->grid[YY],set->grid[ZZ],set->rcut);
++    }
++
++    return TRUE;
++}
++
++static void print_grid(FILE *fp_err,FILE *fp_log,
++                       const char *pre,
++                       const char *desc,
++                       const pme_setup_t *set,
++                       double cycles)
++{
++    char buf[STRLEN],buft[STRLEN];
++    
++    if (cycles >= 0)
++    {
++        sprintf(buft,": %.1f M-cycles",cycles*1e-6);
++    }
++    else
++    {
++        buft[0] = '\0';
++    }
++    sprintf(buf,"%-11s%10s pme grid %d %d %d, cutoff %.3f%s",
++            pre,
++            desc,set->grid[XX],set->grid[YY],set->grid[ZZ],set->rcut,
++            buft);
++    if (fp_err != NULL)
++    {
++        fprintf(fp_err,"\r%s\n",buf);
++    }
++    if (fp_log != NULL)
++    {
++        fprintf(fp_log,"%s\n",buf);
++    }
++}
++
++static int pme_loadbal_end(pme_load_balancing_t pme_lb)
++{
++    /* In the initial stage only n is set; end is not set yet */
++    if (pme_lb->end > 0)
++    {
++        return pme_lb->end;
++    }
++    else
++    {
++        return pme_lb->n;
++    }
++}
++
++static void print_loadbal_limited(FILE *fp_err,FILE *fp_log,
++                                  gmx_large_int_t step,
++                                  pme_load_balancing_t pme_lb)
++{
++    char buf[STRLEN],sbuf[22];
++
++    sprintf(buf,"step %4s: the %s limited the PME load balancing to a cut-off of %.3f",
++            gmx_step_str(step,sbuf),
++            pmelblim_str[pme_lb->elimited],
++            pme_lb->setup[pme_loadbal_end(pme_lb)].rcut);
++    if (fp_err != NULL)
++    {
++        fprintf(fp_err,"\r%s\n",buf);
++    }
++    if (fp_log != NULL)
++    {
++        fprintf(fp_log,"%s\n",buf);
++    }
++}
++
++static void switch_to_stage1(pme_load_balancing_t pme_lb)
++{
++    pme_lb->start = 0;
++    while (pme_lb->start+1 < pme_lb->n &&
++           (pme_lb->setup[pme_lb->start].count == 0 ||
++            pme_lb->setup[pme_lb->start].cycles >
++            pme_lb->setup[pme_lb->fastest].cycles*PME_LB_SLOW_FAC))
++    {
++        pme_lb->start++;
++    }
++    while (pme_lb->start > 0 && pme_lb->setup[pme_lb->start-1].cycles == 0)
++    {
++        pme_lb->start--;
++    }
++
++    pme_lb->end = pme_lb->n;
++    if (pme_lb->setup[pme_lb->end-1].count > 0 &&
++        pme_lb->setup[pme_lb->end-1].cycles >
++        pme_lb->setup[pme_lb->fastest].cycles*PME_LB_SLOW_FAC)
++    {
++        pme_lb->end--;
++    }
++
++    pme_lb->stage = 1;
++
++    /* Next we want to choose setup pme_lb->start, but as we will increase
++     * pme_ln->cur by one right after returning, we subtract 1 here.
++     */
++    pme_lb->cur = pme_lb->start - 1;
++}
++
++gmx_bool pme_load_balance(pme_load_balancing_t pme_lb,
++                          t_commrec *cr,
++                          FILE *fp_err,
++                          FILE *fp_log,
++                          t_inputrec *ir,
++                          t_state *state,
++                          double cycles,
++                          interaction_const_t *ic,
++                          nonbonded_verlet_t *nbv,
++                          gmx_pme_t *pmedata,
++                          gmx_large_int_t step)
++{
++    gmx_bool OK;
++    pme_setup_t *set;
++    double cycles_fast;
++    char buf[STRLEN],sbuf[22];
++
++    if (pme_lb->stage == pme_lb->nstage)
++    {
++        return FALSE;
++    }
++
++    if (PAR(cr))
++    {
++        gmx_sumd(1,&cycles,cr);
++        cycles /= cr->nnodes;
++    }
++
++    set = &pme_lb->setup[pme_lb->cur];
++
++    set->count++;
++    if (set->count % 2 == 1)
++    {
++        /* Skip the first cycle, because the first step after a switch
++         * is much slower due to allocation and/or caching effects.
++         */
++        return TRUE;
++    }
++
++    sprintf(buf, "step %4s: ", gmx_step_str(step,sbuf));
++    print_grid(fp_err,fp_log,buf,"timed with",set,cycles);
++
++    if (set->count <= 2)
++    {
++        set->cycles = cycles;
++    }
++    else
++    {
++        if (cycles*PME_LB_ACCEL_TOL < set->cycles &&
++            pme_lb->stage == pme_lb->nstage - 1)
++        {
++            /* The performance went up a lot (due to e.g. DD load balancing).
++             * Add a stage, keep the minima, but rescan all setups.
++             */
++            pme_lb->nstage++;
++
++            if (debug)
++            {
++                fprintf(debug,"The performance for grid %d %d %d went from %.3f to %.1f M-cycles, this is more than %f\n"
++                        "Increased the number stages to %d"
++                        " and ignoring the previous performance\n",
++                        set->grid[XX],set->grid[YY],set->grid[ZZ],
++                        cycles*1e-6,set->cycles*1e-6,PME_LB_ACCEL_TOL,
++                        pme_lb->nstage);
++            }
++        }
++        set->cycles = min(set->cycles,cycles);
++    }
++
++    if (set->cycles < pme_lb->setup[pme_lb->fastest].cycles)
++    {
++        pme_lb->fastest = pme_lb->cur;
++    }
++    cycles_fast = pme_lb->setup[pme_lb->fastest].cycles;
++
++    /* Check in stage 0 if we should stop scanning grids.
++     * Stop when the time is more than SLOW_FAC longer than the fastest.
++     */
++    if (pme_lb->stage == 0 && pme_lb->cur > 0 &&
++        cycles > pme_lb->setup[pme_lb->fastest].cycles*PME_LB_SLOW_FAC)
++    {
++        pme_lb->n = pme_lb->cur + 1;
++        /* Done with scanning, go to stage 1 */
++        switch_to_stage1(pme_lb);
++    }
++
++    if (pme_lb->stage == 0)
++    {
++        int gridsize_start;
++
++        gridsize_start = set->grid[XX]*set->grid[YY]*set->grid[ZZ];
++
++        do
++        {
++            if (pme_lb->cur+1 < pme_lb->n)
++            {
++                /* We had already generated the next setup */
++                OK = TRUE;
++            }
++            else
++            {
++                /* Find the next setup */
++                OK = pme_loadbal_increase_cutoff(pme_lb,ir->pme_order);
++            }
++                
++            if (OK && ir->ePBC != epbcNONE)
++            {
++                OK = (sqr(pme_lb->setup[pme_lb->cur+1].rlist)
++                      <= max_cutoff2(ir->ePBC,state->box));
++                if (!OK)
++                {
++                    pme_lb->elimited = epmelblimBOX;
++                }
++            }
++
++            if (OK)
++            {
++                pme_lb->cur++;
++
++                if (DOMAINDECOMP(cr))
++                {
++                    OK = change_dd_cutoff(cr,state,ir,
++                                          pme_lb->setup[pme_lb->cur].rlist);
++                    if (!OK)
++                    {
++                        /* Failed: do not use this setup */
++                        pme_lb->cur--;
++                        pme_lb->elimited = epmelblimDD;
++                    }
++                }
++            }
++            if (!OK)
++            {
++                /* We hit the upper limit for the cut-off,
++                 * the setup should not go further than cur.
++                 */
++                pme_lb->n = pme_lb->cur + 1;
++                print_loadbal_limited(fp_err,fp_log,step,pme_lb);
++                /* Switch to the next stage */
++                switch_to_stage1(pme_lb);
++            }
++        }
++        while (OK &&
++               !(pme_lb->setup[pme_lb->cur].grid[XX]*
++                 pme_lb->setup[pme_lb->cur].grid[YY]*
++                 pme_lb->setup[pme_lb->cur].grid[ZZ] <
++                 gridsize_start*PME_LB_GRID_SCALE_FAC
++                 &&
++                 pme_lb->setup[pme_lb->cur].grid_efficiency <
++                 pme_lb->setup[pme_lb->cur-1].grid_efficiency*PME_LB_GRID_EFFICIENCY_REL_FAC));
++    }
++
++    if (pme_lb->stage > 0 && pme_lb->end == 1)
++    {
++        pme_lb->cur = 0;
++        pme_lb->stage = pme_lb->nstage;
++    }
++    else if (pme_lb->stage > 0 && pme_lb->end > 1)
++    {
++        /* If stage = nstage-1:
++         *   scan over all setups, rerunning only those setups
++         *   which are not much slower than the fastest
++         * else:
++         *   use the next setup
++         */
++        do
++        {
++            pme_lb->cur++;
++            if (pme_lb->cur == pme_lb->end)
++            {
++                pme_lb->stage++;
++                pme_lb->cur = pme_lb->start;
++            }
++        }
++        while (pme_lb->stage == pme_lb->nstage - 1 &&
++               pme_lb->setup[pme_lb->cur].count > 0 &&
++               pme_lb->setup[pme_lb->cur].cycles > cycles_fast*PME_LB_SLOW_FAC);
++
++        if (pme_lb->stage == pme_lb->nstage)
++        {
++            /* We are done optimizing, use the fastest setup we found */
++            pme_lb->cur = pme_lb->fastest;
++        }
++    }
++
++    if (DOMAINDECOMP(cr) && pme_lb->stage > 0)
++    {
++        OK = change_dd_cutoff(cr,state,ir,pme_lb->setup[pme_lb->cur].rlist);
++        if (!OK)
++        {
++            /* Failsafe solution */
++            if (pme_lb->cur > 1 && pme_lb->stage == pme_lb->nstage)
++            {
++                pme_lb->stage--;
++            }
++            pme_lb->fastest  = 0;
++            pme_lb->start    = 0;
++            pme_lb->end      = pme_lb->cur;
++            pme_lb->cur      = pme_lb->start;
++            pme_lb->elimited = epmelblimDD;
++            print_loadbal_limited(fp_err,fp_log,step,pme_lb);
++        }
++    }
++
++    /* Change the Coulomb cut-off and the PME grid */
++
++    set = &pme_lb->setup[pme_lb->cur];
++
++    ic->rcoulomb   = set->rcut;
++    ic->rlist      = set->rlist;
++    ic->ewaldcoeff = set->ewaldcoeff;
++
++    if (nbv->grp[0].kernel_type == nbk8x8x8_CUDA)
++    {
++        nbnxn_cuda_pme_loadbal_update_param(nbv->cu_nbv,ic);
++    }
++    else
++    {
++        init_interaction_const_tables(NULL,ic,nbv->grp[0].kernel_type);
++    }
++
++    if (nbv->ngrp > 1)
++    {
++        init_interaction_const_tables(NULL,ic,nbv->grp[1].kernel_type);
++    }
++
++    if (cr->duty & DUTY_PME)
++    {
++        if (pme_lb->setup[pme_lb->cur].pmedata == NULL)
++        {
++            /* Generate a new PME data structure,
++             * copying part of the old pointers.
++             */
++            gmx_pme_reinit(&set->pmedata,
++                           cr,pme_lb->setup[0].pmedata,ir,
++                           set->grid);
++        }
++        *pmedata = set->pmedata;
++    }
++    else
++    {
++        /* Tell our PME-only node to switch grid */
++        gmx_pme_send_switch(cr, set->grid, set->ewaldcoeff);
++    }
++
++    if (debug)
++    {
++        print_grid(NULL,debug,"","switched to",set,-1);
++    }
++
++    if (pme_lb->stage == pme_lb->nstage)
++    {
++        print_grid(fp_err,fp_log,"","optimal",set,-1);
++    }
++
++    return TRUE;
++}
++
++void restart_pme_loadbal(pme_load_balancing_t pme_lb, int n)
++{
++    pme_lb->nstage += n;
++}
++
++static int pme_grid_points(const pme_setup_t *setup)
++{
++    return setup->grid[XX]*setup->grid[YY]*setup->grid[ZZ];
++}
++
++static void print_pme_loadbal_setting(FILE *fplog,
++                                     char *name,
++                                     const pme_setup_t *setup)
++{
++    fprintf(fplog,
++            "   %-7s %6.3f nm %6.3f nm     %3d %3d %3d   %5.3f nm  %5.3f nm\n",
++            name,
++            setup->rcut,setup->rlist,
++            setup->grid[XX],setup->grid[YY],setup->grid[ZZ],
++            setup->spacing,1/setup->ewaldcoeff);
++}
++
++static void print_pme_loadbal_settings(pme_load_balancing_t pme_lb,
++                                       FILE *fplog)
++{
++    double pp_ratio,grid_ratio;
++
++    pp_ratio   = pow(pme_lb->setup[pme_lb->cur].rlist/pme_lb->setup[0].rlist,3.0);
++    grid_ratio = pme_grid_points(&pme_lb->setup[pme_lb->cur])/
++        (double)pme_grid_points(&pme_lb->setup[0]);
++
++    fprintf(fplog,"\n");
++    fprintf(fplog,"       P P   -   P M E   L O A D   B A L A N C I N G\n");
++    fprintf(fplog,"\n");
++    /* Here we only warn when the optimal setting is the last one */
++    if (pme_lb->elimited != epmelblimNO &&
++        pme_lb->cur == pme_loadbal_end(pme_lb)-1)
++    {
++        fprintf(fplog," NOTE: The PP/PME load balancing was limited by the %s,\n",
++                pmelblim_str[pme_lb->elimited]);
++        fprintf(fplog,"       you might not have reached a good load balance.\n");
++        if (pme_lb->elimited == epmelblimDD)
++        {
++            fprintf(fplog,"       Try different mdrun -dd settings or lower the -dds value.\n");
++        }
++        fprintf(fplog,"\n");
++    }
++    fprintf(fplog," PP/PME load balancing changed the cut-off and PME settings:\n");
++    fprintf(fplog,"           particle-particle                    PME\n");
++    fprintf(fplog,"            rcoulomb  rlist            grid      spacing   1/beta\n");
++    print_pme_loadbal_setting(fplog,"initial",&pme_lb->setup[0]);
++    print_pme_loadbal_setting(fplog,"final"  ,&pme_lb->setup[pme_lb->cur]);
++    fprintf(fplog," cost-ratio           %4.2f             %4.2f\n",
++            pp_ratio,grid_ratio);
++    fprintf(fplog," (note that these numbers concern only part of the total PP and PME load)\n");
++    fprintf(fplog,"\n");
++}
++
++void pme_loadbal_done(pme_load_balancing_t pme_lb, FILE *fplog)
++{
++    if (fplog != NULL && (pme_lb->cur > 0 || pme_lb->elimited != epmelblimNO))
++    {
++        print_pme_loadbal_settings(pme_lb,fplog);
++    }
++
++    /* TODO: Here we should free all pointers in pme_lb,
++     * but as it contains pme data structures,
++     * we need to first make pme.c free all data.
++     */
++}
diff --cc src/programs/mdrun/pme_loadbal.h

index 0000000000000000000000000000000000000000,0254a29153b588805ec7ef975844df63dc9e1eaf..0254a29153b588805ec7ef975844df63dc9e1eaf

mode 000000,100644..100644
--- /dev/null
--- 2/src/kernel/pme_loadbal.h
+++ b/src/programs/mdrun/pme_loadbal.h
diff --cc src/programs/mdrun/runner.c

index ab7884ac21c1e45835ee21bbb41a4e8bd2aae085,0000000000000000000000000000000000000000..1b9803446c7e044b3397a070f422f2fb19dcb4fa

mode 100644,000000..100644
--- 1/src/programs/mdrun/runner.c
--- /dev/null
+++ b/src/programs/mdrun/runner.c
@@@ -1,1776 -1,0 +1,1776 @@@
-     ret=tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi,
+ +/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ + *
+ + * 
+ + *                This source code is part of
+ + * 
+ + *                 G   R   O   M   A   C   S
+ + * 
+ + *          GROningen MAchine for Chemical Simulations
+ + * 
+ + *                        VERSION 3.2.0
+ + * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ + * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ + * Copyright (c) 2001-2004, The GROMACS development team,
+ + * check out http://www.gromacs.org for more information.
+ +
+ + * This program is free software; you can redistribute it and/or
+ + * modify it under the terms of the GNU General Public License
+ + * as published by the Free Software Foundation; either version 2
+ + * of the License, or (at your option) any later version.
+ + * 
+ + * If you want to redistribute modifications, please consider that
+ + * scientific software is very special. Version control is crucial -
+ + * bugs must be traceable. We will be happy to consider code for
+ + * inclusion in the official distribution, but derived work must not
+ + * be called official GROMACS. Details are found in the README & COPYING
+ + * files - if they are missing, get the official version at www.gromacs.org.
+ + * 
+ + * To help us fund GROMACS development, we humbly ask that you cite
+ + * the papers on the package - you can find them in the top README file.
+ + * 
+ + * For more info, check our website at http://www.gromacs.org
+ + * 
+ + * And Hey:
+ + * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ + */
+ +#ifdef HAVE_CONFIG_H
+ +#include <config.h>
+ +#endif
+ +#ifdef __linux
+ +#define _GNU_SOURCE
+ +#include <sched.h>
+ +#include <sys/syscall.h>
+ +#endif
+ +#include <signal.h>
+ +#include <stdlib.h>
+ +#ifdef HAVE_UNISTD_H
+ +#include <unistd.h>
+ +#endif
+ +#include <string.h>
+ +#include <assert.h>
+ +
+ +#include "typedefs.h"
+ +#include "smalloc.h"
+ +#include "sysstuff.h"
+ +#include "statutil.h"
+ +#include "mdrun.h"
+ +#include "md_logging.h"
+ +#include "md_support.h"
+ +#include "network.h"
+ +#include "pull.h"
+ +#include "pull_rotation.h"
+ +#include "names.h"
+ +#include "disre.h"
+ +#include "orires.h"
+ +#include "pme.h"
+ +#include "mdatoms.h"
+ +#include "repl_ex.h"
+ +#include "qmmm.h"
+ +#include "domdec.h"
+ +#include "partdec.h"
+ +#include "coulomb.h"
+ +#include "constr.h"
+ +#include "mvdata.h"
+ +#include "checkpoint.h"
+ +#include "mtop_util.h"
+ +#include "sighandler.h"
+ +#include "tpxio.h"
+ +#include "txtdump.h"
+ +#include "gmx_detect_hardware.h"
+ +#include "gmx_omp_nthreads.h"
+ +#include "pull_rotation.h"
+ +#include "calc_verletbuf.h"
+ +#include "nbnxn_search.h"
+ +#include "../mdlib/nbnxn_consts.h"
+ +#include "gmx_fatal_collective.h"
+ +#include "membed.h"
+ +#include "macros.h"
+ +#include "gmx_omp.h"
+ +
+ +#ifdef GMX_LIB_MPI
+ +#include <mpi.h>
+ +#endif
+ +#ifdef GMX_THREAD_MPI
+ +#include "tmpi.h"
+ +#endif
+ +
+ +#ifdef GMX_FAHCORE
+ +#include "corewrap.h"
+ +#endif
+ +
+ +#ifdef GMX_OPENMM
+ +#include "md_openmm.h"
+ +#endif
+ +
+ +#include "gpu_utils.h"
+ +#include "nbnxn_cuda_data_mgmt.h"
+ +
+ +typedef struct { 
+ +    gmx_integrator_t *func;
+ +} gmx_intp_t;
+ +
+ +/* The array should match the eI array in include/types/enums.h */
+ +#ifdef GMX_OPENMM  /* FIXME do_md_openmm needs fixing */
+ +const gmx_intp_t integrator[eiNR] = { {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm},{do_md_openmm}};
+ +#else
+ +const gmx_intp_t integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md},{do_md}};
+ +#endif
+ +
+ +gmx_large_int_t     deform_init_init_step_tpx;
+ +matrix              deform_init_box_tpx;
+ +#ifdef GMX_THREAD_MPI
+ +tMPI_Thread_mutex_t deform_init_box_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
+ +#endif
+ +
+ +
+ +#ifdef GMX_THREAD_MPI
+ +struct mdrunner_arglist
+ +{
+ +    gmx_hw_opt_t *hw_opt;
+ +    FILE *fplog;
+ +    t_commrec *cr;
+ +    int nfile;
+ +    const t_filenm *fnm;
+ +    output_env_t oenv;
+ +    gmx_bool bVerbose;
+ +    gmx_bool bCompact;
+ +    int nstglobalcomm;
+ +    ivec ddxyz;
+ +    int dd_node_order;
+ +    real rdd;
+ +    real rconstr;
+ +    const char *dddlb_opt;
+ +    real dlb_scale;
+ +    const char *ddcsx;
+ +    const char *ddcsy;
+ +    const char *ddcsz;
+ +    const char *nbpu_opt;
+ +    int nsteps_cmdline;
+ +    int nstepout;
+ +    int resetstep;
+ +    int nmultisim;
+ +    int repl_ex_nst;
+ +    int repl_ex_nex;
+ +    int repl_ex_seed;
+ +    real pforce;
+ +    real cpt_period;
+ +    real max_hours;
+ +    const char *deviceOptions;
+ +    unsigned long Flags;
+ +    int ret; /* return value */
+ +};
+ +
+ +
+ +/* The function used for spawning threads. Extracts the mdrunner() 
+ +   arguments from its one argument and calls mdrunner(), after making
+ +   a commrec. */
+ +static void mdrunner_start_fn(void *arg)
+ +{
+ +    struct mdrunner_arglist *mda=(struct mdrunner_arglist*)arg;
+ +    struct mdrunner_arglist mc=*mda; /* copy the arg list to make sure 
+ +                                        that it's thread-local. This doesn't
+ +                                        copy pointed-to items, of course,
+ +                                        but those are all const. */
+ +    t_commrec *cr;  /* we need a local version of this */
+ +    FILE *fplog=NULL;
+ +    t_filenm *fnm;
+ +
+ +    fnm = dup_tfn(mc.nfile, mc.fnm);
+ +
+ +    cr = init_par_threads(mc.cr);
+ +
+ +    if (MASTER(cr))
+ +    {
+ +        fplog=mc.fplog;
+ +    }
+ +
+ +    mda->ret=mdrunner(mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv, 
+ +                      mc.bVerbose, mc.bCompact, mc.nstglobalcomm, 
+ +                      mc.ddxyz, mc.dd_node_order, mc.rdd,
+ +                      mc.rconstr, mc.dddlb_opt, mc.dlb_scale, 
+ +                      mc.ddcsx, mc.ddcsy, mc.ddcsz,
+ +                      mc.nbpu_opt,
+ +                      mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
+ +                      mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce, 
+ +                      mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.Flags);
+ +}
+ +
+ +/* called by mdrunner() to start a specific number of threads (including 
+ +   the main thread) for thread-parallel runs. This in turn calls mdrunner()
+ +   for each thread. 
+ +   All options besides nthreads are the same as for mdrunner(). */
+ +static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt, 
+ +              FILE *fplog,t_commrec *cr,int nfile, 
+ +              const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
+ +              gmx_bool bCompact, int nstglobalcomm,
+ +              ivec ddxyz,int dd_node_order,real rdd,real rconstr,
+ +              const char *dddlb_opt,real dlb_scale,
+ +              const char *ddcsx,const char *ddcsy,const char *ddcsz,
+ +              const char *nbpu_opt,
+ +              int nsteps_cmdline, int nstepout,int resetstep,
+ +              int nmultisim,int repl_ex_nst,int repl_ex_nex, int repl_ex_seed,
+ +              real pforce,real cpt_period, real max_hours, 
+ +              const char *deviceOptions, unsigned long Flags)
+ +{
+ +    int ret;
+ +    struct mdrunner_arglist *mda;
+ +    t_commrec *crn; /* the new commrec */
+ +    t_filenm *fnmn;
+ +
+ +    /* first check whether we even need to start tMPI */
+ +    if (hw_opt->nthreads_tmpi < 2)
+ +    {
+ +        return cr;
+ +    }
+ +
+ +    /* a few small, one-time, almost unavoidable memory leaks: */
+ +    snew(mda,1);
+ +    fnmn=dup_tfn(nfile, fnm);
+ +
+ +    /* fill the data structure to pass as void pointer to thread start fn */
+ +    mda->hw_opt=hw_opt;
+ +    mda->fplog=fplog;
+ +    mda->cr=cr;
+ +    mda->nfile=nfile;
+ +    mda->fnm=fnmn;
+ +    mda->oenv=oenv;
+ +    mda->bVerbose=bVerbose;
+ +    mda->bCompact=bCompact;
+ +    mda->nstglobalcomm=nstglobalcomm;
+ +    mda->ddxyz[XX]=ddxyz[XX];
+ +    mda->ddxyz[YY]=ddxyz[YY];
+ +    mda->ddxyz[ZZ]=ddxyz[ZZ];
+ +    mda->dd_node_order=dd_node_order;
+ +    mda->rdd=rdd;
+ +    mda->rconstr=rconstr;
+ +    mda->dddlb_opt=dddlb_opt;
+ +    mda->dlb_scale=dlb_scale;
+ +    mda->ddcsx=ddcsx;
+ +    mda->ddcsy=ddcsy;
+ +    mda->ddcsz=ddcsz;
+ +    mda->nbpu_opt=nbpu_opt;
+ +    mda->nsteps_cmdline=nsteps_cmdline;
+ +    mda->nstepout=nstepout;
+ +    mda->resetstep=resetstep;
+ +    mda->nmultisim=nmultisim;
+ +    mda->repl_ex_nst=repl_ex_nst;
+ +    mda->repl_ex_nex=repl_ex_nex;
+ +    mda->repl_ex_seed=repl_ex_seed;
+ +    mda->pforce=pforce;
+ +    mda->cpt_period=cpt_period;
+ +    mda->max_hours=max_hours;
+ +    mda->deviceOptions=deviceOptions;
+ +    mda->Flags=Flags;
+ +
+ +    fprintf(stderr, "Starting %d tMPI threads\n",hw_opt->nthreads_tmpi);
+ +    fflush(stderr);
+ +    /* now spawn new threads that start mdrunner_start_fn(), while 
+ +       the main thread returns */
++    ret=tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi, TMPI_AFFINITY_ALL_CORES,
+ +                     mdrunner_start_fn, (void*)(mda) );
+ +    if (ret!=TMPI_SUCCESS)
+ +        return NULL;
+ +
+ +    /* make a new comm_rec to reflect the new situation */
+ +    crn=init_par_threads(cr);
+ +    return crn;
+ +}
+ +
+ +
+ +static int get_tmpi_omp_thread_distribution(const gmx_hw_opt_t *hw_opt,
+ +                                            int nthreads_tot,
+ +                                            int ngpu)
+ +{
+ +    int nthreads_tmpi;
+ +
+ +    /* There are no separate PME nodes here, as we ensured in
+ +     * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
+ +     * and a conditional ensures we would not have ended up here.
+ +     * Note that separate PME nodes might be switched on later.
+ +     */
+ +    if (ngpu > 0)
+ +    {
+ +        nthreads_tmpi = ngpu;
+ +        if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi)
+ +        {
+ +            nthreads_tmpi = nthreads_tot;
+ +        }
+ +    }
+ +    else if (hw_opt->nthreads_omp > 0)
+ +    {
+ +        if (hw_opt->nthreads_omp > nthreads_tot)
+ +        {
+ +            gmx_fatal(FARGS,"More OpenMP threads requested (%d) than the total number of threads requested (%d)",hw_opt->nthreads_omp,nthreads_tot);
+ +        }
+ +        nthreads_tmpi = nthreads_tot/hw_opt->nthreads_omp;
+ +    }
+ +    else
+ +    {
+ +        /* TODO choose nthreads_omp based on hardware topology
+ +           when we have a hardware topology detection library */
+ +        /* Don't use OpenMP parallelization */
+ +        nthreads_tmpi = nthreads_tot;
+ +    }
+ +
+ +    return nthreads_tmpi;
+ +}
+ +
+ +
+ +/* Get the number of threads to use for thread-MPI based on how many
+ + * were requested, which algorithms we're using,
+ + * and how many particles there are.
+ + * At the point we have already called check_and_update_hw_opt.
+ + * Thus all options should be internally consistent and consistent
+ + * with the hardware, except that ntmpi could be larger than #GPU.
+ + */
+ +static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
+ +                            gmx_hw_opt_t *hw_opt,
+ +                            t_inputrec *inputrec, gmx_mtop_t *mtop,
+ +                            const t_commrec *cr,
+ +                            FILE *fplog)
+ +{
+ +    int nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu;
+ +    int min_atoms_per_mpi_thread;
+ +    char *env;
+ +    char sbuf[STRLEN];
+ +    gmx_bool bCanUseGPU;
+ +
+ +    if (hw_opt->nthreads_tmpi > 0)
+ +    {
+ +        /* Trivial, return right away */
+ +        return hw_opt->nthreads_tmpi;
+ +    }
+ +
+ +    /* How many total (#tMPI*#OpenMP) threads can we start? */ 
+ +    if (hw_opt->nthreads_tot > 0)
+ +    {
+ +        nthreads_tot_max = hw_opt->nthreads_tot;
+ +    }
+ +    else
+ +    {
+ +        nthreads_tot_max = tMPI_Thread_get_hw_number();
+ +    }
+ +
+ +    bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
+ +    if (bCanUseGPU)
+ +    {
+ +        ngpu = hwinfo->gpu_info.ncuda_dev_use;
+ +    }
+ +    else
+ +    {
+ +        ngpu = 0;
+ +    }
+ +
+ +    nthreads_tmpi =
+ +        get_tmpi_omp_thread_distribution(hw_opt,nthreads_tot_max,ngpu);
+ +
+ +    if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
+ +    {
+ +        /* Steps are divided over the nodes iso splitting the atoms */
+ +        min_atoms_per_mpi_thread = 0;
+ +    }
+ +    else
+ +    {
+ +        if (bCanUseGPU)
+ +        {
+ +            min_atoms_per_mpi_thread = MIN_ATOMS_PER_GPU;
+ +        }
+ +        else
+ +        {
+ +            min_atoms_per_mpi_thread = MIN_ATOMS_PER_MPI_THREAD;
+ +        }
+ +    }
+ +
+ +    /* Check if an algorithm does not support parallel simulation.  */
+ +    if (nthreads_tmpi != 1 &&
+ +        ( inputrec->eI == eiLBFGS ||
+ +          inputrec->coulombtype == eelEWALD ) )
+ +    {
+ +        nthreads_tmpi = 1;
+ +
+ +        md_print_warn(cr,fplog,"The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
+ +        if (hw_opt->nthreads_tmpi > nthreads_tmpi)
+ +        {
+ +            gmx_fatal(FARGS,"You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
+ +        }
+ +    }
+ +    else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_thread)
+ +    {
+ +        /* the thread number was chosen automatically, but there are too many
+ +           threads (too few atoms per thread) */
+ +        nthreads_new = max(1,mtop->natoms/min_atoms_per_mpi_thread);
+ +
+ +        if (nthreads_new > 8 || (nthreads_tmpi == 8 && nthreads_new > 4))
+ +        {
+ +            /* TODO replace this once we have proper HT detection
+ +             * Use only multiples of 4 above 8 threads
+ +             * or with an 8-core processor
+ +             * (to avoid 6 threads on 8 core processors with 4 real cores).
+ +             */
+ +            nthreads_new = (nthreads_new/4)*4;
+ +        }
+ +        else if (nthreads_new > 4)
+ +        {
+ +            /* Avoid 5 or 7 threads */
+ +            nthreads_new = (nthreads_new/2)*2;
+ +        }
+ +
+ +        nthreads_tmpi = nthreads_new;
+ +
+ +        fprintf(stderr,"\n");
+ +        fprintf(stderr,"NOTE: Parallelization is limited by the small number of atoms,\n");
+ +        fprintf(stderr,"      only starting %d thread-MPI threads.\n",nthreads_tmpi);
+ +        fprintf(stderr,"      You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
+ +    }
+ +
+ +    return nthreads_tmpi;
+ +}
+ +#endif /* GMX_THREAD_MPI */
+ +
+ +
+ +/* Environment variable for setting nstlist */
+ +static const char*  NSTLIST_ENVVAR          =  "GMX_NSTLIST";
+ +/* Try to increase nstlist when using a GPU with nstlist less than this */
+ +static const int    NSTLIST_GPU_ENOUGH      = 20;
+ +/* Increase nstlist until the non-bonded cost increases more than this factor */
+ +static const float  NBNXN_GPU_LIST_OK_FAC   = 1.25;
+ +/* Don't increase nstlist beyond a non-bonded cost increases of this factor */
+ +static const float  NBNXN_GPU_LIST_MAX_FAC  = 1.40;
+ +
+ +/* Try to increase nstlist when running on a GPU */
+ +static void increase_nstlist(FILE *fp,t_commrec *cr,
+ +                             t_inputrec *ir,const gmx_mtop_t *mtop,matrix box)
+ +{
+ +    char *env;
+ +    int  nstlist_orig,nstlist_prev;
+ +    verletbuf_list_setup_t ls;
+ +    real rlist_inc,rlist_ok,rlist_max,rlist_new,rlist_prev;
+ +    int  i;
+ +    t_state state_tmp;
+ +    gmx_bool bBox,bDD,bCont;
+ +    const char *nstl_fmt="\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
+ +    const char *vbd_err="Can not increase nstlist for GPU run because verlet-buffer-drift is not set or used";
+ +    const char *box_err="Can not increase nstlist for GPU run because the box is too small";
+ +    const char *dd_err ="Can not increase nstlist for GPU run because of domain decomposition limitations";
+ +    char buf[STRLEN];
+ +
+ +    /* Number of + nstlist alternative values to try when switching  */
+ +    const int nstl[]={ 20, 25, 40, 50 };
+ +#define NNSTL  sizeof(nstl)/sizeof(nstl[0])
+ +
+ +    env = getenv(NSTLIST_ENVVAR);
+ +    if (env == NULL)
+ +    {
+ +        if (fp != NULL)
+ +        {
+ +            fprintf(fp,nstl_fmt,ir->nstlist);
+ +        }
+ +    }
+ +
+ +    if (ir->verletbuf_drift == 0)
+ +    {
+ +        gmx_fatal(FARGS,"You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
+ +    }
+ +
+ +    if (ir->verletbuf_drift < 0)
+ +    {
+ +        if (MASTER(cr))
+ +        {
+ +            fprintf(stderr,"%s\n",vbd_err);
+ +        }
+ +        if (fp != NULL)
+ +        {
+ +            fprintf(fp,"%s\n",vbd_err);
+ +        }
+ +
+ +        return;
+ +    }
+ +
+ +    nstlist_orig = ir->nstlist;
+ +    if (env != NULL)
+ +    {
+ +        sprintf(buf,"Getting nstlist from environment variable GMX_NSTLIST=%s",env);
+ +        if (MASTER(cr))
+ +        {
+ +            fprintf(stderr,"%s\n",buf);
+ +        }
+ +        if (fp != NULL)
+ +        {
+ +            fprintf(fp,"%s\n",buf);
+ +        }
+ +        sscanf(env,"%d",&ir->nstlist);
+ +    }
+ +
+ +    verletbuf_get_list_setup(TRUE,&ls);
+ +
+ +    /* Allow rlist to make the list double the size of the cut-off sphere */
+ +    rlist_inc = nbnxn_get_rlist_effective_inc(NBNXN_GPU_CLUSTER_SIZE,mtop->natoms/det(box));
+ +    rlist_ok  = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_OK_FAC,1.0/3.0) - rlist_inc;
+ +    rlist_max = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_MAX_FAC,1.0/3.0) - rlist_inc;
+ +    if (debug)
+ +    {
+ +        fprintf(debug,"GPU nstlist tuning: rlist_inc %.3f rlist_max %.3f\n",
+ +                rlist_inc,rlist_max);
+ +    }
+ +
+ +    i = 0;
+ +    nstlist_prev = nstlist_orig;
+ +    rlist_prev   = ir->rlist;
+ +    do
+ +    {
+ +        if (env == NULL)
+ +        {
+ +            ir->nstlist = nstl[i];
+ +        }
+ +
+ +        /* Set the pair-list buffer size in ir */
+ +        calc_verlet_buffer_size(mtop,det(box),ir,ir->verletbuf_drift,&ls,
+ +                                NULL,&rlist_new);
+ +
+ +        /* Does rlist fit in the box? */
+ +        bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC,box));
+ +        bDD  = TRUE;
+ +        if (bBox && DOMAINDECOMP(cr))
+ +        {
+ +            /* Check if rlist fits in the domain decomposition */
+ +            if (inputrec2nboundeddim(ir) < DIM)
+ +            {
+ +                gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
+ +            }
+ +            copy_mat(box,state_tmp.box);
+ +            bDD = change_dd_cutoff(cr,&state_tmp,ir,rlist_new);
+ +        }
+ +
+ +        bCont = FALSE;
+ +
+ +        if (env == NULL)
+ +        {
+ +            if (bBox && bDD && rlist_new <= rlist_max)
+ +            {
+ +                /* Increase nstlist */
+ +                nstlist_prev = ir->nstlist;
+ +                rlist_prev   = rlist_new;
+ +                bCont = (i+1 < NNSTL && rlist_new < rlist_ok);
+ +            }
+ +            else
+ +            {
+ +                /* Stick with the previous nstlist */
+ +                ir->nstlist = nstlist_prev;
+ +                rlist_new   = rlist_prev;
+ +                bBox = TRUE;
+ +                bDD  = TRUE;
+ +            }
+ +        }
+ +
+ +        i++;
+ +    }
+ +    while (bCont);
+ +
+ +    if (!bBox || !bDD)
+ +    {
+ +        gmx_warning(!bBox ? box_err : dd_err);
+ +        if (fp != NULL)
+ +        {
+ +            fprintf(fp,"\n%s\n",bBox ? box_err : dd_err);
+ +        }
+ +        ir->nstlist = nstlist_orig;
+ +    }
+ +    else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
+ +    {
+ +        sprintf(buf,"Changing nstlist from %d to %d, rlist from %g to %g",
+ +                nstlist_orig,ir->nstlist,
+ +                ir->rlist,rlist_new);
+ +        if (MASTER(cr))
+ +        {
+ +            fprintf(stderr,"%s\n\n",buf);
+ +        }
+ +        if (fp != NULL)
+ +        {
+ +            fprintf(fp,"%s\n\n",buf);
+ +        }
+ +        ir->rlist     = rlist_new;
+ +        ir->rlistlong = rlist_new;
+ +    }
+ +}
+ +
+ +static void prepare_verlet_scheme(FILE *fplog,
+ +                                  gmx_hw_info_t *hwinfo,
+ +                                  t_commrec *cr,
+ +                                  gmx_hw_opt_t *hw_opt,
+ +                                  const char *nbpu_opt,
+ +                                  t_inputrec *ir,
+ +                                  const gmx_mtop_t *mtop,
+ +                                  matrix box,
+ +                                  gmx_bool *bUseGPU)
+ +{
+ +    /* Here we only check for GPU usage on the MPI master process,
+ +     * as here we don't know how many GPUs we will use yet.
+ +     * We check for a GPU on all processes later.
+ +     */
+ +    *bUseGPU = hwinfo->bCanUseGPU || (getenv("GMX_EMULATE_GPU") != NULL);
+ +
+ +    if (ir->verletbuf_drift > 0)
+ +    {
+ +        /* Update the Verlet buffer size for the current run setup */
+ +        verletbuf_list_setup_t ls;
+ +        real rlist_new;
+ +
+ +        /* Here we assume CPU acceleration is on. But as currently
+ +         * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
+ +         * and 4x2 gives a larger buffer than 4x4, this is ok.
+ +         */
+ +        verletbuf_get_list_setup(*bUseGPU,&ls);
+ +
+ +        calc_verlet_buffer_size(mtop,det(box),ir,
+ +                                ir->verletbuf_drift,&ls,
+ +                                NULL,&rlist_new);
+ +        if (rlist_new != ir->rlist)
+ +        {
+ +            if (fplog != NULL)
+ +            {
+ +                fprintf(fplog,"\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
+ +                        ir->rlist,rlist_new,
+ +                        ls.cluster_size_i,ls.cluster_size_j);
+ +            }
+ +            ir->rlist     = rlist_new;
+ +            ir->rlistlong = rlist_new;
+ +        }
+ +    }
+ +
+ +    /* With GPU or emulation we should check nstlist for performance */
+ +    if ((EI_DYNAMICS(ir->eI) &&
+ +         *bUseGPU &&
+ +         ir->nstlist < NSTLIST_GPU_ENOUGH) ||
+ +        getenv(NSTLIST_ENVVAR) != NULL)
+ +    {
+ +        /* Choose a better nstlist */
+ +        increase_nstlist(fplog,cr,ir,mtop,box);
+ +    }
+ +}
+ +
+ +static void convert_to_verlet_scheme(FILE *fplog,
+ +                                     t_inputrec *ir,
+ +                                     gmx_mtop_t *mtop,real box_vol)
+ +{
+ +    char *conv_mesg="Converting input file with group cut-off scheme to the Verlet cut-off scheme";
+ +
+ +    md_print_warn(NULL,fplog,"%s\n",conv_mesg);
+ +
+ +    ir->cutoff_scheme   = ecutsVERLET;
+ +    ir->verletbuf_drift = 0.005;
+ +
+ +    if (ir->rcoulomb != ir->rvdw)
+ +    {
+ +        gmx_fatal(FARGS,"The VdW and Coulomb cut-offs are different, whereas the Verlet scheme only supports equal cut-offs");
+ +    }
+ +
+ +    if (ir->vdwtype == evdwUSER || EEL_USER(ir->coulombtype))
+ +    {
+ +        gmx_fatal(FARGS,"User non-bonded potentials are not (yet) supported with the Verlet scheme");
+ +    }
+ +    else if (EVDW_SWITCHED(ir->vdwtype) || EEL_SWITCHED(ir->coulombtype))
+ +    {
+ +        md_print_warn(NULL,fplog,"Converting switched or shifted interactions to a shifted potential (without force shift), this will lead to slightly different interaction potentials");
+ +
+ +        if (EVDW_SWITCHED(ir->vdwtype))
+ +        {
+ +            ir->vdwtype = evdwCUT;
+ +        }
+ +        if (EEL_SWITCHED(ir->coulombtype))
+ +        {
+ +            if (EEL_FULL(ir->coulombtype))
+ +            {
+ +                /* With full electrostatic only PME can be switched */
+ +                ir->coulombtype = eelPME;
+ +            }
+ +            else
+ +            {
+ +                md_print_warn(NULL,fplog,"NOTE: Replacing %s electrostatics with reaction-field with epsilon-rf=inf\n",eel_names[ir->coulombtype]);
+ +                ir->coulombtype = eelRF;
+ +                ir->epsilon_rf  = 0.0;
+ +            }
+ +        }
+ +
+ +        /* We set the target energy drift to a small number.
+ +         * Note that this is only for testing. For production the user
+ +         * should think about this and set the mdp options.
+ +         */
+ +        ir->verletbuf_drift = 1e-4;
+ +    }
+ +
+ +    if (inputrec2nboundeddim(ir) != 3)
+ +    {
+ +        gmx_fatal(FARGS,"Can only convert old tpr files to the Verlet cut-off scheme with 3D pbc");
+ +    }
+ +
+ +    if (ir->efep != efepNO || ir->implicit_solvent != eisNO)
+ +    {
+ +        gmx_fatal(FARGS,"Will not convert old tpr files to the Verlet cut-off scheme with free-energy calculations or implicit solvent");
+ +    }
+ +
+ +    if (EI_DYNAMICS(ir->eI) && !(EI_MD(ir->eI) && ir->etc == etcNO))
+ +    {
+ +        verletbuf_list_setup_t ls;
+ +
+ +        verletbuf_get_list_setup(FALSE,&ls);
+ +        calc_verlet_buffer_size(mtop,box_vol,ir,ir->verletbuf_drift,&ls,
+ +                                NULL,&ir->rlist);
+ +    }
+ +    else
+ +    {
+ +        ir->verletbuf_drift = -1;
+ +        ir->rlist           = 1.05*max(ir->rvdw,ir->rcoulomb);
+ +    }
+ +
+ +    gmx_mtop_remove_chargegroups(mtop);
+ +}
+ +
+ +
+ +/* Set CPU affinity. Can be important for performance.
+ +   On some systems (e.g. Cray) CPU Affinity is set by default.
+ +   But default assigning doesn't work (well) with only some ranks
+ +   having threads. This causes very low performance.
+ +   External tools have cumbersome syntax for setting affinity
+ +   in the case that only some ranks have threads.
+ +   Thus it is important that GROMACS sets the affinity internally
+ +   if only PME is using threads.
+ +*/
+ +static void set_cpu_affinity(FILE *fplog,
+ +                             const t_commrec *cr,
+ +                             const gmx_hw_opt_t *hw_opt,
+ +                             int nthreads_pme,
+ +                             const gmx_hw_info_t *hwinfo,
+ +                             const t_inputrec *inputrec)
+ +{
+ +#ifdef GMX_OPENMP /* TODO: actually we could do this even without OpenMP?! */
+ +#ifdef __linux /* TODO: only linux? why not everywhere if sched_setaffinity is available */
+ +    if (hw_opt->bThreadPinning)
+ +    {
+ +        int thread, nthread_local, nthread_node, nthread_hw_max, nphyscore;
+ +        int offset;
+ +        char *env;
+ +
+ +        /* threads on this MPI process or TMPI thread */
+ +        if (cr->duty & DUTY_PP)
+ +        {
+ +            nthread_local = gmx_omp_nthreads_get(emntNonbonded);
+ +        }
+ +        else
+ +        {
+ +            nthread_local = gmx_omp_nthreads_get(emntPME);
+ +        }
+ +
+ +        /* map the current process to cores */
+ +        thread = 0;
+ +        nthread_node = nthread_local;
+ +#ifdef GMX_MPI
+ +        if (PAR(cr) || MULTISIM(cr))
+ +        {
+ +            /* We need to determine a scan of the thread counts in this
+ +             * compute node.
+ +             */
+ +            MPI_Comm comm_intra;
+ +
+ +            MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),cr->nodeid_intra,
+ +                           &comm_intra);
+ +            MPI_Scan(&nthread_local,&thread,1,MPI_INT,MPI_SUM,comm_intra);
+ +            /* MPI_Scan is inclusive, but here we need exclusive */
+ +            thread -= nthread_local;
+ +            /* Get the total number of threads on this physical node */
+ +            MPI_Allreduce(&nthread_local,&nthread_node,1,MPI_INT,MPI_SUM,comm_intra);
+ +            MPI_Comm_free(&comm_intra);
+ +        }
+ +#endif
+ +
+ +        offset = 0;
+ +        if (hw_opt->core_pinning_offset > 0)
+ +        {
+ +            offset = hw_opt->core_pinning_offset;
+ +            if (SIMMASTER(cr))
+ +            {
+ +                fprintf(stderr, "Applying core pinning offset %d\n", offset);
+ +            }
+ +            if (fplog)
+ +            {
+ +                fprintf(fplog, "Applying core pinning offset %d\n", offset);
+ +            }
+ +        }
+ +
+ +        /* With Intel Hyper-Threading enabled, we want to pin consecutive
+ +         * threads to physical cores when using more threads than physical
+ +         * cores or when the user requests so.
+ +         */
+ +        nthread_hw_max = hwinfo->nthreads_hw_avail;
+ +        nphyscore = -1;
+ +        if (hw_opt->bPinHyperthreading ||
+ +            (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
+ +             nthread_node > nthread_hw_max/2 && getenv("GMX_DISABLE_PINHT") == NULL))
+ +        {
+ +            if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) != GMX_CPUID_X86_SMT_ENABLED)
+ +            {
+ +                /* We print to stderr on all processes, as we might have
+ +                 * different settings on different physical nodes.
+ +                 */
+ +                if (gmx_cpuid_vendor(hwinfo->cpuid_info) != GMX_CPUID_VENDOR_INTEL)
+ +                {
+ +                    md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
+ +                                  "but non-Intel CPU detected (vendor: %s)\n",
+ +                                  gmx_cpuid_vendor_string[gmx_cpuid_vendor(hwinfo->cpuid_info)]);
+ +                }
+ +                else
+ +                {
+ +                    md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
+ +                                  "but the CPU detected does not have Intel Hyper-Threading support "
+ +                                  "(or it is turned off)\n");
+ +                }
+ +            }
+ +            nphyscore = nthread_hw_max/2;
+ +
+ +            if (SIMMASTER(cr))
+ +            {
+ +                fprintf(stderr, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
+ +                        nphyscore);
+ +            }
+ +            if (fplog)
+ +            {
+ +                fprintf(fplog, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
+ +                        nphyscore);
+ +            }
+ +        }
+ +
+ +        /* set the per-thread affinity */
+ +#pragma omp parallel firstprivate(thread) num_threads(nthread_local)
+ +        {
+ +            cpu_set_t mask;
+ +            int core;
+ +
+ +            CPU_ZERO(&mask);
+ +            thread += gmx_omp_get_thread_num();
+ +            if (nphyscore <= 0)
+ +            {
+ +                core = offset + thread;
+ +            }
+ +            else
+ +            {
+ +                /* Lock pairs of threads to the same hyperthreaded core */
+ +                core = offset + thread/2 + (thread % 2)*nphyscore;
+ +            }
+ +            CPU_SET(core, &mask);
+ +            sched_setaffinity((pid_t) syscall (SYS_gettid), sizeof(cpu_set_t), &mask);
+ +        }
+ +    }
+ +#endif /* __linux    */
+ +#endif /* GMX_OPENMP */
+ +}
+ +
+ +
+ +static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt,
+ +                                    int cutoff_scheme)
+ +{
+ +    gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp);
+ +
+ +#ifndef GMX_THREAD_MPI
+ +    if (hw_opt->nthreads_tot > 0)
+ +    {
+ +        gmx_fatal(FARGS,"Setting the total number of threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
+ +    }
+ +    if (hw_opt->nthreads_tmpi > 0)
+ +    {
+ +        gmx_fatal(FARGS,"Setting the number of thread-MPI threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
+ +    }
+ +#endif
+ +
+ +    if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
+ +    {
+ +        /* We have the same number of OpenMP threads for PP and PME processes,
+ +         * thus we can perform several consistency checks.
+ +         */
+ +        if (hw_opt->nthreads_tmpi > 0 &&
+ +            hw_opt->nthreads_omp > 0 &&
+ +            hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
+ +        {
+ +            gmx_fatal(FARGS,"The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
+ +                      hw_opt->nthreads_tot,hw_opt->nthreads_tmpi,hw_opt->nthreads_omp);
+ +        }
+ +
+ +        if (hw_opt->nthreads_tmpi > 0 &&
+ +            hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
+ +        {
+ +            gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
+ +                      hw_opt->nthreads_tot,hw_opt->nthreads_tmpi);
+ +        }
+ +
+ +        if (hw_opt->nthreads_omp > 0 &&
+ +            hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
+ +        {
+ +            gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
+ +                      hw_opt->nthreads_tot,hw_opt->nthreads_omp);
+ +        }
+ +
+ +        if (hw_opt->nthreads_tmpi > 0 &&
+ +            hw_opt->nthreads_omp <= 0)
+ +        {
+ +            hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
+ +        }
+ +    }
+ +
+ +#ifndef GMX_OPENMP
+ +    if (hw_opt->nthreads_omp > 1)
+ +    {
+ +        gmx_fatal(FARGS,"OpenMP threads are requested, but Gromacs was compiled without OpenMP support");
+ +    }
+ +#endif
+ +
+ +    if (cutoff_scheme == ecutsGROUP)
+ +    {
+ +        /* We only have OpenMP support for PME only nodes */
+ +        if (hw_opt->nthreads_omp > 1)
+ +        {
+ +            gmx_fatal(FARGS,"OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
+ +                      ecutscheme_names[cutoff_scheme],
+ +                      ecutscheme_names[ecutsVERLET]);
+ +        }
+ +        hw_opt->nthreads_omp = 1;
+ +    }
+ +
+ +    if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
+ +    {
+ +        gmx_fatal(FARGS,"You need to specify -ntomp in addition to -ntomp_pme");
+ +    }
+ +
+ +    if (hw_opt->nthreads_tot == 1)
+ +    {
+ +        hw_opt->nthreads_tmpi = 1;
+ +
+ +        if (hw_opt->nthreads_omp > 1)
+ +        {
+ +            gmx_fatal(FARGS,"You requested %d OpenMP threads with %d total threads",
+ +                      hw_opt->nthreads_tmpi,hw_opt->nthreads_tot);
+ +        }
+ +        hw_opt->nthreads_omp = 1;
+ +    }
+ +
+ +    if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
+ +    {
+ +        hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
+ +    }
+ +
+ +    if (debug)
+ +    {
+ +        fprintf(debug,"hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
+ +                hw_opt->nthreads_tot,
+ +                hw_opt->nthreads_tmpi,
+ +                hw_opt->nthreads_omp,
+ +                hw_opt->nthreads_omp_pme,
+ +                hw_opt->gpu_id!=NULL ? hw_opt->gpu_id : "");
+ +                
+ +    }
+ +}
+ +
+ +
+ +/* Override the value in inputrec with value passed on the command line (if any) */
+ +static void override_nsteps_cmdline(FILE *fplog,
+ +                                    int nsteps_cmdline,
+ +                                    t_inputrec *ir,
+ +                                    const t_commrec *cr)
+ +{
+ +    assert(ir);
+ +    assert(cr);
+ +
+ +    /* override with anything else than the default -2 */
+ +    if (nsteps_cmdline > -2)
+ +    {
+ +        char stmp[STRLEN];
+ +
+ +        ir->nsteps = nsteps_cmdline;
+ +        if (EI_DYNAMICS(ir->eI))
+ +        {
+ +            sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps, %.3f ps",
+ +                    nsteps_cmdline, nsteps_cmdline*ir->delta_t);
+ +        }
+ +        else
+ +        {
+ +            sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps",
+ +                    nsteps_cmdline);
+ +        }
+ +
+ +        md_print_warn(cr, fplog, "%s\n", stmp);
+ +    }
+ +}
+ +
+ +/* Data structure set by SIMMASTER which needs to be passed to all nodes
+ + * before the other nodes have read the tpx file and called gmx_detect_hardware.
+ + */
+ +typedef struct {
+ +    int      cutoff_scheme; /* The cutoff-scheme from inputrec_t */
+ +    gmx_bool bUseGPU;       /* Use GPU or GPU emulation          */
+ +} master_inf_t;
+ +
+ +int mdrunner(gmx_hw_opt_t *hw_opt,
+ +             FILE *fplog,t_commrec *cr,int nfile,
+ +             const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
+ +             gmx_bool bCompact, int nstglobalcomm,
+ +             ivec ddxyz,int dd_node_order,real rdd,real rconstr,
+ +             const char *dddlb_opt,real dlb_scale,
+ +             const char *ddcsx,const char *ddcsy,const char *ddcsz,
+ +             const char *nbpu_opt,
+ +             int nsteps_cmdline, int nstepout,int resetstep,
+ +             int nmultisim,int repl_ex_nst,int repl_ex_nex,
+ +             int repl_ex_seed, real pforce,real cpt_period,real max_hours,
+ +             const char *deviceOptions, unsigned long Flags)
+ +{
+ +    gmx_bool   bForceUseGPU,bTryUseGPU;
+ +    double     nodetime=0,realtime;
+ +    t_inputrec *inputrec;
+ +    t_state    *state=NULL;
+ +    matrix     box;
+ +    gmx_ddbox_t ddbox={0};
+ +    int        npme_major,npme_minor;
+ +    real       tmpr1,tmpr2;
+ +    t_nrnb     *nrnb;
+ +    gmx_mtop_t *mtop=NULL;
+ +    t_mdatoms  *mdatoms=NULL;
+ +    t_forcerec *fr=NULL;
+ +    t_fcdata   *fcd=NULL;
+ +    real       ewaldcoeff=0;
+ +    gmx_pme_t  *pmedata=NULL;
+ +    gmx_vsite_t *vsite=NULL;
+ +    gmx_constr_t constr;
+ +    int        i,m,nChargePerturbed=-1,status,nalloc;
+ +    char       *gro;
+ +    gmx_wallcycle_t wcycle;
+ +    gmx_bool       bReadRNG,bReadEkin;
+ +    int        list;
+ +    gmx_runtime_t runtime;
+ +    int        rc;
+ +    gmx_large_int_t reset_counters;
+ +    gmx_edsam_t ed=NULL;
+ +    t_commrec   *cr_old=cr; 
+ +    int         nthreads_pme=1;
+ +    int         nthreads_pp=1;
+ +    gmx_membed_t membed=NULL;
+ +    gmx_hw_info_t *hwinfo=NULL;
+ +    master_inf_t minf={-1,FALSE};
+ +
+ +    /* CAUTION: threads may be started later on in this function, so
+ +       cr doesn't reflect the final parallel state right now */
+ +    snew(inputrec,1);
+ +    snew(mtop,1);
+ +    
+ +    if (Flags & MD_APPENDFILES) 
+ +    {
+ +        fplog = NULL;
+ +    }
+ +
+ +    bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
+ +    bTryUseGPU   = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
+ +
+ +    snew(state,1);
+ +    if (SIMMASTER(cr)) 
+ +    {
+ +        /* Read (nearly) all data required for the simulation */
+ +        read_tpx_state(ftp2fn(efTPX,nfile,fnm),inputrec,state,NULL,mtop);
+ +
+ +        if (inputrec->cutoff_scheme != ecutsVERLET &&
+ +            ((Flags & MD_TESTVERLET) || getenv("GMX_VERLET_SCHEME") != NULL))
+ +        {
+ +            convert_to_verlet_scheme(fplog,inputrec,mtop,det(state->box));
+ +        }
+ +
+ +        /* Detect hardware, gather information. With tMPI only thread 0 does it
+ +         * and after threads are started broadcasts hwinfo around. */
+ +        snew(hwinfo, 1);
+ +        gmx_detect_hardware(fplog, hwinfo, cr,
+ +                            bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
+ +
+ +        minf.cutoff_scheme = inputrec->cutoff_scheme;
+ +        minf.bUseGPU       = FALSE;
+ +
+ +        if (inputrec->cutoff_scheme == ecutsVERLET)
+ +        {
+ +            prepare_verlet_scheme(fplog,hwinfo,cr,hw_opt,nbpu_opt,
+ +                                  inputrec,mtop,state->box,
+ +                                  &minf.bUseGPU);
+ +        }
+ +        else if (hwinfo->bCanUseGPU)
+ +        {
+ +            md_print_warn(cr,fplog,
+ +                          "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
+ +                          "      To use a GPU, set the mdp option: cutoff-scheme = Verlet\n"
+ +                          "      (for quick performance testing you can use the -testverlet option)\n");
+ +
+ +            if (bForceUseGPU)
+ +            {
+ +                gmx_fatal(FARGS,"GPU requested, but can't be used without cutoff-scheme=Verlet");
+ +            }
+ +        }
+ +    }
+ +#ifndef GMX_THREAD_MPI
+ +    if (PAR(cr))
+ +    {
+ +        gmx_bcast_sim(sizeof(minf),&minf,cr);
+ +    }
+ +#endif
+ +    if (minf.bUseGPU && cr->npmenodes == -1)
+ +    {
+ +        /* Don't automatically use PME-only nodes with GPUs */
+ +        cr->npmenodes = 0;
+ +    }
+ +
+ +#ifdef GMX_THREAD_MPI
+ +    /* With thread-MPI inputrec is only set here on the master thread */
+ +    if (SIMMASTER(cr))
+ +#endif
+ +    {
+ +        check_and_update_hw_opt(hw_opt,minf.cutoff_scheme);
+ +
+ +#ifdef GMX_THREAD_MPI
+ +        if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
+ +        {
+ +            gmx_fatal(FARGS,"You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME nodes");
+ +        }
+ +#endif
+ +
+ +        if (hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
+ +            cr->npmenodes <= 0)
+ +        {
+ +            gmx_fatal(FARGS,"You need to explicitly specify the number of PME nodes (-npme) when using different number of OpenMP threads for PP and PME nodes");
+ +        }
+ +    }
+ +
+ +#ifdef GMX_THREAD_MPI
+ +    if (SIMMASTER(cr))
+ +    {
+ +        /* NOW the threads will be started: */
+ +        hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
+ +                                                 hw_opt,
+ +                                                 inputrec, mtop,
+ +                                                 cr, fplog);
+ +        if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
+ +        {
+ +            hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
+ +        }
+ +
+ +        if (hw_opt->nthreads_tmpi > 1)
+ +        {
+ +            /* now start the threads. */
+ +            cr=mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm, 
+ +                                      oenv, bVerbose, bCompact, nstglobalcomm, 
+ +                                      ddxyz, dd_node_order, rdd, rconstr, 
+ +                                      dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
+ +                                      nbpu_opt,
+ +                                      nsteps_cmdline, nstepout, resetstep, nmultisim, 
+ +                                      repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
+ +                                      cpt_period, max_hours, deviceOptions, 
+ +                                      Flags);
+ +            /* the main thread continues here with a new cr. We don't deallocate
+ +               the old cr because other threads may still be reading it. */
+ +            if (cr == NULL)
+ +            {
+ +                gmx_comm("Failed to spawn threads");
+ +            }
+ +        }
+ +    }
+ +#endif
+ +    /* END OF CAUTION: cr is now reliable */
+ +
+ +    /* g_membed initialisation *
+ +     * Because we change the mtop, init_membed is called before the init_parallel *
+ +     * (in case we ever want to make it run in parallel) */
+ +    if (opt2bSet("-membed",nfile,fnm))
+ +    {
+ +        if (MASTER(cr))
+ +        {
+ +            fprintf(stderr,"Initializing membed");
+ +        }
+ +        membed = init_membed(fplog,nfile,fnm,mtop,inputrec,state,cr,&cpt_period);
+ +    }
+ +
+ +    if (PAR(cr))
+ +    {
+ +        /* now broadcast everything to the non-master nodes/threads: */
+ +        init_parallel(fplog, cr, inputrec, mtop);
+ +
+ +        /* This check needs to happen after get_nthreads_mpi() */
+ +        if (inputrec->cutoff_scheme == ecutsVERLET && (Flags & MD_PARTDEC))
+ +        {
+ +            gmx_fatal_collective(FARGS,cr,NULL,
+ +                                 "The Verlet cut-off scheme is not supported with particle decomposition.\n"
+ +                                 "You can achieve the same effect as particle decomposition by running in parallel using only OpenMP threads.");
+ +        }
+ +    }
+ +    if (fplog != NULL)
+ +    {
+ +        pr_inputrec(fplog,0,"Input Parameters",inputrec,FALSE);
+ +    }
+ +
+ +#if defined GMX_THREAD_MPI
+ +    /* With tMPI we detected on thread 0 and we'll just pass the hwinfo pointer
+ +     * to the other threads  -- slightly uncool, but works fine, just need to
+ +     * make sure that the data doesn't get freed twice. */
+ +    if (cr->nnodes > 1)
+ +    {
+ +        if (!SIMMASTER(cr))
+ +        {
+ +            snew(hwinfo, 1);
+ +        }
+ +        gmx_bcast(sizeof(&hwinfo), &hwinfo, cr);
+ +    }
+ +#else
+ +    if (PAR(cr) && !SIMMASTER(cr))
+ +    {
+ +        /* now we have inputrec on all nodes, can run the detection */
+ +        /* TODO: perhaps it's better to propagate within a node instead? */
+ +        snew(hwinfo, 1);
+ +        gmx_detect_hardware(fplog, hwinfo, cr,
+ +                                 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
+ +    }
+ +#endif
+ +
+ +    /* now make sure the state is initialized and propagated */
+ +    set_state_entries(state,inputrec,cr->nnodes);
+ +
+ +    /* remove when vv and rerun works correctly! */
+ +    if (PAR(cr) && EI_VV(inputrec->eI) && ((Flags & MD_RERUN) || (Flags & MD_RERUN_VSITE)))
+ +    {
+ +        gmx_fatal(FARGS,
+ +                  "Currently can't do velocity verlet with rerun in parallel.");
+ +    }
+ +
+ +    /* A parallel command line option consistency check that we can
+ +       only do after any threads have started. */
+ +    if (!PAR(cr) &&
+ +        (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
+ +    {
+ +        gmx_fatal(FARGS,
+ +                  "The -dd or -npme option request a parallel simulation, "
+ +#ifndef GMX_MPI
+ +                  "but %s was compiled without threads or MPI enabled"
+ +#else
+ +#ifdef GMX_THREAD_MPI
+ +                  "but the number of threads (option -nt) is 1"
+ +#else
+ +                  "but %s was not started through mpirun/mpiexec or only one process was requested through mpirun/mpiexec"
+ +#endif
+ +#endif
+ +                  , ShortProgram()
+ +            );
+ +    }
+ +
+ +    if ((Flags & MD_RERUN) &&
+ +        (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
+ +    {
+ +        gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
+ +    }
+ +
+ +    if (can_use_allvsall(inputrec,mtop,TRUE,cr,fplog) && PAR(cr))
+ +    {
+ +        /* All-vs-all loops do not work with domain decomposition */
+ +        Flags |= MD_PARTDEC;
+ +    }
+ +
+ +    if (!EEL_PME(inputrec->coulombtype) || (Flags & MD_PARTDEC))
+ +    {
+ +        if (cr->npmenodes > 0)
+ +        {
+ +            if (!EEL_PME(inputrec->coulombtype))
+ +            {
+ +                gmx_fatal_collective(FARGS,cr,NULL,
+ +                                     "PME nodes are requested, but the system does not use PME electrostatics");
+ +            }
+ +            if (Flags & MD_PARTDEC)
+ +            {
+ +                gmx_fatal_collective(FARGS,cr,NULL,
+ +                                     "PME nodes are requested, but particle decomposition does not support separate PME nodes");
+ +            }
+ +        }
+ +
+ +        cr->npmenodes = 0;
+ +    }
+ +
+ +#ifdef GMX_FAHCORE
+ +    fcRegisterSteps(inputrec->nsteps,inputrec->init_step);
+ +#endif
+ +
+ +    /* NMR restraints must be initialized before load_checkpoint,
+ +     * since with time averaging the history is added to t_state.
+ +     * For proper consistency check we therefore need to extend
+ +     * t_state here.
+ +     * So the PME-only nodes (if present) will also initialize
+ +     * the distance restraints.
+ +     */
+ +    snew(fcd,1);
+ +
+ +    /* This needs to be called before read_checkpoint to extend the state */
+ +    init_disres(fplog,mtop,inputrec,cr,Flags & MD_PARTDEC,fcd,state);
+ +
+ +    if (gmx_mtop_ftype_count(mtop,F_ORIRES) > 0)
+ +    {
+ +        if (PAR(cr) && !(Flags & MD_PARTDEC))
+ +        {
+ +            gmx_fatal(FARGS,"Orientation restraints do not work (yet) with domain decomposition, use particle decomposition (mdrun option -pd)");
+ +        }
+ +        /* Orientation restraints */
+ +        if (MASTER(cr))
+ +        {
+ +            init_orires(fplog,mtop,state->x,inputrec,cr->ms,&(fcd->orires),
+ +                        state);
+ +        }
+ +    }
+ +
+ +    if (DEFORM(*inputrec))
+ +    {
+ +        /* Store the deform reference box before reading the checkpoint */
+ +        if (SIMMASTER(cr))
+ +        {
+ +            copy_mat(state->box,box);
+ +        }
+ +        if (PAR(cr))
+ +        {
+ +            gmx_bcast(sizeof(box),box,cr);
+ +        }
+ +        /* Because we do not have the update struct available yet
+ +         * in which the reference values should be stored,
+ +         * we store them temporarily in static variables.
+ +         * This should be thread safe, since they are only written once
+ +         * and with identical values.
+ +         */
+ +#ifdef GMX_THREAD_MPI
+ +        tMPI_Thread_mutex_lock(&deform_init_box_mutex);
+ +#endif
+ +        deform_init_init_step_tpx = inputrec->init_step;
+ +        copy_mat(box,deform_init_box_tpx);
+ +#ifdef GMX_THREAD_MPI
+ +        tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
+ +#endif
+ +    }
+ +
+ +    if (opt2bSet("-cpi",nfile,fnm)) 
+ +    {
+ +        /* Check if checkpoint file exists before doing continuation.
+ +         * This way we can use identical input options for the first and subsequent runs...
+ +         */
+ +        if( gmx_fexist_master(opt2fn_master("-cpi",nfile,fnm,cr),cr) )
+ +        {
+ +            load_checkpoint(opt2fn_master("-cpi",nfile,fnm,cr),&fplog,
+ +                            cr,Flags & MD_PARTDEC,ddxyz,
+ +                            inputrec,state,&bReadRNG,&bReadEkin,
+ +                            (Flags & MD_APPENDFILES),
+ +                            (Flags & MD_APPENDFILESSET));
+ +            
+ +            if (bReadRNG)
+ +            {
+ +                Flags |= MD_READ_RNG;
+ +            }
+ +            if (bReadEkin)
+ +            {
+ +                Flags |= MD_READ_EKIN;
+ +            }
+ +        }
+ +    }
+ +
+ +    if (((MASTER(cr) || (Flags & MD_SEPPOT)) && (Flags & MD_APPENDFILES))
+ +#ifdef GMX_THREAD_MPI
+ +        /* With thread MPI only the master node/thread exists in mdrun.c,
+ +         * therefore non-master nodes need to open the "seppot" log file here.
+ +         */
+ +        || (!MASTER(cr) && (Flags & MD_SEPPOT))
+ +#endif
+ +        )
+ +    {
+ +        gmx_log_open(ftp2fn(efLOG,nfile,fnm),cr,!(Flags & MD_SEPPOT),
+ +                             Flags,&fplog);
+ +    }
+ +
+ +    /* override nsteps with value from cmdline */
+ +    override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
+ +
+ +    if (SIMMASTER(cr)) 
+ +    {
+ +        copy_mat(state->box,box);
+ +    }
+ +
+ +    if (PAR(cr)) 
+ +    {
+ +        gmx_bcast(sizeof(box),box,cr);
+ +    }
+ +
+ +    /* Essential dynamics */
+ +    if (opt2bSet("-ei",nfile,fnm))
+ +    {
+ +        /* Open input and output files, allocate space for ED data structure */
+ +        ed = ed_open(nfile,fnm,Flags,cr);
+ +    }
+ +
+ +    if (PAR(cr) && !((Flags & MD_PARTDEC) ||
+ +                     EI_TPI(inputrec->eI) ||
+ +                     inputrec->eI == eiNM))
+ +    {
+ +        cr->dd = init_domain_decomposition(fplog,cr,Flags,ddxyz,rdd,rconstr,
+ +                                           dddlb_opt,dlb_scale,
+ +                                           ddcsx,ddcsy,ddcsz,
+ +                                           mtop,inputrec,
+ +                                           box,state->x,
+ +                                           &ddbox,&npme_major,&npme_minor);
+ +
+ +        make_dd_communicators(fplog,cr,dd_node_order);
+ +
+ +        /* Set overallocation to avoid frequent reallocation of arrays */
+ +        set_over_alloc_dd(TRUE);
+ +    }
+ +    else
+ +    {
+ +        /* PME, if used, is done on all nodes with 1D decomposition */
+ +        cr->npmenodes = 0;
+ +        cr->duty = (DUTY_PP | DUTY_PME);
+ +        npme_major = 1;
+ +        npme_minor = 1;
+ +        if (!EI_TPI(inputrec->eI))
+ +        {
+ +            npme_major = cr->nnodes;
+ +        }
+ +        
+ +        if (inputrec->ePBC == epbcSCREW)
+ +        {
+ +            gmx_fatal(FARGS,
+ +                      "pbc=%s is only implemented with domain decomposition",
+ +                      epbc_names[inputrec->ePBC]);
+ +        }
+ +    }
+ +
+ +    if (PAR(cr))
+ +    {
+ +        /* After possible communicator splitting in make_dd_communicators.
+ +         * we can set up the intra/inter node communication.
+ +         */
+ +        gmx_setup_nodecomm(fplog,cr);
+ +    }
+ +
+ +    /* Initialize per-node process ID and counters. */
+ +    gmx_init_intra_counters(cr);
+ +
+ +#ifdef GMX_MPI
+ +    md_print_info(cr,fplog,"Using %d MPI %s\n",
+ +                  cr->nnodes,
+ +#ifdef GMX_THREAD_MPI
+ +                  cr->nnodes==1 ? "thread" : "threads"
+ +#else
+ +                  cr->nnodes==1 ? "process" : "processes"
+ +#endif
+ +                  );
+ +#endif
+ +
+ +    gmx_omp_nthreads_init(fplog, cr,
+ +                          hwinfo->nthreads_hw_avail,
+ +                          hw_opt->nthreads_omp,
+ +                          hw_opt->nthreads_omp_pme,
+ +                          (cr->duty & DUTY_PP) == 0,
+ +                          inputrec->cutoff_scheme == ecutsVERLET);
+ +
+ +    gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi, minf.bUseGPU);
+ +
+ +    /* getting number of PP/PME threads
+ +       PME: env variable should be read only on one node to make sure it is 
+ +       identical everywhere;
+ +     */
+ +    /* TODO nthreads_pp is only used for pinning threads.
+ +     * This is a temporary solution until we have a hw topology library.
+ +     */
+ +    nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
+ +    nthreads_pme = gmx_omp_nthreads_get(emntPME);
+ +
+ +    wcycle = wallcycle_init(fplog,resetstep,cr,nthreads_pp,nthreads_pme);
+ +
+ +    if (PAR(cr))
+ +    {
+ +        /* Master synchronizes its value of reset_counters with all nodes 
+ +         * including PME only nodes */
+ +        reset_counters = wcycle_get_reset_counters(wcycle);
+ +        gmx_bcast_sim(sizeof(reset_counters),&reset_counters,cr);
+ +        wcycle_set_reset_counters(wcycle, reset_counters);
+ +    }
+ +
+ +    snew(nrnb,1);
+ +    if (cr->duty & DUTY_PP)
+ +    {
+ +        /* For domain decomposition we allocate dynamically
+ +         * in dd_partition_system.
+ +         */
+ +        if (DOMAINDECOMP(cr))
+ +        {
+ +            bcast_state_setup(cr,state);
+ +        }
+ +        else
+ +        {
+ +            if (PAR(cr))
+ +            {
+ +                bcast_state(cr,state,TRUE);
+ +            }
+ +        }
+ +
+ +        /* Initiate forcerecord */
+ +        fr = mk_forcerec();
+ +        fr->hwinfo = hwinfo;
+ +        init_forcerec(fplog,oenv,fr,fcd,inputrec,mtop,cr,box,FALSE,
+ +                      opt2fn("-table",nfile,fnm),
+ +                      opt2fn("-tabletf",nfile,fnm),
+ +                      opt2fn("-tablep",nfile,fnm),
+ +                      opt2fn("-tableb",nfile,fnm),
+ +                      nbpu_opt,
+ +                      FALSE,pforce);
+ +
+ +        /* version for PCA_NOT_READ_NODE (see md.c) */
+ +        /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
+ +          "nofile","nofile","nofile","nofile",FALSE,pforce);
+ +          */        
+ +        fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT);
+ +
+ +        /* Initialize QM-MM */
+ +        if(fr->bQMMM)
+ +        {
+ +            init_QMMMrec(cr,box,mtop,inputrec,fr);
+ +        }
+ +
+ +        /* Initialize the mdatoms structure.
+ +         * mdatoms is not filled with atom data,
+ +         * as this can not be done now with domain decomposition.
+ +         */
+ +        mdatoms = init_mdatoms(fplog,mtop,inputrec->efep!=efepNO);
+ +
+ +        /* Initialize the virtual site communication */
+ +        vsite = init_vsite(mtop,cr);
+ +
+ +        calc_shifts(box,fr->shift_vec);
+ +
+ +        /* With periodic molecules the charge groups should be whole at start up
+ +         * and the virtual sites should not be far from their proper positions.
+ +         */
+ +        if (!inputrec->bContinuation && MASTER(cr) &&
+ +            !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
+ +        {
+ +            /* Make molecules whole at start of run */
+ +            if (fr->ePBC != epbcNONE)
+ +            {
+ +                do_pbc_first_mtop(fplog,inputrec->ePBC,box,mtop,state->x);
+ +            }
+ +            if (vsite)
+ +            {
+ +                /* Correct initial vsite positions are required
+ +                 * for the initial distribution in the domain decomposition
+ +                 * and for the initial shell prediction.
+ +                 */
+ +                construct_vsites_mtop(fplog,vsite,mtop,state->x);
+ +            }
+ +        }
+ +
+ +        if (EEL_PME(fr->eeltype))
+ +        {
+ +            ewaldcoeff = fr->ewaldcoeff;
+ +            pmedata = &fr->pmedata;
+ +        }
+ +        else
+ +        {
+ +            pmedata = NULL;
+ +        }
+ +    }
+ +    else
+ +    {
+ +        /* This is a PME only node */
+ +
+ +        /* We don't need the state */
+ +        done_state(state);
+ +
+ +        ewaldcoeff = calc_ewaldcoeff(inputrec->rcoulomb, inputrec->ewald_rtol);
+ +        snew(pmedata,1);
+ +    }
+ +
+ +#if defined GMX_THREAD_MPI
+ +    /* With the number of TMPI threads equal to the number of cores
+ +     * we already pinned in thread-MPI, so don't pin again here.
+ +     */
+ +    if (hw_opt->nthreads_tmpi != tMPI_Thread_get_hw_number())
+ +#endif
+ +    {
+ +        /* Set the CPU affinity */
+ +        set_cpu_affinity(fplog,cr,hw_opt,nthreads_pme,hwinfo,inputrec);
+ +    }
+ +
+ +    /* Initiate PME if necessary,
+ +     * either on all nodes or on dedicated PME nodes only. */
+ +    if (EEL_PME(inputrec->coulombtype))
+ +    {
+ +        if (mdatoms)
+ +        {
+ +            nChargePerturbed = mdatoms->nChargePerturbed;
+ +        }
+ +        if (cr->npmenodes > 0)
+ +        {
+ +            /* The PME only nodes need to know nChargePerturbed */
+ +            gmx_bcast_sim(sizeof(nChargePerturbed),&nChargePerturbed,cr);
+ +        }
+ +
+ +        if (cr->duty & DUTY_PME)
+ +        {
+ +            status = gmx_pme_init(pmedata,cr,npme_major,npme_minor,inputrec,
+ +                                  mtop ? mtop->natoms : 0,nChargePerturbed,
+ +                                  (Flags & MD_REPRODUCIBLE),nthreads_pme);
+ +            if (status != 0) 
+ +            {
+ +                gmx_fatal(FARGS,"Error %d initializing PME",status);
+ +            }
+ +        }
+ +    }
+ +
+ +
+ +    if (integrator[inputrec->eI].func == do_md
+ +#ifdef GMX_OPENMM
+ +        ||
+ +        integrator[inputrec->eI].func == do_md_openmm
+ +#endif
+ +        )
+ +    {
+ +        /* Turn on signal handling on all nodes */
+ +        /*
+ +         * (A user signal from the PME nodes (if any)
+ +         * is communicated to the PP nodes.
+ +         */
+ +        signal_handler_install();
+ +    }
+ +
+ +    if (cr->duty & DUTY_PP)
+ +    {
+ +        if (inputrec->ePull != epullNO)
+ +        {
+ +            /* Initialize pull code */
+ +            init_pull(fplog,inputrec,nfile,fnm,mtop,cr,oenv, inputrec->fepvals->init_lambda,
+ +                      EI_DYNAMICS(inputrec->eI) && MASTER(cr),Flags);
+ +        }
+ +        
+ +        if (inputrec->bRot)
+ +        {
+ +           /* Initialize enforced rotation code */
+ +           init_rot(fplog,inputrec,nfile,fnm,cr,state->x,box,mtop,oenv,
+ +                    bVerbose,Flags);
+ +        }
+ +
+ +        constr = init_constraints(fplog,mtop,inputrec,ed,state,cr);
+ +
+ +        if (DOMAINDECOMP(cr))
+ +        {
+ +            dd_init_bondeds(fplog,cr->dd,mtop,vsite,constr,inputrec,
+ +                            Flags & MD_DDBONDCHECK,fr->cginfo_mb);
+ +
+ +            set_dd_parameters(fplog,cr->dd,dlb_scale,inputrec,fr,&ddbox);
+ +
+ +            setup_dd_grid(fplog,cr->dd);
+ +        }
+ +
+ +        /* Now do whatever the user wants us to do (how flexible...) */
+ +        integrator[inputrec->eI].func(fplog,cr,nfile,fnm,
+ +                                      oenv,bVerbose,bCompact,
+ +                                      nstglobalcomm,
+ +                                      vsite,constr,
+ +                                      nstepout,inputrec,mtop,
+ +                                      fcd,state,
+ +                                      mdatoms,nrnb,wcycle,ed,fr,
+ +                                      repl_ex_nst,repl_ex_nex,repl_ex_seed,
+ +                                      membed,
+ +                                      cpt_period,max_hours,
+ +                                      deviceOptions,
+ +                                      Flags,
+ +                                      &runtime);
+ +
+ +        if (inputrec->ePull != epullNO)
+ +        {
+ +            finish_pull(fplog,inputrec->pull);
+ +        }
+ +        
+ +        if (inputrec->bRot)
+ +        {
+ +            finish_rot(fplog,inputrec->rot);
+ +        }
+ +
+ +    } 
+ +    else 
+ +    {
+ +        /* do PME only */
+ +        gmx_pmeonly(*pmedata,cr,nrnb,wcycle,ewaldcoeff,FALSE,inputrec);
+ +    }
+ +
+ +    if (EI_DYNAMICS(inputrec->eI) || EI_TPI(inputrec->eI))
+ +    {
+ +        /* Some timing stats */  
+ +        if (SIMMASTER(cr))
+ +        {
+ +            if (runtime.proc == 0)
+ +            {
+ +                runtime.proc = runtime.real;
+ +            }
+ +        }
+ +        else
+ +        {
+ +            runtime.real = 0;
+ +        }
+ +    }
+ +
+ +    wallcycle_stop(wcycle,ewcRUN);
+ +
+ +    /* Finish up, write some stuff
+ +     * if rerunMD, don't write last frame again 
+ +     */
+ +    finish_run(fplog,cr,ftp2fn(efSTO,nfile,fnm),
+ +               inputrec,nrnb,wcycle,&runtime,
+ +               fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ?
+ +                 nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL,
+ +               nthreads_pp, 
+ +               EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
+ +
+ +    if ((cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU)
+ +    {
+ +        char gpu_err_str[STRLEN];
+ +
+ +        /* free GPU memory and uninitialize GPU (by destroying the context) */
+ +        nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
+ +
+ +        if (!free_gpu(gpu_err_str))
+ +        {
+ +            gmx_warning("On node %d failed to free GPU #%d: %s",
+ +                        cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
+ +        }
+ +    }
+ +
+ +    if (opt2bSet("-membed",nfile,fnm))
+ +    {
+ +        sfree(membed);
+ +    }
+ +
+ +#ifdef GMX_THREAD_MPI
+ +    if (PAR(cr) && SIMMASTER(cr))
+ +#endif
+ +    {
+ +        gmx_hardware_info_free(hwinfo);
+ +    }
+ +
+ +    /* Does what it says */  
+ +    print_date_and_time(fplog,cr->nodeid,"Finished mdrun",&runtime);
+ +
+ +    /* Close logfile already here if we were appending to it */
+ +    if (MASTER(cr) && (Flags & MD_APPENDFILES))
+ +    {
+ +        gmx_log_close(fplog);
+ +    } 
+ +
+ +    rc=(int)gmx_get_stop_condition();
+ +
+ +#ifdef GMX_THREAD_MPI
+ +    /* we need to join all threads. The sub-threads join when they
+ +       exit this function, but the master thread needs to be told to 
+ +       wait for that. */
+ +    if (PAR(cr) && MASTER(cr))
+ +    {
+ +        tMPI_Finalize();
+ +    }
+ +#endif
+ +
+ +    return rc;
+ +}
author	Roland Schulz <roland@utk.edu>
	Fri, 19 Oct 2012 18:50:56 +0000 (14:50 -0400)
committer	Roland Schulz <roland@utk.edu>
	Fri, 19 Oct 2012 19:22:32 +0000 (15:22 -0400)
		1	2
src/gromacs/gmxlib/thread_mpi/pthreads.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/gmxlib/thread_mpi/pthreads.h	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/gmxlib/thread_mpi/tmpi_init.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/gmxlib/thread_mpi/winthreads.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/gmxlib/thread_mpi/winthreads.h	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/legacyheaders/nbnxn_cuda_data_mgmt.h	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/legacyheaders/thread_mpi/threads.h	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/legacyheaders/thread_mpi/tmpi.h	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/clincs.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/gmx_wallcycle.c	patch \|	diff1 \|	diff2 \|	blob \| history
src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu	patch \|	diff1 \|	diff2 \|	blob \| history
src/programs/mdrun/CMakeLists.txt	patch \|	diff1 \|	\|	blob \| history
src/programs/mdrun/md.c	patch \|	diff1 \|	\|	blob \| history
src/programs/mdrun/pme_loadbal.c	patch \|	\|	\|	blob
src/programs/mdrun/pme_loadbal.h	patch \|	\|	diff2 \|	blob \| history
src/programs/mdrun/runner.c	patch \|	diff1 \|	\|	blob \| history