--- /dev/null
- #include "pme_switch.h"
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * VERSION 3.2.0
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "typedefs.h"
+#include "smalloc.h"
+#include "sysstuff.h"
+#include "vec.h"
+#include "statutil.h"
+#include "vcm.h"
+#include "mdebin.h"
+#include "nrnb.h"
+#include "calcmu.h"
+#include "index.h"
+#include "vsite.h"
+#include "update.h"
+#include "ns.h"
+#include "trnio.h"
+#include "xtcio.h"
+#include "mdrun.h"
+#include "md_support.h"
+#include "confio.h"
+#include "network.h"
+#include "pull.h"
+#include "xvgr.h"
+#include "physics.h"
+#include "names.h"
+#include "xmdrun.h"
+#include "ionize.h"
+#include "disre.h"
+#include "orires.h"
+#include "pme.h"
+#include "mdatoms.h"
+#include "repl_ex.h"
+#include "qmmm.h"
+#include "domdec.h"
+#include "domdec_network.h"
+#include "partdec.h"
+#include "topsort.h"
+#include "coulomb.h"
+#include "constr.h"
+#include "shellfc.h"
+#include "compute_io.h"
+#include "mvdata.h"
+#include "checkpoint.h"
+#include "mtop_util.h"
+#include "sighandler.h"
+#include "txtdump.h"
+#include "string2.h"
- pme_switch_t pme_switch=NULL;
++#include "pme_loadbal.h"
+#include "bondf.h"
+#include "membed.h"
+#include "types/nlistheuristics.h"
+#include "types/iteratedconstraints.h"
+#include "nbnxn_cuda_data_mgmt.h"
+
+#ifdef GMX_LIB_MPI
+#include <mpi.h>
+#endif
+#ifdef GMX_THREAD_MPI
+#include "tmpi.h"
+#endif
+
+#ifdef GMX_FAHCORE
+#include "corewrap.h"
+#endif
+
+static void reset_all_counters(FILE *fplog,t_commrec *cr,
+ gmx_large_int_t step,
+ gmx_large_int_t *step_rel,t_inputrec *ir,
+ gmx_wallcycle_t wcycle,t_nrnb *nrnb,
+ gmx_runtime_t *runtime,
+ nbnxn_cuda_ptr_t cu_nbv)
+{
+ char sbuf[STEPSTRSIZE];
+
+ /* Reset all the counters related to performance over the run */
+ md_print_warn(cr,fplog,"step %s: resetting all time and cycle counters\n",
+ gmx_step_str(step,sbuf));
+
+ if (cu_nbv)
+ {
+ nbnxn_cuda_reset_timings(cu_nbv);
+ }
+
+ wallcycle_stop(wcycle,ewcRUN);
+ wallcycle_reset_all(wcycle);
+ if (DOMAINDECOMP(cr))
+ {
+ reset_dd_statistics_counters(cr->dd);
+ }
+ init_nrnb(nrnb);
+ ir->init_step += *step_rel;
+ ir->nsteps -= *step_rel;
+ *step_rel = 0;
+ wallcycle_start(wcycle,ewcRUN);
+ runtime_start(runtime);
+ print_date_and_time(fplog,cr->nodeid,"Restarted time",runtime);
+}
+
+double do_md(FILE *fplog,t_commrec *cr,int nfile,const t_filenm fnm[],
+ const output_env_t oenv, gmx_bool bVerbose,gmx_bool bCompact,
+ int nstglobalcomm,
+ gmx_vsite_t *vsite,gmx_constr_t constr,
+ int stepout,t_inputrec *ir,
+ gmx_mtop_t *top_global,
+ t_fcdata *fcd,
+ t_state *state_global,
+ t_mdatoms *mdatoms,
+ t_nrnb *nrnb,gmx_wallcycle_t wcycle,
+ gmx_edsam_t ed,t_forcerec *fr,
+ int repl_ex_nst,int repl_ex_nex,int repl_ex_seed,gmx_membed_t membed,
+ real cpt_period,real max_hours,
+ const char *deviceOptions,
+ unsigned long Flags,
+ gmx_runtime_t *runtime)
+{
+ gmx_mdoutf_t *outf;
+ gmx_large_int_t step,step_rel;
+ double run_time;
+ double t,t0,lam0[efptNR];
+ gmx_bool bGStatEveryStep,bGStat,bCalcVir,bCalcEner;
+ gmx_bool bNS,bNStList,bSimAnn,bStopCM,bRerunMD,bNotLastFrame=FALSE,
+ bFirstStep,bStateFromCP,bStateFromTPX,bInitStep,bLastStep,
+ bBornRadii,bStartingFromCpt;
+ gmx_bool bDoDHDL=FALSE,bDoFEP=FALSE,bDoExpanded=FALSE;
+ gmx_bool do_ene,do_log,do_verbose,bRerunWarnNoV=TRUE,
+ bForceUpdate=FALSE,bCPT;
+ int mdof_flags;
+ gmx_bool bMasterState;
+ int force_flags,cglo_flags;
+ tensor force_vir,shake_vir,total_vir,tmp_vir,pres;
+ int i,m;
+ t_trxstatus *status;
+ rvec mu_tot;
+ t_vcm *vcm;
+ t_state *bufstate=NULL;
+ matrix *scale_tot,pcoupl_mu,M,ebox;
+ gmx_nlheur_t nlh;
+ t_trxframe rerun_fr;
+ gmx_repl_ex_t repl_ex=NULL;
+ int nchkpt=1;
+ gmx_localtop_t *top;
+ t_mdebin *mdebin=NULL;
+ df_history_t df_history;
+ t_state *state=NULL;
+ rvec *f_global=NULL;
+ int n_xtc=-1;
+ rvec *x_xtc=NULL;
+ gmx_enerdata_t *enerd;
+ rvec *f=NULL;
+ gmx_global_stat_t gstat;
+ gmx_update_t upd=NULL;
+ t_graph *graph=NULL;
+ globsig_t gs;
+ gmx_rng_t mcrng=NULL;
+ gmx_bool bFFscan;
+ gmx_groups_t *groups;
+ gmx_ekindata_t *ekind, *ekind_save;
+ gmx_shellfc_t shellfc;
+ int count,nconverged=0;
+ real timestep=0;
+ double tcount=0;
+ gmx_bool bIonize=FALSE;
+ gmx_bool bTCR=FALSE,bConverged=TRUE,bOK,bSumEkinhOld,bExchanged;
+ gmx_bool bAppend;
+ gmx_bool bResetCountersHalfMaxH=FALSE;
+ gmx_bool bVV,bIterations,bFirstIterate,bTemp,bPres,bTrotter;
+ real mu_aver=0,dvdl;
+ int a0,a1,gnx=0,ii;
+ atom_id *grpindex=NULL;
+ char *grpname;
+ t_coupl_rec *tcr=NULL;
+ rvec *xcopy=NULL,*vcopy=NULL,*cbuf=NULL;
+ matrix boxcopy={{0}},lastbox;
+ tensor tmpvir;
+ real fom,oldfom,veta_save,pcurr,scalevir,tracevir;
+ real vetanew = 0;
+ int lamnew=0;
+ /* for FEP */
+ int fep_state=0;
+ int nstfep;
+ real rate;
+ double cycles;
+ real saved_conserved_quantity = 0;
+ real last_ekin = 0;
+ int iter_i;
+ t_extmass MassQ;
+ int **trotter_seq;
+ char sbuf[STEPSTRSIZE],sbuf2[STEPSTRSIZE];
+ int handled_stop_condition=gmx_stop_cond_none; /* compare to get_stop_condition*/
+ gmx_iterate_t iterate;
+ gmx_large_int_t multisim_nsteps=-1; /* number of steps to do before first multisim
+ simulation stops. If equal to zero, don't
+ communicate any more between multisims.*/
+ /* PME load balancing data for GPU kernels */
- switch_pme_init(&pme_switch,ir,state->box,fr->ic,fr->pmedata);
++ pme_load_balancing_t pme_loadbal=NULL;
+ double cycles_pmes;
+ gmx_bool bPMETuneTry=FALSE,bPMETuneRunning=FALSE;
+
+ if(MASTER(cr))
+ {
+ fprintf(stderr,
+ "\n* WARNING * WARNING * WARNING * WARNING * WARNING * WARNING *\n"
+ "We have just committed the new CPU detection code in this branch,\n"
+ "and will commit new SSE/AVX kernels in a few days. However, this\n"
+ "means that currently only the NxN kernels are accelerated!\n"
+ "In the mean time, you might want to avoid production runs in 4.6.\n\n");
+ }
+
+#ifdef GMX_FAHCORE
+ /* Temporary addition for FAHCORE checkpointing */
+ int chkpt_ret;
+#endif
+
+ /* Check for special mdrun options */
+ bRerunMD = (Flags & MD_RERUN);
+ bIonize = (Flags & MD_IONIZE);
+ bFFscan = (Flags & MD_FFSCAN);
+ bAppend = (Flags & MD_APPENDFILES);
+ if (Flags & MD_RESETCOUNTERSHALFWAY)
+ {
+ if (ir->nsteps > 0)
+ {
+ /* Signal to reset the counters half the simulation steps. */
+ wcycle_set_reset_counters(wcycle,ir->nsteps/2);
+ }
+ /* Signal to reset the counters halfway the simulation time. */
+ bResetCountersHalfMaxH = (max_hours > 0);
+ }
+
+ /* md-vv uses averaged full step velocities for T-control
+ md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
+ md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
+ bVV = EI_VV(ir->eI);
+ if (bVV) /* to store the initial velocities while computing virial */
+ {
+ snew(cbuf,top_global->natoms);
+ }
+ /* all the iteratative cases - only if there are constraints */
+ bIterations = ((IR_NPH_TROTTER(ir) || IR_NPT_TROTTER(ir)) && (constr) && (!bRerunMD));
+ bTrotter = (bVV && (IR_NPT_TROTTER(ir) || IR_NPH_TROTTER(ir) || IR_NVT_TROTTER(ir)));
+
+ if (bRerunMD)
+ {
+ /* Since we don't know if the frames read are related in any way,
+ * rebuild the neighborlist at every step.
+ */
+ ir->nstlist = 1;
+ ir->nstcalcenergy = 1;
+ nstglobalcomm = 1;
+ }
+
+ check_ir_old_tpx_versions(cr,fplog,ir,top_global);
+
+ nstglobalcomm = check_nstglobalcomm(fplog,cr,nstglobalcomm,ir);
+ bGStatEveryStep = (nstglobalcomm == 1);
+
+ if (!bGStatEveryStep && ir->nstlist == -1 && fplog != NULL)
+ {
+ fprintf(fplog,
+ "To reduce the energy communication with nstlist = -1\n"
+ "the neighbor list validity should not be checked at every step,\n"
+ "this means that exact integration is not guaranteed.\n"
+ "The neighbor list validity is checked after:\n"
+ " <n.list life time> - 2*std.dev.(n.list life time) steps.\n"
+ "In most cases this will result in exact integration.\n"
+ "This reduces the energy communication by a factor of 2 to 3.\n"
+ "If you want less energy communication, set nstlist > 3.\n\n");
+ }
+
+ if (bRerunMD || bFFscan)
+ {
+ ir->nstxtcout = 0;
+ }
+ groups = &top_global->groups;
+
+ /* Initial values */
+ init_md(fplog,cr,ir,oenv,&t,&t0,state_global->lambda,
+ &(state_global->fep_state),lam0,
+ nrnb,top_global,&upd,
+ nfile,fnm,&outf,&mdebin,
+ force_vir,shake_vir,mu_tot,&bSimAnn,&vcm,state_global,Flags);
+
+ clear_mat(total_vir);
+ clear_mat(pres);
+ /* Energy terms and groups */
+ snew(enerd,1);
+ init_enerdata(top_global->groups.grps[egcENER].nr,ir->fepvals->n_lambda,
+ enerd);
+ if (DOMAINDECOMP(cr))
+ {
+ f = NULL;
+ }
+ else
+ {
+ snew(f,top_global->natoms);
+ }
+
+ /* lambda Monte carlo random number generator */
+ if (ir->bExpanded)
+ {
+ mcrng = gmx_rng_init(ir->expandedvals->lmc_seed);
+ }
+ /* copy the state into df_history */
+ copy_df_history(&df_history,&state_global->dfhist);
+
+ /* Kinetic energy data */
+ snew(ekind,1);
+ init_ekindata(fplog,top_global,&(ir->opts),ekind);
+ /* needed for iteration of constraints */
+ snew(ekind_save,1);
+ init_ekindata(fplog,top_global,&(ir->opts),ekind_save);
+ /* Copy the cos acceleration to the groups struct */
+ ekind->cosacc.cos_accel = ir->cos_accel;
+
+ gstat = global_stat_init(ir);
+ debug_gmx();
+
+ /* Check for polarizable models and flexible constraints */
+ shellfc = init_shell_flexcon(fplog,
+ top_global,n_flexible_constraints(constr),
+ (ir->bContinuation ||
+ (DOMAINDECOMP(cr) && !MASTER(cr))) ?
+ NULL : state_global->x);
+
+ if (DEFORM(*ir))
+ {
+#ifdef GMX_THREAD_MPI
+ tMPI_Thread_mutex_lock(&deform_init_box_mutex);
+#endif
+ set_deform_reference_box(upd,
+ deform_init_init_step_tpx,
+ deform_init_box_tpx);
+#ifdef GMX_THREAD_MPI
+ tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
+#endif
+ }
+
+ {
+ double io = compute_io(ir,top_global->natoms,groups,mdebin->ebin->nener,1);
+ if ((io > 2000) && MASTER(cr))
+ fprintf(stderr,
+ "\nWARNING: This run will generate roughly %.0f Mb of data\n\n",
+ io);
+ }
+
+ if (DOMAINDECOMP(cr)) {
+ top = dd_init_local_top(top_global);
+
+ snew(state,1);
+ dd_init_local_state(cr->dd,state_global,state);
+
+ if (DDMASTER(cr->dd) && ir->nstfout) {
+ snew(f_global,state_global->natoms);
+ }
+ } else {
+ if (PAR(cr)) {
+ /* Initialize the particle decomposition and split the topology */
+ top = split_system(fplog,top_global,ir,cr);
+
+ pd_cg_range(cr,&fr->cg0,&fr->hcg);
+ pd_at_range(cr,&a0,&a1);
+ } else {
+ top = gmx_mtop_generate_local_top(top_global,ir);
+
+ a0 = 0;
+ a1 = top_global->natoms;
+ }
+
+ forcerec_set_excl_load(fr,top,cr);
+
+ state = partdec_init_local_state(cr,state_global);
+ f_global = f;
+
+ atoms2md(top_global,ir,0,NULL,a0,a1-a0,mdatoms);
+
+ if (vsite) {
+ set_vsite_top(vsite,top,mdatoms,cr);
+ }
+
+ if (ir->ePBC != epbcNONE && !fr->bMolPBC) {
+ graph = mk_graph(fplog,&(top->idef),0,top_global->natoms,FALSE,FALSE);
+ }
+
+ if (shellfc) {
+ make_local_shells(cr,mdatoms,shellfc);
+ }
+
+ init_bonded_thread_force_reduction(fr,&top->idef);
+
+ if (ir->pull && PAR(cr)) {
+ dd_make_local_pull_groups(NULL,ir->pull,mdatoms);
+ }
+ }
+
+ if (DOMAINDECOMP(cr))
+ {
+ /* Distribute the charge groups over the nodes from the master node */
+ dd_partition_system(fplog,ir->init_step,cr,TRUE,1,
+ state_global,top_global,ir,
+ state,&f,mdatoms,top,fr,
+ vsite,shellfc,constr,
+ nrnb,wcycle,FALSE);
+
+ }
+
+ update_mdatoms(mdatoms,state->lambda[efptMASS]);
+
+ if (opt2bSet("-cpi",nfile,fnm))
+ {
+ bStateFromCP = gmx_fexist_master(opt2fn_master("-cpi",nfile,fnm,cr),cr);
+ }
+ else
+ {
+ bStateFromCP = FALSE;
+ }
+
+ if (MASTER(cr))
+ {
+ if (bStateFromCP)
+ {
+ /* Update mdebin with energy history if appending to output files */
+ if ( Flags & MD_APPENDFILES )
+ {
+ restore_energyhistory_from_state(mdebin,&state_global->enerhist);
+ }
+ else
+ {
+ /* We might have read an energy history from checkpoint,
+ * free the allocated memory and reset the counts.
+ */
+ done_energyhistory(&state_global->enerhist);
+ init_energyhistory(&state_global->enerhist);
+ }
+ }
+ /* Set the initial energy history in state by updating once */
+ update_energyhistory(&state_global->enerhist,mdebin);
+ }
+
+ if ((state->flags & (1<<estLD_RNG)) && (Flags & MD_READ_RNG))
+ {
+ /* Set the random state if we read a checkpoint file */
+ set_stochd_state(upd,state);
+ }
+
+ if (state->flags & (1<<estMC_RNG))
+ {
+ set_mc_state(mcrng,state);
+ }
+
+ /* Initialize constraints */
+ if (constr) {
+ if (!DOMAINDECOMP(cr))
+ set_constraints(constr,top,ir,mdatoms,cr);
+ }
+
+ /* Check whether we have to GCT stuff */
+ bTCR = ftp2bSet(efGCT,nfile,fnm);
+ if (bTCR) {
+ if (MASTER(cr)) {
+ fprintf(stderr,"Will do General Coupling Theory!\n");
+ }
+ gnx = top_global->mols.nr;
+ snew(grpindex,gnx);
+ for(i=0; (i<gnx); i++) {
+ grpindex[i] = i;
+ }
+ }
+
+ if (repl_ex_nst > 0)
+ {
+ /* We need to be sure replica exchange can only occur
+ * when the energies are current */
+ check_nst_param(fplog,cr,"nstcalcenergy",ir->nstcalcenergy,
+ "repl_ex_nst",&repl_ex_nst);
+ /* This check needs to happen before inter-simulation
+ * signals are initialized, too */
+ }
+ if (repl_ex_nst > 0 && MASTER(cr))
+ {
+ repl_ex = init_replica_exchange(fplog,cr->ms,state_global,ir,
+ repl_ex_nst,repl_ex_nex,repl_ex_seed);
+ }
+
+ /* PME tuning is only supported with GPUs or PME nodes and not with rerun */
+ if ((Flags & MD_TUNEPME) &&
+ EEL_PME(fr->eeltype) &&
+ fr->cutoff_scheme == ecutsVERLET &&
+ (fr->nbv->bUseGPU || !(cr->duty & DUTY_PME)) &&
+ !bRerunMD)
+ {
- switch_pme(pme_switch,cr,
- (bVerbose && MASTER(cr)) ? stderr : NULL,
- fplog,
- ir,state,cycles_pmes,
- fr->ic,fr->nbv,&fr->pmedata,
- step);
++ pme_loadbal_init(&pme_loadbal,ir,state->box,fr->ic,fr->pmedata);
+ cycles_pmes = 0;
+ if (cr->duty & DUTY_PME)
+ {
+ /* Start tuning right away, as we can't measure the load */
+ bPMETuneRunning = TRUE;
+ }
+ else
+ {
+ /* Separate PME nodes, we can measure the PP/PME load balance */
+ bPMETuneTry = TRUE;
+ }
+ }
+
+ if (!ir->bContinuation && !bRerunMD)
+ {
+ if (mdatoms->cFREEZE && (state->flags & (1<<estV)))
+ {
+ /* Set the velocities of frozen particles to zero */
+ for(i=mdatoms->start; i<mdatoms->start+mdatoms->homenr; i++)
+ {
+ for(m=0; m<DIM; m++)
+ {
+ if (ir->opts.nFreeze[mdatoms->cFREEZE[i]][m])
+ {
+ state->v[i][m] = 0;
+ }
+ }
+ }
+ }
+
+ if (constr)
+ {
+ /* Constrain the initial coordinates and velocities */
+ do_constrain_first(fplog,constr,ir,mdatoms,state,f,
+ graph,cr,nrnb,fr,top,shake_vir);
+ }
+ if (vsite)
+ {
+ /* Construct the virtual sites for the initial configuration */
+ construct_vsites(fplog,vsite,state->x,nrnb,ir->delta_t,NULL,
+ top->idef.iparams,top->idef.il,
+ fr->ePBC,fr->bMolPBC,graph,cr,state->box);
+ }
+ }
+
+ debug_gmx();
+
+ /* set free energy calculation frequency as the minimum of nstdhdl, nstexpanded, and nstrepl_ex_nst*/
+ nstfep = ir->fepvals->nstdhdl;
+ if (ir->bExpanded && (nstfep > ir->expandedvals->nstexpanded))
+ {
+ nstfep = ir->expandedvals->nstexpanded;
+ }
+ if (repl_ex_nst > 0 && repl_ex_nst > nstfep)
+ {
+ nstfep = repl_ex_nst;
+ }
+
+ /* I'm assuming we need global communication the first time! MRS */
+ cglo_flags = (CGLO_TEMPERATURE | CGLO_GSTAT
+ | ((ir->comm_mode != ecmNO) ? CGLO_STOPCM:0)
+ | (bVV ? CGLO_PRESSURE:0)
+ | (bVV ? CGLO_CONSTRAINT:0)
+ | (bRerunMD ? CGLO_RERUNMD:0)
+ | ((Flags & MD_READ_EKIN) ? CGLO_READEKIN:0));
+
+ bSumEkinhOld = FALSE;
+ compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
+ NULL,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
+ constr,NULL,FALSE,state->box,
+ top_global,&pcurr,top_global->natoms,&bSumEkinhOld,cglo_flags);
+ if (ir->eI == eiVVAK) {
+ /* a second call to get the half step temperature initialized as well */
+ /* we do the same call as above, but turn the pressure off -- internally to
+ compute_globals, this is recognized as a velocity verlet half-step
+ kinetic energy calculation. This minimized excess variables, but
+ perhaps loses some logic?*/
+
+ compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
+ NULL,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
+ constr,NULL,FALSE,state->box,
+ top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
+ cglo_flags &~ (CGLO_STOPCM | CGLO_PRESSURE));
+ }
+
+ /* Calculate the initial half step temperature, and save the ekinh_old */
+ if (!(Flags & MD_STARTFROMCPT))
+ {
+ for(i=0; (i<ir->opts.ngtc); i++)
+ {
+ copy_mat(ekind->tcstat[i].ekinh,ekind->tcstat[i].ekinh_old);
+ }
+ }
+ if (ir->eI != eiVV)
+ {
+ enerd->term[F_TEMP] *= 2; /* result of averages being done over previous and current step,
+ and there is no previous step */
+ }
+
+ /* if using an iterative algorithm, we need to create a working directory for the state. */
+ if (bIterations)
+ {
+ bufstate = init_bufstate(state);
+ }
+ if (bFFscan)
+ {
+ snew(xcopy,state->natoms);
+ snew(vcopy,state->natoms);
+ copy_rvecn(state->x,xcopy,0,state->natoms);
+ copy_rvecn(state->v,vcopy,0,state->natoms);
+ copy_mat(state->box,boxcopy);
+ }
+
+ /* need to make an initiation call to get the Trotter variables set, as well as other constants for non-trotter
+ temperature control */
+ trotter_seq = init_npt_vars(ir,state,&MassQ,bTrotter);
+
+ if (MASTER(cr))
+ {
+ if (constr && !ir->bContinuation && ir->eConstrAlg == econtLINCS)
+ {
+ fprintf(fplog,
+ "RMS relative constraint deviation after constraining: %.2e\n",
+ constr_rmsd(constr,FALSE));
+ }
+ if (EI_STATE_VELOCITY(ir->eI))
+ {
+ fprintf(fplog,"Initial temperature: %g K\n",enerd->term[F_TEMP]);
+ }
+ if (bRerunMD)
+ {
+ fprintf(stderr,"starting md rerun '%s', reading coordinates from"
+ " input trajectory '%s'\n\n",
+ *(top_global->name),opt2fn("-rerun",nfile,fnm));
+ if (bVerbose)
+ {
+ fprintf(stderr,"Calculated time to finish depends on nsteps from "
+ "run input file,\nwhich may not correspond to the time "
+ "needed to process input trajectory.\n\n");
+ }
+ }
+ else
+ {
+ char tbuf[20];
+ fprintf(stderr,"starting mdrun '%s'\n",
+ *(top_global->name));
+ if (ir->nsteps >= 0)
+ {
+ sprintf(tbuf,"%8.1f",(ir->init_step+ir->nsteps)*ir->delta_t);
+ }
+ else
+ {
+ sprintf(tbuf,"%s","infinite");
+ }
+ if (ir->init_step > 0)
+ {
+ fprintf(stderr,"%s steps, %s ps (continuing from step %s, %8.1f ps).\n",
+ gmx_step_str(ir->init_step+ir->nsteps,sbuf),tbuf,
+ gmx_step_str(ir->init_step,sbuf2),
+ ir->init_step*ir->delta_t);
+ }
+ else
+ {
+ fprintf(stderr,"%s steps, %s ps.\n",
+ gmx_step_str(ir->nsteps,sbuf),tbuf);
+ }
+ }
+ fprintf(fplog,"\n");
+ }
+
+ /* Set and write start time */
+ runtime_start(runtime);
+ print_date_and_time(fplog,cr->nodeid,"Started mdrun",runtime);
+ wallcycle_start(wcycle,ewcRUN);
+ if (fplog)
+ {
+ fprintf(fplog,"\n");
+ }
+
+ /* safest point to do file checkpointing is here. More general point would be immediately before integrator call */
+#ifdef GMX_FAHCORE
+ chkpt_ret=fcCheckPointParallel( cr->nodeid,
+ NULL,0);
+ if ( chkpt_ret == 0 )
+ gmx_fatal( 3,__FILE__,__LINE__, "Checkpoint error on step %d\n", 0 );
+#endif
+
+ debug_gmx();
+ /***********************************************************
+ *
+ * Loop over MD steps
+ *
+ ************************************************************/
+
+ /* if rerunMD then read coordinates and velocities from input trajectory */
+ if (bRerunMD)
+ {
+ if (getenv("GMX_FORCE_UPDATE"))
+ {
+ bForceUpdate = TRUE;
+ }
+
+ rerun_fr.natoms = 0;
+ if (MASTER(cr))
+ {
+ bNotLastFrame = read_first_frame(oenv,&status,
+ opt2fn("-rerun",nfile,fnm),
+ &rerun_fr,TRX_NEED_X | TRX_READ_V);
+ if (rerun_fr.natoms != top_global->natoms)
+ {
+ gmx_fatal(FARGS,
+ "Number of atoms in trajectory (%d) does not match the "
+ "run input file (%d)\n",
+ rerun_fr.natoms,top_global->natoms);
+ }
+ if (ir->ePBC != epbcNONE)
+ {
+ if (!rerun_fr.bBox)
+ {
+ gmx_fatal(FARGS,"Rerun trajectory frame step %d time %f does not contain a box, while pbc is used",rerun_fr.step,rerun_fr.time);
+ }
+ if (max_cutoff2(ir->ePBC,rerun_fr.box) < sqr(fr->rlistlong))
+ {
+ gmx_fatal(FARGS,"Rerun trajectory frame step %d time %f has too small box dimensions",rerun_fr.step,rerun_fr.time);
+ }
+ }
+ }
+
+ if (PAR(cr))
+ {
+ rerun_parallel_comm(cr,&rerun_fr,&bNotLastFrame);
+ }
+
+ if (ir->ePBC != epbcNONE)
+ {
+ /* Set the shift vectors.
+ * Necessary here when have a static box different from the tpr box.
+ */
+ calc_shifts(rerun_fr.box,fr->shift_vec);
+ }
+ }
+
+ /* loop over MD steps or if rerunMD to end of input trajectory */
+ bFirstStep = TRUE;
+ /* Skip the first Nose-Hoover integration when we get the state from tpx */
+ bStateFromTPX = !bStateFromCP;
+ bInitStep = bFirstStep && (bStateFromTPX || bVV);
+ bStartingFromCpt = (Flags & MD_STARTFROMCPT) && bInitStep;
+ bLastStep = FALSE;
+ bSumEkinhOld = FALSE;
+ bExchanged = FALSE;
+
+ init_global_signals(&gs,cr,ir,repl_ex_nst);
+
+ step = ir->init_step;
+ step_rel = 0;
+
+ if (ir->nstlist == -1)
+ {
+ init_nlistheuristics(&nlh,bGStatEveryStep,step);
+ }
+
+ if (MULTISIM(cr) && (repl_ex_nst <=0 ))
+ {
+ /* check how many steps are left in other sims */
+ multisim_nsteps=get_multisim_nsteps(cr, ir->nsteps);
+ }
+
+
+ /* and stop now if we should */
+ bLastStep = (bRerunMD || (ir->nsteps >= 0 && step_rel > ir->nsteps) ||
+ ((multisim_nsteps >= 0) && (step_rel >= multisim_nsteps )));
+ while (!bLastStep || (bRerunMD && bNotLastFrame)) {
+
+ wallcycle_start(wcycle,ewcSTEP);
+
+ if (bRerunMD) {
+ if (rerun_fr.bStep) {
+ step = rerun_fr.step;
+ step_rel = step - ir->init_step;
+ }
+ if (rerun_fr.bTime) {
+ t = rerun_fr.time;
+ }
+ else
+ {
+ t = step;
+ }
+ }
+ else
+ {
+ bLastStep = (step_rel == ir->nsteps);
+ t = t0 + step*ir->delta_t;
+ }
+
+ if (ir->efep != efepNO || ir->bSimTemp)
+ {
+ /* find and set the current lambdas. If rerunning, we either read in a state, or a lambda value,
+ requiring different logic. */
+
+ set_current_lambdas(step,ir->fepvals,bRerunMD,&rerun_fr,state_global,state,lam0);
+ bDoDHDL = do_per_step(step,ir->fepvals->nstdhdl);
+ bDoFEP = (do_per_step(step,nstfep) && (ir->efep != efepNO));
+ bDoExpanded = (do_per_step(step,ir->expandedvals->nstexpanded) && (ir->bExpanded) && (step > 0));
+ }
+
+ if (bSimAnn)
+ {
+ update_annealing_target_temp(&(ir->opts),t);
+ }
+
+ if (bRerunMD)
+ {
+ if (!(DOMAINDECOMP(cr) && !MASTER(cr)))
+ {
+ for(i=0; i<state_global->natoms; i++)
+ {
+ copy_rvec(rerun_fr.x[i],state_global->x[i]);
+ }
+ if (rerun_fr.bV)
+ {
+ for(i=0; i<state_global->natoms; i++)
+ {
+ copy_rvec(rerun_fr.v[i],state_global->v[i]);
+ }
+ }
+ else
+ {
+ for(i=0; i<state_global->natoms; i++)
+ {
+ clear_rvec(state_global->v[i]);
+ }
+ if (bRerunWarnNoV)
+ {
+ fprintf(stderr,"\nWARNING: Some frames do not contain velocities.\n"
+ " Ekin, temperature and pressure are incorrect,\n"
+ " the virial will be incorrect when constraints are present.\n"
+ "\n");
+ bRerunWarnNoV = FALSE;
+ }
+ }
+ }
+ copy_mat(rerun_fr.box,state_global->box);
+ copy_mat(state_global->box,state->box);
+
+ if (vsite && (Flags & MD_RERUN_VSITE))
+ {
+ if (DOMAINDECOMP(cr))
+ {
+ gmx_fatal(FARGS,"Vsite recalculation with -rerun is not implemented for domain decomposition, use particle decomposition");
+ }
+ if (graph)
+ {
+ /* Following is necessary because the graph may get out of sync
+ * with the coordinates if we only have every N'th coordinate set
+ */
+ mk_mshift(fplog,graph,fr->ePBC,state->box,state->x);
+ shift_self(graph,state->box,state->x);
+ }
+ construct_vsites(fplog,vsite,state->x,nrnb,ir->delta_t,state->v,
+ top->idef.iparams,top->idef.il,
+ fr->ePBC,fr->bMolPBC,graph,cr,state->box);
+ if (graph)
+ {
+ unshift_self(graph,state->box,state->x);
+ }
+ }
+ }
+
+ /* Stop Center of Mass motion */
+ bStopCM = (ir->comm_mode != ecmNO && do_per_step(step,ir->nstcomm));
+
+ /* Copy back starting coordinates in case we're doing a forcefield scan */
+ if (bFFscan)
+ {
+ for(ii=0; (ii<state->natoms); ii++)
+ {
+ copy_rvec(xcopy[ii],state->x[ii]);
+ copy_rvec(vcopy[ii],state->v[ii]);
+ }
+ copy_mat(boxcopy,state->box);
+ }
+
+ if (bRerunMD)
+ {
+ /* for rerun MD always do Neighbour Searching */
+ bNS = (bFirstStep || ir->nstlist != 0);
+ bNStList = bNS;
+ }
+ else
+ {
+ /* Determine whether or not to do Neighbour Searching and LR */
+ bNStList = (ir->nstlist > 0 && step % ir->nstlist == 0);
+
+ bNS = (bFirstStep || bExchanged || bNStList || bDoFEP ||
+ (ir->nstlist == -1 && nlh.nabnsb > 0));
+
+ if (bNS && ir->nstlist == -1)
+ {
+ set_nlistheuristics(&nlh,bFirstStep || bExchanged || bDoFEP, step);
+ }
+ }
+
+ /* check whether we should stop because another simulation has
+ stopped. */
+ if (MULTISIM(cr))
+ {
+ if ( (multisim_nsteps >= 0) && (step_rel >= multisim_nsteps) &&
+ (multisim_nsteps != ir->nsteps) )
+ {
+ if (bNS)
+ {
+ if (MASTER(cr))
+ {
+ fprintf(stderr,
+ "Stopping simulation %d because another one has finished\n",
+ cr->ms->sim);
+ }
+ bLastStep=TRUE;
+ gs.sig[eglsCHKPT] = 1;
+ }
+ }
+ }
+
+ /* < 0 means stop at next step, > 0 means stop at next NS step */
+ if ( (gs.set[eglsSTOPCOND] < 0 ) ||
+ ( (gs.set[eglsSTOPCOND] > 0 ) && ( bNS || ir->nstlist==0)) )
+ {
+ bLastStep = TRUE;
+ }
+
+ /* Determine whether or not to update the Born radii if doing GB */
+ bBornRadii=bFirstStep;
+ if (ir->implicit_solvent && (step % ir->nstgbradii==0))
+ {
+ bBornRadii=TRUE;
+ }
+
+ do_log = do_per_step(step,ir->nstlog) || bFirstStep || bLastStep;
+ do_verbose = bVerbose &&
+ (step % stepout == 0 || bFirstStep || bLastStep);
+
+ if (bNS && !(bFirstStep && ir->bContinuation && !bRerunMD))
+ {
+ if (bRerunMD)
+ {
+ bMasterState = TRUE;
+ }
+ else
+ {
+ bMasterState = FALSE;
+ /* Correct the new box if it is too skewed */
+ if (DYNAMIC_BOX(*ir))
+ {
+ if (correct_box(fplog,step,state->box,graph))
+ {
+ bMasterState = TRUE;
+ }
+ }
+ if (DOMAINDECOMP(cr) && bMasterState)
+ {
+ dd_collect_state(cr->dd,state,state_global);
+ }
+ }
+
+ if (DOMAINDECOMP(cr))
+ {
+ /* Repartition the domain decomposition */
+ wallcycle_start(wcycle,ewcDOMDEC);
+ dd_partition_system(fplog,step,cr,
+ bMasterState,nstglobalcomm,
+ state_global,top_global,ir,
+ state,&f,mdatoms,top,fr,
+ vsite,shellfc,constr,
+ nrnb,wcycle,
+ do_verbose && !bPMETuneRunning);
+ wallcycle_stop(wcycle,ewcDOMDEC);
+ /* If using an iterative integrator, reallocate space to match the decomposition */
+ }
+ }
+
+ if (MASTER(cr) && do_log && !bFFscan)
+ {
+ print_ebin_header(fplog,step,t,state->lambda[efptFEP]); /* can we improve the information printed here? */
+ }
+
+ if (ir->efep != efepNO)
+ {
+ update_mdatoms(mdatoms,state->lambda[efptMASS]);
+ }
+
+ if ((bRerunMD && rerun_fr.bV) || bExchanged)
+ {
+
+ /* We need the kinetic energy at minus the half step for determining
+ * the full step kinetic energy and possibly for T-coupling.*/
+ /* This may not be quite working correctly yet . . . . */
+ compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
+ wcycle,enerd,NULL,NULL,NULL,NULL,mu_tot,
+ constr,NULL,FALSE,state->box,
+ top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
+ CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
+ }
+ clear_mat(force_vir);
+
+ /* Ionize the atoms if necessary */
+ if (bIonize)
+ {
+ ionize(fplog,oenv,mdatoms,top_global,t,ir,state->x,state->v,
+ mdatoms->start,mdatoms->start+mdatoms->homenr,state->box,cr);
+ }
+
+ /* Update force field in ffscan program */
+ if (bFFscan)
+ {
+ if (update_forcefield(fplog,
+ nfile,fnm,fr,
+ mdatoms->nr,state->x,state->box))
+ {
+ gmx_finalize_par();
+
+ exit(0);
+ }
+ }
+
+ /* We write a checkpoint at this MD step when:
+ * either at an NS step when we signalled through gs,
+ * or at the last step (but not when we do not want confout),
+ * but never at the first step or with rerun.
+ */
+ bCPT = (((gs.set[eglsCHKPT] && (bNS || ir->nstlist == 0)) ||
+ (bLastStep && (Flags & MD_CONFOUT))) &&
+ step > ir->init_step && !bRerunMD);
+ if (bCPT)
+ {
+ gs.set[eglsCHKPT] = 0;
+ }
+
+ /* Determine the energy and pressure:
+ * at nstcalcenergy steps and at energy output steps (set below).
+ */
+ if (EI_VV(ir->eI) && (!bInitStep))
+ {
+ /* for vv, the first half actually corresponds to the last step */
+ bCalcEner = do_per_step(step-1,ir->nstcalcenergy);
+ }
+ else
+ {
+ bCalcEner = do_per_step(step,ir->nstcalcenergy);
+ }
+ bCalcVir = bCalcEner ||
+ (ir->epc != epcNO && do_per_step(step,ir->nstpcouple));
+
+ /* Do we need global communication ? */
+ bGStat = (bCalcVir || bCalcEner || bStopCM ||
+ do_per_step(step,nstglobalcomm) ||
+ (ir->nstlist == -1 && !bRerunMD && step >= nlh.step_nscheck));
+
+ do_ene = (do_per_step(step,ir->nstenergy) || bLastStep);
+
+ if (do_ene || do_log)
+ {
+ bCalcVir = TRUE;
+ bCalcEner = TRUE;
+ bGStat = TRUE;
+ }
+
+ /* these CGLO_ options remain the same throughout the iteration */
+ cglo_flags = ((bRerunMD ? CGLO_RERUNMD : 0) |
+ (bGStat ? CGLO_GSTAT : 0)
+ );
+
+ force_flags = (GMX_FORCE_STATECHANGED |
+ ((DYNAMIC_BOX(*ir) || bRerunMD) ? GMX_FORCE_DYNAMICBOX : 0) |
+ GMX_FORCE_ALLFORCES |
+ (bNStList ? GMX_FORCE_DOLR : 0) |
+ GMX_FORCE_SEPLRF |
+ (bCalcVir ? GMX_FORCE_VIRIAL : 0) |
+ (bCalcEner ? GMX_FORCE_ENERGY : 0) |
+ (bDoFEP ? GMX_FORCE_DHDL : 0)
+ );
+
+ if (shellfc)
+ {
+ /* Now is the time to relax the shells */
+ count=relax_shell_flexcon(fplog,cr,bVerbose,bFFscan ? step+1 : step,
+ ir,bNS,force_flags,
+ bStopCM,top,top_global,
+ constr,enerd,fcd,
+ state,f,force_vir,mdatoms,
+ nrnb,wcycle,graph,groups,
+ shellfc,fr,bBornRadii,t,mu_tot,
+ state->natoms,&bConverged,vsite,
+ outf->fp_field);
+ tcount+=count;
+
+ if (bConverged)
+ {
+ nconverged++;
+ }
+ }
+ else
+ {
+ /* The coordinates (x) are shifted (to get whole molecules)
+ * in do_force.
+ * This is parallellized as well, and does communication too.
+ * Check comments in sim_util.c
+ */
+ do_force(fplog,cr,ir,step,nrnb,wcycle,top,top_global,groups,
+ state->box,state->x,&state->hist,
+ f,force_vir,mdatoms,enerd,fcd,
+ state->lambda,graph,
+ fr,vsite,mu_tot,t,outf->fp_field,ed,bBornRadii,
+ (bNS ? GMX_FORCE_NS : 0) | force_flags);
+ }
+
+ if (bTCR)
+ {
+ mu_aver = calc_mu_aver(cr,state->x,mdatoms->chargeA,
+ mu_tot,&top_global->mols,mdatoms,gnx,grpindex);
+ }
+
+ if (bTCR && bFirstStep)
+ {
+ tcr=init_coupling(fplog,nfile,fnm,cr,fr,mdatoms,&(top->idef));
+ fprintf(fplog,"Done init_coupling\n");
+ fflush(fplog);
+ }
+
+ if (bVV && !bStartingFromCpt && !bRerunMD)
+ /* ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
+ {
+ if (ir->eI==eiVV && bInitStep)
+ {
+ /* if using velocity verlet with full time step Ekin,
+ * take the first half step only to compute the
+ * virial for the first step. From there,
+ * revert back to the initial coordinates
+ * so that the input is actually the initial step.
+ */
+ copy_rvecn(state->v,cbuf,0,state->natoms); /* should make this better for parallelizing? */
+ } else {
+ /* this is for NHC in the Ekin(t+dt/2) version of vv */
+ trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ1);
+ }
+
+ update_coords(fplog,step,ir,mdatoms,state,fr->bMolPBC,
+ f,fr->bTwinRange && bNStList,fr->f_twin,fcd,
+ ekind,M,wcycle,upd,bInitStep,etrtVELOCITY1,
+ cr,nrnb,constr,&top->idef);
+
+ if (bIterations)
+ {
+ gmx_iterate_init(&iterate,bIterations && !bInitStep);
+ }
+ /* for iterations, we save these vectors, as we will be self-consistently iterating
+ the calculations */
+
+ /*#### UPDATE EXTENDED VARIABLES IN TROTTER FORMULATION */
+
+ /* save the state */
+ if (bIterations && iterate.bIterate) {
+ copy_coupling_state(state,bufstate,ekind,ekind_save,&(ir->opts));
+ }
+
+ bFirstIterate = TRUE;
+ while (bFirstIterate || (bIterations && iterate.bIterate))
+ {
+ if (bIterations && iterate.bIterate)
+ {
+ copy_coupling_state(bufstate,state,ekind_save,ekind,&(ir->opts));
+ if (bFirstIterate && bTrotter)
+ {
+ /* The first time through, we need a decent first estimate
+ of veta(t+dt) to compute the constraints. Do
+ this by computing the box volume part of the
+ trotter integration at this time. Nothing else
+ should be changed by this routine here. If
+ !(first time), we start with the previous value
+ of veta. */
+
+ veta_save = state->veta;
+ trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ0);
+ vetanew = state->veta;
+ state->veta = veta_save;
+ }
+ }
+
+ bOK = TRUE;
+ if ( !bRerunMD || rerun_fr.bV || bForceUpdate) { /* Why is rerun_fr.bV here? Unclear. */
+ dvdl = 0;
+
+ update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,
+ state,fr->bMolPBC,graph,f,
+ &top->idef,shake_vir,NULL,
+ cr,nrnb,wcycle,upd,constr,
+ bInitStep,TRUE,bCalcVir,vetanew);
+
+ if (!bOK && !bFFscan)
+ {
+ gmx_fatal(FARGS,"Constraint error: Shake, Lincs or Settle could not solve the constrains");
+ }
+
+ }
+ else if (graph)
+ { /* Need to unshift here if a do_force has been
+ called in the previous step */
+ unshift_self(graph,state->box,state->x);
+ }
+
+
+ /* if VV, compute the pressure and constraints */
+ /* For VV2, we strictly only need this if using pressure
+ * control, but we really would like to have accurate pressures
+ * printed out.
+ * Think about ways around this in the future?
+ * For now, keep this choice in comments.
+ */
+ /*bPres = (ir->eI==eiVV || IR_NPT_TROTTER(ir)); */
+ /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && IR_NPT_TROTTER(ir)));*/
+ bPres = TRUE;
+ bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK));
+ if (bCalcEner && ir->eI==eiVVAK) /*MRS: 7/9/2010 -- this still doesn't fix it?*/
+ {
+ bSumEkinhOld = TRUE;
+ }
+ compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
+ wcycle,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
+ constr,NULL,FALSE,state->box,
+ top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
+ cglo_flags
+ | CGLO_ENERGY
+ | (bStopCM ? CGLO_STOPCM : 0)
+ | (bTemp ? CGLO_TEMPERATURE:0)
+ | (bPres ? CGLO_PRESSURE : 0)
+ | (bPres ? CGLO_CONSTRAINT : 0)
+ | ((bIterations && iterate.bIterate) ? CGLO_ITERATE : 0)
+ | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
+ | CGLO_SCALEEKIN
+ );
+ /* explanation of above:
+ a) We compute Ekin at the full time step
+ if 1) we are using the AveVel Ekin, and it's not the
+ initial step, or 2) if we are using AveEkin, but need the full
+ time step kinetic energy for the pressure (always true now, since we want accurate statistics).
+ b) If we are using EkinAveEkin for the kinetic energy for the temperture control, we still feed in
+ EkinAveVel because it's needed for the pressure */
+
+ /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
+ if (!bInitStep)
+ {
+ if (bTrotter)
+ {
+ trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ2);
+ }
+ else
+ {
+ if (bExchanged)
+ {
+
+ /* We need the kinetic energy at minus the half step for determining
+ * the full step kinetic energy and possibly for T-coupling.*/
+ /* This may not be quite working correctly yet . . . . */
+ compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
+ wcycle,enerd,NULL,NULL,NULL,NULL,mu_tot,
+ constr,NULL,FALSE,state->box,
+ top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
+ CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
+ }
+
+
+ update_tcouple(fplog,step,ir,state,ekind,wcycle,upd,&MassQ,mdatoms);
+ }
+ }
+
+ if (bIterations &&
+ done_iterating(cr,fplog,step,&iterate,bFirstIterate,
+ state->veta,&vetanew))
+ {
+ break;
+ }
+ bFirstIterate = FALSE;
+ }
+
+ if (bTrotter && !bInitStep) {
+ enerd->term[F_DVDL_BONDED] += dvdl; /* only add after iterations */
+ copy_mat(shake_vir,state->svir_prev);
+ copy_mat(force_vir,state->fvir_prev);
+ if (IR_NVT_TROTTER(ir) && ir->eI==eiVV) {
+ /* update temperature and kinetic energy now that step is over - this is the v(t+dt) point */
+ enerd->term[F_TEMP] = sum_ekin(&(ir->opts),ekind,NULL,(ir->eI==eiVV),FALSE,FALSE);
+ enerd->term[F_EKIN] = trace(ekind->ekin);
+ }
+ }
+ /* if it's the initial step, we performed this first step just to get the constraint virial */
+ if (bInitStep && ir->eI==eiVV) {
+ copy_rvecn(cbuf,state->v,0,state->natoms);
+ }
+
+ if (fr->bSepDVDL && fplog && do_log)
+ {
+ fprintf(fplog,sepdvdlformat,"Constraint",0.0,dvdl);
+ }
+ enerd->term[F_DVDL_BONDED] += dvdl;
+ }
+
+ /* MRS -- now done iterating -- compute the conserved quantity */
+ if (bVV) {
+ saved_conserved_quantity = compute_conserved_from_auxiliary(ir,state,&MassQ);
+ if (ir->eI==eiVV)
+ {
+ last_ekin = enerd->term[F_EKIN];
+ }
+ if ((ir->eDispCorr != edispcEnerPres) && (ir->eDispCorr != edispcAllEnerPres))
+ {
+ saved_conserved_quantity -= enerd->term[F_DISPCORR];
+ }
+ /* sum up the foreign energy and dhdl terms for vv. currently done every step so that dhdl is correct in the .edr */
+ sum_dhdl(enerd,state->lambda,ir->fepvals);
+ }
+
+ /* ######## END FIRST UPDATE STEP ############## */
+ /* ######## If doing VV, we now have v(dt) ###### */
+ if (bDoExpanded) {
+ /* perform extended ensemble sampling in lambda - we don't
+ actually move to the new state before outputting
+ statistics, but if performing simulated tempering, we
+ do update the velocities and the tau_t. */
+
+ lamnew = ExpandedEnsembleDynamics(fplog,ir,enerd,state,&MassQ,&df_history,step,mcrng,state->v,mdatoms);
+ }
+ /* ################## START TRAJECTORY OUTPUT ################# */
+
+ /* Now we have the energies and forces corresponding to the
+ * coordinates at time t. We must output all of this before
+ * the update.
+ * for RerunMD t is read from input trajectory
+ */
+ mdof_flags = 0;
+ if (do_per_step(step,ir->nstxout)) { mdof_flags |= MDOF_X; }
+ if (do_per_step(step,ir->nstvout)) { mdof_flags |= MDOF_V; }
+ if (do_per_step(step,ir->nstfout)) { mdof_flags |= MDOF_F; }
+ if (do_per_step(step,ir->nstxtcout)) { mdof_flags |= MDOF_XTC; }
+ if (bCPT) { mdof_flags |= MDOF_CPT; };
+
+#if defined(GMX_FAHCORE) || defined(GMX_WRITELASTSTEP)
+ if (bLastStep)
+ {
+ /* Enforce writing positions and velocities at end of run */
+ mdof_flags |= (MDOF_X | MDOF_V);
+ }
+#endif
+#ifdef GMX_FAHCORE
+ if (MASTER(cr))
+ fcReportProgress( ir->nsteps, step );
+
+ /* sync bCPT and fc record-keeping */
+ if (bCPT && MASTER(cr))
+ fcRequestCheckPoint();
+#endif
+
+ if (mdof_flags != 0)
+ {
+ wallcycle_start(wcycle,ewcTRAJ);
+ if (bCPT)
+ {
+ if (state->flags & (1<<estLD_RNG))
+ {
+ get_stochd_state(upd,state);
+ }
+ if (state->flags & (1<<estMC_RNG))
+ {
+ get_mc_state(mcrng,state);
+ }
+ if (MASTER(cr))
+ {
+ if (bSumEkinhOld)
+ {
+ state_global->ekinstate.bUpToDate = FALSE;
+ }
+ else
+ {
+ update_ekinstate(&state_global->ekinstate,ekind);
+ state_global->ekinstate.bUpToDate = TRUE;
+ }
+ update_energyhistory(&state_global->enerhist,mdebin);
+ if (ir->efep!=efepNO || ir->bSimTemp)
+ {
+ state_global->fep_state = state->fep_state; /* MRS: seems kludgy. The code should be
+ structured so this isn't necessary.
+ Note this reassignment is only necessary
+ for single threads.*/
+ copy_df_history(&state_global->dfhist,&df_history);
+ }
+ }
+ }
+ write_traj(fplog,cr,outf,mdof_flags,top_global,
+ step,t,state,state_global,f,f_global,&n_xtc,&x_xtc);
+ if (bCPT)
+ {
+ nchkpt++;
+ bCPT = FALSE;
+ }
+ debug_gmx();
+ if (bLastStep && step_rel == ir->nsteps &&
+ (Flags & MD_CONFOUT) && MASTER(cr) &&
+ !bRerunMD && !bFFscan)
+ {
+ /* x and v have been collected in write_traj,
+ * because a checkpoint file will always be written
+ * at the last step.
+ */
+ fprintf(stderr,"\nWriting final coordinates.\n");
+ if (fr->bMolPBC)
+ {
+ /* Make molecules whole only for confout writing */
+ do_pbc_mtop(fplog,ir->ePBC,state->box,top_global,state_global->x);
+ }
+ write_sto_conf_mtop(ftp2fn(efSTO,nfile,fnm),
+ *top_global->name,top_global,
+ state_global->x,state_global->v,
+ ir->ePBC,state->box);
+ debug_gmx();
+ }
+ wallcycle_stop(wcycle,ewcTRAJ);
+ }
+
+ /* kludge -- virial is lost with restart for NPT control. Must restart */
+ if (bStartingFromCpt && bVV)
+ {
+ copy_mat(state->svir_prev,shake_vir);
+ copy_mat(state->fvir_prev,force_vir);
+ }
+ /* ################## END TRAJECTORY OUTPUT ################ */
+
+ /* Determine the wallclock run time up till now */
+ run_time = gmx_gettime() - (double)runtime->real;
+
+ /* Check whether everything is still allright */
+ if (((int)gmx_get_stop_condition() > handled_stop_condition)
+#ifdef GMX_THREAD_MPI
+ && MASTER(cr)
+#endif
+ )
+ {
+ /* this is just make gs.sig compatible with the hack
+ of sending signals around by MPI_Reduce with together with
+ other floats */
+ if ( gmx_get_stop_condition() == gmx_stop_cond_next_ns )
+ gs.sig[eglsSTOPCOND]=1;
+ if ( gmx_get_stop_condition() == gmx_stop_cond_next )
+ gs.sig[eglsSTOPCOND]=-1;
+ /* < 0 means stop at next step, > 0 means stop at next NS step */
+ if (fplog)
+ {
+ fprintf(fplog,
+ "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
+ gmx_get_signal_name(),
+ gs.sig[eglsSTOPCOND]==1 ? "NS " : "");
+ fflush(fplog);
+ }
+ fprintf(stderr,
+ "\n\nReceived the %s signal, stopping at the next %sstep\n\n",
+ gmx_get_signal_name(),
+ gs.sig[eglsSTOPCOND]==1 ? "NS " : "");
+ fflush(stderr);
+ handled_stop_condition=(int)gmx_get_stop_condition();
+ }
+ else if (MASTER(cr) && (bNS || ir->nstlist <= 0) &&
+ (max_hours > 0 && run_time > max_hours*60.0*60.0*0.99) &&
+ gs.sig[eglsSTOPCOND] == 0 && gs.set[eglsSTOPCOND] == 0)
+ {
+ /* Signal to terminate the run */
+ gs.sig[eglsSTOPCOND] = 1;
+ if (fplog)
+ {
+ fprintf(fplog,"\nStep %s: Run time exceeded %.3f hours, will terminate the run\n",gmx_step_str(step,sbuf),max_hours*0.99);
+ }
+ fprintf(stderr, "\nStep %s: Run time exceeded %.3f hours, will terminate the run\n",gmx_step_str(step,sbuf),max_hours*0.99);
+ }
+
+ if (bResetCountersHalfMaxH && MASTER(cr) &&
+ run_time > max_hours*60.0*60.0*0.495)
+ {
+ gs.sig[eglsRESETCOUNTERS] = 1;
+ }
+
+ if (ir->nstlist == -1 && !bRerunMD)
+ {
+ /* When bGStatEveryStep=FALSE, global_stat is only called
+ * when we check the atom displacements, not at NS steps.
+ * This means that also the bonded interaction count check is not
+ * performed immediately after NS. Therefore a few MD steps could
+ * be performed with missing interactions.
+ * But wrong energies are never written to file,
+ * since energies are only written after global_stat
+ * has been called.
+ */
+ if (step >= nlh.step_nscheck)
+ {
+ nlh.nabnsb = natoms_beyond_ns_buffer(ir,fr,&top->cgs,
+ nlh.scale_tot,state->x);
+ }
+ else
+ {
+ /* This is not necessarily true,
+ * but step_nscheck is determined quite conservatively.
+ */
+ nlh.nabnsb = 0;
+ }
+ }
+
+ /* In parallel we only have to check for checkpointing in steps
+ * where we do global communication,
+ * otherwise the other nodes don't know.
+ */
+ if (MASTER(cr) && ((bGStat || !PAR(cr)) &&
+ cpt_period >= 0 &&
+ (cpt_period == 0 ||
+ run_time >= nchkpt*cpt_period*60.0)) &&
+ gs.set[eglsCHKPT] == 0)
+ {
+ gs.sig[eglsCHKPT] = 1;
+ }
+
+
+ /* at the start of step, randomize the velocities */
+ if (ETC_ANDERSEN(ir->etc) && EI_VV(ir->eI))
+ {
+ gmx_bool bDoAndersenConstr;
+ bDoAndersenConstr = (constr && update_randomize_velocities(ir,step,mdatoms,state,upd,&top->idef,constr));
+ /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
+ if (bDoAndersenConstr)
+ {
+ update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,
+ state,fr->bMolPBC,graph,f,
+ &top->idef,tmp_vir,NULL,
+ cr,nrnb,wcycle,upd,constr,
+ bInitStep,TRUE,bCalcVir,vetanew);
+ }
+ }
+
+ if (bIterations)
+ {
+ gmx_iterate_init(&iterate,bIterations);
+ }
+
+ /* for iterations, we save these vectors, as we will be redoing the calculations */
+ if (bIterations && iterate.bIterate)
+ {
+ copy_coupling_state(state,bufstate,ekind,ekind_save,&(ir->opts));
+ }
+ bFirstIterate = TRUE;
+ while (bFirstIterate || (bIterations && iterate.bIterate))
+ {
+ /* We now restore these vectors to redo the calculation with improved extended variables */
+ if (bIterations)
+ {
+ copy_coupling_state(bufstate,state,ekind_save,ekind,&(ir->opts));
+ }
+
+ /* We make the decision to break or not -after- the calculation of Ekin and Pressure,
+ so scroll down for that logic */
+
+ /* ######### START SECOND UPDATE STEP ################# */
+ /* Box is changed in update() when we do pressure coupling,
+ * but we should still use the old box for energy corrections and when
+ * writing it to the energy file, so it matches the trajectory files for
+ * the same timestep above. Make a copy in a separate array.
+ */
+ copy_mat(state->box,lastbox);
+
+ bOK = TRUE;
+ if (!(bRerunMD && !rerun_fr.bV && !bForceUpdate))
+ {
+ wallcycle_start(wcycle,ewcUPDATE);
+ dvdl = 0;
+ /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
+ if (bTrotter)
+ {
+ if (bIterations && iterate.bIterate)
+ {
+ if (bFirstIterate)
+ {
+ scalevir = 1;
+ }
+ else
+ {
+ /* we use a new value of scalevir to converge the iterations faster */
+ scalevir = tracevir/trace(shake_vir);
+ }
+ msmul(shake_vir,scalevir,shake_vir);
+ m_add(force_vir,shake_vir,total_vir);
+ clear_mat(shake_vir);
+ }
+ trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ3);
+ /* We can only do Berendsen coupling after we have summed
+ * the kinetic energy or virial. Since the happens
+ * in global_state after update, we should only do it at
+ * step % nstlist = 1 with bGStatEveryStep=FALSE.
+ */
+ }
+ else
+ {
+ update_tcouple(fplog,step,ir,state,ekind,wcycle,upd,&MassQ,mdatoms);
+ update_pcouple(fplog,step,ir,state,pcoupl_mu,M,wcycle,
+ upd,bInitStep);
+ }
+
+ if (bVV)
+ {
+ /* velocity half-step update */
+ update_coords(fplog,step,ir,mdatoms,state,fr->bMolPBC,f,
+ fr->bTwinRange && bNStList,fr->f_twin,fcd,
+ ekind,M,wcycle,upd,FALSE,etrtVELOCITY2,
+ cr,nrnb,constr,&top->idef);
+ }
+
+ /* Above, initialize just copies ekinh into ekin,
+ * it doesn't copy position (for VV),
+ * and entire integrator for MD.
+ */
+
+ if (ir->eI==eiVVAK)
+ {
+ copy_rvecn(state->x,cbuf,0,state->natoms);
+ }
+
+ update_coords(fplog,step,ir,mdatoms,state,fr->bMolPBC,f,
+ fr->bTwinRange && bNStList,fr->f_twin,fcd,
+ ekind,M,wcycle,upd,bInitStep,etrtPOSITION,cr,nrnb,constr,&top->idef);
+ wallcycle_stop(wcycle,ewcUPDATE);
+
+ update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,state,
+ fr->bMolPBC,graph,f,
+ &top->idef,shake_vir,force_vir,
+ cr,nrnb,wcycle,upd,constr,
+ bInitStep,FALSE,bCalcVir,state->veta);
+
+ if (ir->eI==eiVVAK)
+ {
+ /* erase F_EKIN and F_TEMP here? */
+ /* just compute the kinetic energy at the half step to perform a trotter step */
+ compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
+ wcycle,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
+ constr,NULL,FALSE,lastbox,
+ top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
+ cglo_flags | CGLO_TEMPERATURE
+ );
+ wallcycle_start(wcycle,ewcUPDATE);
+ trotter_update(ir,step,ekind,enerd,state,total_vir,mdatoms,&MassQ,trotter_seq,ettTSEQ4);
+ /* now we know the scaling, we can compute the positions again again */
+ copy_rvecn(cbuf,state->x,0,state->natoms);
+
+ update_coords(fplog,step,ir,mdatoms,state,fr->bMolPBC,f,
+ fr->bTwinRange && bNStList,fr->f_twin,fcd,
+ ekind,M,wcycle,upd,bInitStep,etrtPOSITION,cr,nrnb,constr,&top->idef);
+ wallcycle_stop(wcycle,ewcUPDATE);
+
+ /* do we need an extra constraint here? just need to copy out of state->v to upd->xp? */
+ /* are the small terms in the shake_vir here due
+ * to numerical errors, or are they important
+ * physically? I'm thinking they are just errors, but not completely sure.
+ * For now, will call without actually constraining, constr=NULL*/
+ update_constraints(fplog,step,&dvdl,ir,ekind,mdatoms,
+ state,fr->bMolPBC,graph,f,
+ &top->idef,tmp_vir,force_vir,
+ cr,nrnb,wcycle,upd,NULL,
+ bInitStep,FALSE,bCalcVir,
+ state->veta);
+ }
+ if (!bOK && !bFFscan)
+ {
+ gmx_fatal(FARGS,"Constraint error: Shake, Lincs or Settle could not solve the constrains");
+ }
+
+ if (fr->bSepDVDL && fplog && do_log)
+ {
+ fprintf(fplog,sepdvdlformat,"Constraint dV/dl",0.0,dvdl);
+ }
+ enerd->term[F_DVDL_BONDED] += dvdl;
+ }
+ else if (graph)
+ {
+ /* Need to unshift here */
+ unshift_self(graph,state->box,state->x);
+ }
+
+ if (vsite != NULL)
+ {
+ wallcycle_start(wcycle,ewcVSITECONSTR);
+ if (graph != NULL)
+ {
+ shift_self(graph,state->box,state->x);
+ }
+ construct_vsites(fplog,vsite,state->x,nrnb,ir->delta_t,state->v,
+ top->idef.iparams,top->idef.il,
+ fr->ePBC,fr->bMolPBC,graph,cr,state->box);
+
+ if (graph != NULL)
+ {
+ unshift_self(graph,state->box,state->x);
+ }
+ wallcycle_stop(wcycle,ewcVSITECONSTR);
+ }
+
+ /* ############## IF NOT VV, Calculate globals HERE, also iterate constraints ############ */
+ /* With Leap-Frog we can skip compute_globals at
+ * non-communication steps, but we need to calculate
+ * the kinetic energy one step before communication.
+ */
+ if (bGStat || do_per_step(step+1,nstglobalcomm) ||
+ EI_VV(ir->eI))
+ {
+ if (ir->nstlist == -1 && bFirstIterate)
+ {
+ gs.sig[eglsNABNSB] = nlh.nabnsb;
+ }
+ compute_globals(fplog,gstat,cr,ir,fr,ekind,state,state_global,mdatoms,nrnb,vcm,
+ wcycle,enerd,force_vir,shake_vir,total_vir,pres,mu_tot,
+ constr,
+ bFirstIterate ? &gs : NULL,
+ (step_rel % gs.nstms == 0) &&
+ (multisim_nsteps<0 || (step_rel<multisim_nsteps)),
+ lastbox,
+ top_global,&pcurr,top_global->natoms,&bSumEkinhOld,
+ cglo_flags
+ | (!EI_VV(ir->eI) ? CGLO_ENERGY : 0)
+ | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
+ | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
+ | (!EI_VV(ir->eI) || bRerunMD ? CGLO_PRESSURE : 0)
+ | (bIterations && iterate.bIterate ? CGLO_ITERATE : 0)
+ | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
+ | CGLO_CONSTRAINT
+ );
+ if (ir->nstlist == -1 && bFirstIterate)
+ {
+ nlh.nabnsb = gs.set[eglsNABNSB];
+ gs.set[eglsNABNSB] = 0;
+ }
+ }
+ /* bIterate is set to keep it from eliminating the old ekin kinetic energy terms */
+ /* ############# END CALC EKIN AND PRESSURE ################# */
+
+ /* Note: this is OK, but there are some numerical precision issues with using the convergence of
+ the virial that should probably be addressed eventually. state->veta has better properies,
+ but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
+ generate the new shake_vir, but test the veta value for convergence. This will take some thought. */
+
+ if (bIterations &&
+ done_iterating(cr,fplog,step,&iterate,bFirstIterate,
+ trace(shake_vir),&tracevir))
+ {
+ break;
+ }
+ bFirstIterate = FALSE;
+ }
+
+ /* only add constraint dvdl after constraints */
+ enerd->term[F_DVDL_BONDED] += dvdl;
+ if (!bVV)
+ {
+ /* sum up the foreign energy and dhdl terms for md and sd. currently done every step so that dhdl is correct in the .edr */
+ sum_dhdl(enerd,state->lambda,ir->fepvals);
+ }
+ update_box(fplog,step,ir,mdatoms,state,graph,f,
+ ir->nstlist==-1 ? &nlh.scale_tot : NULL,pcoupl_mu,nrnb,wcycle,upd,bInitStep,FALSE);
+
+ /* ################# END UPDATE STEP 2 ################# */
+ /* #### We now have r(t+dt) and v(t+dt/2) ############# */
+
+ /* The coordinates (x) were unshifted in update */
+ if (bFFscan && (shellfc==NULL || bConverged))
+ {
+ if (print_forcefield(fplog,enerd->term,mdatoms->homenr,
+ f,NULL,xcopy,
+ &(top_global->mols),mdatoms->massT,pres))
+ {
+ gmx_finalize_par();
+
+ fprintf(stderr,"\n");
+ exit(0);
+ }
+ }
+ if (!bGStat)
+ {
+ /* We will not sum ekinh_old,
+ * so signal that we still have to do it.
+ */
+ bSumEkinhOld = TRUE;
+ }
+
+ if (bTCR)
+ {
+ /* Only do GCT when the relaxation of shells (minimization) has converged,
+ * otherwise we might be coupling to bogus energies.
+ * In parallel we must always do this, because the other sims might
+ * update the FF.
+ */
+
+ /* Since this is called with the new coordinates state->x, I assume
+ * we want the new box state->box too. / EL 20040121
+ */
+ do_coupling(fplog,oenv,nfile,fnm,tcr,t,step,enerd->term,fr,
+ ir,MASTER(cr),
+ mdatoms,&(top->idef),mu_aver,
+ top_global->mols.nr,cr,
+ state->box,total_vir,pres,
+ mu_tot,state->x,f,bConverged);
+ debug_gmx();
+ }
+
+ /* ######### BEGIN PREPARING EDR OUTPUT ########### */
+
+ /* use the directly determined last velocity, not actually the averaged half steps */
+ if (bTrotter && ir->eI==eiVV)
+ {
+ enerd->term[F_EKIN] = last_ekin;
+ }
+ enerd->term[F_ETOT] = enerd->term[F_EPOT] + enerd->term[F_EKIN];
+
+ if (bVV)
+ {
+ enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + saved_conserved_quantity;
+ }
+ else
+ {
+ enerd->term[F_ECONSERVED] = enerd->term[F_ETOT] + compute_conserved_from_auxiliary(ir,state,&MassQ);
+ }
+ /* Check for excessively large energies */
+ if (bIonize)
+ {
+#ifdef GMX_DOUBLE
+ real etot_max = 1e200;
+#else
+ real etot_max = 1e30;
+#endif
+ if (fabs(enerd->term[F_ETOT]) > etot_max)
+ {
+ fprintf(stderr,"Energy too large (%g), giving up\n",
+ enerd->term[F_ETOT]);
+ }
+ }
+ /* ######### END PREPARING EDR OUTPUT ########### */
+
+ /* Time for performance */
+ if (((step % stepout) == 0) || bLastStep)
+ {
+ runtime_upd_proc(runtime);
+ }
+
+ /* Output stuff */
+ if (MASTER(cr))
+ {
+ gmx_bool do_dr,do_or;
+
+ if (fplog && do_log && bDoExpanded)
+ {
+ /* only needed if doing expanded ensemble */
+ PrintFreeEnergyInfoToFile(fplog,ir->fepvals,ir->expandedvals,ir->bSimTemp?ir->simtempvals:NULL,
+ &df_history,state->fep_state,ir->nstlog,step);
+ }
+ if (!(bStartingFromCpt && (EI_VV(ir->eI))))
+ {
+ if (bCalcEner)
+ {
+ upd_mdebin(mdebin,bDoDHDL, TRUE,
+ t,mdatoms->tmass,enerd,state,
+ ir->fepvals,ir->expandedvals,lastbox,
+ shake_vir,force_vir,total_vir,pres,
+ ekind,mu_tot,constr);
+ }
+ else
+ {
+ upd_mdebin_step(mdebin);
+ }
+
+ do_dr = do_per_step(step,ir->nstdisreout);
+ do_or = do_per_step(step,ir->nstorireout);
+
+ print_ebin(outf->fp_ene,do_ene,do_dr,do_or,do_log?fplog:NULL,
+ step,t,
+ eprNORMAL,bCompact,mdebin,fcd,groups,&(ir->opts));
+ }
+ if (ir->ePull != epullNO)
+ {
+ pull_print_output(ir->pull,step,t);
+ }
+
+ if (do_per_step(step,ir->nstlog))
+ {
+ if(fflush(fplog) != 0)
+ {
+ gmx_fatal(FARGS,"Cannot flush logfile - maybe you are out of disk space?");
+ }
+ }
+ }
+ if (bDoExpanded)
+ {
+ /* Have to do this part after outputting the logfile and the edr file */
+ state->fep_state = lamnew;
+ for (i=0;i<efptNR;i++)
+ {
+ state->lambda[i] = ir->fepvals->all_lambda[i][lamnew];
+ }
+ }
+ /* Remaining runtime */
+ if (MULTIMASTER(cr) && (do_verbose || gmx_got_usr_signal()) && !bPMETuneRunning)
+ {
+ if (shellfc)
+ {
+ fprintf(stderr,"\n");
+ }
+ print_time(stderr,runtime,step,ir,cr);
+ }
+
+ /* Replica exchange */
+ bExchanged = FALSE;
+ if ((repl_ex_nst > 0) && (step > 0) && !bLastStep &&
+ do_per_step(step,repl_ex_nst))
+ {
+ bExchanged = replica_exchange(fplog,cr,repl_ex,
+ state_global,enerd,
+ state,step,t);
+
+ if (bExchanged && DOMAINDECOMP(cr))
+ {
+ dd_partition_system(fplog,step,cr,TRUE,1,
+ state_global,top_global,ir,
+ state,&f,mdatoms,top,fr,
+ vsite,shellfc,constr,
+ nrnb,wcycle,FALSE);
+ }
+ }
+
+ bFirstStep = FALSE;
+ bInitStep = FALSE;
+ bStartingFromCpt = FALSE;
+
+ /* ####### SET VARIABLES FOR NEXT ITERATION IF THEY STILL NEED IT ###### */
+ /* With all integrators, except VV, we need to retain the pressure
+ * at the current step for coupling at the next step.
+ */
+ if ((state->flags & (1<<estPRES_PREV)) &&
+ (bGStatEveryStep ||
+ (ir->nstpcouple > 0 && step % ir->nstpcouple == 0)))
+ {
+ /* Store the pressure in t_state for pressure coupling
+ * at the next MD step.
+ */
+ copy_mat(pres,state->pres_prev);
+ }
+
+ /* ####### END SET VARIABLES FOR NEXT ITERATION ###### */
+
+ if ( (membed!=NULL) && (!bLastStep) )
+ {
+ rescale_membed(step_rel,membed,state_global->x);
+ }
+
+ if (bRerunMD)
+ {
+ if (MASTER(cr))
+ {
+ /* read next frame from input trajectory */
+ bNotLastFrame = read_next_frame(oenv,status,&rerun_fr);
+ }
+
+ if (PAR(cr))
+ {
+ rerun_parallel_comm(cr,&rerun_fr,&bNotLastFrame);
+ }
+ }
+
+ if (!bRerunMD || !rerun_fr.bStep)
+ {
+ /* increase the MD step number */
+ step++;
+ step_rel++;
+ }
+
+ cycles = wallcycle_stop(wcycle,ewcSTEP);
+ if (DOMAINDECOMP(cr) && wcycle)
+ {
+ dd_cycles_add(cr->dd,cycles,ddCyclStep);
+ }
+
+ if (bPMETuneRunning || bPMETuneTry)
+ {
+ /* PME grid + cut-off optimization with GPUs or PME nodes */
+
+ /* Count the total cycles over the last steps */
+ cycles_pmes += cycles;
+
+ /* We can only switch cut-off at NS steps */
+ if (step % ir->nstlist == 0)
+ {
+ /* PME grid + cut-off optimization with GPUs or PME nodes */
+ if (bPMETuneTry)
+ {
+ if (DDMASTER(cr->dd))
+ {
+ /* PME node load is too high, start tuning */
+ bPMETuneRunning = (dd_pme_f_ratio(cr->dd) >= 1.05);
+ }
+ dd_bcast(cr->dd,sizeof(gmx_bool),&bPMETuneRunning);
+
+ if (bPMETuneRunning || step_rel > ir->nstlist*50)
+ {
+ bPMETuneTry = FALSE;
+ }
+ }
+ if (bPMETuneRunning)
+ {
+ /* init_step might not be a multiple of nstlist,
+ * but the first cycle is always skipped anyhow.
+ */
+ bPMETuneRunning =
-
++ pme_load_balance(pme_loadbal,cr,
++ (bVerbose && MASTER(cr)) ? stderr : NULL,
++ fplog,
++ ir,state,cycles_pmes,
++ fr->ic,fr->nbv,&fr->pmedata,
++ step);
+
+ fr->ewaldcoeff = fr->ic->ewaldcoeff;
+ }
+
+ cycles_pmes = 0;
+ }
+ }
+
+ if (step_rel == wcycle_get_reset_counters(wcycle) ||
+ gs.set[eglsRESETCOUNTERS] != 0)
+ {
+ /* Reset all the counters related to performance over the run */
+ reset_all_counters(fplog,cr,step,&step_rel,ir,wcycle,nrnb,runtime,
+ fr->nbv != NULL && fr->nbv->bUseGPU ? fr->nbv->cu_nbv : NULL);
+ wcycle_set_reset_counters(wcycle,-1);
+ /* Correct max_hours for the elapsed time */
+ max_hours -= run_time/(60.0*60.0);
+ bResetCountersHalfMaxH = FALSE;
+ gs.set[eglsRESETCOUNTERS] = 0;
+ }
+
+ }
+ /* End of main MD loop */
+ debug_gmx();
+
+ /* Stop the time */
+ runtime_end(runtime);
+
+ if (bRerunMD && MASTER(cr))
+ {
+ close_trj(status);
+ }
+
+ if (!(cr->duty & DUTY_PME))
+ {
+ /* Tell the PME only node to finish */
+ gmx_pme_send_finish(cr);
+ }
+
+ if (MASTER(cr))
+ {
+ if (ir->nstcalcenergy > 0 && !bRerunMD)
+ {
+ print_ebin(outf->fp_ene,FALSE,FALSE,FALSE,fplog,step,t,
+ eprAVER,FALSE,mdebin,fcd,groups,&(ir->opts));
+ }
+ }
+
+ done_mdoutf(outf);
+
+ debug_gmx();
+
+ if (ir->nstlist == -1 && nlh.nns > 0 && fplog)
+ {
+ fprintf(fplog,"Average neighborlist lifetime: %.1f steps, std.dev.: %.1f steps\n",nlh.s1/nlh.nns,sqrt(nlh.s2/nlh.nns - sqr(nlh.s1/nlh.nns)));
+ fprintf(fplog,"Average number of atoms that crossed the half buffer length: %.1f\n\n",nlh.ab/nlh.nns);
+ }
++
++ if (pme_loadbal != NULL)
++ {
++ pme_loadbal_done(pme_loadbal,fplog);
++ }
++
+ if (shellfc && fplog)
+ {
+ fprintf(fplog,"Fraction of iterations that converged: %.2f %%\n",
+ (nconverged*100.0)/step_rel);
+ fprintf(fplog,"Average number of force evaluations per MD step: %.2f\n\n",
+ tcount/step_rel);
+ }
+
+ if (repl_ex_nst > 0 && MASTER(cr))
+ {
+ print_replica_exchange_statistics(fplog,repl_ex);
+ }
+
+ runtime->nsteps_done = step_rel;
+
+ return 0;
+}
--- /dev/null
--- /dev/null
++/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
++ *
++ *
++ * This source code is part of
++ *
++ * G R O M A C S
++ *
++ * GROningen MAchine for Chemical Simulations
++ *
++ * VERSION 4.6.0
++ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
++ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
++ * Copyright (c) 2001-2011, The GROMACS development team,
++ * check out http://www.gromacs.org for more information.
++
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License
++ * as published by the Free Software Foundation; either version 2
++ * of the License, or (at your option) any later version.
++ *
++ * If you want to redistribute modifications, please consider that
++ * scientific software is very special. Version control is crucial -
++ * bugs must be traceable. We will be happy to consider code for
++ * inclusion in the official distribution, but derived work must not
++ * be called official GROMACS. Details are found in the README & COPYING
++ * files - if they are missing, get the official version at www.gromacs.org.
++ *
++ * To help us fund GROMACS development, we humbly ask that you cite
++ * the papers on the package - you can find them in the top README file.
++ *
++ * For more info, check our website at http://www.gromacs.org
++ *
++ * And Hey:
++ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
++ */
++#ifdef HAVE_CONFIG_H
++#include <config.h>
++#endif
++
++#include "smalloc.h"
++#include "network.h"
++#include "calcgrid.h"
++#include "pme.h"
++#include "vec.h"
++#include "domdec.h"
++#include "nbnxn_cuda_data_mgmt.h"
++#include "force.h"
++#include "macros.h"
++#include "pme_loadbal.h"
++
++/* Parameters and setting for one PP-PME setup */
++typedef struct {
++ real rcut; /* Coulomb cut-off */
++ real rlist; /* pair-list cut-off */
++ real spacing; /* (largest) PME grid spacing */
++ ivec grid; /* the PME grid dimensions */
++ real grid_efficiency; /* ineffiency factor for non-uniform grids <= 1 */
++ real ewaldcoeff; /* the Ewald coefficient */
++ gmx_pme_t pmedata; /* the data structure used in the PME code */
++
++ int count; /* number of times this setup has been timed */
++ double cycles; /* the fastest time for this setup in cycles */
++} pme_setup_t;
++
++/* In the initial scan, step by grids that are at least a factor 0.8 coarser */
++#define PME_LB_GRID_SCALE_FAC 0.8
++/* In the initial scan, try to skip grids with uneven x/y/z spacing,
++ * checking if the "efficiency" is more than 5% worse than the previous grid.
++ */
++#define PME_LB_GRID_EFFICIENCY_REL_FAC 1.05
++/* Rerun up till 12% slower setups than the fastest up till now */
++#define PME_LB_SLOW_FAC 1.12
++/* If setups get more than 2% faster, do another round to avoid
++ * choosing a slower setup due to acceleration or fluctuations.
++ */
++#define PME_LB_ACCEL_TOL 1.02
++
++enum { epmelblimNO, epmelblimBOX, epmelblimDD, epmelblimNR };
++
++const char *pmelblim_str[epmelblimNR] =
++{ "no", "box size", "domain decompostion" };
++
++struct pme_load_balancing {
++ int nstage; /* the current maximum number of stages */
++
++ real cut_spacing; /* the minimum cutoff / PME grid spacing ratio */
++ real rbuf; /* the pairlist buffer size */
++ matrix box_start; /* the initial simulation box */
++ int n; /* the count of setup as well as the allocation size */
++ pme_setup_t *setup; /* the PME+cutoff setups */
++ int cur; /* the current setup */
++ int fastest; /* fastest setup up till now */
++ int start; /* start of setup range to consider in stage>0 */
++ int end; /* end of setup range to consider in stage>0 */
++ int elimited; /* was the balancing limited, uses enum above */
++
++ int stage; /* the current stage */
++};
++
++void pme_loadbal_init(pme_load_balancing_t *pme_lb_p,
++ const t_inputrec *ir,matrix box,
++ const interaction_const_t *ic,
++ gmx_pme_t pmedata)
++{
++ pme_load_balancing_t pme_lb;
++ real spm,sp;
++ int d;
++
++ snew(pme_lb,1);
++
++ /* Any number of stages >= 2 is supported */
++ pme_lb->nstage = 2;
++
++ pme_lb->rbuf = ic->rlist - ic->rcoulomb;
++
++ copy_mat(box,pme_lb->box_start);
++ if (ir->ePBC==epbcXY && ir->nwall==2)
++ {
++ svmul(ir->wall_ewald_zfac,pme_lb->box_start[ZZ],pme_lb->box_start[ZZ]);
++ }
++
++ pme_lb->n = 1;
++ snew(pme_lb->setup,pme_lb->n);
++
++ pme_lb->cur = 0;
++ pme_lb->setup[0].rcut = ic->rcoulomb;
++ pme_lb->setup[0].rlist = ic->rlist;
++ pme_lb->setup[0].grid[XX] = ir->nkx;
++ pme_lb->setup[0].grid[YY] = ir->nky;
++ pme_lb->setup[0].grid[ZZ] = ir->nkz;
++ pme_lb->setup[0].ewaldcoeff = ic->ewaldcoeff;
++
++ pme_lb->setup[0].pmedata = pmedata;
++
++ spm = 0;
++ for(d=0; d<DIM; d++)
++ {
++ sp = norm(pme_lb->box_start[d])/pme_lb->setup[0].grid[d];
++ if (sp > spm)
++ {
++ spm = sp;
++ }
++ }
++ pme_lb->setup[0].spacing = spm;
++
++ if (ir->fourier_spacing > 0)
++ {
++ pme_lb->cut_spacing = ir->rcoulomb/ir->fourier_spacing;
++ }
++ else
++ {
++ pme_lb->cut_spacing = ir->rcoulomb/pme_lb->setup[0].spacing;
++ }
++
++ pme_lb->stage = 0;
++
++ pme_lb->fastest = 0;
++ pme_lb->start = 0;
++ pme_lb->end = 0;
++ pme_lb->elimited = epmelblimNO;
++
++ *pme_lb_p = pme_lb;
++}
++
++static gmx_bool pme_loadbal_increase_cutoff(pme_load_balancing_t pme_lb,
++ int pme_order)
++{
++ pme_setup_t *set;
++ real fac,sp;
++ int d;
++
++ /* Try to add a new setup with next larger cut-off to the list */
++ pme_lb->n++;
++ srenew(pme_lb->setup,pme_lb->n);
++ set = &pme_lb->setup[pme_lb->n-1];
++ set->pmedata = NULL;
++
++ fac = 1;
++ do
++ {
++ fac *= 1.01;
++ clear_ivec(set->grid);
++ sp = calc_grid(NULL,pme_lb->box_start,
++ fac*pme_lb->setup[pme_lb->cur].spacing,
++ &set->grid[XX],
++ &set->grid[YY],
++ &set->grid[ZZ]);
++
++ /* In parallel we can't have grids smaller than 2*pme_order,
++ * and we would anyhow not gain much speed at these grid sizes.
++ */
++ for(d=0; d<DIM; d++)
++ {
++ if (set->grid[d] <= 2*pme_order)
++ {
++ pme_lb->n--;
++
++ return FALSE;
++ }
++ }
++ }
++ while (sp <= 1.001*pme_lb->setup[pme_lb->cur].spacing);
++
++ set->rcut = pme_lb->cut_spacing*sp;
++ set->rlist = set->rcut + pme_lb->rbuf;
++ set->spacing = sp;
++ /* The grid efficiency is the size wrt a grid with uniform x/y/z spacing */
++ set->grid_efficiency = 1;
++ for(d=0; d<DIM; d++)
++ {
++ set->grid_efficiency *= (set->grid[d]*sp)/norm(pme_lb->box_start[d]);
++ }
++ /* The Ewald coefficient is inversly proportional to the cut-off */
++ set->ewaldcoeff =
++ pme_lb->setup[0].ewaldcoeff*pme_lb->setup[0].rcut/set->rcut;
++
++ set->count = 0;
++ set->cycles = 0;
++
++ if (debug)
++ {
++ fprintf(debug,"PME loadbal: grid %d %d %d, cutoff %f\n",
++ set->grid[XX],set->grid[YY],set->grid[ZZ],set->rcut);
++ }
++
++ return TRUE;
++}
++
++static void print_grid(FILE *fp_err,FILE *fp_log,
++ const char *pre,
++ const char *desc,
++ const pme_setup_t *set,
++ double cycles)
++{
++ char buf[STRLEN],buft[STRLEN];
++
++ if (cycles >= 0)
++ {
++ sprintf(buft,": %.1f M-cycles",cycles*1e-6);
++ }
++ else
++ {
++ buft[0] = '\0';
++ }
++ sprintf(buf,"%-11s%10s pme grid %d %d %d, cutoff %.3f%s",
++ pre,
++ desc,set->grid[XX],set->grid[YY],set->grid[ZZ],set->rcut,
++ buft);
++ if (fp_err != NULL)
++ {
++ fprintf(fp_err,"\r%s\n",buf);
++ }
++ if (fp_log != NULL)
++ {
++ fprintf(fp_log,"%s\n",buf);
++ }
++}
++
++static int pme_loadbal_end(pme_load_balancing_t pme_lb)
++{
++ /* In the initial stage only n is set; end is not set yet */
++ if (pme_lb->end > 0)
++ {
++ return pme_lb->end;
++ }
++ else
++ {
++ return pme_lb->n;
++ }
++}
++
++static void print_loadbal_limited(FILE *fp_err,FILE *fp_log,
++ gmx_large_int_t step,
++ pme_load_balancing_t pme_lb)
++{
++ char buf[STRLEN],sbuf[22];
++
++ sprintf(buf,"step %4s: the %s limited the PME load balancing to a cut-off of %.3f",
++ gmx_step_str(step,sbuf),
++ pmelblim_str[pme_lb->elimited],
++ pme_lb->setup[pme_loadbal_end(pme_lb)].rcut);
++ if (fp_err != NULL)
++ {
++ fprintf(fp_err,"\r%s\n",buf);
++ }
++ if (fp_log != NULL)
++ {
++ fprintf(fp_log,"%s\n",buf);
++ }
++}
++
++static void switch_to_stage1(pme_load_balancing_t pme_lb)
++{
++ pme_lb->start = 0;
++ while (pme_lb->start+1 < pme_lb->n &&
++ (pme_lb->setup[pme_lb->start].count == 0 ||
++ pme_lb->setup[pme_lb->start].cycles >
++ pme_lb->setup[pme_lb->fastest].cycles*PME_LB_SLOW_FAC))
++ {
++ pme_lb->start++;
++ }
++ while (pme_lb->start > 0 && pme_lb->setup[pme_lb->start-1].cycles == 0)
++ {
++ pme_lb->start--;
++ }
++
++ pme_lb->end = pme_lb->n;
++ if (pme_lb->setup[pme_lb->end-1].count > 0 &&
++ pme_lb->setup[pme_lb->end-1].cycles >
++ pme_lb->setup[pme_lb->fastest].cycles*PME_LB_SLOW_FAC)
++ {
++ pme_lb->end--;
++ }
++
++ pme_lb->stage = 1;
++
++ /* Next we want to choose setup pme_lb->start, but as we will increase
++ * pme_ln->cur by one right after returning, we subtract 1 here.
++ */
++ pme_lb->cur = pme_lb->start - 1;
++}
++
++gmx_bool pme_load_balance(pme_load_balancing_t pme_lb,
++ t_commrec *cr,
++ FILE *fp_err,
++ FILE *fp_log,
++ t_inputrec *ir,
++ t_state *state,
++ double cycles,
++ interaction_const_t *ic,
++ nonbonded_verlet_t *nbv,
++ gmx_pme_t *pmedata,
++ gmx_large_int_t step)
++{
++ gmx_bool OK;
++ pme_setup_t *set;
++ double cycles_fast;
++ char buf[STRLEN],sbuf[22];
++
++ if (pme_lb->stage == pme_lb->nstage)
++ {
++ return FALSE;
++ }
++
++ if (PAR(cr))
++ {
++ gmx_sumd(1,&cycles,cr);
++ cycles /= cr->nnodes;
++ }
++
++ set = &pme_lb->setup[pme_lb->cur];
++
++ set->count++;
++ if (set->count % 2 == 1)
++ {
++ /* Skip the first cycle, because the first step after a switch
++ * is much slower due to allocation and/or caching effects.
++ */
++ return TRUE;
++ }
++
++ sprintf(buf, "step %4s: ", gmx_step_str(step,sbuf));
++ print_grid(fp_err,fp_log,buf,"timed with",set,cycles);
++
++ if (set->count <= 2)
++ {
++ set->cycles = cycles;
++ }
++ else
++ {
++ if (cycles*PME_LB_ACCEL_TOL < set->cycles &&
++ pme_lb->stage == pme_lb->nstage - 1)
++ {
++ /* The performance went up a lot (due to e.g. DD load balancing).
++ * Add a stage, keep the minima, but rescan all setups.
++ */
++ pme_lb->nstage++;
++
++ if (debug)
++ {
++ fprintf(debug,"The performance for grid %d %d %d went from %.3f to %.1f M-cycles, this is more than %f\n"
++ "Increased the number stages to %d"
++ " and ignoring the previous performance\n",
++ set->grid[XX],set->grid[YY],set->grid[ZZ],
++ cycles*1e-6,set->cycles*1e-6,PME_LB_ACCEL_TOL,
++ pme_lb->nstage);
++ }
++ }
++ set->cycles = min(set->cycles,cycles);
++ }
++
++ if (set->cycles < pme_lb->setup[pme_lb->fastest].cycles)
++ {
++ pme_lb->fastest = pme_lb->cur;
++ }
++ cycles_fast = pme_lb->setup[pme_lb->fastest].cycles;
++
++ /* Check in stage 0 if we should stop scanning grids.
++ * Stop when the time is more than SLOW_FAC longer than the fastest.
++ */
++ if (pme_lb->stage == 0 && pme_lb->cur > 0 &&
++ cycles > pme_lb->setup[pme_lb->fastest].cycles*PME_LB_SLOW_FAC)
++ {
++ pme_lb->n = pme_lb->cur + 1;
++ /* Done with scanning, go to stage 1 */
++ switch_to_stage1(pme_lb);
++ }
++
++ if (pme_lb->stage == 0)
++ {
++ int gridsize_start;
++
++ gridsize_start = set->grid[XX]*set->grid[YY]*set->grid[ZZ];
++
++ do
++ {
++ if (pme_lb->cur+1 < pme_lb->n)
++ {
++ /* We had already generated the next setup */
++ OK = TRUE;
++ }
++ else
++ {
++ /* Find the next setup */
++ OK = pme_loadbal_increase_cutoff(pme_lb,ir->pme_order);
++ }
++
++ if (OK && ir->ePBC != epbcNONE)
++ {
++ OK = (sqr(pme_lb->setup[pme_lb->cur+1].rlist)
++ <= max_cutoff2(ir->ePBC,state->box));
++ if (!OK)
++ {
++ pme_lb->elimited = epmelblimBOX;
++ }
++ }
++
++ if (OK)
++ {
++ pme_lb->cur++;
++
++ if (DOMAINDECOMP(cr))
++ {
++ OK = change_dd_cutoff(cr,state,ir,
++ pme_lb->setup[pme_lb->cur].rlist);
++ if (!OK)
++ {
++ /* Failed: do not use this setup */
++ pme_lb->cur--;
++ pme_lb->elimited = epmelblimDD;
++ }
++ }
++ }
++ if (!OK)
++ {
++ /* We hit the upper limit for the cut-off,
++ * the setup should not go further than cur.
++ */
++ pme_lb->n = pme_lb->cur + 1;
++ print_loadbal_limited(fp_err,fp_log,step,pme_lb);
++ /* Switch to the next stage */
++ switch_to_stage1(pme_lb);
++ }
++ }
++ while (OK &&
++ !(pme_lb->setup[pme_lb->cur].grid[XX]*
++ pme_lb->setup[pme_lb->cur].grid[YY]*
++ pme_lb->setup[pme_lb->cur].grid[ZZ] <
++ gridsize_start*PME_LB_GRID_SCALE_FAC
++ &&
++ pme_lb->setup[pme_lb->cur].grid_efficiency <
++ pme_lb->setup[pme_lb->cur-1].grid_efficiency*PME_LB_GRID_EFFICIENCY_REL_FAC));
++ }
++
++ if (pme_lb->stage > 0 && pme_lb->end == 1)
++ {
++ pme_lb->cur = 0;
++ pme_lb->stage = pme_lb->nstage;
++ }
++ else if (pme_lb->stage > 0 && pme_lb->end > 1)
++ {
++ /* If stage = nstage-1:
++ * scan over all setups, rerunning only those setups
++ * which are not much slower than the fastest
++ * else:
++ * use the next setup
++ */
++ do
++ {
++ pme_lb->cur++;
++ if (pme_lb->cur == pme_lb->end)
++ {
++ pme_lb->stage++;
++ pme_lb->cur = pme_lb->start;
++ }
++ }
++ while (pme_lb->stage == pme_lb->nstage - 1 &&
++ pme_lb->setup[pme_lb->cur].count > 0 &&
++ pme_lb->setup[pme_lb->cur].cycles > cycles_fast*PME_LB_SLOW_FAC);
++
++ if (pme_lb->stage == pme_lb->nstage)
++ {
++ /* We are done optimizing, use the fastest setup we found */
++ pme_lb->cur = pme_lb->fastest;
++ }
++ }
++
++ if (DOMAINDECOMP(cr) && pme_lb->stage > 0)
++ {
++ OK = change_dd_cutoff(cr,state,ir,pme_lb->setup[pme_lb->cur].rlist);
++ if (!OK)
++ {
++ /* Failsafe solution */
++ if (pme_lb->cur > 1 && pme_lb->stage == pme_lb->nstage)
++ {
++ pme_lb->stage--;
++ }
++ pme_lb->fastest = 0;
++ pme_lb->start = 0;
++ pme_lb->end = pme_lb->cur;
++ pme_lb->cur = pme_lb->start;
++ pme_lb->elimited = epmelblimDD;
++ print_loadbal_limited(fp_err,fp_log,step,pme_lb);
++ }
++ }
++
++ /* Change the Coulomb cut-off and the PME grid */
++
++ set = &pme_lb->setup[pme_lb->cur];
++
++ ic->rcoulomb = set->rcut;
++ ic->rlist = set->rlist;
++ ic->ewaldcoeff = set->ewaldcoeff;
++
++ if (nbv->grp[0].kernel_type == nbk8x8x8_CUDA)
++ {
++ nbnxn_cuda_pme_loadbal_update_param(nbv->cu_nbv,ic);
++ }
++ else
++ {
++ init_interaction_const_tables(NULL,ic,nbv->grp[0].kernel_type);
++ }
++
++ if (nbv->ngrp > 1)
++ {
++ init_interaction_const_tables(NULL,ic,nbv->grp[1].kernel_type);
++ }
++
++ if (cr->duty & DUTY_PME)
++ {
++ if (pme_lb->setup[pme_lb->cur].pmedata == NULL)
++ {
++ /* Generate a new PME data structure,
++ * copying part of the old pointers.
++ */
++ gmx_pme_reinit(&set->pmedata,
++ cr,pme_lb->setup[0].pmedata,ir,
++ set->grid);
++ }
++ *pmedata = set->pmedata;
++ }
++ else
++ {
++ /* Tell our PME-only node to switch grid */
++ gmx_pme_send_switch(cr, set->grid, set->ewaldcoeff);
++ }
++
++ if (debug)
++ {
++ print_grid(NULL,debug,"","switched to",set,-1);
++ }
++
++ if (pme_lb->stage == pme_lb->nstage)
++ {
++ print_grid(fp_err,fp_log,"","optimal",set,-1);
++ }
++
++ return TRUE;
++}
++
++void restart_pme_loadbal(pme_load_balancing_t pme_lb, int n)
++{
++ pme_lb->nstage += n;
++}
++
++static int pme_grid_points(const pme_setup_t *setup)
++{
++ return setup->grid[XX]*setup->grid[YY]*setup->grid[ZZ];
++}
++
++static void print_pme_loadbal_setting(FILE *fplog,
++ char *name,
++ const pme_setup_t *setup)
++{
++ fprintf(fplog,
++ " %-7s %6.3f nm %6.3f nm %3d %3d %3d %5.3f nm %5.3f nm\n",
++ name,
++ setup->rcut,setup->rlist,
++ setup->grid[XX],setup->grid[YY],setup->grid[ZZ],
++ setup->spacing,1/setup->ewaldcoeff);
++}
++
++static void print_pme_loadbal_settings(pme_load_balancing_t pme_lb,
++ FILE *fplog)
++{
++ double pp_ratio,grid_ratio;
++
++ pp_ratio = pow(pme_lb->setup[pme_lb->cur].rlist/pme_lb->setup[0].rlist,3.0);
++ grid_ratio = pme_grid_points(&pme_lb->setup[pme_lb->cur])/
++ (double)pme_grid_points(&pme_lb->setup[0]);
++
++ fprintf(fplog,"\n");
++ fprintf(fplog," P P - P M E L O A D B A L A N C I N G\n");
++ fprintf(fplog,"\n");
++ /* Here we only warn when the optimal setting is the last one */
++ if (pme_lb->elimited != epmelblimNO &&
++ pme_lb->cur == pme_loadbal_end(pme_lb)-1)
++ {
++ fprintf(fplog," NOTE: The PP/PME load balancing was limited by the %s,\n",
++ pmelblim_str[pme_lb->elimited]);
++ fprintf(fplog," you might not have reached a good load balance.\n");
++ if (pme_lb->elimited == epmelblimDD)
++ {
++ fprintf(fplog," Try different mdrun -dd settings or lower the -dds value.\n");
++ }
++ fprintf(fplog,"\n");
++ }
++ fprintf(fplog," PP/PME load balancing changed the cut-off and PME settings:\n");
++ fprintf(fplog," particle-particle PME\n");
++ fprintf(fplog," rcoulomb rlist grid spacing 1/beta\n");
++ print_pme_loadbal_setting(fplog,"initial",&pme_lb->setup[0]);
++ print_pme_loadbal_setting(fplog,"final" ,&pme_lb->setup[pme_lb->cur]);
++ fprintf(fplog," cost-ratio %4.2f %4.2f\n",
++ pp_ratio,grid_ratio);
++ fprintf(fplog," (note that these numbers concern only part of the total PP and PME load)\n");
++ fprintf(fplog,"\n");
++}
++
++void pme_loadbal_done(pme_load_balancing_t pme_lb, FILE *fplog)
++{
++ if (fplog != NULL && (pme_lb->cur > 0 || pme_lb->elimited != epmelblimNO))
++ {
++ print_pme_loadbal_settings(pme_lb,fplog);
++ }
++
++ /* TODO: Here we should free all pointers in pme_lb,
++ * but as it contains pme data structures,
++ * we need to first make pme.c free all data.
++ */
++}
--- /dev/null
- ret=tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi,
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+ *
+ *
+ * This source code is part of
+ *
+ * G R O M A C S
+ *
+ * GROningen MAchine for Chemical Simulations
+ *
+ * VERSION 3.2.0
+ * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2004, The GROMACS development team,
+ * check out http://www.gromacs.org for more information.
+
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * If you want to redistribute modifications, please consider that
+ * scientific software is very special. Version control is crucial -
+ * bugs must be traceable. We will be happy to consider code for
+ * inclusion in the official distribution, but derived work must not
+ * be called official GROMACS. Details are found in the README & COPYING
+ * files - if they are missing, get the official version at www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the papers on the package - you can find them in the top README file.
+ *
+ * For more info, check our website at http://www.gromacs.org
+ *
+ * And Hey:
+ * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#ifdef __linux
+#define _GNU_SOURCE
+#include <sched.h>
+#include <sys/syscall.h>
+#endif
+#include <signal.h>
+#include <stdlib.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#include <string.h>
+#include <assert.h>
+
+#include "typedefs.h"
+#include "smalloc.h"
+#include "sysstuff.h"
+#include "statutil.h"
+#include "mdrun.h"
+#include "md_logging.h"
+#include "md_support.h"
+#include "network.h"
+#include "pull.h"
+#include "pull_rotation.h"
+#include "names.h"
+#include "disre.h"
+#include "orires.h"
+#include "pme.h"
+#include "mdatoms.h"
+#include "repl_ex.h"
+#include "qmmm.h"
+#include "domdec.h"
+#include "partdec.h"
+#include "coulomb.h"
+#include "constr.h"
+#include "mvdata.h"
+#include "checkpoint.h"
+#include "mtop_util.h"
+#include "sighandler.h"
+#include "tpxio.h"
+#include "txtdump.h"
+#include "gmx_detect_hardware.h"
+#include "gmx_omp_nthreads.h"
+#include "pull_rotation.h"
+#include "calc_verletbuf.h"
+#include "nbnxn_search.h"
+#include "../mdlib/nbnxn_consts.h"
+#include "gmx_fatal_collective.h"
+#include "membed.h"
+#include "macros.h"
+#include "gmx_omp.h"
+
+#ifdef GMX_LIB_MPI
+#include <mpi.h>
+#endif
+#ifdef GMX_THREAD_MPI
+#include "tmpi.h"
+#endif
+
+#ifdef GMX_FAHCORE
+#include "corewrap.h"
+#endif
+
+#ifdef GMX_OPENMM
+#include "md_openmm.h"
+#endif
+
+#include "gpu_utils.h"
+#include "nbnxn_cuda_data_mgmt.h"
+
+typedef struct {
+ gmx_integrator_t *func;
+} gmx_intp_t;
+
+/* The array should match the eI array in include/types/enums.h */
+#ifdef GMX_OPENMM /* FIXME do_md_openmm needs fixing */
+const gmx_intp_t integrator[eiNR] = { {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm},{do_md_openmm}};
+#else
+const gmx_intp_t integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md},{do_md}};
+#endif
+
+gmx_large_int_t deform_init_init_step_tpx;
+matrix deform_init_box_tpx;
+#ifdef GMX_THREAD_MPI
+tMPI_Thread_mutex_t deform_init_box_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
+#endif
+
+
+#ifdef GMX_THREAD_MPI
+struct mdrunner_arglist
+{
+ gmx_hw_opt_t *hw_opt;
+ FILE *fplog;
+ t_commrec *cr;
+ int nfile;
+ const t_filenm *fnm;
+ output_env_t oenv;
+ gmx_bool bVerbose;
+ gmx_bool bCompact;
+ int nstglobalcomm;
+ ivec ddxyz;
+ int dd_node_order;
+ real rdd;
+ real rconstr;
+ const char *dddlb_opt;
+ real dlb_scale;
+ const char *ddcsx;
+ const char *ddcsy;
+ const char *ddcsz;
+ const char *nbpu_opt;
+ int nsteps_cmdline;
+ int nstepout;
+ int resetstep;
+ int nmultisim;
+ int repl_ex_nst;
+ int repl_ex_nex;
+ int repl_ex_seed;
+ real pforce;
+ real cpt_period;
+ real max_hours;
+ const char *deviceOptions;
+ unsigned long Flags;
+ int ret; /* return value */
+};
+
+
+/* The function used for spawning threads. Extracts the mdrunner()
+ arguments from its one argument and calls mdrunner(), after making
+ a commrec. */
+static void mdrunner_start_fn(void *arg)
+{
+ struct mdrunner_arglist *mda=(struct mdrunner_arglist*)arg;
+ struct mdrunner_arglist mc=*mda; /* copy the arg list to make sure
+ that it's thread-local. This doesn't
+ copy pointed-to items, of course,
+ but those are all const. */
+ t_commrec *cr; /* we need a local version of this */
+ FILE *fplog=NULL;
+ t_filenm *fnm;
+
+ fnm = dup_tfn(mc.nfile, mc.fnm);
+
+ cr = init_par_threads(mc.cr);
+
+ if (MASTER(cr))
+ {
+ fplog=mc.fplog;
+ }
+
+ mda->ret=mdrunner(mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv,
+ mc.bVerbose, mc.bCompact, mc.nstglobalcomm,
+ mc.ddxyz, mc.dd_node_order, mc.rdd,
+ mc.rconstr, mc.dddlb_opt, mc.dlb_scale,
+ mc.ddcsx, mc.ddcsy, mc.ddcsz,
+ mc.nbpu_opt,
+ mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
+ mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce,
+ mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.Flags);
+}
+
+/* called by mdrunner() to start a specific number of threads (including
+ the main thread) for thread-parallel runs. This in turn calls mdrunner()
+ for each thread.
+ All options besides nthreads are the same as for mdrunner(). */
+static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
+ FILE *fplog,t_commrec *cr,int nfile,
+ const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
+ gmx_bool bCompact, int nstglobalcomm,
+ ivec ddxyz,int dd_node_order,real rdd,real rconstr,
+ const char *dddlb_opt,real dlb_scale,
+ const char *ddcsx,const char *ddcsy,const char *ddcsz,
+ const char *nbpu_opt,
+ int nsteps_cmdline, int nstepout,int resetstep,
+ int nmultisim,int repl_ex_nst,int repl_ex_nex, int repl_ex_seed,
+ real pforce,real cpt_period, real max_hours,
+ const char *deviceOptions, unsigned long Flags)
+{
+ int ret;
+ struct mdrunner_arglist *mda;
+ t_commrec *crn; /* the new commrec */
+ t_filenm *fnmn;
+
+ /* first check whether we even need to start tMPI */
+ if (hw_opt->nthreads_tmpi < 2)
+ {
+ return cr;
+ }
+
+ /* a few small, one-time, almost unavoidable memory leaks: */
+ snew(mda,1);
+ fnmn=dup_tfn(nfile, fnm);
+
+ /* fill the data structure to pass as void pointer to thread start fn */
+ mda->hw_opt=hw_opt;
+ mda->fplog=fplog;
+ mda->cr=cr;
+ mda->nfile=nfile;
+ mda->fnm=fnmn;
+ mda->oenv=oenv;
+ mda->bVerbose=bVerbose;
+ mda->bCompact=bCompact;
+ mda->nstglobalcomm=nstglobalcomm;
+ mda->ddxyz[XX]=ddxyz[XX];
+ mda->ddxyz[YY]=ddxyz[YY];
+ mda->ddxyz[ZZ]=ddxyz[ZZ];
+ mda->dd_node_order=dd_node_order;
+ mda->rdd=rdd;
+ mda->rconstr=rconstr;
+ mda->dddlb_opt=dddlb_opt;
+ mda->dlb_scale=dlb_scale;
+ mda->ddcsx=ddcsx;
+ mda->ddcsy=ddcsy;
+ mda->ddcsz=ddcsz;
+ mda->nbpu_opt=nbpu_opt;
+ mda->nsteps_cmdline=nsteps_cmdline;
+ mda->nstepout=nstepout;
+ mda->resetstep=resetstep;
+ mda->nmultisim=nmultisim;
+ mda->repl_ex_nst=repl_ex_nst;
+ mda->repl_ex_nex=repl_ex_nex;
+ mda->repl_ex_seed=repl_ex_seed;
+ mda->pforce=pforce;
+ mda->cpt_period=cpt_period;
+ mda->max_hours=max_hours;
+ mda->deviceOptions=deviceOptions;
+ mda->Flags=Flags;
+
+ fprintf(stderr, "Starting %d tMPI threads\n",hw_opt->nthreads_tmpi);
+ fflush(stderr);
+ /* now spawn new threads that start mdrunner_start_fn(), while
+ the main thread returns */
++ ret=tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi, TMPI_AFFINITY_ALL_CORES,
+ mdrunner_start_fn, (void*)(mda) );
+ if (ret!=TMPI_SUCCESS)
+ return NULL;
+
+ /* make a new comm_rec to reflect the new situation */
+ crn=init_par_threads(cr);
+ return crn;
+}
+
+
+static int get_tmpi_omp_thread_distribution(const gmx_hw_opt_t *hw_opt,
+ int nthreads_tot,
+ int ngpu)
+{
+ int nthreads_tmpi;
+
+ /* There are no separate PME nodes here, as we ensured in
+ * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
+ * and a conditional ensures we would not have ended up here.
+ * Note that separate PME nodes might be switched on later.
+ */
+ if (ngpu > 0)
+ {
+ nthreads_tmpi = ngpu;
+ if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi)
+ {
+ nthreads_tmpi = nthreads_tot;
+ }
+ }
+ else if (hw_opt->nthreads_omp > 0)
+ {
+ if (hw_opt->nthreads_omp > nthreads_tot)
+ {
+ gmx_fatal(FARGS,"More OpenMP threads requested (%d) than the total number of threads requested (%d)",hw_opt->nthreads_omp,nthreads_tot);
+ }
+ nthreads_tmpi = nthreads_tot/hw_opt->nthreads_omp;
+ }
+ else
+ {
+ /* TODO choose nthreads_omp based on hardware topology
+ when we have a hardware topology detection library */
+ /* Don't use OpenMP parallelization */
+ nthreads_tmpi = nthreads_tot;
+ }
+
+ return nthreads_tmpi;
+}
+
+
+/* Get the number of threads to use for thread-MPI based on how many
+ * were requested, which algorithms we're using,
+ * and how many particles there are.
+ * At the point we have already called check_and_update_hw_opt.
+ * Thus all options should be internally consistent and consistent
+ * with the hardware, except that ntmpi could be larger than #GPU.
+ */
+static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
+ gmx_hw_opt_t *hw_opt,
+ t_inputrec *inputrec, gmx_mtop_t *mtop,
+ const t_commrec *cr,
+ FILE *fplog)
+{
+ int nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu;
+ int min_atoms_per_mpi_thread;
+ char *env;
+ char sbuf[STRLEN];
+ gmx_bool bCanUseGPU;
+
+ if (hw_opt->nthreads_tmpi > 0)
+ {
+ /* Trivial, return right away */
+ return hw_opt->nthreads_tmpi;
+ }
+
+ /* How many total (#tMPI*#OpenMP) threads can we start? */
+ if (hw_opt->nthreads_tot > 0)
+ {
+ nthreads_tot_max = hw_opt->nthreads_tot;
+ }
+ else
+ {
+ nthreads_tot_max = tMPI_Thread_get_hw_number();
+ }
+
+ bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
+ if (bCanUseGPU)
+ {
+ ngpu = hwinfo->gpu_info.ncuda_dev_use;
+ }
+ else
+ {
+ ngpu = 0;
+ }
+
+ nthreads_tmpi =
+ get_tmpi_omp_thread_distribution(hw_opt,nthreads_tot_max,ngpu);
+
+ if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
+ {
+ /* Steps are divided over the nodes iso splitting the atoms */
+ min_atoms_per_mpi_thread = 0;
+ }
+ else
+ {
+ if (bCanUseGPU)
+ {
+ min_atoms_per_mpi_thread = MIN_ATOMS_PER_GPU;
+ }
+ else
+ {
+ min_atoms_per_mpi_thread = MIN_ATOMS_PER_MPI_THREAD;
+ }
+ }
+
+ /* Check if an algorithm does not support parallel simulation. */
+ if (nthreads_tmpi != 1 &&
+ ( inputrec->eI == eiLBFGS ||
+ inputrec->coulombtype == eelEWALD ) )
+ {
+ nthreads_tmpi = 1;
+
+ md_print_warn(cr,fplog,"The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
+ if (hw_opt->nthreads_tmpi > nthreads_tmpi)
+ {
+ gmx_fatal(FARGS,"You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
+ }
+ }
+ else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_thread)
+ {
+ /* the thread number was chosen automatically, but there are too many
+ threads (too few atoms per thread) */
+ nthreads_new = max(1,mtop->natoms/min_atoms_per_mpi_thread);
+
+ if (nthreads_new > 8 || (nthreads_tmpi == 8 && nthreads_new > 4))
+ {
+ /* TODO replace this once we have proper HT detection
+ * Use only multiples of 4 above 8 threads
+ * or with an 8-core processor
+ * (to avoid 6 threads on 8 core processors with 4 real cores).
+ */
+ nthreads_new = (nthreads_new/4)*4;
+ }
+ else if (nthreads_new > 4)
+ {
+ /* Avoid 5 or 7 threads */
+ nthreads_new = (nthreads_new/2)*2;
+ }
+
+ nthreads_tmpi = nthreads_new;
+
+ fprintf(stderr,"\n");
+ fprintf(stderr,"NOTE: Parallelization is limited by the small number of atoms,\n");
+ fprintf(stderr," only starting %d thread-MPI threads.\n",nthreads_tmpi);
+ fprintf(stderr," You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
+ }
+
+ return nthreads_tmpi;
+}
+#endif /* GMX_THREAD_MPI */
+
+
+/* Environment variable for setting nstlist */
+static const char* NSTLIST_ENVVAR = "GMX_NSTLIST";
+/* Try to increase nstlist when using a GPU with nstlist less than this */
+static const int NSTLIST_GPU_ENOUGH = 20;
+/* Increase nstlist until the non-bonded cost increases more than this factor */
+static const float NBNXN_GPU_LIST_OK_FAC = 1.25;
+/* Don't increase nstlist beyond a non-bonded cost increases of this factor */
+static const float NBNXN_GPU_LIST_MAX_FAC = 1.40;
+
+/* Try to increase nstlist when running on a GPU */
+static void increase_nstlist(FILE *fp,t_commrec *cr,
+ t_inputrec *ir,const gmx_mtop_t *mtop,matrix box)
+{
+ char *env;
+ int nstlist_orig,nstlist_prev;
+ verletbuf_list_setup_t ls;
+ real rlist_inc,rlist_ok,rlist_max,rlist_new,rlist_prev;
+ int i;
+ t_state state_tmp;
+ gmx_bool bBox,bDD,bCont;
+ const char *nstl_fmt="\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
+ const char *vbd_err="Can not increase nstlist for GPU run because verlet-buffer-drift is not set or used";
+ const char *box_err="Can not increase nstlist for GPU run because the box is too small";
+ const char *dd_err ="Can not increase nstlist for GPU run because of domain decomposition limitations";
+ char buf[STRLEN];
+
+ /* Number of + nstlist alternative values to try when switching */
+ const int nstl[]={ 20, 25, 40, 50 };
+#define NNSTL sizeof(nstl)/sizeof(nstl[0])
+
+ env = getenv(NSTLIST_ENVVAR);
+ if (env == NULL)
+ {
+ if (fp != NULL)
+ {
+ fprintf(fp,nstl_fmt,ir->nstlist);
+ }
+ }
+
+ if (ir->verletbuf_drift == 0)
+ {
+ gmx_fatal(FARGS,"You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
+ }
+
+ if (ir->verletbuf_drift < 0)
+ {
+ if (MASTER(cr))
+ {
+ fprintf(stderr,"%s\n",vbd_err);
+ }
+ if (fp != NULL)
+ {
+ fprintf(fp,"%s\n",vbd_err);
+ }
+
+ return;
+ }
+
+ nstlist_orig = ir->nstlist;
+ if (env != NULL)
+ {
+ sprintf(buf,"Getting nstlist from environment variable GMX_NSTLIST=%s",env);
+ if (MASTER(cr))
+ {
+ fprintf(stderr,"%s\n",buf);
+ }
+ if (fp != NULL)
+ {
+ fprintf(fp,"%s\n",buf);
+ }
+ sscanf(env,"%d",&ir->nstlist);
+ }
+
+ verletbuf_get_list_setup(TRUE,&ls);
+
+ /* Allow rlist to make the list double the size of the cut-off sphere */
+ rlist_inc = nbnxn_get_rlist_effective_inc(NBNXN_GPU_CLUSTER_SIZE,mtop->natoms/det(box));
+ rlist_ok = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_OK_FAC,1.0/3.0) - rlist_inc;
+ rlist_max = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_MAX_FAC,1.0/3.0) - rlist_inc;
+ if (debug)
+ {
+ fprintf(debug,"GPU nstlist tuning: rlist_inc %.3f rlist_max %.3f\n",
+ rlist_inc,rlist_max);
+ }
+
+ i = 0;
+ nstlist_prev = nstlist_orig;
+ rlist_prev = ir->rlist;
+ do
+ {
+ if (env == NULL)
+ {
+ ir->nstlist = nstl[i];
+ }
+
+ /* Set the pair-list buffer size in ir */
+ calc_verlet_buffer_size(mtop,det(box),ir,ir->verletbuf_drift,&ls,
+ NULL,&rlist_new);
+
+ /* Does rlist fit in the box? */
+ bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC,box));
+ bDD = TRUE;
+ if (bBox && DOMAINDECOMP(cr))
+ {
+ /* Check if rlist fits in the domain decomposition */
+ if (inputrec2nboundeddim(ir) < DIM)
+ {
+ gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
+ }
+ copy_mat(box,state_tmp.box);
+ bDD = change_dd_cutoff(cr,&state_tmp,ir,rlist_new);
+ }
+
+ bCont = FALSE;
+
+ if (env == NULL)
+ {
+ if (bBox && bDD && rlist_new <= rlist_max)
+ {
+ /* Increase nstlist */
+ nstlist_prev = ir->nstlist;
+ rlist_prev = rlist_new;
+ bCont = (i+1 < NNSTL && rlist_new < rlist_ok);
+ }
+ else
+ {
+ /* Stick with the previous nstlist */
+ ir->nstlist = nstlist_prev;
+ rlist_new = rlist_prev;
+ bBox = TRUE;
+ bDD = TRUE;
+ }
+ }
+
+ i++;
+ }
+ while (bCont);
+
+ if (!bBox || !bDD)
+ {
+ gmx_warning(!bBox ? box_err : dd_err);
+ if (fp != NULL)
+ {
+ fprintf(fp,"\n%s\n",bBox ? box_err : dd_err);
+ }
+ ir->nstlist = nstlist_orig;
+ }
+ else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
+ {
+ sprintf(buf,"Changing nstlist from %d to %d, rlist from %g to %g",
+ nstlist_orig,ir->nstlist,
+ ir->rlist,rlist_new);
+ if (MASTER(cr))
+ {
+ fprintf(stderr,"%s\n\n",buf);
+ }
+ if (fp != NULL)
+ {
+ fprintf(fp,"%s\n\n",buf);
+ }
+ ir->rlist = rlist_new;
+ ir->rlistlong = rlist_new;
+ }
+}
+
+static void prepare_verlet_scheme(FILE *fplog,
+ gmx_hw_info_t *hwinfo,
+ t_commrec *cr,
+ gmx_hw_opt_t *hw_opt,
+ const char *nbpu_opt,
+ t_inputrec *ir,
+ const gmx_mtop_t *mtop,
+ matrix box,
+ gmx_bool *bUseGPU)
+{
+ /* Here we only check for GPU usage on the MPI master process,
+ * as here we don't know how many GPUs we will use yet.
+ * We check for a GPU on all processes later.
+ */
+ *bUseGPU = hwinfo->bCanUseGPU || (getenv("GMX_EMULATE_GPU") != NULL);
+
+ if (ir->verletbuf_drift > 0)
+ {
+ /* Update the Verlet buffer size for the current run setup */
+ verletbuf_list_setup_t ls;
+ real rlist_new;
+
+ /* Here we assume CPU acceleration is on. But as currently
+ * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
+ * and 4x2 gives a larger buffer than 4x4, this is ok.
+ */
+ verletbuf_get_list_setup(*bUseGPU,&ls);
+
+ calc_verlet_buffer_size(mtop,det(box),ir,
+ ir->verletbuf_drift,&ls,
+ NULL,&rlist_new);
+ if (rlist_new != ir->rlist)
+ {
+ if (fplog != NULL)
+ {
+ fprintf(fplog,"\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
+ ir->rlist,rlist_new,
+ ls.cluster_size_i,ls.cluster_size_j);
+ }
+ ir->rlist = rlist_new;
+ ir->rlistlong = rlist_new;
+ }
+ }
+
+ /* With GPU or emulation we should check nstlist for performance */
+ if ((EI_DYNAMICS(ir->eI) &&
+ *bUseGPU &&
+ ir->nstlist < NSTLIST_GPU_ENOUGH) ||
+ getenv(NSTLIST_ENVVAR) != NULL)
+ {
+ /* Choose a better nstlist */
+ increase_nstlist(fplog,cr,ir,mtop,box);
+ }
+}
+
+static void convert_to_verlet_scheme(FILE *fplog,
+ t_inputrec *ir,
+ gmx_mtop_t *mtop,real box_vol)
+{
+ char *conv_mesg="Converting input file with group cut-off scheme to the Verlet cut-off scheme";
+
+ md_print_warn(NULL,fplog,"%s\n",conv_mesg);
+
+ ir->cutoff_scheme = ecutsVERLET;
+ ir->verletbuf_drift = 0.005;
+
+ if (ir->rcoulomb != ir->rvdw)
+ {
+ gmx_fatal(FARGS,"The VdW and Coulomb cut-offs are different, whereas the Verlet scheme only supports equal cut-offs");
+ }
+
+ if (ir->vdwtype == evdwUSER || EEL_USER(ir->coulombtype))
+ {
+ gmx_fatal(FARGS,"User non-bonded potentials are not (yet) supported with the Verlet scheme");
+ }
+ else if (EVDW_SWITCHED(ir->vdwtype) || EEL_SWITCHED(ir->coulombtype))
+ {
+ md_print_warn(NULL,fplog,"Converting switched or shifted interactions to a shifted potential (without force shift), this will lead to slightly different interaction potentials");
+
+ if (EVDW_SWITCHED(ir->vdwtype))
+ {
+ ir->vdwtype = evdwCUT;
+ }
+ if (EEL_SWITCHED(ir->coulombtype))
+ {
+ if (EEL_FULL(ir->coulombtype))
+ {
+ /* With full electrostatic only PME can be switched */
+ ir->coulombtype = eelPME;
+ }
+ else
+ {
+ md_print_warn(NULL,fplog,"NOTE: Replacing %s electrostatics with reaction-field with epsilon-rf=inf\n",eel_names[ir->coulombtype]);
+ ir->coulombtype = eelRF;
+ ir->epsilon_rf = 0.0;
+ }
+ }
+
+ /* We set the target energy drift to a small number.
+ * Note that this is only for testing. For production the user
+ * should think about this and set the mdp options.
+ */
+ ir->verletbuf_drift = 1e-4;
+ }
+
+ if (inputrec2nboundeddim(ir) != 3)
+ {
+ gmx_fatal(FARGS,"Can only convert old tpr files to the Verlet cut-off scheme with 3D pbc");
+ }
+
+ if (ir->efep != efepNO || ir->implicit_solvent != eisNO)
+ {
+ gmx_fatal(FARGS,"Will not convert old tpr files to the Verlet cut-off scheme with free-energy calculations or implicit solvent");
+ }
+
+ if (EI_DYNAMICS(ir->eI) && !(EI_MD(ir->eI) && ir->etc == etcNO))
+ {
+ verletbuf_list_setup_t ls;
+
+ verletbuf_get_list_setup(FALSE,&ls);
+ calc_verlet_buffer_size(mtop,box_vol,ir,ir->verletbuf_drift,&ls,
+ NULL,&ir->rlist);
+ }
+ else
+ {
+ ir->verletbuf_drift = -1;
+ ir->rlist = 1.05*max(ir->rvdw,ir->rcoulomb);
+ }
+
+ gmx_mtop_remove_chargegroups(mtop);
+}
+
+
+/* Set CPU affinity. Can be important for performance.
+ On some systems (e.g. Cray) CPU Affinity is set by default.
+ But default assigning doesn't work (well) with only some ranks
+ having threads. This causes very low performance.
+ External tools have cumbersome syntax for setting affinity
+ in the case that only some ranks have threads.
+ Thus it is important that GROMACS sets the affinity internally
+ if only PME is using threads.
+*/
+static void set_cpu_affinity(FILE *fplog,
+ const t_commrec *cr,
+ const gmx_hw_opt_t *hw_opt,
+ int nthreads_pme,
+ const gmx_hw_info_t *hwinfo,
+ const t_inputrec *inputrec)
+{
+#ifdef GMX_OPENMP /* TODO: actually we could do this even without OpenMP?! */
+#ifdef __linux /* TODO: only linux? why not everywhere if sched_setaffinity is available */
+ if (hw_opt->bThreadPinning)
+ {
+ int thread, nthread_local, nthread_node, nthread_hw_max, nphyscore;
+ int offset;
+ char *env;
+
+ /* threads on this MPI process or TMPI thread */
+ if (cr->duty & DUTY_PP)
+ {
+ nthread_local = gmx_omp_nthreads_get(emntNonbonded);
+ }
+ else
+ {
+ nthread_local = gmx_omp_nthreads_get(emntPME);
+ }
+
+ /* map the current process to cores */
+ thread = 0;
+ nthread_node = nthread_local;
+#ifdef GMX_MPI
+ if (PAR(cr) || MULTISIM(cr))
+ {
+ /* We need to determine a scan of the thread counts in this
+ * compute node.
+ */
+ MPI_Comm comm_intra;
+
+ MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),cr->nodeid_intra,
+ &comm_intra);
+ MPI_Scan(&nthread_local,&thread,1,MPI_INT,MPI_SUM,comm_intra);
+ /* MPI_Scan is inclusive, but here we need exclusive */
+ thread -= nthread_local;
+ /* Get the total number of threads on this physical node */
+ MPI_Allreduce(&nthread_local,&nthread_node,1,MPI_INT,MPI_SUM,comm_intra);
+ MPI_Comm_free(&comm_intra);
+ }
+#endif
+
+ offset = 0;
+ if (hw_opt->core_pinning_offset > 0)
+ {
+ offset = hw_opt->core_pinning_offset;
+ if (SIMMASTER(cr))
+ {
+ fprintf(stderr, "Applying core pinning offset %d\n", offset);
+ }
+ if (fplog)
+ {
+ fprintf(fplog, "Applying core pinning offset %d\n", offset);
+ }
+ }
+
+ /* With Intel Hyper-Threading enabled, we want to pin consecutive
+ * threads to physical cores when using more threads than physical
+ * cores or when the user requests so.
+ */
+ nthread_hw_max = hwinfo->nthreads_hw_avail;
+ nphyscore = -1;
+ if (hw_opt->bPinHyperthreading ||
+ (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
+ nthread_node > nthread_hw_max/2 && getenv("GMX_DISABLE_PINHT") == NULL))
+ {
+ if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) != GMX_CPUID_X86_SMT_ENABLED)
+ {
+ /* We print to stderr on all processes, as we might have
+ * different settings on different physical nodes.
+ */
+ if (gmx_cpuid_vendor(hwinfo->cpuid_info) != GMX_CPUID_VENDOR_INTEL)
+ {
+ md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
+ "but non-Intel CPU detected (vendor: %s)\n",
+ gmx_cpuid_vendor_string[gmx_cpuid_vendor(hwinfo->cpuid_info)]);
+ }
+ else
+ {
+ md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
+ "but the CPU detected does not have Intel Hyper-Threading support "
+ "(or it is turned off)\n");
+ }
+ }
+ nphyscore = nthread_hw_max/2;
+
+ if (SIMMASTER(cr))
+ {
+ fprintf(stderr, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
+ nphyscore);
+ }
+ if (fplog)
+ {
+ fprintf(fplog, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
+ nphyscore);
+ }
+ }
+
+ /* set the per-thread affinity */
+#pragma omp parallel firstprivate(thread) num_threads(nthread_local)
+ {
+ cpu_set_t mask;
+ int core;
+
+ CPU_ZERO(&mask);
+ thread += gmx_omp_get_thread_num();
+ if (nphyscore <= 0)
+ {
+ core = offset + thread;
+ }
+ else
+ {
+ /* Lock pairs of threads to the same hyperthreaded core */
+ core = offset + thread/2 + (thread % 2)*nphyscore;
+ }
+ CPU_SET(core, &mask);
+ sched_setaffinity((pid_t) syscall (SYS_gettid), sizeof(cpu_set_t), &mask);
+ }
+ }
+#endif /* __linux */
+#endif /* GMX_OPENMP */
+}
+
+
+static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt,
+ int cutoff_scheme)
+{
+ gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp);
+
+#ifndef GMX_THREAD_MPI
+ if (hw_opt->nthreads_tot > 0)
+ {
+ gmx_fatal(FARGS,"Setting the total number of threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
+ }
+ if (hw_opt->nthreads_tmpi > 0)
+ {
+ gmx_fatal(FARGS,"Setting the number of thread-MPI threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
+ }
+#endif
+
+ if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
+ {
+ /* We have the same number of OpenMP threads for PP and PME processes,
+ * thus we can perform several consistency checks.
+ */
+ if (hw_opt->nthreads_tmpi > 0 &&
+ hw_opt->nthreads_omp > 0 &&
+ hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
+ {
+ gmx_fatal(FARGS,"The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
+ hw_opt->nthreads_tot,hw_opt->nthreads_tmpi,hw_opt->nthreads_omp);
+ }
+
+ if (hw_opt->nthreads_tmpi > 0 &&
+ hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
+ {
+ gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
+ hw_opt->nthreads_tot,hw_opt->nthreads_tmpi);
+ }
+
+ if (hw_opt->nthreads_omp > 0 &&
+ hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
+ {
+ gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
+ hw_opt->nthreads_tot,hw_opt->nthreads_omp);
+ }
+
+ if (hw_opt->nthreads_tmpi > 0 &&
+ hw_opt->nthreads_omp <= 0)
+ {
+ hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
+ }
+ }
+
+#ifndef GMX_OPENMP
+ if (hw_opt->nthreads_omp > 1)
+ {
+ gmx_fatal(FARGS,"OpenMP threads are requested, but Gromacs was compiled without OpenMP support");
+ }
+#endif
+
+ if (cutoff_scheme == ecutsGROUP)
+ {
+ /* We only have OpenMP support for PME only nodes */
+ if (hw_opt->nthreads_omp > 1)
+ {
+ gmx_fatal(FARGS,"OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
+ ecutscheme_names[cutoff_scheme],
+ ecutscheme_names[ecutsVERLET]);
+ }
+ hw_opt->nthreads_omp = 1;
+ }
+
+ if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
+ {
+ gmx_fatal(FARGS,"You need to specify -ntomp in addition to -ntomp_pme");
+ }
+
+ if (hw_opt->nthreads_tot == 1)
+ {
+ hw_opt->nthreads_tmpi = 1;
+
+ if (hw_opt->nthreads_omp > 1)
+ {
+ gmx_fatal(FARGS,"You requested %d OpenMP threads with %d total threads",
+ hw_opt->nthreads_tmpi,hw_opt->nthreads_tot);
+ }
+ hw_opt->nthreads_omp = 1;
+ }
+
+ if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
+ {
+ hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
+ }
+
+ if (debug)
+ {
+ fprintf(debug,"hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
+ hw_opt->nthreads_tot,
+ hw_opt->nthreads_tmpi,
+ hw_opt->nthreads_omp,
+ hw_opt->nthreads_omp_pme,
+ hw_opt->gpu_id!=NULL ? hw_opt->gpu_id : "");
+
+ }
+}
+
+
+/* Override the value in inputrec with value passed on the command line (if any) */
+static void override_nsteps_cmdline(FILE *fplog,
+ int nsteps_cmdline,
+ t_inputrec *ir,
+ const t_commrec *cr)
+{
+ assert(ir);
+ assert(cr);
+
+ /* override with anything else than the default -2 */
+ if (nsteps_cmdline > -2)
+ {
+ char stmp[STRLEN];
+
+ ir->nsteps = nsteps_cmdline;
+ if (EI_DYNAMICS(ir->eI))
+ {
+ sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps, %.3f ps",
+ nsteps_cmdline, nsteps_cmdline*ir->delta_t);
+ }
+ else
+ {
+ sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps",
+ nsteps_cmdline);
+ }
+
+ md_print_warn(cr, fplog, "%s\n", stmp);
+ }
+}
+
+/* Data structure set by SIMMASTER which needs to be passed to all nodes
+ * before the other nodes have read the tpx file and called gmx_detect_hardware.
+ */
+typedef struct {
+ int cutoff_scheme; /* The cutoff-scheme from inputrec_t */
+ gmx_bool bUseGPU; /* Use GPU or GPU emulation */
+} master_inf_t;
+
+int mdrunner(gmx_hw_opt_t *hw_opt,
+ FILE *fplog,t_commrec *cr,int nfile,
+ const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
+ gmx_bool bCompact, int nstglobalcomm,
+ ivec ddxyz,int dd_node_order,real rdd,real rconstr,
+ const char *dddlb_opt,real dlb_scale,
+ const char *ddcsx,const char *ddcsy,const char *ddcsz,
+ const char *nbpu_opt,
+ int nsteps_cmdline, int nstepout,int resetstep,
+ int nmultisim,int repl_ex_nst,int repl_ex_nex,
+ int repl_ex_seed, real pforce,real cpt_period,real max_hours,
+ const char *deviceOptions, unsigned long Flags)
+{
+ gmx_bool bForceUseGPU,bTryUseGPU;
+ double nodetime=0,realtime;
+ t_inputrec *inputrec;
+ t_state *state=NULL;
+ matrix box;
+ gmx_ddbox_t ddbox={0};
+ int npme_major,npme_minor;
+ real tmpr1,tmpr2;
+ t_nrnb *nrnb;
+ gmx_mtop_t *mtop=NULL;
+ t_mdatoms *mdatoms=NULL;
+ t_forcerec *fr=NULL;
+ t_fcdata *fcd=NULL;
+ real ewaldcoeff=0;
+ gmx_pme_t *pmedata=NULL;
+ gmx_vsite_t *vsite=NULL;
+ gmx_constr_t constr;
+ int i,m,nChargePerturbed=-1,status,nalloc;
+ char *gro;
+ gmx_wallcycle_t wcycle;
+ gmx_bool bReadRNG,bReadEkin;
+ int list;
+ gmx_runtime_t runtime;
+ int rc;
+ gmx_large_int_t reset_counters;
+ gmx_edsam_t ed=NULL;
+ t_commrec *cr_old=cr;
+ int nthreads_pme=1;
+ int nthreads_pp=1;
+ gmx_membed_t membed=NULL;
+ gmx_hw_info_t *hwinfo=NULL;
+ master_inf_t minf={-1,FALSE};
+
+ /* CAUTION: threads may be started later on in this function, so
+ cr doesn't reflect the final parallel state right now */
+ snew(inputrec,1);
+ snew(mtop,1);
+
+ if (Flags & MD_APPENDFILES)
+ {
+ fplog = NULL;
+ }
+
+ bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
+ bTryUseGPU = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
+
+ snew(state,1);
+ if (SIMMASTER(cr))
+ {
+ /* Read (nearly) all data required for the simulation */
+ read_tpx_state(ftp2fn(efTPX,nfile,fnm),inputrec,state,NULL,mtop);
+
+ if (inputrec->cutoff_scheme != ecutsVERLET &&
+ ((Flags & MD_TESTVERLET) || getenv("GMX_VERLET_SCHEME") != NULL))
+ {
+ convert_to_verlet_scheme(fplog,inputrec,mtop,det(state->box));
+ }
+
+ /* Detect hardware, gather information. With tMPI only thread 0 does it
+ * and after threads are started broadcasts hwinfo around. */
+ snew(hwinfo, 1);
+ gmx_detect_hardware(fplog, hwinfo, cr,
+ bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
+
+ minf.cutoff_scheme = inputrec->cutoff_scheme;
+ minf.bUseGPU = FALSE;
+
+ if (inputrec->cutoff_scheme == ecutsVERLET)
+ {
+ prepare_verlet_scheme(fplog,hwinfo,cr,hw_opt,nbpu_opt,
+ inputrec,mtop,state->box,
+ &minf.bUseGPU);
+ }
+ else if (hwinfo->bCanUseGPU)
+ {
+ md_print_warn(cr,fplog,
+ "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
+ " To use a GPU, set the mdp option: cutoff-scheme = Verlet\n"
+ " (for quick performance testing you can use the -testverlet option)\n");
+
+ if (bForceUseGPU)
+ {
+ gmx_fatal(FARGS,"GPU requested, but can't be used without cutoff-scheme=Verlet");
+ }
+ }
+ }
+#ifndef GMX_THREAD_MPI
+ if (PAR(cr))
+ {
+ gmx_bcast_sim(sizeof(minf),&minf,cr);
+ }
+#endif
+ if (minf.bUseGPU && cr->npmenodes == -1)
+ {
+ /* Don't automatically use PME-only nodes with GPUs */
+ cr->npmenodes = 0;
+ }
+
+#ifdef GMX_THREAD_MPI
+ /* With thread-MPI inputrec is only set here on the master thread */
+ if (SIMMASTER(cr))
+#endif
+ {
+ check_and_update_hw_opt(hw_opt,minf.cutoff_scheme);
+
+#ifdef GMX_THREAD_MPI
+ if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
+ {
+ gmx_fatal(FARGS,"You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME nodes");
+ }
+#endif
+
+ if (hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
+ cr->npmenodes <= 0)
+ {
+ gmx_fatal(FARGS,"You need to explicitly specify the number of PME nodes (-npme) when using different number of OpenMP threads for PP and PME nodes");
+ }
+ }
+
+#ifdef GMX_THREAD_MPI
+ if (SIMMASTER(cr))
+ {
+ /* NOW the threads will be started: */
+ hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
+ hw_opt,
+ inputrec, mtop,
+ cr, fplog);
+ if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
+ {
+ hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
+ }
+
+ if (hw_opt->nthreads_tmpi > 1)
+ {
+ /* now start the threads. */
+ cr=mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm,
+ oenv, bVerbose, bCompact, nstglobalcomm,
+ ddxyz, dd_node_order, rdd, rconstr,
+ dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
+ nbpu_opt,
+ nsteps_cmdline, nstepout, resetstep, nmultisim,
+ repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
+ cpt_period, max_hours, deviceOptions,
+ Flags);
+ /* the main thread continues here with a new cr. We don't deallocate
+ the old cr because other threads may still be reading it. */
+ if (cr == NULL)
+ {
+ gmx_comm("Failed to spawn threads");
+ }
+ }
+ }
+#endif
+ /* END OF CAUTION: cr is now reliable */
+
+ /* g_membed initialisation *
+ * Because we change the mtop, init_membed is called before the init_parallel *
+ * (in case we ever want to make it run in parallel) */
+ if (opt2bSet("-membed",nfile,fnm))
+ {
+ if (MASTER(cr))
+ {
+ fprintf(stderr,"Initializing membed");
+ }
+ membed = init_membed(fplog,nfile,fnm,mtop,inputrec,state,cr,&cpt_period);
+ }
+
+ if (PAR(cr))
+ {
+ /* now broadcast everything to the non-master nodes/threads: */
+ init_parallel(fplog, cr, inputrec, mtop);
+
+ /* This check needs to happen after get_nthreads_mpi() */
+ if (inputrec->cutoff_scheme == ecutsVERLET && (Flags & MD_PARTDEC))
+ {
+ gmx_fatal_collective(FARGS,cr,NULL,
+ "The Verlet cut-off scheme is not supported with particle decomposition.\n"
+ "You can achieve the same effect as particle decomposition by running in parallel using only OpenMP threads.");
+ }
+ }
+ if (fplog != NULL)
+ {
+ pr_inputrec(fplog,0,"Input Parameters",inputrec,FALSE);
+ }
+
+#if defined GMX_THREAD_MPI
+ /* With tMPI we detected on thread 0 and we'll just pass the hwinfo pointer
+ * to the other threads -- slightly uncool, but works fine, just need to
+ * make sure that the data doesn't get freed twice. */
+ if (cr->nnodes > 1)
+ {
+ if (!SIMMASTER(cr))
+ {
+ snew(hwinfo, 1);
+ }
+ gmx_bcast(sizeof(&hwinfo), &hwinfo, cr);
+ }
+#else
+ if (PAR(cr) && !SIMMASTER(cr))
+ {
+ /* now we have inputrec on all nodes, can run the detection */
+ /* TODO: perhaps it's better to propagate within a node instead? */
+ snew(hwinfo, 1);
+ gmx_detect_hardware(fplog, hwinfo, cr,
+ bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
+ }
+#endif
+
+ /* now make sure the state is initialized and propagated */
+ set_state_entries(state,inputrec,cr->nnodes);
+
+ /* remove when vv and rerun works correctly! */
+ if (PAR(cr) && EI_VV(inputrec->eI) && ((Flags & MD_RERUN) || (Flags & MD_RERUN_VSITE)))
+ {
+ gmx_fatal(FARGS,
+ "Currently can't do velocity verlet with rerun in parallel.");
+ }
+
+ /* A parallel command line option consistency check that we can
+ only do after any threads have started. */
+ if (!PAR(cr) &&
+ (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
+ {
+ gmx_fatal(FARGS,
+ "The -dd or -npme option request a parallel simulation, "
+#ifndef GMX_MPI
+ "but %s was compiled without threads or MPI enabled"
+#else
+#ifdef GMX_THREAD_MPI
+ "but the number of threads (option -nt) is 1"
+#else
+ "but %s was not started through mpirun/mpiexec or only one process was requested through mpirun/mpiexec"
+#endif
+#endif
+ , ShortProgram()
+ );
+ }
+
+ if ((Flags & MD_RERUN) &&
+ (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
+ {
+ gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
+ }
+
+ if (can_use_allvsall(inputrec,mtop,TRUE,cr,fplog) && PAR(cr))
+ {
+ /* All-vs-all loops do not work with domain decomposition */
+ Flags |= MD_PARTDEC;
+ }
+
+ if (!EEL_PME(inputrec->coulombtype) || (Flags & MD_PARTDEC))
+ {
+ if (cr->npmenodes > 0)
+ {
+ if (!EEL_PME(inputrec->coulombtype))
+ {
+ gmx_fatal_collective(FARGS,cr,NULL,
+ "PME nodes are requested, but the system does not use PME electrostatics");
+ }
+ if (Flags & MD_PARTDEC)
+ {
+ gmx_fatal_collective(FARGS,cr,NULL,
+ "PME nodes are requested, but particle decomposition does not support separate PME nodes");
+ }
+ }
+
+ cr->npmenodes = 0;
+ }
+
+#ifdef GMX_FAHCORE
+ fcRegisterSteps(inputrec->nsteps,inputrec->init_step);
+#endif
+
+ /* NMR restraints must be initialized before load_checkpoint,
+ * since with time averaging the history is added to t_state.
+ * For proper consistency check we therefore need to extend
+ * t_state here.
+ * So the PME-only nodes (if present) will also initialize
+ * the distance restraints.
+ */
+ snew(fcd,1);
+
+ /* This needs to be called before read_checkpoint to extend the state */
+ init_disres(fplog,mtop,inputrec,cr,Flags & MD_PARTDEC,fcd,state);
+
+ if (gmx_mtop_ftype_count(mtop,F_ORIRES) > 0)
+ {
+ if (PAR(cr) && !(Flags & MD_PARTDEC))
+ {
+ gmx_fatal(FARGS,"Orientation restraints do not work (yet) with domain decomposition, use particle decomposition (mdrun option -pd)");
+ }
+ /* Orientation restraints */
+ if (MASTER(cr))
+ {
+ init_orires(fplog,mtop,state->x,inputrec,cr->ms,&(fcd->orires),
+ state);
+ }
+ }
+
+ if (DEFORM(*inputrec))
+ {
+ /* Store the deform reference box before reading the checkpoint */
+ if (SIMMASTER(cr))
+ {
+ copy_mat(state->box,box);
+ }
+ if (PAR(cr))
+ {
+ gmx_bcast(sizeof(box),box,cr);
+ }
+ /* Because we do not have the update struct available yet
+ * in which the reference values should be stored,
+ * we store them temporarily in static variables.
+ * This should be thread safe, since they are only written once
+ * and with identical values.
+ */
+#ifdef GMX_THREAD_MPI
+ tMPI_Thread_mutex_lock(&deform_init_box_mutex);
+#endif
+ deform_init_init_step_tpx = inputrec->init_step;
+ copy_mat(box,deform_init_box_tpx);
+#ifdef GMX_THREAD_MPI
+ tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
+#endif
+ }
+
+ if (opt2bSet("-cpi",nfile,fnm))
+ {
+ /* Check if checkpoint file exists before doing continuation.
+ * This way we can use identical input options for the first and subsequent runs...
+ */
+ if( gmx_fexist_master(opt2fn_master("-cpi",nfile,fnm,cr),cr) )
+ {
+ load_checkpoint(opt2fn_master("-cpi",nfile,fnm,cr),&fplog,
+ cr,Flags & MD_PARTDEC,ddxyz,
+ inputrec,state,&bReadRNG,&bReadEkin,
+ (Flags & MD_APPENDFILES),
+ (Flags & MD_APPENDFILESSET));
+
+ if (bReadRNG)
+ {
+ Flags |= MD_READ_RNG;
+ }
+ if (bReadEkin)
+ {
+ Flags |= MD_READ_EKIN;
+ }
+ }
+ }
+
+ if (((MASTER(cr) || (Flags & MD_SEPPOT)) && (Flags & MD_APPENDFILES))
+#ifdef GMX_THREAD_MPI
+ /* With thread MPI only the master node/thread exists in mdrun.c,
+ * therefore non-master nodes need to open the "seppot" log file here.
+ */
+ || (!MASTER(cr) && (Flags & MD_SEPPOT))
+#endif
+ )
+ {
+ gmx_log_open(ftp2fn(efLOG,nfile,fnm),cr,!(Flags & MD_SEPPOT),
+ Flags,&fplog);
+ }
+
+ /* override nsteps with value from cmdline */
+ override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
+
+ if (SIMMASTER(cr))
+ {
+ copy_mat(state->box,box);
+ }
+
+ if (PAR(cr))
+ {
+ gmx_bcast(sizeof(box),box,cr);
+ }
+
+ /* Essential dynamics */
+ if (opt2bSet("-ei",nfile,fnm))
+ {
+ /* Open input and output files, allocate space for ED data structure */
+ ed = ed_open(nfile,fnm,Flags,cr);
+ }
+
+ if (PAR(cr) && !((Flags & MD_PARTDEC) ||
+ EI_TPI(inputrec->eI) ||
+ inputrec->eI == eiNM))
+ {
+ cr->dd = init_domain_decomposition(fplog,cr,Flags,ddxyz,rdd,rconstr,
+ dddlb_opt,dlb_scale,
+ ddcsx,ddcsy,ddcsz,
+ mtop,inputrec,
+ box,state->x,
+ &ddbox,&npme_major,&npme_minor);
+
+ make_dd_communicators(fplog,cr,dd_node_order);
+
+ /* Set overallocation to avoid frequent reallocation of arrays */
+ set_over_alloc_dd(TRUE);
+ }
+ else
+ {
+ /* PME, if used, is done on all nodes with 1D decomposition */
+ cr->npmenodes = 0;
+ cr->duty = (DUTY_PP | DUTY_PME);
+ npme_major = 1;
+ npme_minor = 1;
+ if (!EI_TPI(inputrec->eI))
+ {
+ npme_major = cr->nnodes;
+ }
+
+ if (inputrec->ePBC == epbcSCREW)
+ {
+ gmx_fatal(FARGS,
+ "pbc=%s is only implemented with domain decomposition",
+ epbc_names[inputrec->ePBC]);
+ }
+ }
+
+ if (PAR(cr))
+ {
+ /* After possible communicator splitting in make_dd_communicators.
+ * we can set up the intra/inter node communication.
+ */
+ gmx_setup_nodecomm(fplog,cr);
+ }
+
+ /* Initialize per-node process ID and counters. */
+ gmx_init_intra_counters(cr);
+
+#ifdef GMX_MPI
+ md_print_info(cr,fplog,"Using %d MPI %s\n",
+ cr->nnodes,
+#ifdef GMX_THREAD_MPI
+ cr->nnodes==1 ? "thread" : "threads"
+#else
+ cr->nnodes==1 ? "process" : "processes"
+#endif
+ );
+#endif
+
+ gmx_omp_nthreads_init(fplog, cr,
+ hwinfo->nthreads_hw_avail,
+ hw_opt->nthreads_omp,
+ hw_opt->nthreads_omp_pme,
+ (cr->duty & DUTY_PP) == 0,
+ inputrec->cutoff_scheme == ecutsVERLET);
+
+ gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi, minf.bUseGPU);
+
+ /* getting number of PP/PME threads
+ PME: env variable should be read only on one node to make sure it is
+ identical everywhere;
+ */
+ /* TODO nthreads_pp is only used for pinning threads.
+ * This is a temporary solution until we have a hw topology library.
+ */
+ nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
+ nthreads_pme = gmx_omp_nthreads_get(emntPME);
+
+ wcycle = wallcycle_init(fplog,resetstep,cr,nthreads_pp,nthreads_pme);
+
+ if (PAR(cr))
+ {
+ /* Master synchronizes its value of reset_counters with all nodes
+ * including PME only nodes */
+ reset_counters = wcycle_get_reset_counters(wcycle);
+ gmx_bcast_sim(sizeof(reset_counters),&reset_counters,cr);
+ wcycle_set_reset_counters(wcycle, reset_counters);
+ }
+
+ snew(nrnb,1);
+ if (cr->duty & DUTY_PP)
+ {
+ /* For domain decomposition we allocate dynamically
+ * in dd_partition_system.
+ */
+ if (DOMAINDECOMP(cr))
+ {
+ bcast_state_setup(cr,state);
+ }
+ else
+ {
+ if (PAR(cr))
+ {
+ bcast_state(cr,state,TRUE);
+ }
+ }
+
+ /* Initiate forcerecord */
+ fr = mk_forcerec();
+ fr->hwinfo = hwinfo;
+ init_forcerec(fplog,oenv,fr,fcd,inputrec,mtop,cr,box,FALSE,
+ opt2fn("-table",nfile,fnm),
+ opt2fn("-tabletf",nfile,fnm),
+ opt2fn("-tablep",nfile,fnm),
+ opt2fn("-tableb",nfile,fnm),
+ nbpu_opt,
+ FALSE,pforce);
+
+ /* version for PCA_NOT_READ_NODE (see md.c) */
+ /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
+ "nofile","nofile","nofile","nofile",FALSE,pforce);
+ */
+ fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT);
+
+ /* Initialize QM-MM */
+ if(fr->bQMMM)
+ {
+ init_QMMMrec(cr,box,mtop,inputrec,fr);
+ }
+
+ /* Initialize the mdatoms structure.
+ * mdatoms is not filled with atom data,
+ * as this can not be done now with domain decomposition.
+ */
+ mdatoms = init_mdatoms(fplog,mtop,inputrec->efep!=efepNO);
+
+ /* Initialize the virtual site communication */
+ vsite = init_vsite(mtop,cr);
+
+ calc_shifts(box,fr->shift_vec);
+
+ /* With periodic molecules the charge groups should be whole at start up
+ * and the virtual sites should not be far from their proper positions.
+ */
+ if (!inputrec->bContinuation && MASTER(cr) &&
+ !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
+ {
+ /* Make molecules whole at start of run */
+ if (fr->ePBC != epbcNONE)
+ {
+ do_pbc_first_mtop(fplog,inputrec->ePBC,box,mtop,state->x);
+ }
+ if (vsite)
+ {
+ /* Correct initial vsite positions are required
+ * for the initial distribution in the domain decomposition
+ * and for the initial shell prediction.
+ */
+ construct_vsites_mtop(fplog,vsite,mtop,state->x);
+ }
+ }
+
+ if (EEL_PME(fr->eeltype))
+ {
+ ewaldcoeff = fr->ewaldcoeff;
+ pmedata = &fr->pmedata;
+ }
+ else
+ {
+ pmedata = NULL;
+ }
+ }
+ else
+ {
+ /* This is a PME only node */
+
+ /* We don't need the state */
+ done_state(state);
+
+ ewaldcoeff = calc_ewaldcoeff(inputrec->rcoulomb, inputrec->ewald_rtol);
+ snew(pmedata,1);
+ }
+
+#if defined GMX_THREAD_MPI
+ /* With the number of TMPI threads equal to the number of cores
+ * we already pinned in thread-MPI, so don't pin again here.
+ */
+ if (hw_opt->nthreads_tmpi != tMPI_Thread_get_hw_number())
+#endif
+ {
+ /* Set the CPU affinity */
+ set_cpu_affinity(fplog,cr,hw_opt,nthreads_pme,hwinfo,inputrec);
+ }
+
+ /* Initiate PME if necessary,
+ * either on all nodes or on dedicated PME nodes only. */
+ if (EEL_PME(inputrec->coulombtype))
+ {
+ if (mdatoms)
+ {
+ nChargePerturbed = mdatoms->nChargePerturbed;
+ }
+ if (cr->npmenodes > 0)
+ {
+ /* The PME only nodes need to know nChargePerturbed */
+ gmx_bcast_sim(sizeof(nChargePerturbed),&nChargePerturbed,cr);
+ }
+
+ if (cr->duty & DUTY_PME)
+ {
+ status = gmx_pme_init(pmedata,cr,npme_major,npme_minor,inputrec,
+ mtop ? mtop->natoms : 0,nChargePerturbed,
+ (Flags & MD_REPRODUCIBLE),nthreads_pme);
+ if (status != 0)
+ {
+ gmx_fatal(FARGS,"Error %d initializing PME",status);
+ }
+ }
+ }
+
+
+ if (integrator[inputrec->eI].func == do_md
+#ifdef GMX_OPENMM
+ ||
+ integrator[inputrec->eI].func == do_md_openmm
+#endif
+ )
+ {
+ /* Turn on signal handling on all nodes */
+ /*
+ * (A user signal from the PME nodes (if any)
+ * is communicated to the PP nodes.
+ */
+ signal_handler_install();
+ }
+
+ if (cr->duty & DUTY_PP)
+ {
+ if (inputrec->ePull != epullNO)
+ {
+ /* Initialize pull code */
+ init_pull(fplog,inputrec,nfile,fnm,mtop,cr,oenv, inputrec->fepvals->init_lambda,
+ EI_DYNAMICS(inputrec->eI) && MASTER(cr),Flags);
+ }
+
+ if (inputrec->bRot)
+ {
+ /* Initialize enforced rotation code */
+ init_rot(fplog,inputrec,nfile,fnm,cr,state->x,box,mtop,oenv,
+ bVerbose,Flags);
+ }
+
+ constr = init_constraints(fplog,mtop,inputrec,ed,state,cr);
+
+ if (DOMAINDECOMP(cr))
+ {
+ dd_init_bondeds(fplog,cr->dd,mtop,vsite,constr,inputrec,
+ Flags & MD_DDBONDCHECK,fr->cginfo_mb);
+
+ set_dd_parameters(fplog,cr->dd,dlb_scale,inputrec,fr,&ddbox);
+
+ setup_dd_grid(fplog,cr->dd);
+ }
+
+ /* Now do whatever the user wants us to do (how flexible...) */
+ integrator[inputrec->eI].func(fplog,cr,nfile,fnm,
+ oenv,bVerbose,bCompact,
+ nstglobalcomm,
+ vsite,constr,
+ nstepout,inputrec,mtop,
+ fcd,state,
+ mdatoms,nrnb,wcycle,ed,fr,
+ repl_ex_nst,repl_ex_nex,repl_ex_seed,
+ membed,
+ cpt_period,max_hours,
+ deviceOptions,
+ Flags,
+ &runtime);
+
+ if (inputrec->ePull != epullNO)
+ {
+ finish_pull(fplog,inputrec->pull);
+ }
+
+ if (inputrec->bRot)
+ {
+ finish_rot(fplog,inputrec->rot);
+ }
+
+ }
+ else
+ {
+ /* do PME only */
+ gmx_pmeonly(*pmedata,cr,nrnb,wcycle,ewaldcoeff,FALSE,inputrec);
+ }
+
+ if (EI_DYNAMICS(inputrec->eI) || EI_TPI(inputrec->eI))
+ {
+ /* Some timing stats */
+ if (SIMMASTER(cr))
+ {
+ if (runtime.proc == 0)
+ {
+ runtime.proc = runtime.real;
+ }
+ }
+ else
+ {
+ runtime.real = 0;
+ }
+ }
+
+ wallcycle_stop(wcycle,ewcRUN);
+
+ /* Finish up, write some stuff
+ * if rerunMD, don't write last frame again
+ */
+ finish_run(fplog,cr,ftp2fn(efSTO,nfile,fnm),
+ inputrec,nrnb,wcycle,&runtime,
+ fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ?
+ nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL,
+ nthreads_pp,
+ EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
+
+ if ((cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU)
+ {
+ char gpu_err_str[STRLEN];
+
+ /* free GPU memory and uninitialize GPU (by destroying the context) */
+ nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
+
+ if (!free_gpu(gpu_err_str))
+ {
+ gmx_warning("On node %d failed to free GPU #%d: %s",
+ cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
+ }
+ }
+
+ if (opt2bSet("-membed",nfile,fnm))
+ {
+ sfree(membed);
+ }
+
+#ifdef GMX_THREAD_MPI
+ if (PAR(cr) && SIMMASTER(cr))
+#endif
+ {
+ gmx_hardware_info_free(hwinfo);
+ }
+
+ /* Does what it says */
+ print_date_and_time(fplog,cr->nodeid,"Finished mdrun",&runtime);
+
+ /* Close logfile already here if we were appending to it */
+ if (MASTER(cr) && (Flags & MD_APPENDFILES))
+ {
+ gmx_log_close(fplog);
+ }
+
+ rc=(int)gmx_get_stop_condition();
+
+#ifdef GMX_THREAD_MPI
+ /* we need to join all threads. The sub-threads join when they
+ exit this function, but the master thread needs to be told to
+ wait for that. */
+ if (PAR(cr) && MASTER(cr))
+ {
+ tMPI_Finalize();
+ }
+#endif
+
+ return rc;
+}