src/programs/mdrun/runner.c

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  *                This source code is part of
   5  *
   6  *                 G   R   O   M   A   C   S
   7  *
   8  *          GROningen MAchine for Chemical Simulations
   9  *
  10  *                        VERSION 3.2.0
  11  * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  12  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  13  * Copyright (c) 2001-2004, The GROMACS development team,
  14  * check out http://www.gromacs.org for more information.
  15
  16  * This program is free software; you can redistribute it and/or
  17  * modify it under the terms of the GNU General Public License
  18  * as published by the Free Software Foundation; either version 2
  19  * of the License, or (at your option) any later version.
  20  *
  21  * If you want to redistribute modifications, please consider that
  22  * scientific software is very special. Version control is crucial -
  23  * bugs must be traceable. We will be happy to consider code for
  24  * inclusion in the official distribution, but derived work must not
  25  * be called official GROMACS. Details are found in the README & COPYING
  26  * files - if they are missing, get the official version at www.gromacs.org.
  27  *
  28  * To help us fund GROMACS development, we humbly ask that you cite
  29  * the papers on the package - you can find them in the top README file.
  30  *
  31  * For more info, check our website at http://www.gromacs.org
  32  *
  33  * And Hey:
  34  * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
  35  */
  36 #ifdef HAVE_CONFIG_H
  37 #include <config.h>
  38 #endif
  39 #ifdef __linux
  40 #define _GNU_SOURCE
  41 #include <sched.h>
  42 #include <sys/syscall.h>
  43 #endif
  44 #include <signal.h>
  45 #include <stdlib.h>
  46 #ifdef HAVE_UNISTD_H
  47 #include <unistd.h>
  48 #endif
  49 #include <string.h>
  50 #include <assert.h>
  51
  52 #include "typedefs.h"
  53 #include "smalloc.h"
  54 #include "sysstuff.h"
  55 #include "statutil.h"
  56 #include "mdrun.h"
  57 #include "md_logging.h"
  58 #include "md_support.h"
  59 #include "network.h"
  60 #include "pull.h"
  61 #include "pull_rotation.h"
  62 #include "names.h"
  63 #include "disre.h"
  64 #include "orires.h"
  65 #include "pme.h"
  66 #include "mdatoms.h"
  67 #include "repl_ex.h"
  68 #include "qmmm.h"
  69 #include "domdec.h"
  70 #include "partdec.h"
  71 #include "coulomb.h"
  72 #include "constr.h"
  73 #include "mvdata.h"
  74 #include "checkpoint.h"
  75 #include "mtop_util.h"
  76 #include "sighandler.h"
  77 #include "tpxio.h"
  78 #include "txtdump.h"
  79 #include "gmx_detect_hardware.h"
  80 #include "gmx_omp_nthreads.h"
  81 #include "pull_rotation.h"
  82 #include "calc_verletbuf.h"
  83 #include "../mdlib/nbnxn_search.h"
  84 #include "../mdlib/nbnxn_consts.h"
  85 #include "gmx_fatal_collective.h"
  86 #include "membed.h"
  87 #include "macros.h"
  88 #include "gmx_omp.h"
  89
  90 #ifdef GMX_LIB_MPI
  91 #include <mpi.h>
  92 #endif
  93 #ifdef GMX_THREAD_MPI
  94 #include "tmpi.h"
  95 #endif
  96
  97 #ifdef GMX_FAHCORE
  98 #include "corewrap.h"
  99 #endif
 100
 101 #ifdef GMX_OPENMM
 102 #include "md_openmm.h"
 103 #endif
 104
 105 #include "gpu_utils.h"
 106 #include "nbnxn_cuda_data_mgmt.h"
 107
 108 typedef struct {
 109     gmx_integrator_t *func;
 110 } gmx_intp_t;
 111
 112 /* The array should match the eI array in include/types/enums.h */
 113 #ifdef GMX_OPENMM  /* FIXME do_md_openmm needs fixing */
 114 const gmx_intp_t integrator[eiNR] = { {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm},{do_md_openmm}};
 115 #else
 116 const gmx_intp_t integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md},{do_md}};
 117 #endif
 118
 119 gmx_large_int_t     deform_init_init_step_tpx;
 120 matrix              deform_init_box_tpx;
 121 #ifdef GMX_THREAD_MPI
 122 tMPI_Thread_mutex_t deform_init_box_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
 123 #endif
 124
 125
 126 #ifdef GMX_THREAD_MPI
 127 struct mdrunner_arglist
 128 {
 129     gmx_hw_opt_t *hw_opt;
 130     FILE *fplog;
 131     t_commrec *cr;
 132     int nfile;
 133     const t_filenm *fnm;
 134     output_env_t oenv;
 135     gmx_bool bVerbose;
 136     gmx_bool bCompact;
 137     int nstglobalcomm;
 138     ivec ddxyz;
 139     int dd_node_order;
 140     real rdd;
 141     real rconstr;
 142     const char *dddlb_opt;
 143     real dlb_scale;
 144     const char *ddcsx;
 145     const char *ddcsy;
 146     const char *ddcsz;
 147     const char *nbpu_opt;
 148     int nsteps_cmdline;
 149     int nstepout;
 150     int resetstep;
 151     int nmultisim;
 152     int repl_ex_nst;
 153     int repl_ex_nex;
 154     int repl_ex_seed;
 155     real pforce;
 156     real cpt_period;
 157     real max_hours;
 158     const char *deviceOptions;
 159     unsigned long Flags;
 160     int ret; /* return value */
 161 };
 162
 163
 164 /* The function used for spawning threads. Extracts the mdrunner()
 165    arguments from its one argument and calls mdrunner(), after making
 166    a commrec. */
 167 static void mdrunner_start_fn(void *arg)
 168 {
 169     struct mdrunner_arglist *mda=(struct mdrunner_arglist*)arg;
 170     struct mdrunner_arglist mc=*mda; /* copy the arg list to make sure
 171                                         that it's thread-local. This doesn't
 172                                         copy pointed-to items, of course,
 173                                         but those are all const. */
 174     t_commrec *cr;  /* we need a local version of this */
 175     FILE *fplog=NULL;
 176     t_filenm *fnm;
 177
 178     fnm = dup_tfn(mc.nfile, mc.fnm);
 179
 180     cr = init_par_threads(mc.cr);
 181
 182     if (MASTER(cr))
 183     {
 184         fplog=mc.fplog;
 185     }
 186
 187     mda->ret=mdrunner(mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv,
 188                       mc.bVerbose, mc.bCompact, mc.nstglobalcomm,
 189                       mc.ddxyz, mc.dd_node_order, mc.rdd,
 190                       mc.rconstr, mc.dddlb_opt, mc.dlb_scale,
 191                       mc.ddcsx, mc.ddcsy, mc.ddcsz,
 192                       mc.nbpu_opt,
 193                       mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
 194                       mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce,
 195                       mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.Flags);
 196 }
 197
 198 /* called by mdrunner() to start a specific number of threads (including
 199    the main thread) for thread-parallel runs. This in turn calls mdrunner()
 200    for each thread.
 201    All options besides nthreads are the same as for mdrunner(). */
 202 static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
 203               FILE *fplog,t_commrec *cr,int nfile,
 204               const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
 205               gmx_bool bCompact, int nstglobalcomm,
 206               ivec ddxyz,int dd_node_order,real rdd,real rconstr,
 207               const char *dddlb_opt,real dlb_scale,
 208               const char *ddcsx,const char *ddcsy,const char *ddcsz,
 209               const char *nbpu_opt,
 210               int nsteps_cmdline, int nstepout,int resetstep,
 211               int nmultisim,int repl_ex_nst,int repl_ex_nex, int repl_ex_seed,
 212               real pforce,real cpt_period, real max_hours,
 213               const char *deviceOptions, unsigned long Flags)
 214 {
 215     int ret;
 216     struct mdrunner_arglist *mda;
 217     t_commrec *crn; /* the new commrec */
 218     t_filenm *fnmn;
 219
 220     /* first check whether we even need to start tMPI */
 221     if (hw_opt->nthreads_tmpi < 2)
 222     {
 223         return cr;
 224     }
 225
 226     /* a few small, one-time, almost unavoidable memory leaks: */
 227     snew(mda,1);
 228     fnmn=dup_tfn(nfile, fnm);
 229
 230     /* fill the data structure to pass as void pointer to thread start fn */
 231     mda->hw_opt=hw_opt;
 232     mda->fplog=fplog;
 233     mda->cr=cr;
 234     mda->nfile=nfile;
 235     mda->fnm=fnmn;
 236     mda->oenv=oenv;
 237     mda->bVerbose=bVerbose;
 238     mda->bCompact=bCompact;
 239     mda->nstglobalcomm=nstglobalcomm;
 240     mda->ddxyz[XX]=ddxyz[XX];
 241     mda->ddxyz[YY]=ddxyz[YY];
 242     mda->ddxyz[ZZ]=ddxyz[ZZ];
 243     mda->dd_node_order=dd_node_order;
 244     mda->rdd=rdd;
 245     mda->rconstr=rconstr;
 246     mda->dddlb_opt=dddlb_opt;
 247     mda->dlb_scale=dlb_scale;
 248     mda->ddcsx=ddcsx;
 249     mda->ddcsy=ddcsy;
 250     mda->ddcsz=ddcsz;
 251     mda->nbpu_opt=nbpu_opt;
 252     mda->nsteps_cmdline=nsteps_cmdline;
 253     mda->nstepout=nstepout;
 254     mda->resetstep=resetstep;
 255     mda->nmultisim=nmultisim;
 256     mda->repl_ex_nst=repl_ex_nst;
 257     mda->repl_ex_nex=repl_ex_nex;
 258     mda->repl_ex_seed=repl_ex_seed;
 259     mda->pforce=pforce;
 260     mda->cpt_period=cpt_period;
 261     mda->max_hours=max_hours;
 262     mda->deviceOptions=deviceOptions;
 263     mda->Flags=Flags;
 264
 265     fprintf(stderr, "Starting %d tMPI threads\n",hw_opt->nthreads_tmpi);
 266     fflush(stderr);
 267     /* now spawn new threads that start mdrunner_start_fn(), while
 268        the main thread returns */
 269     ret=tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi,
 270                      (hw_opt->bThreadPinning ? TMPI_AFFINITY_ALL_CORES : TMPI_AFFINITY_NONE),
 271                      mdrunner_start_fn, (void*)(mda) );
 272     if (ret!=TMPI_SUCCESS)
 273         return NULL;
 274
 275     /* make a new comm_rec to reflect the new situation */
 276     crn=init_par_threads(cr);
 277     return crn;
 278 }
 279
 280
 281 static int get_tmpi_omp_thread_distribution(const gmx_hw_opt_t *hw_opt,
 282                                             int nthreads_tot,
 283                                             int ngpu)
 284 {
 285     int nthreads_tmpi;
 286
 287     /* There are no separate PME nodes here, as we ensured in
 288      * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
 289      * and a conditional ensures we would not have ended up here.
 290      * Note that separate PME nodes might be switched on later.
 291      */
 292     if (ngpu > 0)
 293     {
 294         nthreads_tmpi = ngpu;
 295         if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi)
 296         {
 297             nthreads_tmpi = nthreads_tot;
 298         }
 299     }
 300     else if (hw_opt->nthreads_omp > 0)
 301     {
 302         if (hw_opt->nthreads_omp > nthreads_tot)
 303         {
 304             gmx_fatal(FARGS,"More OpenMP threads requested (%d) than the total number of threads requested (%d)",hw_opt->nthreads_omp,nthreads_tot);
 305         }
 306         nthreads_tmpi = nthreads_tot/hw_opt->nthreads_omp;
 307     }
 308     else
 309     {
 310         /* TODO choose nthreads_omp based on hardware topology
 311            when we have a hardware topology detection library */
 312         /* Don't use OpenMP parallelization */
 313         nthreads_tmpi = nthreads_tot;
 314     }
 315
 316     return nthreads_tmpi;
 317 }
 318
 319
 320 /* Get the number of threads to use for thread-MPI based on how many
 321  * were requested, which algorithms we're using,
 322  * and how many particles there are.
 323  * At the point we have already called check_and_update_hw_opt.
 324  * Thus all options should be internally consistent and consistent
 325  * with the hardware, except that ntmpi could be larger than #GPU.
 326  */
 327 static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
 328                             gmx_hw_opt_t *hw_opt,
 329                             t_inputrec *inputrec, gmx_mtop_t *mtop,
 330                             const t_commrec *cr,
 331                             FILE *fplog)
 332 {
 333     int nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu;
 334     int min_atoms_per_mpi_thread;
 335     char *env;
 336     char sbuf[STRLEN];
 337     gmx_bool bCanUseGPU;
 338
 339     if (hw_opt->nthreads_tmpi > 0)
 340     {
 341         /* Trivial, return right away */
 342         return hw_opt->nthreads_tmpi;
 343     }
 344
 345     /* How many total (#tMPI*#OpenMP) threads can we start? */
 346     if (hw_opt->nthreads_tot > 0)
 347     {
 348         nthreads_tot_max = hw_opt->nthreads_tot;
 349     }
 350     else
 351     {
 352         nthreads_tot_max = tMPI_Thread_get_hw_number();
 353     }
 354
 355     bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
 356     if (bCanUseGPU)
 357     {
 358         ngpu = hwinfo->gpu_info.ncuda_dev_use;
 359     }
 360     else
 361     {
 362         ngpu = 0;
 363     }
 364
 365     nthreads_tmpi =
 366         get_tmpi_omp_thread_distribution(hw_opt,nthreads_tot_max,ngpu);
 367
 368     if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
 369     {
 370         /* Steps are divided over the nodes iso splitting the atoms */
 371         min_atoms_per_mpi_thread = 0;
 372     }
 373     else
 374     {
 375         if (bCanUseGPU)
 376         {
 377             min_atoms_per_mpi_thread = MIN_ATOMS_PER_GPU;
 378         }
 379         else
 380         {
 381             min_atoms_per_mpi_thread = MIN_ATOMS_PER_MPI_THREAD;
 382         }
 383     }
 384
 385     /* Check if an algorithm does not support parallel simulation.  */
 386     if (nthreads_tmpi != 1 &&
 387         ( inputrec->eI == eiLBFGS ||
 388           inputrec->coulombtype == eelEWALD ) )
 389     {
 390         nthreads_tmpi = 1;
 391
 392         md_print_warn(cr,fplog,"The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
 393         if (hw_opt->nthreads_tmpi > nthreads_tmpi)
 394         {
 395             gmx_fatal(FARGS,"You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
 396         }
 397     }
 398     else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_thread)
 399     {
 400         /* the thread number was chosen automatically, but there are too many
 401            threads (too few atoms per thread) */
 402         nthreads_new = max(1,mtop->natoms/min_atoms_per_mpi_thread);
 403
 404         if (nthreads_new > 8 || (nthreads_tmpi == 8 && nthreads_new > 4))
 405         {
 406             /* TODO replace this once we have proper HT detection
 407              * Use only multiples of 4 above 8 threads
 408              * or with an 8-core processor
 409              * (to avoid 6 threads on 8 core processors with 4 real cores).
 410              */
 411             nthreads_new = (nthreads_new/4)*4;
 412         }
 413         else if (nthreads_new > 4)
 414         {
 415             /* Avoid 5 or 7 threads */
 416             nthreads_new = (nthreads_new/2)*2;
 417         }
 418
 419         nthreads_tmpi = nthreads_new;
 420
 421         fprintf(stderr,"\n");
 422         fprintf(stderr,"NOTE: Parallelization is limited by the small number of atoms,\n");
 423         fprintf(stderr,"      only starting %d thread-MPI threads.\n",nthreads_tmpi);
 424         fprintf(stderr,"      You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
 425     }
 426
 427     return nthreads_tmpi;
 428 }
 429 #endif /* GMX_THREAD_MPI */
 430
 431
 432 /* Environment variable for setting nstlist */
 433 static const char*  NSTLIST_ENVVAR          =  "GMX_NSTLIST";
 434 /* Try to increase nstlist when using a GPU with nstlist less than this */
 435 static const int    NSTLIST_GPU_ENOUGH      = 20;
 436 /* Increase nstlist until the non-bonded cost increases more than this factor */
 437 static const float  NBNXN_GPU_LIST_OK_FAC   = 1.25;
 438 /* Don't increase nstlist beyond a non-bonded cost increases of this factor */
 439 static const float  NBNXN_GPU_LIST_MAX_FAC  = 1.40;
 440
 441 /* Try to increase nstlist when running on a GPU */
 442 static void increase_nstlist(FILE *fp,t_commrec *cr,
 443                              t_inputrec *ir,const gmx_mtop_t *mtop,matrix box)
 444 {
 445     char *env;
 446     int  nstlist_orig,nstlist_prev;
 447     verletbuf_list_setup_t ls;
 448     real rlist_inc,rlist_ok,rlist_max,rlist_new,rlist_prev;
 449     int  i;
 450     t_state state_tmp;
 451     gmx_bool bBox,bDD,bCont;
 452     const char *nstl_fmt="\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
 453     const char *vbd_err="Can not increase nstlist for GPU run because verlet-buffer-drift is not set or used";
 454     const char *box_err="Can not increase nstlist for GPU run because the box is too small";
 455     const char *dd_err ="Can not increase nstlist for GPU run because of domain decomposition limitations";
 456     char buf[STRLEN];
 457
 458     /* Number of + nstlist alternative values to try when switching  */
 459     const int nstl[]={ 20, 25, 40, 50 };
 460 #define NNSTL  sizeof(nstl)/sizeof(nstl[0])
 461
 462     env = getenv(NSTLIST_ENVVAR);
 463     if (env == NULL)
 464     {
 465         if (fp != NULL)
 466         {
 467             fprintf(fp,nstl_fmt,ir->nstlist);
 468         }
 469     }
 470
 471     if (ir->verletbuf_drift == 0)
 472     {
 473         gmx_fatal(FARGS,"You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
 474     }
 475
 476     if (ir->verletbuf_drift < 0)
 477     {
 478         if (MASTER(cr))
 479         {
 480             fprintf(stderr,"%s\n",vbd_err);
 481         }
 482         if (fp != NULL)
 483         {
 484             fprintf(fp,"%s\n",vbd_err);
 485         }
 486
 487         return;
 488     }
 489
 490     nstlist_orig = ir->nstlist;
 491     if (env != NULL)
 492     {
 493         sprintf(buf,"Getting nstlist from environment variable GMX_NSTLIST=%s",env);
 494         if (MASTER(cr))
 495         {
 496             fprintf(stderr,"%s\n",buf);
 497         }
 498         if (fp != NULL)
 499         {
 500             fprintf(fp,"%s\n",buf);
 501         }
 502         sscanf(env,"%d",&ir->nstlist);
 503     }
 504
 505     verletbuf_get_list_setup(TRUE,&ls);
 506
 507     /* Allow rlist to make the list double the size of the cut-off sphere */
 508     rlist_inc = nbnxn_get_rlist_effective_inc(NBNXN_GPU_CLUSTER_SIZE,mtop->natoms/det(box));
 509     rlist_ok  = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_OK_FAC,1.0/3.0) - rlist_inc;
 510     rlist_max = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_MAX_FAC,1.0/3.0) - rlist_inc;
 511     if (debug)
 512     {
 513         fprintf(debug,"GPU nstlist tuning: rlist_inc %.3f rlist_max %.3f\n",
 514                 rlist_inc,rlist_max);
 515     }
 516
 517     i = 0;
 518     nstlist_prev = nstlist_orig;
 519     rlist_prev   = ir->rlist;
 520     do
 521     {
 522         if (env == NULL)
 523         {
 524             ir->nstlist = nstl[i];
 525         }
 526
 527         /* Set the pair-list buffer size in ir */
 528         calc_verlet_buffer_size(mtop,det(box),ir,ir->verletbuf_drift,&ls,
 529                                 NULL,&rlist_new);
 530
 531         /* Does rlist fit in the box? */
 532         bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC,box));
 533         bDD  = TRUE;
 534         if (bBox && DOMAINDECOMP(cr))
 535         {
 536             /* Check if rlist fits in the domain decomposition */
 537             if (inputrec2nboundeddim(ir) < DIM)
 538             {
 539                 gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
 540             }
 541             copy_mat(box,state_tmp.box);
 542             bDD = change_dd_cutoff(cr,&state_tmp,ir,rlist_new);
 543         }
 544
 545         bCont = FALSE;
 546
 547         if (env == NULL)
 548         {
 549             if (bBox && bDD && rlist_new <= rlist_max)
 550             {
 551                 /* Increase nstlist */
 552                 nstlist_prev = ir->nstlist;
 553                 rlist_prev   = rlist_new;
 554                 bCont = (i+1 < NNSTL && rlist_new < rlist_ok);
 555             }
 556             else
 557             {
 558                 /* Stick with the previous nstlist */
 559                 ir->nstlist = nstlist_prev;
 560                 rlist_new   = rlist_prev;
 561                 bBox = TRUE;
 562                 bDD  = TRUE;
 563             }
 564         }
 565
 566         i++;
 567     }
 568     while (bCont);
 569
 570     if (!bBox || !bDD)
 571     {
 572         gmx_warning(!bBox ? box_err : dd_err);
 573         if (fp != NULL)
 574         {
 575             fprintf(fp,"\n%s\n",bBox ? box_err : dd_err);
 576         }
 577         ir->nstlist = nstlist_orig;
 578     }
 579     else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
 580     {
 581         sprintf(buf,"Changing nstlist from %d to %d, rlist from %g to %g",
 582                 nstlist_orig,ir->nstlist,
 583                 ir->rlist,rlist_new);
 584         if (MASTER(cr))
 585         {
 586             fprintf(stderr,"%s\n\n",buf);
 587         }
 588         if (fp != NULL)
 589         {
 590             fprintf(fp,"%s\n\n",buf);
 591         }
 592         ir->rlist     = rlist_new;
 593         ir->rlistlong = rlist_new;
 594     }
 595 }
 596
 597 static void prepare_verlet_scheme(FILE *fplog,
 598                                   gmx_hw_info_t *hwinfo,
 599                                   t_commrec *cr,
 600                                   gmx_hw_opt_t *hw_opt,
 601                                   const char *nbpu_opt,
 602                                   t_inputrec *ir,
 603                                   const gmx_mtop_t *mtop,
 604                                   matrix box,
 605                                   gmx_bool *bUseGPU)
 606 {
 607     /* Here we only check for GPU usage on the MPI master process,
 608      * as here we don't know how many GPUs we will use yet.
 609      * We check for a GPU on all processes later.
 610      */
 611     *bUseGPU = hwinfo->bCanUseGPU || (getenv("GMX_EMULATE_GPU") != NULL);
 612
 613     if (ir->verletbuf_drift > 0)
 614     {
 615         /* Update the Verlet buffer size for the current run setup */
 616         verletbuf_list_setup_t ls;
 617         real rlist_new;
 618
 619         /* Here we assume CPU acceleration is on. But as currently
 620          * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
 621          * and 4x2 gives a larger buffer than 4x4, this is ok.
 622          */
 623         verletbuf_get_list_setup(*bUseGPU,&ls);
 624
 625         calc_verlet_buffer_size(mtop,det(box),ir,
 626                                 ir->verletbuf_drift,&ls,
 627                                 NULL,&rlist_new);
 628         if (rlist_new != ir->rlist)
 629         {
 630             if (fplog != NULL)
 631             {
 632                 fprintf(fplog,"\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
 633                         ir->rlist,rlist_new,
 634                         ls.cluster_size_i,ls.cluster_size_j);
 635             }
 636             ir->rlist     = rlist_new;
 637             ir->rlistlong = rlist_new;
 638         }
 639     }
 640
 641     /* With GPU or emulation we should check nstlist for performance */
 642     if ((EI_DYNAMICS(ir->eI) &&
 643          *bUseGPU &&
 644          ir->nstlist < NSTLIST_GPU_ENOUGH) ||
 645         getenv(NSTLIST_ENVVAR) != NULL)
 646     {
 647         /* Choose a better nstlist */
 648         increase_nstlist(fplog,cr,ir,mtop,box);
 649     }
 650 }
 651
 652 static void convert_to_verlet_scheme(FILE *fplog,
 653                                      t_inputrec *ir,
 654                                      gmx_mtop_t *mtop,real box_vol)
 655 {
 656     char *conv_mesg="Converting input file with group cut-off scheme to the Verlet cut-off scheme";
 657
 658     md_print_warn(NULL,fplog,"%s\n",conv_mesg);
 659
 660     ir->cutoff_scheme   = ecutsVERLET;
 661     ir->verletbuf_drift = 0.005;
 662
 663     if (ir->rcoulomb != ir->rvdw)
 664     {
 665         gmx_fatal(FARGS,"The VdW and Coulomb cut-offs are different, whereas the Verlet scheme only supports equal cut-offs");
 666     }
 667
 668     if (ir->vdwtype == evdwUSER || EEL_USER(ir->coulombtype))
 669     {
 670         gmx_fatal(FARGS,"User non-bonded potentials are not (yet) supported with the Verlet scheme");
 671     }
 672     else if (EVDW_SWITCHED(ir->vdwtype) || EEL_SWITCHED(ir->coulombtype))
 673     {
 674         md_print_warn(NULL,fplog,"Converting switched or shifted interactions to a shifted potential (without force shift), this will lead to slightly different interaction potentials");
 675
 676         if (EVDW_SWITCHED(ir->vdwtype))
 677         {
 678             ir->vdwtype = evdwCUT;
 679         }
 680         if (EEL_SWITCHED(ir->coulombtype))
 681         {
 682             if (EEL_FULL(ir->coulombtype))
 683             {
 684                 /* With full electrostatic only PME can be switched */
 685                 ir->coulombtype = eelPME;
 686             }
 687             else
 688             {
 689                 md_print_warn(NULL,fplog,"NOTE: Replacing %s electrostatics with reaction-field with epsilon-rf=inf\n",eel_names[ir->coulombtype]);
 690                 ir->coulombtype = eelRF;
 691                 ir->epsilon_rf  = 0.0;
 692             }
 693         }
 694
 695         /* We set the target energy drift to a small number.
 696          * Note that this is only for testing. For production the user
 697          * should think about this and set the mdp options.
 698          */
 699         ir->verletbuf_drift = 1e-4;
 700     }
 701
 702     if (inputrec2nboundeddim(ir) != 3)
 703     {
 704         gmx_fatal(FARGS,"Can only convert old tpr files to the Verlet cut-off scheme with 3D pbc");
 705     }
 706
 707     if (ir->efep != efepNO || ir->implicit_solvent != eisNO)
 708     {
 709         gmx_fatal(FARGS,"Will not convert old tpr files to the Verlet cut-off scheme with free-energy calculations or implicit solvent");
 710     }
 711
 712     if (EI_DYNAMICS(ir->eI) && !(EI_MD(ir->eI) && ir->etc == etcNO))
 713     {
 714         verletbuf_list_setup_t ls;
 715
 716         verletbuf_get_list_setup(FALSE,&ls);
 717         calc_verlet_buffer_size(mtop,box_vol,ir,ir->verletbuf_drift,&ls,
 718                                 NULL,&ir->rlist);
 719     }
 720     else
 721     {
 722         ir->verletbuf_drift = -1;
 723         ir->rlist           = 1.05*max(ir->rvdw,ir->rcoulomb);
 724     }
 725
 726     gmx_mtop_remove_chargegroups(mtop);
 727 }
 728
 729
 730 /* Set CPU affinity. Can be important for performance.
 731    On some systems (e.g. Cray) CPU Affinity is set by default.
 732    But default assigning doesn't work (well) with only some ranks
 733    having threads. This causes very low performance.
 734    External tools have cumbersome syntax for setting affinity
 735    in the case that only some ranks have threads.
 736    Thus it is important that GROMACS sets the affinity internally
 737    if only PME is using threads.
 738 */
 739 static void set_cpu_affinity(FILE *fplog,
 740                              const t_commrec *cr,
 741                              gmx_hw_opt_t *hw_opt,
 742                              int nthreads_pme,
 743                              const gmx_hw_info_t *hwinfo,
 744                              const t_inputrec *inputrec)
 745 {
 746 #if defined GMX_THREAD_MPI
 747     /* With the number of TMPI threads equal to the number of cores
 748      * we already pinned in thread-MPI, so don't pin again here.
 749      */
 750     if (hw_opt->nthreads_tmpi == tMPI_Thread_get_hw_number())
 751     {
 752         return;
 753     }
 754 #endif
 755
 756 #ifdef GMX_OPENMP /* TODO: actually we could do this even without OpenMP?! */
 757 #ifdef __linux /* TODO: only linux? why not everywhere if sched_setaffinity is available */
 758     if (hw_opt->bThreadPinning)
 759     {
 760         int thread, nthread_local, nthread_node, nthread_hw_max, nphyscore;
 761         int offset;
 762         char *env;
 763
 764         /* threads on this MPI process or TMPI thread */
 765         if (cr->duty & DUTY_PP)
 766         {
 767             nthread_local = gmx_omp_nthreads_get(emntNonbonded);
 768         }
 769         else
 770         {
 771             nthread_local = gmx_omp_nthreads_get(emntPME);
 772         }
 773
 774         /* map the current process to cores */
 775         thread = 0;
 776         nthread_node = nthread_local;
 777 #ifdef GMX_MPI
 778         if (PAR(cr) || MULTISIM(cr))
 779         {
 780             /* We need to determine a scan of the thread counts in this
 781              * compute node.
 782              */
 783             MPI_Comm comm_intra;
 784
 785             MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),cr->nodeid_intra,
 786                            &comm_intra);
 787             MPI_Scan(&nthread_local,&thread,1,MPI_INT,MPI_SUM,comm_intra);
 788             /* MPI_Scan is inclusive, but here we need exclusive */
 789             thread -= nthread_local;
 790             /* Get the total number of threads on this physical node */
 791             MPI_Allreduce(&nthread_local,&nthread_node,1,MPI_INT,MPI_SUM,comm_intra);
 792             MPI_Comm_free(&comm_intra);
 793         }
 794 #endif
 795
 796         offset = 0;
 797         if (hw_opt->core_pinning_offset > 0)
 798         {
 799             offset = hw_opt->core_pinning_offset;
 800             if (SIMMASTER(cr))
 801             {
 802                 fprintf(stderr, "Applying core pinning offset %d\n", offset);
 803             }
 804             if (fplog)
 805             {
 806                 fprintf(fplog, "Applying core pinning offset %d\n", offset);
 807             }
 808         }
 809
 810         /* With Intel Hyper-Threading enabled, we want to pin consecutive
 811          * threads to physical cores when using more threads than physical
 812          * cores or when the user requests so.
 813          */
 814         nthread_hw_max = hwinfo->nthreads_hw_avail;
 815         nphyscore = -1;
 816         if (hw_opt->bPinHyperthreading ||
 817             (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
 818              nthread_node > nthread_hw_max/2 && getenv("GMX_DISABLE_PINHT") == NULL))
 819         {
 820             if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) != GMX_CPUID_X86_SMT_ENABLED)
 821             {
 822                 /* We print to stderr on all processes, as we might have
 823                  * different settings on different physical nodes.
 824                  */
 825                 if (gmx_cpuid_vendor(hwinfo->cpuid_info) != GMX_CPUID_VENDOR_INTEL)
 826                 {
 827                     md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
 828                                   "but non-Intel CPU detected (vendor: %s)\n",
 829                                   gmx_cpuid_vendor_string[gmx_cpuid_vendor(hwinfo->cpuid_info)]);
 830                 }
 831                 else
 832                 {
 833                     md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
 834                                   "but the CPU detected does not have Intel Hyper-Threading support "
 835                                   "(or it is turned off)\n");
 836                 }
 837             }
 838             nphyscore = nthread_hw_max/2;
 839
 840             if (SIMMASTER(cr))
 841             {
 842                 fprintf(stderr, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
 843                         nphyscore);
 844             }
 845             if (fplog)
 846             {
 847                 fprintf(fplog, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
 848                         nphyscore);
 849             }
 850         }
 851
 852         /* set the per-thread affinity */
 853 #pragma omp parallel firstprivate(thread) num_threads(nthread_local)
 854         {
 855             cpu_set_t mask;
 856             int core;
 857
 858             CPU_ZERO(&mask);
 859             thread += gmx_omp_get_thread_num();
 860             if (nphyscore <= 0)
 861             {
 862                 core = offset + thread;
 863             }
 864             else
 865             {
 866                 /* Lock pairs of threads to the same hyperthreaded core */
 867                 core = offset + thread/2 + (thread % 2)*nphyscore;
 868             }
 869             CPU_SET(core, &mask);
 870             sched_setaffinity((pid_t) syscall (SYS_gettid), sizeof(cpu_set_t), &mask);
 871         }
 872     }
 873 #endif /* __linux    */
 874 #endif /* GMX_OPENMP */
 875 }
 876
 877
 878 static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt,
 879                                     int cutoff_scheme)
 880 {
 881     gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp);
 882
 883 #ifndef GMX_THREAD_MPI
 884     if (hw_opt->nthreads_tot > 0)
 885     {
 886         gmx_fatal(FARGS,"Setting the total number of threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
 887     }
 888     if (hw_opt->nthreads_tmpi > 0)
 889     {
 890         gmx_fatal(FARGS,"Setting the number of thread-MPI threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
 891     }
 892 #endif
 893
 894     if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
 895     {
 896         /* We have the same number of OpenMP threads for PP and PME processes,
 897          * thus we can perform several consistency checks.
 898          */
 899         if (hw_opt->nthreads_tmpi > 0 &&
 900             hw_opt->nthreads_omp > 0 &&
 901             hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
 902         {
 903             gmx_fatal(FARGS,"The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
 904                       hw_opt->nthreads_tot,hw_opt->nthreads_tmpi,hw_opt->nthreads_omp);
 905         }
 906
 907         if (hw_opt->nthreads_tmpi > 0 &&
 908             hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
 909         {
 910             gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
 911                       hw_opt->nthreads_tot,hw_opt->nthreads_tmpi);
 912         }
 913
 914         if (hw_opt->nthreads_omp > 0 &&
 915             hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
 916         {
 917             gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
 918                       hw_opt->nthreads_tot,hw_opt->nthreads_omp);
 919         }
 920
 921         if (hw_opt->nthreads_tmpi > 0 &&
 922             hw_opt->nthreads_omp <= 0)
 923         {
 924             hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
 925         }
 926     }
 927
 928 #ifndef GMX_OPENMP
 929     if (hw_opt->nthreads_omp > 1)
 930     {
 931         gmx_fatal(FARGS,"OpenMP threads are requested, but Gromacs was compiled without OpenMP support");
 932     }
 933 #endif
 934
 935     if (cutoff_scheme == ecutsGROUP)
 936     {
 937         /* We only have OpenMP support for PME only nodes */
 938         if (hw_opt->nthreads_omp > 1)
 939         {
 940             gmx_fatal(FARGS,"OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
 941                       ecutscheme_names[cutoff_scheme],
 942                       ecutscheme_names[ecutsVERLET]);
 943         }
 944         hw_opt->nthreads_omp = 1;
 945     }
 946
 947     if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
 948     {
 949         gmx_fatal(FARGS,"You need to specify -ntomp in addition to -ntomp_pme");
 950     }
 951
 952     if (hw_opt->nthreads_tot == 1)
 953     {
 954         hw_opt->nthreads_tmpi = 1;
 955
 956         if (hw_opt->nthreads_omp > 1)
 957         {
 958             gmx_fatal(FARGS,"You requested %d OpenMP threads with %d total threads",
 959                       hw_opt->nthreads_tmpi,hw_opt->nthreads_tot);
 960         }
 961         hw_opt->nthreads_omp = 1;
 962     }
 963
 964     if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
 965     {
 966         hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
 967     }
 968
 969     if (debug)
 970     {
 971         fprintf(debug,"hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
 972                 hw_opt->nthreads_tot,
 973                 hw_opt->nthreads_tmpi,
 974                 hw_opt->nthreads_omp,
 975                 hw_opt->nthreads_omp_pme,
 976                 hw_opt->gpu_id!=NULL ? hw_opt->gpu_id : "");
 977
 978     }
 979 }
 980
 981
 982 /* Override the value in inputrec with value passed on the command line (if any) */
 983 static void override_nsteps_cmdline(FILE *fplog,
 984                                     int nsteps_cmdline,
 985                                     t_inputrec *ir,
 986                                     const t_commrec *cr)
 987 {
 988     assert(ir);
 989     assert(cr);
 990
 991     /* override with anything else than the default -2 */
 992     if (nsteps_cmdline > -2)
 993     {
 994         char stmp[STRLEN];
 995
 996         ir->nsteps = nsteps_cmdline;
 997         if (EI_DYNAMICS(ir->eI))
 998         {
 999             sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps, %.3f ps",
1000                     nsteps_cmdline, nsteps_cmdline*ir->delta_t);
1001         }
1002         else
1003         {
1004             sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps",
1005                     nsteps_cmdline);
1006         }
1007
1008         md_print_warn(cr, fplog, "%s\n", stmp);
1009     }
1010 }
1011
1012 /* Data structure set by SIMMASTER which needs to be passed to all nodes
1013  * before the other nodes have read the tpx file and called gmx_detect_hardware.
1014  */
1015 typedef struct {
1016     int      cutoff_scheme; /* The cutoff-scheme from inputrec_t */
1017     gmx_bool bUseGPU;       /* Use GPU or GPU emulation          */
1018 } master_inf_t;
1019
1020 int mdrunner(gmx_hw_opt_t *hw_opt,
1021              FILE *fplog,t_commrec *cr,int nfile,
1022              const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
1023              gmx_bool bCompact, int nstglobalcomm,
1024              ivec ddxyz,int dd_node_order,real rdd,real rconstr,
1025              const char *dddlb_opt,real dlb_scale,
1026              const char *ddcsx,const char *ddcsy,const char *ddcsz,
1027              const char *nbpu_opt,
1028              int nsteps_cmdline, int nstepout,int resetstep,
1029              int nmultisim,int repl_ex_nst,int repl_ex_nex,
1030              int repl_ex_seed, real pforce,real cpt_period,real max_hours,
1031              const char *deviceOptions, unsigned long Flags)
1032 {
1033     gmx_bool   bForceUseGPU,bTryUseGPU;
1034     double     nodetime=0,realtime;
1035     t_inputrec *inputrec;
1036     t_state    *state=NULL;
1037     matrix     box;
1038     gmx_ddbox_t ddbox={0};
1039     int        npme_major,npme_minor;
1040     real       tmpr1,tmpr2;
1041     t_nrnb     *nrnb;
1042     gmx_mtop_t *mtop=NULL;
1043     t_mdatoms  *mdatoms=NULL;
1044     t_forcerec *fr=NULL;
1045     t_fcdata   *fcd=NULL;
1046     real       ewaldcoeff=0;
1047     gmx_pme_t  *pmedata=NULL;
1048     gmx_vsite_t *vsite=NULL;
1049     gmx_constr_t constr;
1050     int        i,m,nChargePerturbed=-1,status,nalloc;
1051     char       *gro;
1052     gmx_wallcycle_t wcycle;
1053     gmx_bool       bReadRNG,bReadEkin;
1054     int        list;
1055     gmx_runtime_t runtime;
1056     int        rc;
1057     gmx_large_int_t reset_counters;
1058     gmx_edsam_t ed=NULL;
1059     t_commrec   *cr_old=cr;
1060     int         nthreads_pme=1;
1061     int         nthreads_pp=1;
1062     gmx_membed_t membed=NULL;
1063     gmx_hw_info_t *hwinfo=NULL;
1064     master_inf_t minf={-1,FALSE};
1065
1066     /* CAUTION: threads may be started later on in this function, so
1067        cr doesn't reflect the final parallel state right now */
1068     snew(inputrec,1);
1069     snew(mtop,1);
1070
1071     if (Flags & MD_APPENDFILES)
1072     {
1073         fplog = NULL;
1074     }
1075
1076     bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
1077     bTryUseGPU   = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
1078
1079     snew(state,1);
1080     if (SIMMASTER(cr))
1081     {
1082         /* Read (nearly) all data required for the simulation */
1083         read_tpx_state(ftp2fn(efTPX,nfile,fnm),inputrec,state,NULL,mtop);
1084
1085         if (inputrec->cutoff_scheme != ecutsVERLET &&
1086             ((Flags & MD_TESTVERLET) || getenv("GMX_VERLET_SCHEME") != NULL))
1087         {
1088             convert_to_verlet_scheme(fplog,inputrec,mtop,det(state->box));
1089         }
1090
1091         /* Detect hardware, gather information. With tMPI only thread 0 does it
1092          * and after threads are started broadcasts hwinfo around. */
1093         snew(hwinfo, 1);
1094         gmx_detect_hardware(fplog, hwinfo, cr,
1095                             bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
1096
1097         minf.cutoff_scheme = inputrec->cutoff_scheme;
1098         minf.bUseGPU       = FALSE;
1099
1100         if (inputrec->cutoff_scheme == ecutsVERLET)
1101         {
1102             prepare_verlet_scheme(fplog,hwinfo,cr,hw_opt,nbpu_opt,
1103                                   inputrec,mtop,state->box,
1104                                   &minf.bUseGPU);
1105         }
1106         else if (hwinfo->bCanUseGPU)
1107         {
1108             md_print_warn(cr,fplog,
1109                           "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
1110                           "      To use a GPU, set the mdp option: cutoff-scheme = Verlet\n"
1111                           "      (for quick performance testing you can use the -testverlet option)\n");
1112
1113             if (bForceUseGPU)
1114             {
1115                 gmx_fatal(FARGS,"GPU requested, but can't be used without cutoff-scheme=Verlet");
1116             }
1117         }
1118     }
1119 #ifndef GMX_THREAD_MPI
1120     if (PAR(cr))
1121     {
1122         gmx_bcast_sim(sizeof(minf),&minf,cr);
1123     }
1124 #endif
1125     if (minf.bUseGPU && cr->npmenodes == -1)
1126     {
1127         /* Don't automatically use PME-only nodes with GPUs */
1128         cr->npmenodes = 0;
1129     }
1130
1131 #ifdef GMX_THREAD_MPI
1132     /* With thread-MPI inputrec is only set here on the master thread */
1133     if (SIMMASTER(cr))
1134 #endif
1135     {
1136         check_and_update_hw_opt(hw_opt,minf.cutoff_scheme);
1137
1138 #ifdef GMX_THREAD_MPI
1139         if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
1140         {
1141             gmx_fatal(FARGS,"You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME nodes");
1142         }
1143 #endif
1144
1145         if (hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
1146             cr->npmenodes <= 0)
1147         {
1148             gmx_fatal(FARGS,"You need to explicitly specify the number of PME nodes (-npme) when using different number of OpenMP threads for PP and PME nodes");
1149         }
1150     }
1151
1152     /* Check for externally set OpenMP affinity and turn off internal
1153      * pinning if any is found. We need to do this check early to tell
1154      * thread-MPI whether it should do pinning when spawning threads.
1155      */
1156     gmx_omp_check_thread_affinity(fplog, cr, hw_opt);
1157
1158 #ifdef GMX_THREAD_MPI
1159     if (SIMMASTER(cr))
1160     {
1161         /* NOW the threads will be started: */
1162         hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
1163                                                  hw_opt,
1164                                                  inputrec, mtop,
1165                                                  cr, fplog);
1166         if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
1167         {
1168             hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
1169         }
1170
1171         if (hw_opt->nthreads_tmpi > 1)
1172         {
1173             /* now start the threads. */
1174             cr=mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm,
1175                                       oenv, bVerbose, bCompact, nstglobalcomm,
1176                                       ddxyz, dd_node_order, rdd, rconstr,
1177                                       dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
1178                                       nbpu_opt,
1179                                       nsteps_cmdline, nstepout, resetstep, nmultisim,
1180                                       repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
1181                                       cpt_period, max_hours, deviceOptions,
1182                                       Flags);
1183             /* the main thread continues here with a new cr. We don't deallocate
1184                the old cr because other threads may still be reading it. */
1185             if (cr == NULL)
1186             {
1187                 gmx_comm("Failed to spawn threads");
1188             }
1189         }
1190     }
1191 #endif
1192     /* END OF CAUTION: cr is now reliable */
1193
1194     /* g_membed initialisation *
1195      * Because we change the mtop, init_membed is called before the init_parallel *
1196      * (in case we ever want to make it run in parallel) */
1197     if (opt2bSet("-membed",nfile,fnm))
1198     {
1199         if (MASTER(cr))
1200         {
1201             fprintf(stderr,"Initializing membed");
1202         }
1203         membed = init_membed(fplog,nfile,fnm,mtop,inputrec,state,cr,&cpt_period);
1204     }
1205
1206     if (PAR(cr))
1207     {
1208         /* now broadcast everything to the non-master nodes/threads: */
1209         init_parallel(fplog, cr, inputrec, mtop);
1210
1211         /* This check needs to happen after get_nthreads_mpi() */
1212         if (inputrec->cutoff_scheme == ecutsVERLET && (Flags & MD_PARTDEC))
1213         {
1214             gmx_fatal_collective(FARGS,cr,NULL,
1215                                  "The Verlet cut-off scheme is not supported with particle decomposition.\n"
1216                                  "You can achieve the same effect as particle decomposition by running in parallel using only OpenMP threads.");
1217         }
1218     }
1219     if (fplog != NULL)
1220     {
1221         pr_inputrec(fplog,0,"Input Parameters",inputrec,FALSE);
1222     }
1223
1224 #if defined GMX_THREAD_MPI
1225     /* With tMPI we detected on thread 0 and we'll just pass the hwinfo pointer
1226      * to the other threads  -- slightly uncool, but works fine, just need to
1227      * make sure that the data doesn't get freed twice. */
1228     if (cr->nnodes > 1)
1229     {
1230         if (!SIMMASTER(cr))
1231         {
1232             snew(hwinfo, 1);
1233         }
1234         gmx_bcast(sizeof(&hwinfo), &hwinfo, cr);
1235     }
1236 #else
1237     if (PAR(cr) && !SIMMASTER(cr))
1238     {
1239         /* now we have inputrec on all nodes, can run the detection */
1240         /* TODO: perhaps it's better to propagate within a node instead? */
1241         snew(hwinfo, 1);
1242         gmx_detect_hardware(fplog, hwinfo, cr,
1243                                  bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
1244     }
1245 #endif
1246
1247     /* now make sure the state is initialized and propagated */
1248     set_state_entries(state,inputrec,cr->nnodes);
1249
1250     /* remove when vv and rerun works correctly! */
1251     if (PAR(cr) && EI_VV(inputrec->eI) && ((Flags & MD_RERUN) || (Flags & MD_RERUN_VSITE)))
1252     {
1253         gmx_fatal(FARGS,
1254                   "Currently can't do velocity verlet with rerun in parallel.");
1255     }
1256
1257     /* A parallel command line option consistency check that we can
1258        only do after any threads have started. */
1259     if (!PAR(cr) &&
1260         (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
1261     {
1262         gmx_fatal(FARGS,
1263                   "The -dd or -npme option request a parallel simulation, "
1264 #ifndef GMX_MPI
1265                   "but %s was compiled without threads or MPI enabled"
1266 #else
1267 #ifdef GMX_THREAD_MPI
1268                   "but the number of threads (option -nt) is 1"
1269 #else
1270                   "but %s was not started through mpirun/mpiexec or only one process was requested through mpirun/mpiexec"
1271 #endif
1272 #endif
1273                   , ShortProgram()
1274             );
1275     }
1276
1277     if ((Flags & MD_RERUN) &&
1278         (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
1279     {
1280         gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
1281     }
1282
1283     if (can_use_allvsall(inputrec,mtop,TRUE,cr,fplog) && PAR(cr))
1284     {
1285         /* All-vs-all loops do not work with domain decomposition */
1286         Flags |= MD_PARTDEC;
1287     }
1288
1289     if (!EEL_PME(inputrec->coulombtype) || (Flags & MD_PARTDEC))
1290     {
1291         if (cr->npmenodes > 0)
1292         {
1293             if (!EEL_PME(inputrec->coulombtype))
1294             {
1295                 gmx_fatal_collective(FARGS,cr,NULL,
1296                                      "PME nodes are requested, but the system does not use PME electrostatics");
1297             }
1298             if (Flags & MD_PARTDEC)
1299             {
1300                 gmx_fatal_collective(FARGS,cr,NULL,
1301                                      "PME nodes are requested, but particle decomposition does not support separate PME nodes");
1302             }
1303         }
1304
1305         cr->npmenodes = 0;
1306     }
1307
1308 #ifdef GMX_FAHCORE
1309     fcRegisterSteps(inputrec->nsteps,inputrec->init_step);
1310 #endif
1311
1312     /* NMR restraints must be initialized before load_checkpoint,
1313      * since with time averaging the history is added to t_state.
1314      * For proper consistency check we therefore need to extend
1315      * t_state here.
1316      * So the PME-only nodes (if present) will also initialize
1317      * the distance restraints.
1318      */
1319     snew(fcd,1);
1320
1321     /* This needs to be called before read_checkpoint to extend the state */
1322     init_disres(fplog,mtop,inputrec,cr,Flags & MD_PARTDEC,fcd,state);
1323
1324     if (gmx_mtop_ftype_count(mtop,F_ORIRES) > 0)
1325     {
1326         if (PAR(cr) && !(Flags & MD_PARTDEC))
1327         {
1328             gmx_fatal(FARGS,"Orientation restraints do not work (yet) with domain decomposition, use particle decomposition (mdrun option -pd)");
1329         }
1330         /* Orientation restraints */
1331         if (MASTER(cr))
1332         {
1333             init_orires(fplog,mtop,state->x,inputrec,cr->ms,&(fcd->orires),
1334                         state);
1335         }
1336     }
1337
1338     if (DEFORM(*inputrec))
1339     {
1340         /* Store the deform reference box before reading the checkpoint */
1341         if (SIMMASTER(cr))
1342         {
1343             copy_mat(state->box,box);
1344         }
1345         if (PAR(cr))
1346         {
1347             gmx_bcast(sizeof(box),box,cr);
1348         }
1349         /* Because we do not have the update struct available yet
1350          * in which the reference values should be stored,
1351          * we store them temporarily in static variables.
1352          * This should be thread safe, since they are only written once
1353          * and with identical values.
1354          */
1355 #ifdef GMX_THREAD_MPI
1356         tMPI_Thread_mutex_lock(&deform_init_box_mutex);
1357 #endif
1358         deform_init_init_step_tpx = inputrec->init_step;
1359         copy_mat(box,deform_init_box_tpx);
1360 #ifdef GMX_THREAD_MPI
1361         tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
1362 #endif
1363     }
1364
1365     if (opt2bSet("-cpi",nfile,fnm))
1366     {
1367         /* Check if checkpoint file exists before doing continuation.
1368          * This way we can use identical input options for the first and subsequent runs...
1369          */
1370         if( gmx_fexist_master(opt2fn_master("-cpi",nfile,fnm,cr),cr) )
1371         {
1372             load_checkpoint(opt2fn_master("-cpi",nfile,fnm,cr),&fplog,
1373                             cr,Flags & MD_PARTDEC,ddxyz,
1374                             inputrec,state,&bReadRNG,&bReadEkin,
1375                             (Flags & MD_APPENDFILES),
1376                             (Flags & MD_APPENDFILESSET));
1377
1378             if (bReadRNG)
1379             {
1380                 Flags |= MD_READ_RNG;
1381             }
1382             if (bReadEkin)
1383             {
1384                 Flags |= MD_READ_EKIN;
1385             }
1386         }
1387     }
1388
1389     if (((MASTER(cr) || (Flags & MD_SEPPOT)) && (Flags & MD_APPENDFILES))
1390 #ifdef GMX_THREAD_MPI
1391         /* With thread MPI only the master node/thread exists in mdrun.c,
1392          * therefore non-master nodes need to open the "seppot" log file here.
1393          */
1394         || (!MASTER(cr) && (Flags & MD_SEPPOT))
1395 #endif
1396         )
1397     {
1398         gmx_log_open(ftp2fn(efLOG,nfile,fnm),cr,!(Flags & MD_SEPPOT),
1399                              Flags,&fplog);
1400     }
1401
1402     /* override nsteps with value from cmdline */
1403     override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
1404
1405     if (SIMMASTER(cr))
1406     {
1407         copy_mat(state->box,box);
1408     }
1409
1410     if (PAR(cr))
1411     {
1412         gmx_bcast(sizeof(box),box,cr);
1413     }
1414
1415     /* Essential dynamics */
1416     if (opt2bSet("-ei",nfile,fnm))
1417     {
1418         /* Open input and output files, allocate space for ED data structure */
1419         ed = ed_open(nfile,fnm,Flags,cr);
1420     }
1421
1422     if (PAR(cr) && !((Flags & MD_PARTDEC) ||
1423                      EI_TPI(inputrec->eI) ||
1424                      inputrec->eI == eiNM))
1425     {
1426         cr->dd = init_domain_decomposition(fplog,cr,Flags,ddxyz,rdd,rconstr,
1427                                            dddlb_opt,dlb_scale,
1428                                            ddcsx,ddcsy,ddcsz,
1429                                            mtop,inputrec,
1430                                            box,state->x,
1431                                            &ddbox,&npme_major,&npme_minor);
1432
1433         make_dd_communicators(fplog,cr,dd_node_order);
1434
1435         /* Set overallocation to avoid frequent reallocation of arrays */
1436         set_over_alloc_dd(TRUE);
1437     }
1438     else
1439     {
1440         /* PME, if used, is done on all nodes with 1D decomposition */
1441         cr->npmenodes = 0;
1442         cr->duty = (DUTY_PP | DUTY_PME);
1443         npme_major = 1;
1444         npme_minor = 1;
1445         if (!EI_TPI(inputrec->eI))
1446         {
1447             npme_major = cr->nnodes;
1448         }
1449
1450         if (inputrec->ePBC == epbcSCREW)
1451         {
1452             gmx_fatal(FARGS,
1453                       "pbc=%s is only implemented with domain decomposition",
1454                       epbc_names[inputrec->ePBC]);
1455         }
1456     }
1457
1458     if (PAR(cr))
1459     {
1460         /* After possible communicator splitting in make_dd_communicators.
1461          * we can set up the intra/inter node communication.
1462          */
1463         gmx_setup_nodecomm(fplog,cr);
1464     }
1465
1466     /* Initialize per-node process ID and counters. */
1467     gmx_init_intra_counters(cr);
1468
1469 #ifdef GMX_MPI
1470     md_print_info(cr,fplog,"Using %d MPI %s\n",
1471                   cr->nnodes,
1472 #ifdef GMX_THREAD_MPI
1473                   cr->nnodes==1 ? "thread" : "threads"
1474 #else
1475                   cr->nnodes==1 ? "process" : "processes"
1476 #endif
1477                   );
1478 #endif
1479
1480     gmx_omp_nthreads_init(fplog, cr,
1481                           hwinfo->nthreads_hw_avail,
1482                           hw_opt->nthreads_omp,
1483                           hw_opt->nthreads_omp_pme,
1484                           (cr->duty & DUTY_PP) == 0,
1485                           inputrec->cutoff_scheme == ecutsVERLET);
1486
1487     gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi, minf.bUseGPU);
1488
1489     /* getting number of PP/PME threads
1490        PME: env variable should be read only on one node to make sure it is
1491        identical everywhere;
1492      */
1493     /* TODO nthreads_pp is only used for pinning threads.
1494      * This is a temporary solution until we have a hw topology library.
1495      */
1496     nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
1497     nthreads_pme = gmx_omp_nthreads_get(emntPME);
1498
1499     wcycle = wallcycle_init(fplog,resetstep,cr,nthreads_pp,nthreads_pme);
1500
1501     if (PAR(cr))
1502     {
1503         /* Master synchronizes its value of reset_counters with all nodes
1504          * including PME only nodes */
1505         reset_counters = wcycle_get_reset_counters(wcycle);
1506         gmx_bcast_sim(sizeof(reset_counters),&reset_counters,cr);
1507         wcycle_set_reset_counters(wcycle, reset_counters);
1508     }
1509
1510     snew(nrnb,1);
1511     if (cr->duty & DUTY_PP)
1512     {
1513         /* For domain decomposition we allocate dynamically
1514          * in dd_partition_system.
1515          */
1516         if (DOMAINDECOMP(cr))
1517         {
1518             bcast_state_setup(cr,state);
1519         }
1520         else
1521         {
1522             if (PAR(cr))
1523             {
1524                 bcast_state(cr,state,TRUE);
1525             }
1526         }
1527
1528         /* Initiate forcerecord */
1529         fr = mk_forcerec();
1530         fr->hwinfo = hwinfo;
1531         init_forcerec(fplog,oenv,fr,fcd,inputrec,mtop,cr,box,FALSE,
1532                       opt2fn("-table",nfile,fnm),
1533                       opt2fn("-tabletf",nfile,fnm),
1534                       opt2fn("-tablep",nfile,fnm),
1535                       opt2fn("-tableb",nfile,fnm),
1536                       nbpu_opt,
1537                       FALSE,pforce);
1538
1539         /* version for PCA_NOT_READ_NODE (see md.c) */
1540         /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
1541           "nofile","nofile","nofile","nofile",FALSE,pforce);
1542           */
1543         fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT);
1544
1545         /* Initialize QM-MM */
1546         if(fr->bQMMM)
1547         {
1548             init_QMMMrec(cr,box,mtop,inputrec,fr);
1549         }
1550
1551         /* Initialize the mdatoms structure.
1552          * mdatoms is not filled with atom data,
1553          * as this can not be done now with domain decomposition.
1554          */
1555         mdatoms = init_mdatoms(fplog,mtop,inputrec->efep!=efepNO);
1556
1557         /* Initialize the virtual site communication */
1558         vsite = init_vsite(mtop,cr,FALSE);
1559
1560         calc_shifts(box,fr->shift_vec);
1561
1562         /* With periodic molecules the charge groups should be whole at start up
1563          * and the virtual sites should not be far from their proper positions.
1564          */
1565         if (!inputrec->bContinuation && MASTER(cr) &&
1566             !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
1567         {
1568             /* Make molecules whole at start of run */
1569             if (fr->ePBC != epbcNONE)
1570             {
1571                 do_pbc_first_mtop(fplog,inputrec->ePBC,box,mtop,state->x);
1572             }
1573             if (vsite)
1574             {
1575                 /* Correct initial vsite positions are required
1576                  * for the initial distribution in the domain decomposition
1577                  * and for the initial shell prediction.
1578                  */
1579                 construct_vsites_mtop(fplog,vsite,mtop,state->x);
1580             }
1581         }
1582
1583         if (EEL_PME(fr->eeltype))
1584         {
1585             ewaldcoeff = fr->ewaldcoeff;
1586             pmedata = &fr->pmedata;
1587         }
1588         else
1589         {
1590             pmedata = NULL;
1591         }
1592     }
1593     else
1594     {
1595         /* This is a PME only node */
1596
1597         /* We don't need the state */
1598         done_state(state);
1599
1600         ewaldcoeff = calc_ewaldcoeff(inputrec->rcoulomb, inputrec->ewald_rtol);
1601         snew(pmedata,1);
1602     }
1603
1604     /* Set the CPU affinity */
1605     set_cpu_affinity(fplog,cr,hw_opt,nthreads_pme,hwinfo,inputrec);
1606
1607     /* Initiate PME if necessary,
1608      * either on all nodes or on dedicated PME nodes only. */
1609     if (EEL_PME(inputrec->coulombtype))
1610     {
1611         if (mdatoms)
1612         {
1613             nChargePerturbed = mdatoms->nChargePerturbed;
1614         }
1615         if (cr->npmenodes > 0)
1616         {
1617             /* The PME only nodes need to know nChargePerturbed */
1618             gmx_bcast_sim(sizeof(nChargePerturbed),&nChargePerturbed,cr);
1619         }
1620
1621         if (cr->duty & DUTY_PME)
1622         {
1623             status = gmx_pme_init(pmedata,cr,npme_major,npme_minor,inputrec,
1624                                   mtop ? mtop->natoms : 0,nChargePerturbed,
1625                                   (Flags & MD_REPRODUCIBLE),nthreads_pme);
1626             if (status != 0)
1627             {
1628                 gmx_fatal(FARGS,"Error %d initializing PME",status);
1629             }
1630         }
1631     }
1632
1633
1634     if (integrator[inputrec->eI].func == do_md
1635 #ifdef GMX_OPENMM
1636         ||
1637         integrator[inputrec->eI].func == do_md_openmm
1638 #endif
1639         )
1640     {
1641         /* Turn on signal handling on all nodes */
1642         /*
1643          * (A user signal from the PME nodes (if any)
1644          * is communicated to the PP nodes.
1645          */
1646         signal_handler_install();
1647     }
1648
1649     if (cr->duty & DUTY_PP)
1650     {
1651         if (inputrec->ePull != epullNO)
1652         {
1653             /* Initialize pull code */
1654             init_pull(fplog,inputrec,nfile,fnm,mtop,cr,oenv, inputrec->fepvals->init_lambda,
1655                       EI_DYNAMICS(inputrec->eI) && MASTER(cr),Flags);
1656         }
1657
1658         if (inputrec->bRot)
1659         {
1660            /* Initialize enforced rotation code */
1661            init_rot(fplog,inputrec,nfile,fnm,cr,state->x,box,mtop,oenv,
1662                     bVerbose,Flags);
1663         }
1664
1665         constr = init_constraints(fplog,mtop,inputrec,ed,state,cr);
1666
1667         if (DOMAINDECOMP(cr))
1668         {
1669             dd_init_bondeds(fplog,cr->dd,mtop,vsite,constr,inputrec,
1670                             Flags & MD_DDBONDCHECK,fr->cginfo_mb);
1671
1672             set_dd_parameters(fplog,cr->dd,dlb_scale,inputrec,fr,&ddbox);
1673
1674             setup_dd_grid(fplog,cr->dd);
1675         }
1676
1677         /* Now do whatever the user wants us to do (how flexible...) */
1678         integrator[inputrec->eI].func(fplog,cr,nfile,fnm,
1679                                       oenv,bVerbose,bCompact,
1680                                       nstglobalcomm,
1681                                       vsite,constr,
1682                                       nstepout,inputrec,mtop,
1683                                       fcd,state,
1684                                       mdatoms,nrnb,wcycle,ed,fr,
1685                                       repl_ex_nst,repl_ex_nex,repl_ex_seed,
1686                                       membed,
1687                                       cpt_period,max_hours,
1688                                       deviceOptions,
1689                                       Flags,
1690                                       &runtime);
1691
1692         if (inputrec->ePull != epullNO)
1693         {
1694             finish_pull(fplog,inputrec->pull);
1695         }
1696
1697         if (inputrec->bRot)
1698         {
1699             finish_rot(fplog,inputrec->rot);
1700         }
1701
1702     }
1703     else
1704     {
1705         /* do PME only */
1706         gmx_pmeonly(*pmedata,cr,nrnb,wcycle,ewaldcoeff,FALSE,inputrec);
1707     }
1708
1709     if (EI_DYNAMICS(inputrec->eI) || EI_TPI(inputrec->eI))
1710     {
1711         /* Some timing stats */
1712         if (SIMMASTER(cr))
1713         {
1714             if (runtime.proc == 0)
1715             {
1716                 runtime.proc = runtime.real;
1717             }
1718         }
1719         else
1720         {
1721             runtime.real = 0;
1722         }
1723     }
1724
1725     wallcycle_stop(wcycle,ewcRUN);
1726
1727     /* Finish up, write some stuff
1728      * if rerunMD, don't write last frame again
1729      */
1730     finish_run(fplog,cr,ftp2fn(efSTO,nfile,fnm),
1731                inputrec,nrnb,wcycle,&runtime,
1732                fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ?
1733                  nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL,
1734                nthreads_pp,
1735                EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
1736
1737     if ((cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU)
1738     {
1739         char gpu_err_str[STRLEN];
1740
1741         /* free GPU memory and uninitialize GPU (by destroying the context) */
1742         nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
1743
1744         if (!free_gpu(gpu_err_str))
1745         {
1746             gmx_warning("On node %d failed to free GPU #%d: %s",
1747                         cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
1748         }
1749     }
1750
1751     if (opt2bSet("-membed",nfile,fnm))
1752     {
1753         sfree(membed);
1754     }
1755
1756 #ifdef GMX_THREAD_MPI
1757     if (PAR(cr) && SIMMASTER(cr))
1758 #endif
1759     {
1760         gmx_hardware_info_free(hwinfo);
1761     }
1762
1763     /* Does what it says */
1764     print_date_and_time(fplog,cr->nodeid,"Finished mdrun",&runtime);
1765
1766     /* Close logfile already here if we were appending to it */
1767     if (MASTER(cr) && (Flags & MD_APPENDFILES))
1768     {
1769         gmx_log_close(fplog);
1770     }
1771
1772     rc=(int)gmx_get_stop_condition();
1773
1774 #ifdef GMX_THREAD_MPI
1775     /* we need to join all threads. The sub-threads join when they
1776        exit this function, but the master thread needs to be told to
1777        wait for that. */
1778     if (PAR(cr) && MASTER(cr))
1779     {
1780         tMPI_Finalize();
1781     }
1782 #endif
1783
1784     return rc;
1785 }