1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This source code is part of
8 * GROningen MAchine for Chemical Simulations
11 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13 * Copyright (c) 2001-2004, The GROMACS development team,
14 * check out http://www.gromacs.org for more information.
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version 2
19 * of the License, or (at your option) any later version.
21 * If you want to redistribute modifications, please consider that
22 * scientific software is very special. Version control is crucial -
23 * bugs must be traceable. We will be happy to consider code for
24 * inclusion in the official distribution, but derived work must not
25 * be called official GROMACS. Details are found in the README & COPYING
26 * files - if they are missing, get the official version at www.gromacs.org.
28 * To help us fund GROMACS development, we humbly ask that you cite
29 * the papers on the package - you can find them in the top README file.
31 * For more info, check our website at http://www.gromacs.org
34 * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
39 #if defined(HAVE_SCHED_H) && defined(HAVE_SCHED_GETAFFINITY)
42 #include <sys/syscall.h>
57 #include "md_logging.h"
58 #include "md_support.h"
61 #include "pull_rotation.h"
74 #include "checkpoint.h"
75 #include "mtop_util.h"
76 #include "sighandler.h"
79 #include "gmx_detect_hardware.h"
80 #include "gmx_omp_nthreads.h"
81 #include "pull_rotation.h"
82 #include "calc_verletbuf.h"
83 #include "../mdlib/nbnxn_search.h"
84 #include "../mdlib/nbnxn_consts.h"
85 #include "gmx_fatal_collective.h"
90 #include "thread_mpi/threads.h"
100 #include "corewrap.h"
103 #include "gpu_utils.h"
104 #include "nbnxn_cuda_data_mgmt.h"
107 gmx_integrator_t *func;
110 /* The array should match the eI array in include/types/enums.h */
111 const gmx_intp_t integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md},{do_md}};
113 gmx_large_int_t deform_init_init_step_tpx;
114 matrix deform_init_box_tpx;
115 #ifdef GMX_THREAD_MPI
116 tMPI_Thread_mutex_t deform_init_box_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
120 #ifdef GMX_THREAD_MPI
121 struct mdrunner_arglist
123 gmx_hw_opt_t *hw_opt;
136 const char *dddlb_opt;
141 const char *nbpu_opt;
152 const char *deviceOptions;
154 int ret; /* return value */
158 /* The function used for spawning threads. Extracts the mdrunner()
159 arguments from its one argument and calls mdrunner(), after making
161 static void mdrunner_start_fn(void *arg)
163 struct mdrunner_arglist *mda=(struct mdrunner_arglist*)arg;
164 struct mdrunner_arglist mc=*mda; /* copy the arg list to make sure
165 that it's thread-local. This doesn't
166 copy pointed-to items, of course,
167 but those are all const. */
168 t_commrec *cr; /* we need a local version of this */
172 fnm = dup_tfn(mc.nfile, mc.fnm);
174 cr = init_par_threads(mc.cr);
181 mda->ret=mdrunner(mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv,
182 mc.bVerbose, mc.bCompact, mc.nstglobalcomm,
183 mc.ddxyz, mc.dd_node_order, mc.rdd,
184 mc.rconstr, mc.dddlb_opt, mc.dlb_scale,
185 mc.ddcsx, mc.ddcsy, mc.ddcsz,
187 mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
188 mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce,
189 mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.Flags);
192 /* called by mdrunner() to start a specific number of threads (including
193 the main thread) for thread-parallel runs. This in turn calls mdrunner()
195 All options besides nthreads are the same as for mdrunner(). */
196 static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
197 FILE *fplog,t_commrec *cr,int nfile,
198 const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
199 gmx_bool bCompact, int nstglobalcomm,
200 ivec ddxyz,int dd_node_order,real rdd,real rconstr,
201 const char *dddlb_opt,real dlb_scale,
202 const char *ddcsx,const char *ddcsy,const char *ddcsz,
203 const char *nbpu_opt,
204 int nsteps_cmdline, int nstepout,int resetstep,
205 int nmultisim,int repl_ex_nst,int repl_ex_nex, int repl_ex_seed,
206 real pforce,real cpt_period, real max_hours,
207 const char *deviceOptions, unsigned long Flags)
210 struct mdrunner_arglist *mda;
211 t_commrec *crn; /* the new commrec */
214 /* first check whether we even need to start tMPI */
215 if (hw_opt->nthreads_tmpi < 2)
220 /* a few small, one-time, almost unavoidable memory leaks: */
222 fnmn=dup_tfn(nfile, fnm);
224 /* fill the data structure to pass as void pointer to thread start fn */
231 mda->bVerbose=bVerbose;
232 mda->bCompact=bCompact;
233 mda->nstglobalcomm=nstglobalcomm;
234 mda->ddxyz[XX]=ddxyz[XX];
235 mda->ddxyz[YY]=ddxyz[YY];
236 mda->ddxyz[ZZ]=ddxyz[ZZ];
237 mda->dd_node_order=dd_node_order;
239 mda->rconstr=rconstr;
240 mda->dddlb_opt=dddlb_opt;
241 mda->dlb_scale=dlb_scale;
245 mda->nbpu_opt=nbpu_opt;
246 mda->nsteps_cmdline=nsteps_cmdline;
247 mda->nstepout=nstepout;
248 mda->resetstep=resetstep;
249 mda->nmultisim=nmultisim;
250 mda->repl_ex_nst=repl_ex_nst;
251 mda->repl_ex_nex=repl_ex_nex;
252 mda->repl_ex_seed=repl_ex_seed;
254 mda->cpt_period=cpt_period;
255 mda->max_hours=max_hours;
256 mda->deviceOptions=deviceOptions;
259 /* now spawn new threads that start mdrunner_start_fn(), while
260 the main thread returns */
261 ret=tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi,
262 (hw_opt->bThreadPinning ? TMPI_AFFINITY_ALL_CORES : TMPI_AFFINITY_NONE),
263 mdrunner_start_fn, (void*)(mda) );
264 if (ret!=TMPI_SUCCESS)
267 /* make a new comm_rec to reflect the new situation */
268 crn=init_par_threads(cr);
273 static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
274 const gmx_hw_opt_t *hw_opt,
280 /* There are no separate PME nodes here, as we ensured in
281 * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
282 * and a conditional ensures we would not have ended up here.
283 * Note that separate PME nodes might be switched on later.
287 nthreads_tmpi = ngpu;
288 if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi)
290 nthreads_tmpi = nthreads_tot;
293 else if (hw_opt->nthreads_omp > 0)
295 /* Here we could oversubscribe, when we do, we issue a warning later */
296 nthreads_tmpi = max(1,nthreads_tot/hw_opt->nthreads_omp);
300 /* TODO choose nthreads_omp based on hardware topology
301 when we have a hardware topology detection library */
302 /* In general, when running up to 4 threads, OpenMP should be faster.
303 * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
304 * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
305 * even on two CPUs it's usually faster (but with many OpenMP threads
306 * it could be faster not to use HT, currently we always use HT).
307 * On Nehalem/Westmere we want to avoid running 16 threads over
308 * two CPUs with HT, so we need a limit<16; thus we use 12.
309 * A reasonable limit for Intel Sandy and Ivy bridge,
310 * not knowing the topology, is 16 threads.
312 const int nthreads_omp_always_faster = 4;
313 const int nthreads_omp_always_faster_Nehalem = 12;
314 const int nthreads_omp_always_faster_SandyBridge = 16;
315 const int first_model_Nehalem = 0x1A;
316 const int first_model_SandyBridge = 0x2A;
317 gmx_bool bIntel_Family6;
320 (gmx_cpuid_vendor(hwinfo->cpuid_info) == GMX_CPUID_VENDOR_INTEL &&
321 gmx_cpuid_family(hwinfo->cpuid_info) == 6);
323 if (nthreads_tot <= nthreads_omp_always_faster ||
325 ((gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_Nehalem && nthreads_tot <= nthreads_omp_always_faster_Nehalem) ||
326 (gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_SandyBridge && nthreads_tot <= nthreads_omp_always_faster_SandyBridge))))
328 /* Use pure OpenMP parallelization */
333 /* Don't use OpenMP parallelization */
334 nthreads_tmpi = nthreads_tot;
338 return nthreads_tmpi;
342 /* Get the number of threads to use for thread-MPI based on how many
343 * were requested, which algorithms we're using,
344 * and how many particles there are.
345 * At the point we have already called check_and_update_hw_opt.
346 * Thus all options should be internally consistent and consistent
347 * with the hardware, except that ntmpi could be larger than #GPU.
349 static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
350 gmx_hw_opt_t *hw_opt,
351 t_inputrec *inputrec, gmx_mtop_t *mtop,
355 int nthreads_hw,nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu;
356 int min_atoms_per_mpi_thread;
361 if (hw_opt->nthreads_tmpi > 0)
363 /* Trivial, return right away */
364 return hw_opt->nthreads_tmpi;
367 nthreads_hw = hwinfo->nthreads_hw_avail;
369 /* How many total (#tMPI*#OpenMP) threads can we start? */
370 if (hw_opt->nthreads_tot > 0)
372 nthreads_tot_max = hw_opt->nthreads_tot;
376 nthreads_tot_max = nthreads_hw;
379 bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
382 ngpu = hwinfo->gpu_info.ncuda_dev_use;
390 get_tmpi_omp_thread_division(hwinfo,hw_opt,nthreads_tot_max,ngpu);
392 if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
394 /* Steps are divided over the nodes iso splitting the atoms */
395 min_atoms_per_mpi_thread = 0;
401 min_atoms_per_mpi_thread = MIN_ATOMS_PER_GPU;
405 min_atoms_per_mpi_thread = MIN_ATOMS_PER_MPI_THREAD;
409 /* Check if an algorithm does not support parallel simulation. */
410 if (nthreads_tmpi != 1 &&
411 ( inputrec->eI == eiLBFGS ||
412 inputrec->coulombtype == eelEWALD ) )
416 md_print_warn(cr,fplog,"The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
417 if (hw_opt->nthreads_tmpi > nthreads_tmpi)
419 gmx_fatal(FARGS,"You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
422 else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_thread)
424 /* the thread number was chosen automatically, but there are too many
425 threads (too few atoms per thread) */
426 nthreads_new = max(1,mtop->natoms/min_atoms_per_mpi_thread);
428 /* Avoid partial use of Hyper-Threading */
429 if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
430 nthreads_new > nthreads_hw/2 && nthreads_new < nthreads_hw)
432 nthreads_new = nthreads_hw/2;
435 /* Avoid large prime numbers in the thread count */
436 if (nthreads_new >= 6)
438 /* Use only 6,8,10 with additional factors of 2 */
442 while (3*fac*2 <= nthreads_new)
447 nthreads_new = (nthreads_new/fac)*fac;
452 if (nthreads_new == 5)
458 nthreads_tmpi = nthreads_new;
460 fprintf(stderr,"\n");
461 fprintf(stderr,"NOTE: Parallelization is limited by the small number of atoms,\n");
462 fprintf(stderr," only starting %d thread-MPI threads.\n",nthreads_tmpi);
463 fprintf(stderr," You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
466 return nthreads_tmpi;
468 #endif /* GMX_THREAD_MPI */
471 /* Environment variable for setting nstlist */
472 static const char* NSTLIST_ENVVAR = "GMX_NSTLIST";
473 /* Try to increase nstlist when using a GPU with nstlist less than this */
474 static const int NSTLIST_GPU_ENOUGH = 20;
475 /* Increase nstlist until the non-bonded cost increases more than this factor */
476 static const float NBNXN_GPU_LIST_OK_FAC = 1.25;
477 /* Don't increase nstlist beyond a non-bonded cost increases of this factor */
478 static const float NBNXN_GPU_LIST_MAX_FAC = 1.40;
480 /* Try to increase nstlist when running on a GPU */
481 static void increase_nstlist(FILE *fp,t_commrec *cr,
482 t_inputrec *ir,const gmx_mtop_t *mtop,matrix box)
485 int nstlist_orig,nstlist_prev;
486 verletbuf_list_setup_t ls;
487 real rlist_inc,rlist_ok,rlist_max,rlist_new,rlist_prev;
490 gmx_bool bBox,bDD,bCont;
491 const char *nstl_fmt="\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
492 const char *vbd_err="Can not increase nstlist for GPU run because verlet-buffer-drift is not set or used";
493 const char *box_err="Can not increase nstlist for GPU run because the box is too small";
494 const char *dd_err ="Can not increase nstlist for GPU run because of domain decomposition limitations";
497 /* Number of + nstlist alternative values to try when switching */
498 const int nstl[]={ 20, 25, 40, 50 };
499 #define NNSTL sizeof(nstl)/sizeof(nstl[0])
501 env = getenv(NSTLIST_ENVVAR);
506 fprintf(fp,nstl_fmt,ir->nstlist);
510 if (ir->verletbuf_drift == 0)
512 gmx_fatal(FARGS,"You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
515 if (ir->verletbuf_drift < 0)
519 fprintf(stderr,"%s\n",vbd_err);
523 fprintf(fp,"%s\n",vbd_err);
529 nstlist_orig = ir->nstlist;
532 sprintf(buf,"Getting nstlist from environment variable GMX_NSTLIST=%s",env);
535 fprintf(stderr,"%s\n",buf);
539 fprintf(fp,"%s\n",buf);
541 sscanf(env,"%d",&ir->nstlist);
544 verletbuf_get_list_setup(TRUE,&ls);
546 /* Allow rlist to make the list double the size of the cut-off sphere */
547 rlist_inc = nbnxn_get_rlist_effective_inc(NBNXN_GPU_CLUSTER_SIZE,mtop->natoms/det(box));
548 rlist_ok = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_OK_FAC,1.0/3.0) - rlist_inc;
549 rlist_max = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_MAX_FAC,1.0/3.0) - rlist_inc;
552 fprintf(debug,"GPU nstlist tuning: rlist_inc %.3f rlist_max %.3f\n",
553 rlist_inc,rlist_max);
557 nstlist_prev = nstlist_orig;
558 rlist_prev = ir->rlist;
563 ir->nstlist = nstl[i];
566 /* Set the pair-list buffer size in ir */
567 calc_verlet_buffer_size(mtop,det(box),ir,ir->verletbuf_drift,&ls,
570 /* Does rlist fit in the box? */
571 bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC,box));
573 if (bBox && DOMAINDECOMP(cr))
575 /* Check if rlist fits in the domain decomposition */
576 if (inputrec2nboundeddim(ir) < DIM)
578 gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
580 copy_mat(box,state_tmp.box);
581 bDD = change_dd_cutoff(cr,&state_tmp,ir,rlist_new);
588 if (bBox && bDD && rlist_new <= rlist_max)
590 /* Increase nstlist */
591 nstlist_prev = ir->nstlist;
592 rlist_prev = rlist_new;
593 bCont = (i+1 < NNSTL && rlist_new < rlist_ok);
597 /* Stick with the previous nstlist */
598 ir->nstlist = nstlist_prev;
599 rlist_new = rlist_prev;
611 gmx_warning(!bBox ? box_err : dd_err);
614 fprintf(fp,"\n%s\n",bBox ? box_err : dd_err);
616 ir->nstlist = nstlist_orig;
618 else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
620 sprintf(buf,"Changing nstlist from %d to %d, rlist from %g to %g",
621 nstlist_orig,ir->nstlist,
622 ir->rlist,rlist_new);
625 fprintf(stderr,"%s\n\n",buf);
629 fprintf(fp,"%s\n\n",buf);
631 ir->rlist = rlist_new;
632 ir->rlistlong = rlist_new;
636 static void prepare_verlet_scheme(FILE *fplog,
637 gmx_hw_info_t *hwinfo,
639 gmx_hw_opt_t *hw_opt,
640 const char *nbpu_opt,
642 const gmx_mtop_t *mtop,
646 /* Here we only check for GPU usage on the MPI master process,
647 * as here we don't know how many GPUs we will use yet.
648 * We check for a GPU on all processes later.
650 *bUseGPU = hwinfo->bCanUseGPU || (getenv("GMX_EMULATE_GPU") != NULL);
652 if (ir->verletbuf_drift > 0)
654 /* Update the Verlet buffer size for the current run setup */
655 verletbuf_list_setup_t ls;
658 /* Here we assume CPU acceleration is on. But as currently
659 * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
660 * and 4x2 gives a larger buffer than 4x4, this is ok.
662 verletbuf_get_list_setup(*bUseGPU,&ls);
664 calc_verlet_buffer_size(mtop,det(box),ir,
665 ir->verletbuf_drift,&ls,
667 if (rlist_new != ir->rlist)
671 fprintf(fplog,"\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
673 ls.cluster_size_i,ls.cluster_size_j);
675 ir->rlist = rlist_new;
676 ir->rlistlong = rlist_new;
680 /* With GPU or emulation we should check nstlist for performance */
681 if ((EI_DYNAMICS(ir->eI) &&
683 ir->nstlist < NSTLIST_GPU_ENOUGH) ||
684 getenv(NSTLIST_ENVVAR) != NULL)
686 /* Choose a better nstlist */
687 increase_nstlist(fplog,cr,ir,mtop,box);
691 static void convert_to_verlet_scheme(FILE *fplog,
693 gmx_mtop_t *mtop,real box_vol)
695 char *conv_mesg="Converting input file with group cut-off scheme to the Verlet cut-off scheme";
697 md_print_warn(NULL,fplog,"%s\n",conv_mesg);
699 ir->cutoff_scheme = ecutsVERLET;
700 ir->verletbuf_drift = 0.005;
702 if (ir->rcoulomb != ir->rvdw)
704 gmx_fatal(FARGS,"The VdW and Coulomb cut-offs are different, whereas the Verlet scheme only supports equal cut-offs");
707 if (ir->vdwtype == evdwUSER || EEL_USER(ir->coulombtype))
709 gmx_fatal(FARGS,"User non-bonded potentials are not (yet) supported with the Verlet scheme");
711 else if (EVDW_SWITCHED(ir->vdwtype) || EEL_SWITCHED(ir->coulombtype))
713 md_print_warn(NULL,fplog,"Converting switched or shifted interactions to a shifted potential (without force shift), this will lead to slightly different interaction potentials");
715 if (EVDW_SWITCHED(ir->vdwtype))
717 ir->vdwtype = evdwCUT;
719 if (EEL_SWITCHED(ir->coulombtype))
721 if (EEL_FULL(ir->coulombtype))
723 /* With full electrostatic only PME can be switched */
724 ir->coulombtype = eelPME;
728 md_print_warn(NULL,fplog,"NOTE: Replacing %s electrostatics with reaction-field with epsilon-rf=inf\n",eel_names[ir->coulombtype]);
729 ir->coulombtype = eelRF;
730 ir->epsilon_rf = 0.0;
734 /* We set the target energy drift to a small number.
735 * Note that this is only for testing. For production the user
736 * should think about this and set the mdp options.
738 ir->verletbuf_drift = 1e-4;
741 if (inputrec2nboundeddim(ir) != 3)
743 gmx_fatal(FARGS,"Can only convert old tpr files to the Verlet cut-off scheme with 3D pbc");
746 if (ir->efep != efepNO || ir->implicit_solvent != eisNO)
748 gmx_fatal(FARGS,"Will not convert old tpr files to the Verlet cut-off scheme with free-energy calculations or implicit solvent");
751 if (EI_DYNAMICS(ir->eI) && !(EI_MD(ir->eI) && ir->etc == etcNO))
753 verletbuf_list_setup_t ls;
755 verletbuf_get_list_setup(FALSE,&ls);
756 calc_verlet_buffer_size(mtop,box_vol,ir,ir->verletbuf_drift,&ls,
761 ir->verletbuf_drift = -1;
762 ir->rlist = 1.05*max(ir->rvdw,ir->rcoulomb);
765 gmx_mtop_remove_chargegroups(mtop);
768 /* Check the process affinity mask. If it is non-zero, something
769 * else has set the affinity, and mdrun should honor that and
770 * not attempt to do its own thread pinning.
772 * This function should be called twice. Once before the OpenMP
773 * library gets initialized with bAfterOpenMPInit=FALSE (which will
774 * detect affinity set by external tools like taskset), and again
775 * later, after the OpenMP initialization, with bAfterOpenMPInit=TRUE
776 * (which will detect affinity changes made by the OpenMP library).
778 * Note that this will only work on Linux, because we use a GNU
780 static void check_cpu_affinity_set(FILE *fplog, const t_commrec *cr,
781 gmx_hw_opt_t *hw_opt, int ncpus,
782 gmx_bool bAfterOpenmpInit)
784 #ifdef HAVE_SCHED_GETAFFINITY
785 cpu_set_t mask_current;
786 int i, ret, cpu_count, cpu_set;
790 if (!hw_opt->bThreadPinning)
792 /* internal affinity setting is off, don't bother checking process affinity */
796 CPU_ZERO(&mask_current);
797 if ((ret = sched_getaffinity(0, sizeof(cpu_set_t), &mask_current)) != 0)
799 /* failed to query affinity mask, will just return */
802 fprintf(debug, "Failed to query affinity mask (error %d)", ret);
807 /* Before proceeding with the actual check, make sure that the number of
808 * detected CPUs is >= the CPUs in the current set.
809 * We need to check for CPU_COUNT as it was added only in glibc 2.6. */
811 if (ncpus < CPU_COUNT(&mask_current))
815 fprintf(debug, "%d CPUs detected, but %d was returned by CPU_COUNT",
816 ncpus, CPU_COUNT(&mask_current));
820 #endif /* CPU_COUNT */
823 for (i = 0; (i < ncpus && i < CPU_SETSIZE); i++)
825 bAllSet = bAllSet && (CPU_ISSET(i, &mask_current) != 0);
830 if (!bAfterOpenmpInit)
832 md_print_warn(cr, fplog,
833 "%s detected a non-default process affinity, "
834 "so it will not attempt to pin its threads", ShortProgram());
838 md_print_warn(cr, fplog,
839 "%s detected a non-default process affinity, "
840 "probably set by the OpenMP library, "
841 "so it will not attempt to pin its threads", ShortProgram());
843 hw_opt->bThreadPinning = FALSE;
847 fprintf(debug, "Non-default affinity mask found, mdrun will not pin threads\n");
854 fprintf(debug, "Default affinity mask found\n");
857 #endif /* HAVE_SCHED_GETAFFINITY */
860 /* Set CPU affinity. Can be important for performance.
861 On some systems (e.g. Cray) CPU Affinity is set by default.
862 But default assigning doesn't work (well) with only some ranks
863 having threads. This causes very low performance.
864 External tools have cumbersome syntax for setting affinity
865 in the case that only some ranks have threads.
866 Thus it is important that GROMACS sets the affinity internally
867 if only PME is using threads.
869 static void set_cpu_affinity(FILE *fplog,
871 gmx_hw_opt_t *hw_opt,
873 const gmx_hw_info_t *hwinfo,
874 const t_inputrec *inputrec)
876 #if defined GMX_THREAD_MPI
877 /* With the number of TMPI threads equal to the number of cores
878 * we already pinned in thread-MPI, so don't pin again here.
880 if (hw_opt->nthreads_tmpi == tMPI_Thread_get_hw_number())
887 /* If the tMPI thread affinity setting is not supported encourage the user
888 * to report it as it's either a bug or an exotic platform which we might
889 * want to support. */
890 if (tMPI_Thread_setaffinity_support() != TMPI_SETAFFINITY_SUPPORT_YES)
892 md_print_warn(NULL, fplog,
893 "Can not set thread affinities on the current plarform. On NUMA systems this\n"
894 "can cause performance degradation. If you think your platform should support\n"
895 "setting affinities, contact the GROMACS developers.");
898 #endif /* __APPLE__ */
900 if (hw_opt->bThreadPinning)
902 int nth_affinity_set, thread_id_node, thread_id,
903 nthread_local, nthread_node, nthread_hw_max, nphyscore;
907 /* threads on this MPI process or TMPI thread */
908 if (cr->duty & DUTY_PP)
910 nthread_local = gmx_omp_nthreads_get(emntNonbonded);
914 nthread_local = gmx_omp_nthreads_get(emntPME);
917 /* map the current process to cores */
919 nthread_node = nthread_local;
921 if (PAR(cr) || MULTISIM(cr))
923 /* We need to determine a scan of the thread counts in this
928 MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),cr->rank_intranode,
930 MPI_Scan(&nthread_local,&thread_id_node,1,MPI_INT,MPI_SUM,comm_intra);
931 /* MPI_Scan is inclusive, but here we need exclusive */
932 thread_id_node -= nthread_local;
933 /* Get the total number of threads on this physical node */
934 MPI_Allreduce(&nthread_local,&nthread_node,1,MPI_INT,MPI_SUM,comm_intra);
935 MPI_Comm_free(&comm_intra);
940 if (hw_opt->core_pinning_offset > 0)
942 offset = hw_opt->core_pinning_offset;
945 fprintf(stderr, "Applying core pinning offset %d\n", offset);
949 fprintf(fplog, "Applying core pinning offset %d\n", offset);
953 /* With Intel Hyper-Threading enabled, we want to pin consecutive
954 * threads to physical cores when using more threads than physical
955 * cores or when the user requests so.
957 nthread_hw_max = hwinfo->nthreads_hw_avail;
959 if (hw_opt->bPinHyperthreading ||
960 (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
961 nthread_node > nthread_hw_max/2 && getenv("GMX_DISABLE_PINHT") == NULL))
963 if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) != GMX_CPUID_X86_SMT_ENABLED)
965 /* We print to stderr on all processes, as we might have
966 * different settings on different physical nodes.
968 if (gmx_cpuid_vendor(hwinfo->cpuid_info) != GMX_CPUID_VENDOR_INTEL)
970 md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
971 "but non-Intel CPU detected (vendor: %s)\n",
972 gmx_cpuid_vendor_string[gmx_cpuid_vendor(hwinfo->cpuid_info)]);
976 md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
977 "but the CPU detected does not have Intel Hyper-Threading support "
978 "(or it is turned off)\n");
981 nphyscore = nthread_hw_max/2;
985 fprintf(stderr, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
990 fprintf(fplog, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
995 /* Set the per-thread affinity. In order to be able to check the success
996 * of affinity settings, we will set nth_affinity_set to 1 on threads
997 * where the affinity setting succeded and to 0 where it failed.
998 * Reducing these 0/1 values over the threads will give the total number
999 * of threads on which we succeeded.
1001 nth_affinity_set = 0;
1002 #pragma omp parallel firstprivate(thread_id_node) num_threads(nthread_local) \
1003 reduction(+:nth_affinity_set)
1006 gmx_bool setaffinity_ret;
1008 thread_id = gmx_omp_get_thread_num();
1009 thread_id_node += thread_id;
1012 core = offset + thread_id_node;
1016 /* Lock pairs of threads to the same hyperthreaded core */
1017 core = offset + thread_id_node/2 + (thread_id_node % 2)*nphyscore;
1020 setaffinity_ret = tMPI_Thread_setaffinity_single(tMPI_Thread_self(), core);
1022 /* store the per-thread success-values of the setaffinity */
1023 nth_affinity_set = (setaffinity_ret == 0);
1027 fprintf(debug, "On rank %2d, thread %2d, core %2d the affinity setting returned %d\n",
1028 cr->nodeid, gmx_omp_get_thread_num(), core, setaffinity_ret);
1032 if (nth_affinity_set > nthread_local)
1036 sprintf(msg, "Looks like we have set affinity for more threads than "
1037 "we have (%d > %d)!\n", nth_affinity_set, nthread_local);
1042 /* check & warn if some threads failed to set their affinities */
1043 if (nth_affinity_set != nthread_local)
1045 char sbuf1[STRLEN], sbuf2[STRLEN];
1047 /* sbuf1 contains rank info, while sbuf2 OpenMP thread info */
1048 sbuf1[0] = sbuf2[0] = '\0';
1050 #ifdef GMX_THREAD_MPI
1051 sprintf(sbuf1, "In thread-MPI thread #%d: ", cr->nodeid);
1052 #else /* GMX_LIB_MPI */
1053 sprintf(sbuf1, "In MPI process #%d: ", cr->nodeid);
1055 #endif /* GMX_MPI */
1057 if (nthread_local > 1)
1059 sprintf(sbuf2, "of %d/%d thread%s ",
1060 nthread_local - nth_affinity_set, nthread_local,
1061 (nthread_local - nth_affinity_set) > 1 ? "s" : "");
1064 md_print_warn(NULL, fplog,
1065 "NOTE: %sAffinity setting %sfailed.\n"
1066 " This can cause performance degradation!",
1074 static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt,
1076 gmx_bool bIsSimMaster)
1078 gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp, bIsSimMaster);
1080 #ifndef GMX_THREAD_MPI
1081 if (hw_opt->nthreads_tot > 0)
1083 gmx_fatal(FARGS,"Setting the total number of threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
1085 if (hw_opt->nthreads_tmpi > 0)
1087 gmx_fatal(FARGS,"Setting the number of thread-MPI threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
1091 if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
1093 /* We have the same number of OpenMP threads for PP and PME processes,
1094 * thus we can perform several consistency checks.
1096 if (hw_opt->nthreads_tmpi > 0 &&
1097 hw_opt->nthreads_omp > 0 &&
1098 hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
1100 gmx_fatal(FARGS,"The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
1101 hw_opt->nthreads_tot,hw_opt->nthreads_tmpi,hw_opt->nthreads_omp);
1104 if (hw_opt->nthreads_tmpi > 0 &&
1105 hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
1107 gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
1108 hw_opt->nthreads_tot,hw_opt->nthreads_tmpi);
1111 if (hw_opt->nthreads_omp > 0 &&
1112 hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
1114 gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
1115 hw_opt->nthreads_tot,hw_opt->nthreads_omp);
1118 if (hw_opt->nthreads_tmpi > 0 &&
1119 hw_opt->nthreads_omp <= 0)
1121 hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
1126 if (hw_opt->nthreads_omp > 1)
1128 gmx_fatal(FARGS,"OpenMP threads are requested, but Gromacs was compiled without OpenMP support");
1132 if (cutoff_scheme == ecutsGROUP)
1134 /* We only have OpenMP support for PME only nodes */
1135 if (hw_opt->nthreads_omp > 1)
1137 gmx_fatal(FARGS,"OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
1138 ecutscheme_names[cutoff_scheme],
1139 ecutscheme_names[ecutsVERLET]);
1141 hw_opt->nthreads_omp = 1;
1144 if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
1146 gmx_fatal(FARGS,"You need to specify -ntomp in addition to -ntomp_pme");
1149 if (hw_opt->nthreads_tot == 1)
1151 hw_opt->nthreads_tmpi = 1;
1153 if (hw_opt->nthreads_omp > 1)
1155 gmx_fatal(FARGS,"You requested %d OpenMP threads with %d total threads",
1156 hw_opt->nthreads_tmpi,hw_opt->nthreads_tot);
1158 hw_opt->nthreads_omp = 1;
1161 if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
1163 hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
1168 fprintf(debug,"hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
1169 hw_opt->nthreads_tot,
1170 hw_opt->nthreads_tmpi,
1171 hw_opt->nthreads_omp,
1172 hw_opt->nthreads_omp_pme,
1173 hw_opt->gpu_id!=NULL ? hw_opt->gpu_id : "");
1179 /* Override the value in inputrec with value passed on the command line (if any) */
1180 static void override_nsteps_cmdline(FILE *fplog,
1183 const t_commrec *cr)
1188 /* override with anything else than the default -2 */
1189 if (nsteps_cmdline > -2)
1193 ir->nsteps = nsteps_cmdline;
1194 if (EI_DYNAMICS(ir->eI))
1196 sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps, %.3f ps",
1197 nsteps_cmdline, nsteps_cmdline*ir->delta_t);
1201 sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps",
1205 md_print_warn(cr, fplog, "%s\n", stmp);
1209 /* Data structure set by SIMMASTER which needs to be passed to all nodes
1210 * before the other nodes have read the tpx file and called gmx_detect_hardware.
1213 int cutoff_scheme; /* The cutoff scheme from inputrec_t */
1214 gmx_bool bUseGPU; /* Use GPU or GPU emulation */
1217 int mdrunner(gmx_hw_opt_t *hw_opt,
1218 FILE *fplog,t_commrec *cr,int nfile,
1219 const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
1220 gmx_bool bCompact, int nstglobalcomm,
1221 ivec ddxyz,int dd_node_order,real rdd,real rconstr,
1222 const char *dddlb_opt,real dlb_scale,
1223 const char *ddcsx,const char *ddcsy,const char *ddcsz,
1224 const char *nbpu_opt,
1225 int nsteps_cmdline, int nstepout,int resetstep,
1226 int nmultisim,int repl_ex_nst,int repl_ex_nex,
1227 int repl_ex_seed, real pforce,real cpt_period,real max_hours,
1228 const char *deviceOptions, unsigned long Flags)
1230 gmx_bool bForceUseGPU,bTryUseGPU;
1231 double nodetime=0,realtime;
1232 t_inputrec *inputrec;
1233 t_state *state=NULL;
1235 gmx_ddbox_t ddbox={0};
1236 int npme_major,npme_minor;
1239 gmx_mtop_t *mtop=NULL;
1240 t_mdatoms *mdatoms=NULL;
1241 t_forcerec *fr=NULL;
1244 gmx_pme_t *pmedata=NULL;
1245 gmx_vsite_t *vsite=NULL;
1246 gmx_constr_t constr;
1247 int i,m,nChargePerturbed=-1,status,nalloc;
1249 gmx_wallcycle_t wcycle;
1250 gmx_bool bReadRNG,bReadEkin;
1252 gmx_runtime_t runtime;
1254 gmx_large_int_t reset_counters;
1255 gmx_edsam_t ed=NULL;
1256 t_commrec *cr_old=cr;
1259 gmx_membed_t membed=NULL;
1260 gmx_hw_info_t *hwinfo=NULL;
1261 master_inf_t minf={-1,FALSE};
1263 /* CAUTION: threads may be started later on in this function, so
1264 cr doesn't reflect the final parallel state right now */
1268 if (Flags & MD_APPENDFILES)
1273 bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
1274 bTryUseGPU = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
1279 /* Read (nearly) all data required for the simulation */
1280 read_tpx_state(ftp2fn(efTPX,nfile,fnm),inputrec,state,NULL,mtop);
1282 if (inputrec->cutoff_scheme != ecutsVERLET &&
1283 ((Flags & MD_TESTVERLET) || getenv("GMX_VERLET_SCHEME") != NULL))
1285 convert_to_verlet_scheme(fplog,inputrec,mtop,det(state->box));
1288 /* Detect hardware, gather information. With tMPI only thread 0 does it
1289 * and after threads are started broadcasts hwinfo around. */
1291 gmx_detect_hardware(fplog, hwinfo, cr,
1292 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
1294 minf.cutoff_scheme = inputrec->cutoff_scheme;
1295 minf.bUseGPU = FALSE;
1297 if (inputrec->cutoff_scheme == ecutsVERLET)
1299 prepare_verlet_scheme(fplog,hwinfo,cr,hw_opt,nbpu_opt,
1300 inputrec,mtop,state->box,
1303 else if (hwinfo->bCanUseGPU)
1305 md_print_warn(cr,fplog,
1306 "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
1307 " To use a GPU, set the mdp option: cutoff-scheme = Verlet\n"
1308 " (for quick performance testing you can use the -testverlet option)\n");
1312 gmx_fatal(FARGS,"GPU requested, but can't be used without cutoff-scheme=Verlet");
1316 #ifndef GMX_THREAD_MPI
1319 gmx_bcast_sim(sizeof(minf),&minf,cr);
1322 if (minf.bUseGPU && cr->npmenodes == -1)
1324 /* Don't automatically use PME-only nodes with GPUs */
1328 /* Check for externally set OpenMP affinity and turn off internal
1329 * pinning if any is found. We need to do this check early to tell
1330 * thread-MPI whether it should do pinning when spawning threads.
1332 gmx_omp_check_thread_affinity(fplog, cr, hw_opt);
1334 #ifdef GMX_THREAD_MPI
1335 /* With thread-MPI inputrec is only set here on the master thread */
1339 check_and_update_hw_opt(hw_opt,minf.cutoff_scheme,SIMMASTER(cr));
1341 #ifdef GMX_THREAD_MPI
1342 /* Early check for externally set process affinity. Can't do over all
1343 * MPI processes because hwinfo is not available everywhere, but with
1344 * thread-MPI it's needed as pinning might get turned off which needs
1345 * to be known before starting thread-MPI. */
1346 check_cpu_affinity_set(fplog,
1348 hw_opt, hwinfo->nthreads_hw_avail, FALSE);
1351 #ifdef GMX_THREAD_MPI
1352 if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
1354 gmx_fatal(FARGS,"You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME nodes");
1358 if (hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
1361 gmx_fatal(FARGS,"You need to explicitly specify the number of PME nodes (-npme) when using different number of OpenMP threads for PP and PME nodes");
1365 #ifdef GMX_THREAD_MPI
1368 /* NOW the threads will be started: */
1369 hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
1373 if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
1375 hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
1378 if (hw_opt->nthreads_tmpi > 1)
1380 /* now start the threads. */
1381 cr=mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm,
1382 oenv, bVerbose, bCompact, nstglobalcomm,
1383 ddxyz, dd_node_order, rdd, rconstr,
1384 dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
1386 nsteps_cmdline, nstepout, resetstep, nmultisim,
1387 repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
1388 cpt_period, max_hours, deviceOptions,
1390 /* the main thread continues here with a new cr. We don't deallocate
1391 the old cr because other threads may still be reading it. */
1394 gmx_comm("Failed to spawn threads");
1399 /* END OF CAUTION: cr is now reliable */
1401 /* g_membed initialisation *
1402 * Because we change the mtop, init_membed is called before the init_parallel *
1403 * (in case we ever want to make it run in parallel) */
1404 if (opt2bSet("-membed",nfile,fnm))
1408 fprintf(stderr,"Initializing membed");
1410 membed = init_membed(fplog,nfile,fnm,mtop,inputrec,state,cr,&cpt_period);
1415 /* now broadcast everything to the non-master nodes/threads: */
1416 init_parallel(fplog, cr, inputrec, mtop);
1418 /* This check needs to happen after get_nthreads_mpi() */
1419 if (inputrec->cutoff_scheme == ecutsVERLET && (Flags & MD_PARTDEC))
1421 gmx_fatal_collective(FARGS,cr,NULL,
1422 "The Verlet cut-off scheme is not supported with particle decomposition.\n"
1423 "You can achieve the same effect as particle decomposition by running in parallel using only OpenMP threads.");
1428 pr_inputrec(fplog,0,"Input Parameters",inputrec,FALSE);
1431 #if defined GMX_THREAD_MPI
1432 /* With tMPI we detected on thread 0 and we'll just pass the hwinfo pointer
1433 * to the other threads -- slightly uncool, but works fine, just need to
1434 * make sure that the data doesn't get freed twice. */
1441 gmx_bcast(sizeof(&hwinfo), &hwinfo, cr);
1444 if (PAR(cr) && !SIMMASTER(cr))
1446 /* now we have inputrec on all nodes, can run the detection */
1447 /* TODO: perhaps it's better to propagate within a node instead? */
1449 gmx_detect_hardware(fplog, hwinfo, cr,
1450 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
1453 /* Now do the affinity check with MPI/no-MPI (done earlier with thread-MPI). */
1454 check_cpu_affinity_set(fplog, cr,
1455 hw_opt, hwinfo->nthreads_hw_avail, FALSE);
1458 /* now make sure the state is initialized and propagated */
1459 set_state_entries(state,inputrec,cr->nnodes);
1461 /* A parallel command line option consistency check that we can
1462 only do after any threads have started. */
1464 (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
1467 "The -dd or -npme option request a parallel simulation, "
1469 "but %s was compiled without threads or MPI enabled"
1471 #ifdef GMX_THREAD_MPI
1472 "but the number of threads (option -nt) is 1"
1474 "but %s was not started through mpirun/mpiexec or only one process was requested through mpirun/mpiexec"
1481 if ((Flags & MD_RERUN) &&
1482 (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
1484 gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
1487 if (can_use_allvsall(inputrec,mtop,TRUE,cr,fplog) && PAR(cr))
1489 /* All-vs-all loops do not work with domain decomposition */
1490 Flags |= MD_PARTDEC;
1493 if (!EEL_PME(inputrec->coulombtype) || (Flags & MD_PARTDEC))
1495 if (cr->npmenodes > 0)
1497 if (!EEL_PME(inputrec->coulombtype))
1499 gmx_fatal_collective(FARGS,cr,NULL,
1500 "PME nodes are requested, but the system does not use PME electrostatics");
1502 if (Flags & MD_PARTDEC)
1504 gmx_fatal_collective(FARGS,cr,NULL,
1505 "PME nodes are requested, but particle decomposition does not support separate PME nodes");
1513 fcRegisterSteps(inputrec->nsteps,inputrec->init_step);
1516 /* NMR restraints must be initialized before load_checkpoint,
1517 * since with time averaging the history is added to t_state.
1518 * For proper consistency check we therefore need to extend
1520 * So the PME-only nodes (if present) will also initialize
1521 * the distance restraints.
1525 /* This needs to be called before read_checkpoint to extend the state */
1526 init_disres(fplog,mtop,inputrec,cr,Flags & MD_PARTDEC,fcd,state, repl_ex_nst > 0);
1528 if (gmx_mtop_ftype_count(mtop,F_ORIRES) > 0)
1530 if (PAR(cr) && !(Flags & MD_PARTDEC))
1532 gmx_fatal(FARGS,"Orientation restraints do not work (yet) with domain decomposition, use particle decomposition (mdrun option -pd)");
1534 /* Orientation restraints */
1537 init_orires(fplog,mtop,state->x,inputrec,cr->ms,&(fcd->orires),
1542 if (DEFORM(*inputrec))
1544 /* Store the deform reference box before reading the checkpoint */
1547 copy_mat(state->box,box);
1551 gmx_bcast(sizeof(box),box,cr);
1553 /* Because we do not have the update struct available yet
1554 * in which the reference values should be stored,
1555 * we store them temporarily in static variables.
1556 * This should be thread safe, since they are only written once
1557 * and with identical values.
1559 #ifdef GMX_THREAD_MPI
1560 tMPI_Thread_mutex_lock(&deform_init_box_mutex);
1562 deform_init_init_step_tpx = inputrec->init_step;
1563 copy_mat(box,deform_init_box_tpx);
1564 #ifdef GMX_THREAD_MPI
1565 tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
1569 if (opt2bSet("-cpi",nfile,fnm))
1571 /* Check if checkpoint file exists before doing continuation.
1572 * This way we can use identical input options for the first and subsequent runs...
1574 if( gmx_fexist_master(opt2fn_master("-cpi",nfile,fnm,cr),cr) )
1576 load_checkpoint(opt2fn_master("-cpi",nfile,fnm,cr),&fplog,
1577 cr,Flags & MD_PARTDEC,ddxyz,
1578 inputrec,state,&bReadRNG,&bReadEkin,
1579 (Flags & MD_APPENDFILES),
1580 (Flags & MD_APPENDFILESSET));
1584 Flags |= MD_READ_RNG;
1588 Flags |= MD_READ_EKIN;
1593 if (((MASTER(cr) || (Flags & MD_SEPPOT)) && (Flags & MD_APPENDFILES))
1594 #ifdef GMX_THREAD_MPI
1595 /* With thread MPI only the master node/thread exists in mdrun.c,
1596 * therefore non-master nodes need to open the "seppot" log file here.
1598 || (!MASTER(cr) && (Flags & MD_SEPPOT))
1602 gmx_log_open(ftp2fn(efLOG,nfile,fnm),cr,!(Flags & MD_SEPPOT),
1606 /* override nsteps with value from cmdline */
1607 override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
1611 copy_mat(state->box,box);
1616 gmx_bcast(sizeof(box),box,cr);
1619 /* Essential dynamics */
1620 if (opt2bSet("-ei",nfile,fnm))
1622 /* Open input and output files, allocate space for ED data structure */
1623 ed = ed_open(mtop->natoms,&state->edsamstate,nfile,fnm,Flags,oenv,cr);
1626 if (PAR(cr) && !((Flags & MD_PARTDEC) ||
1627 EI_TPI(inputrec->eI) ||
1628 inputrec->eI == eiNM))
1630 cr->dd = init_domain_decomposition(fplog,cr,Flags,ddxyz,rdd,rconstr,
1631 dddlb_opt,dlb_scale,
1635 &ddbox,&npme_major,&npme_minor);
1637 make_dd_communicators(fplog,cr,dd_node_order);
1639 /* Set overallocation to avoid frequent reallocation of arrays */
1640 set_over_alloc_dd(TRUE);
1644 /* PME, if used, is done on all nodes with 1D decomposition */
1646 cr->duty = (DUTY_PP | DUTY_PME);
1649 if (!EI_TPI(inputrec->eI))
1651 npme_major = cr->nnodes;
1654 if (inputrec->ePBC == epbcSCREW)
1657 "pbc=%s is only implemented with domain decomposition",
1658 epbc_names[inputrec->ePBC]);
1664 /* After possible communicator splitting in make_dd_communicators.
1665 * we can set up the intra/inter node communication.
1667 gmx_setup_nodecomm(fplog,cr);
1670 /* Initialize per-physical-node MPI process/thread ID and counters. */
1671 gmx_init_intranode_counters(cr);
1674 md_print_info(cr,fplog,"Using %d MPI %s\n",
1676 #ifdef GMX_THREAD_MPI
1677 cr->nnodes==1 ? "thread" : "threads"
1679 cr->nnodes==1 ? "process" : "processes"
1685 gmx_omp_nthreads_init(fplog, cr,
1686 hwinfo->nthreads_hw_avail,
1687 hw_opt->nthreads_omp,
1688 hw_opt->nthreads_omp_pme,
1689 (cr->duty & DUTY_PP) == 0,
1690 inputrec->cutoff_scheme == ecutsVERLET);
1692 gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi, minf.bUseGPU);
1694 /* getting number of PP/PME threads
1695 PME: env variable should be read only on one node to make sure it is
1696 identical everywhere;
1698 /* TODO nthreads_pp is only used for pinning threads.
1699 * This is a temporary solution until we have a hw topology library.
1701 nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
1702 nthreads_pme = gmx_omp_nthreads_get(emntPME);
1704 wcycle = wallcycle_init(fplog,resetstep,cr,nthreads_pp,nthreads_pme);
1708 /* Master synchronizes its value of reset_counters with all nodes
1709 * including PME only nodes */
1710 reset_counters = wcycle_get_reset_counters(wcycle);
1711 gmx_bcast_sim(sizeof(reset_counters),&reset_counters,cr);
1712 wcycle_set_reset_counters(wcycle, reset_counters);
1716 if (cr->duty & DUTY_PP)
1718 /* For domain decomposition we allocate dynamically
1719 * in dd_partition_system.
1721 if (DOMAINDECOMP(cr))
1723 bcast_state_setup(cr,state);
1729 bcast_state(cr,state,TRUE);
1733 /* Initiate forcerecord */
1735 fr->hwinfo = hwinfo;
1736 init_forcerec(fplog,oenv,fr,fcd,inputrec,mtop,cr,box,FALSE,
1737 opt2fn("-table",nfile,fnm),
1738 opt2fn("-tabletf",nfile,fnm),
1739 opt2fn("-tablep",nfile,fnm),
1740 opt2fn("-tableb",nfile,fnm),
1744 /* version for PCA_NOT_READ_NODE (see md.c) */
1745 /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
1746 "nofile","nofile","nofile","nofile",FALSE,pforce);
1748 fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT);
1750 /* Initialize QM-MM */
1753 init_QMMMrec(cr,box,mtop,inputrec,fr);
1756 /* Initialize the mdatoms structure.
1757 * mdatoms is not filled with atom data,
1758 * as this can not be done now with domain decomposition.
1760 mdatoms = init_mdatoms(fplog,mtop,inputrec->efep!=efepNO);
1762 /* Initialize the virtual site communication */
1763 vsite = init_vsite(mtop,cr,FALSE);
1765 calc_shifts(box,fr->shift_vec);
1767 /* With periodic molecules the charge groups should be whole at start up
1768 * and the virtual sites should not be far from their proper positions.
1770 if (!inputrec->bContinuation && MASTER(cr) &&
1771 !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
1773 /* Make molecules whole at start of run */
1774 if (fr->ePBC != epbcNONE)
1776 do_pbc_first_mtop(fplog,inputrec->ePBC,box,mtop,state->x);
1780 /* Correct initial vsite positions are required
1781 * for the initial distribution in the domain decomposition
1782 * and for the initial shell prediction.
1784 construct_vsites_mtop(fplog,vsite,mtop,state->x);
1788 if (EEL_PME(fr->eeltype))
1790 ewaldcoeff = fr->ewaldcoeff;
1791 pmedata = &fr->pmedata;
1800 /* This is a PME only node */
1802 /* We don't need the state */
1805 ewaldcoeff = calc_ewaldcoeff(inputrec->rcoulomb, inputrec->ewald_rtol);
1809 /* Before setting affinity, check whether the affinity has changed
1810 * - which indicates that probably the OpenMP library has changed it since
1811 * we first checked). */
1812 check_cpu_affinity_set(fplog, cr, hw_opt, hwinfo->nthreads_hw_avail, TRUE);
1814 /* Set the CPU affinity */
1815 set_cpu_affinity(fplog,cr,hw_opt,nthreads_pme,hwinfo,inputrec);
1817 /* Initiate PME if necessary,
1818 * either on all nodes or on dedicated PME nodes only. */
1819 if (EEL_PME(inputrec->coulombtype))
1823 nChargePerturbed = mdatoms->nChargePerturbed;
1825 if (cr->npmenodes > 0)
1827 /* The PME only nodes need to know nChargePerturbed */
1828 gmx_bcast_sim(sizeof(nChargePerturbed),&nChargePerturbed,cr);
1831 if (cr->duty & DUTY_PME)
1833 status = gmx_pme_init(pmedata,cr,npme_major,npme_minor,inputrec,
1834 mtop ? mtop->natoms : 0,nChargePerturbed,
1835 (Flags & MD_REPRODUCIBLE),nthreads_pme);
1838 gmx_fatal(FARGS,"Error %d initializing PME",status);
1844 if (integrator[inputrec->eI].func == do_md)
1846 /* Turn on signal handling on all nodes */
1848 * (A user signal from the PME nodes (if any)
1849 * is communicated to the PP nodes.
1851 signal_handler_install();
1854 if (cr->duty & DUTY_PP)
1856 if (inputrec->ePull != epullNO)
1858 /* Initialize pull code */
1859 init_pull(fplog,inputrec,nfile,fnm,mtop,cr,oenv, inputrec->fepvals->init_lambda,
1860 EI_DYNAMICS(inputrec->eI) && MASTER(cr),Flags);
1865 /* Initialize enforced rotation code */
1866 init_rot(fplog,inputrec,nfile,fnm,cr,state->x,box,mtop,oenv,
1870 constr = init_constraints(fplog,mtop,inputrec,ed,state,cr);
1872 if (DOMAINDECOMP(cr))
1874 dd_init_bondeds(fplog,cr->dd,mtop,vsite,constr,inputrec,
1875 Flags & MD_DDBONDCHECK,fr->cginfo_mb);
1877 set_dd_parameters(fplog,cr->dd,dlb_scale,inputrec,fr,&ddbox);
1879 setup_dd_grid(fplog,cr->dd);
1882 /* Now do whatever the user wants us to do (how flexible...) */
1883 integrator[inputrec->eI].func(fplog,cr,nfile,fnm,
1884 oenv,bVerbose,bCompact,
1887 nstepout,inputrec,mtop,
1889 mdatoms,nrnb,wcycle,ed,fr,
1890 repl_ex_nst,repl_ex_nex,repl_ex_seed,
1892 cpt_period,max_hours,
1897 if (inputrec->ePull != epullNO)
1899 finish_pull(fplog,inputrec->pull);
1904 finish_rot(fplog,inputrec->rot);
1911 gmx_pmeonly(*pmedata,cr,nrnb,wcycle,ewaldcoeff,FALSE,inputrec);
1914 if (EI_DYNAMICS(inputrec->eI) || EI_TPI(inputrec->eI))
1916 /* Some timing stats */
1919 if (runtime.proc == 0)
1921 runtime.proc = runtime.real;
1930 wallcycle_stop(wcycle,ewcRUN);
1932 /* Finish up, write some stuff
1933 * if rerunMD, don't write last frame again
1935 finish_run(fplog,cr,ftp2fn(efSTO,nfile,fnm),
1936 inputrec,nrnb,wcycle,&runtime,
1937 fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ?
1938 nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL,
1940 EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
1942 if ((cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU)
1944 char gpu_err_str[STRLEN];
1946 /* free GPU memory and uninitialize GPU (by destroying the context) */
1947 nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
1949 if (!free_gpu(gpu_err_str))
1951 gmx_warning("On node %d failed to free GPU #%d: %s",
1952 cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
1956 if (opt2bSet("-membed",nfile,fnm))
1961 #ifdef GMX_THREAD_MPI
1962 if (PAR(cr) && SIMMASTER(cr))
1965 gmx_hardware_info_free(hwinfo);
1968 /* Does what it says */
1969 print_date_and_time(fplog,cr->nodeid,"Finished mdrun",&runtime);
1971 /* Close logfile already here if we were appending to it */
1972 if (MASTER(cr) && (Flags & MD_APPENDFILES))
1974 gmx_log_close(fplog);
1977 rc=(int)gmx_get_stop_condition();
1979 #ifdef GMX_THREAD_MPI
1980 /* we need to join all threads. The sub-threads join when they
1981 exit this function, but the master thread needs to be told to
1983 if (PAR(cr) && MASTER(cr))