2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team,
6 * check out http://www.gromacs.org for more information.
7 * Copyright (c) 2012,2013, by the GROMACS development team, led by
8 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
9 * others, as listed in the AUTHORS file in the top-level source
10 * directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
41 #if defined(HAVE_SCHED_H) && defined(HAVE_SCHED_GETAFFINITY)
44 #include <sys/syscall.h>
56 #include "md_logging.h"
57 #include "md_support.h"
67 #include "mpelogging.h"
73 #include "checkpoint.h"
74 #include "mtop_util.h"
75 #include "sighandler.h"
78 #include "gmx_detect_hardware.h"
79 #include "gmx_omp_nthreads.h"
80 #include "pull_rotation.h"
81 #include "calc_verletbuf.h"
82 #include "../mdlib/nbnxn_search.h"
83 #include "../mdlib/nbnxn_consts.h"
84 #include "gmx_fatal_collective.h"
86 #include "md_openmm.h"
89 #include "thread_mpi/threads.h"
102 #include "gpu_utils.h"
103 #include "nbnxn_cuda_data_mgmt.h"
106 gmx_integrator_t *func;
109 /* The array should match the eI array in include/types/enums.h */
110 const gmx_intp_t integrator[eiNR] = { {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm},{do_md_openmm}};
112 gmx_large_int_t deform_init_init_step_tpx;
113 matrix deform_init_box_tpx;
114 #ifdef GMX_THREAD_MPI
115 tMPI_Thread_mutex_t deform_init_box_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
119 #ifdef GMX_THREAD_MPI
120 struct mdrunner_arglist
122 gmx_hw_opt_t *hw_opt;
135 const char *dddlb_opt;
140 const char *nbpu_opt;
151 const char *deviceOptions;
153 int ret; /* return value */
157 /* The function used for spawning threads. Extracts the mdrunner()
158 arguments from its one argument and calls mdrunner(), after making
160 static void mdrunner_start_fn(void *arg)
162 struct mdrunner_arglist *mda=(struct mdrunner_arglist*)arg;
163 struct mdrunner_arglist mc=*mda; /* copy the arg list to make sure
164 that it's thread-local. This doesn't
165 copy pointed-to items, of course,
166 but those are all const. */
167 t_commrec *cr; /* we need a local version of this */
171 fnm = dup_tfn(mc.nfile, mc.fnm);
173 cr = init_par_threads(mc.cr);
180 mda->ret=mdrunner(mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv,
181 mc.bVerbose, mc.bCompact, mc.nstglobalcomm,
182 mc.ddxyz, mc.dd_node_order, mc.rdd,
183 mc.rconstr, mc.dddlb_opt, mc.dlb_scale,
184 mc.ddcsx, mc.ddcsy, mc.ddcsz,
186 mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
187 mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce,
188 mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.Flags);
191 /* called by mdrunner() to start a specific number of threads (including
192 the main thread) for thread-parallel runs. This in turn calls mdrunner()
194 All options besides nthreads are the same as for mdrunner(). */
195 static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
196 FILE *fplog,t_commrec *cr,int nfile,
197 const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
198 gmx_bool bCompact, int nstglobalcomm,
199 ivec ddxyz,int dd_node_order,real rdd,real rconstr,
200 const char *dddlb_opt,real dlb_scale,
201 const char *ddcsx,const char *ddcsy,const char *ddcsz,
202 const char *nbpu_opt,
203 int nsteps_cmdline, int nstepout,int resetstep,
204 int nmultisim,int repl_ex_nst,int repl_ex_nex, int repl_ex_seed,
205 real pforce,real cpt_period, real max_hours,
206 const char *deviceOptions, unsigned long Flags)
209 struct mdrunner_arglist *mda;
210 t_commrec *crn; /* the new commrec */
213 /* first check whether we even need to start tMPI */
214 if (hw_opt->nthreads_tmpi < 2)
219 /* a few small, one-time, almost unavoidable memory leaks: */
221 fnmn=dup_tfn(nfile, fnm);
223 /* fill the data structure to pass as void pointer to thread start fn */
230 mda->bVerbose=bVerbose;
231 mda->bCompact=bCompact;
232 mda->nstglobalcomm=nstglobalcomm;
233 mda->ddxyz[XX]=ddxyz[XX];
234 mda->ddxyz[YY]=ddxyz[YY];
235 mda->ddxyz[ZZ]=ddxyz[ZZ];
236 mda->dd_node_order=dd_node_order;
238 mda->rconstr=rconstr;
239 mda->dddlb_opt=dddlb_opt;
240 mda->dlb_scale=dlb_scale;
244 mda->nbpu_opt=nbpu_opt;
245 mda->nsteps_cmdline=nsteps_cmdline;
246 mda->nstepout=nstepout;
247 mda->resetstep=resetstep;
248 mda->nmultisim=nmultisim;
249 mda->repl_ex_nst=repl_ex_nst;
250 mda->repl_ex_nex=repl_ex_nex;
251 mda->repl_ex_seed=repl_ex_seed;
253 mda->cpt_period=cpt_period;
254 mda->max_hours=max_hours;
255 mda->deviceOptions=deviceOptions;
258 /* now spawn new threads that start mdrunner_start_fn(), while
259 the main thread returns */
260 ret=tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi,
261 (hw_opt->bThreadPinning ? TMPI_AFFINITY_ALL_CORES : TMPI_AFFINITY_NONE),
262 mdrunner_start_fn, (void*)(mda) );
263 if (ret!=TMPI_SUCCESS)
266 /* make a new comm_rec to reflect the new situation */
267 crn=init_par_threads(cr);
272 static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
273 const gmx_hw_opt_t *hw_opt,
279 /* There are no separate PME nodes here, as we ensured in
280 * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
281 * and a conditional ensures we would not have ended up here.
282 * Note that separate PME nodes might be switched on later.
286 nthreads_tmpi = ngpu;
287 if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi)
289 nthreads_tmpi = nthreads_tot;
292 else if (hw_opt->nthreads_omp > 0)
294 /* Here we could oversubscribe, when we do, we issue a warning later */
295 nthreads_tmpi = max(1,nthreads_tot/hw_opt->nthreads_omp);
299 /* TODO choose nthreads_omp based on hardware topology
300 when we have a hardware topology detection library */
301 /* In general, when running up to 4 threads, OpenMP should be faster.
302 * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
303 * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
304 * even on two CPUs it's usually faster (but with many OpenMP threads
305 * it could be faster not to use HT, currently we always use HT).
306 * On Nehalem/Westmere we want to avoid running 16 threads over
307 * two CPUs with HT, so we need a limit<16; thus we use 12.
308 * A reasonable limit for Intel Sandy and Ivy bridge,
309 * not knowing the topology, is 16 threads.
311 const int nthreads_omp_always_faster = 4;
312 const int nthreads_omp_always_faster_Nehalem = 12;
313 const int nthreads_omp_always_faster_SandyBridge = 16;
314 const int first_model_Nehalem = 0x1A;
315 const int first_model_SandyBridge = 0x2A;
316 gmx_bool bIntel_Family6;
319 (gmx_cpuid_vendor(hwinfo->cpuid_info) == GMX_CPUID_VENDOR_INTEL &&
320 gmx_cpuid_family(hwinfo->cpuid_info) == 6);
322 if (nthreads_tot <= nthreads_omp_always_faster ||
324 ((gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_Nehalem && nthreads_tot <= nthreads_omp_always_faster_Nehalem) ||
325 (gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_SandyBridge && nthreads_tot <= nthreads_omp_always_faster_SandyBridge))))
327 /* Use pure OpenMP parallelization */
332 /* Don't use OpenMP parallelization */
333 nthreads_tmpi = nthreads_tot;
337 return nthreads_tmpi;
341 /* Get the number of threads to use for thread-MPI based on how many
342 * were requested, which algorithms we're using,
343 * and how many particles there are.
344 * At the point we have already called check_and_update_hw_opt.
345 * Thus all options should be internally consistent and consistent
346 * with the hardware, except that ntmpi could be larger than #GPU.
348 static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
349 gmx_hw_opt_t *hw_opt,
350 t_inputrec *inputrec, gmx_mtop_t *mtop,
354 int nthreads_hw,nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu;
355 int min_atoms_per_mpi_thread;
360 if (hw_opt->nthreads_tmpi > 0)
362 /* Trivial, return right away */
363 return hw_opt->nthreads_tmpi;
366 nthreads_hw = hwinfo->nthreads_hw_avail;
368 /* How many total (#tMPI*#OpenMP) threads can we start? */
369 if (hw_opt->nthreads_tot > 0)
371 nthreads_tot_max = hw_opt->nthreads_tot;
375 nthreads_tot_max = nthreads_hw;
378 bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
381 ngpu = hwinfo->gpu_info.ncuda_dev_use;
389 get_tmpi_omp_thread_division(hwinfo,hw_opt,nthreads_tot_max,ngpu);
391 if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
393 /* Steps are divided over the nodes iso splitting the atoms */
394 min_atoms_per_mpi_thread = 0;
400 min_atoms_per_mpi_thread = MIN_ATOMS_PER_GPU;
404 min_atoms_per_mpi_thread = MIN_ATOMS_PER_MPI_THREAD;
408 /* Check if an algorithm does not support parallel simulation. */
409 if (nthreads_tmpi != 1 &&
410 ( inputrec->eI == eiLBFGS ||
411 inputrec->coulombtype == eelEWALD ) )
415 md_print_warn(cr,fplog,"The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
416 if (hw_opt->nthreads_tmpi > nthreads_tmpi)
418 gmx_fatal(FARGS,"You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
421 else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_thread)
423 /* the thread number was chosen automatically, but there are too many
424 threads (too few atoms per thread) */
425 nthreads_new = max(1,mtop->natoms/min_atoms_per_mpi_thread);
427 /* Avoid partial use of Hyper-Threading */
428 if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
429 nthreads_new > nthreads_hw/2 && nthreads_new < nthreads_hw)
431 nthreads_new = nthreads_hw/2;
434 /* Avoid large prime numbers in the thread count */
435 if (nthreads_new >= 6)
437 /* Use only 6,8,10 with additional factors of 2 */
441 while (3*fac*2 <= nthreads_new)
446 nthreads_new = (nthreads_new/fac)*fac;
451 if (nthreads_new == 5)
457 nthreads_tmpi = nthreads_new;
459 fprintf(stderr,"\n");
460 fprintf(stderr,"NOTE: Parallelization is limited by the small number of atoms,\n");
461 fprintf(stderr," only starting %d thread-MPI threads.\n",nthreads_tmpi);
462 fprintf(stderr," You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
465 return nthreads_tmpi;
467 #endif /* GMX_THREAD_MPI */
470 /* Environment variable for setting nstlist */
471 static const char* NSTLIST_ENVVAR = "GMX_NSTLIST";
472 /* Try to increase nstlist when using a GPU with nstlist less than this */
473 static const int NSTLIST_GPU_ENOUGH = 20;
474 /* Increase nstlist until the non-bonded cost increases more than this factor */
475 static const float NBNXN_GPU_LIST_OK_FAC = 1.25;
476 /* Don't increase nstlist beyond a non-bonded cost increases of this factor */
477 static const float NBNXN_GPU_LIST_MAX_FAC = 1.40;
479 /* Try to increase nstlist when running on a GPU */
480 static void increase_nstlist(FILE *fp,t_commrec *cr,
481 t_inputrec *ir,const gmx_mtop_t *mtop,matrix box)
484 int nstlist_orig,nstlist_prev;
485 verletbuf_list_setup_t ls;
486 real rlist_inc,rlist_ok,rlist_max,rlist_new,rlist_prev;
489 gmx_bool bBox,bDD,bCont;
490 const char *nstl_fmt="\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
491 const char *vbd_err="Can not increase nstlist for GPU run because verlet-buffer-drift is not set or used";
492 const char *box_err="Can not increase nstlist for GPU run because the box is too small";
493 const char *dd_err ="Can not increase nstlist for GPU run because of domain decomposition limitations";
496 /* Number of + nstlist alternative values to try when switching */
497 const int nstl[]={ 20, 25, 40, 50 };
498 #define NNSTL sizeof(nstl)/sizeof(nstl[0])
500 env = getenv(NSTLIST_ENVVAR);
505 fprintf(fp,nstl_fmt,ir->nstlist);
509 if (ir->verletbuf_drift == 0)
511 gmx_fatal(FARGS,"You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
514 if (ir->verletbuf_drift < 0)
518 fprintf(stderr,"%s\n",vbd_err);
522 fprintf(fp,"%s\n",vbd_err);
528 nstlist_orig = ir->nstlist;
531 sprintf(buf,"Getting nstlist from environment variable GMX_NSTLIST=%s",env);
534 fprintf(stderr,"%s\n",buf);
538 fprintf(fp,"%s\n",buf);
540 sscanf(env,"%d",&ir->nstlist);
543 verletbuf_get_list_setup(TRUE,&ls);
545 /* Allow rlist to make the list double the size of the cut-off sphere */
546 rlist_inc = nbnxn_get_rlist_effective_inc(NBNXN_GPU_CLUSTER_SIZE,mtop->natoms/det(box));
547 rlist_ok = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_OK_FAC,1.0/3.0) - rlist_inc;
548 rlist_max = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_MAX_FAC,1.0/3.0) - rlist_inc;
551 fprintf(debug,"GPU nstlist tuning: rlist_inc %.3f rlist_max %.3f\n",
552 rlist_inc,rlist_max);
556 nstlist_prev = nstlist_orig;
557 rlist_prev = ir->rlist;
562 ir->nstlist = nstl[i];
565 /* Set the pair-list buffer size in ir */
566 calc_verlet_buffer_size(mtop,det(box),ir,ir->verletbuf_drift,&ls,
569 /* Does rlist fit in the box? */
570 bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC,box));
572 if (bBox && DOMAINDECOMP(cr))
574 /* Check if rlist fits in the domain decomposition */
575 if (inputrec2nboundeddim(ir) < DIM)
577 gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
579 copy_mat(box,state_tmp.box);
580 bDD = change_dd_cutoff(cr,&state_tmp,ir,rlist_new);
587 if (bBox && bDD && rlist_new <= rlist_max)
589 /* Increase nstlist */
590 nstlist_prev = ir->nstlist;
591 rlist_prev = rlist_new;
592 bCont = (i+1 < NNSTL && rlist_new < rlist_ok);
596 /* Stick with the previous nstlist */
597 ir->nstlist = nstlist_prev;
598 rlist_new = rlist_prev;
610 gmx_warning(!bBox ? box_err : dd_err);
613 fprintf(fp,"\n%s\n",bBox ? box_err : dd_err);
615 ir->nstlist = nstlist_orig;
617 else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
619 sprintf(buf,"Changing nstlist from %d to %d, rlist from %g to %g",
620 nstlist_orig,ir->nstlist,
621 ir->rlist,rlist_new);
624 fprintf(stderr,"%s\n\n",buf);
628 fprintf(fp,"%s\n\n",buf);
630 ir->rlist = rlist_new;
631 ir->rlistlong = rlist_new;
635 static void prepare_verlet_scheme(FILE *fplog,
636 gmx_hw_info_t *hwinfo,
638 gmx_hw_opt_t *hw_opt,
639 const char *nbpu_opt,
641 const gmx_mtop_t *mtop,
645 /* Here we only check for GPU usage on the MPI master process,
646 * as here we don't know how many GPUs we will use yet.
647 * We check for a GPU on all processes later.
649 *bUseGPU = hwinfo->bCanUseGPU || (getenv("GMX_EMULATE_GPU") != NULL);
651 if (ir->verletbuf_drift > 0)
653 /* Update the Verlet buffer size for the current run setup */
654 verletbuf_list_setup_t ls;
657 /* Here we assume CPU acceleration is on. But as currently
658 * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
659 * and 4x2 gives a larger buffer than 4x4, this is ok.
661 verletbuf_get_list_setup(*bUseGPU,&ls);
663 calc_verlet_buffer_size(mtop,det(box),ir,
664 ir->verletbuf_drift,&ls,
666 if (rlist_new != ir->rlist)
670 fprintf(fplog,"\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
672 ls.cluster_size_i,ls.cluster_size_j);
674 ir->rlist = rlist_new;
675 ir->rlistlong = rlist_new;
679 /* With GPU or emulation we should check nstlist for performance */
680 if ((EI_DYNAMICS(ir->eI) &&
682 ir->nstlist < NSTLIST_GPU_ENOUGH) ||
683 getenv(NSTLIST_ENVVAR) != NULL)
685 /* Choose a better nstlist */
686 increase_nstlist(fplog,cr,ir,mtop,box);
690 static void convert_to_verlet_scheme(FILE *fplog,
692 gmx_mtop_t *mtop,real box_vol)
694 char *conv_mesg="Converting input file with group cut-off scheme to the Verlet cut-off scheme";
696 md_print_warn(NULL,fplog,"%s\n",conv_mesg);
698 ir->cutoff_scheme = ecutsVERLET;
699 ir->verletbuf_drift = 0.005;
701 if (ir->rcoulomb != ir->rvdw)
703 gmx_fatal(FARGS,"The VdW and Coulomb cut-offs are different, whereas the Verlet scheme only supports equal cut-offs");
706 if (ir->vdwtype == evdwUSER || EEL_USER(ir->coulombtype))
708 gmx_fatal(FARGS,"User non-bonded potentials are not (yet) supported with the Verlet scheme");
710 else if (EVDW_SWITCHED(ir->vdwtype) || EEL_SWITCHED(ir->coulombtype))
712 md_print_warn(NULL,fplog,"Converting switched or shifted interactions to a shifted potential (without force shift), this will lead to slightly different interaction potentials");
714 if (EVDW_SWITCHED(ir->vdwtype))
716 ir->vdwtype = evdwCUT;
718 if (EEL_SWITCHED(ir->coulombtype))
720 if (EEL_FULL(ir->coulombtype))
722 /* With full electrostatic only PME can be switched */
723 ir->coulombtype = eelPME;
727 md_print_warn(NULL,fplog,"NOTE: Replacing %s electrostatics with reaction-field with epsilon-rf=inf\n",eel_names[ir->coulombtype]);
728 ir->coulombtype = eelRF;
729 ir->epsilon_rf = 0.0;
733 /* We set the target energy drift to a small number.
734 * Note that this is only for testing. For production the user
735 * should think about this and set the mdp options.
737 ir->verletbuf_drift = 1e-4;
740 if (inputrec2nboundeddim(ir) != 3)
742 gmx_fatal(FARGS,"Can only convert old tpr files to the Verlet cut-off scheme with 3D pbc");
745 if (ir->efep != efepNO || ir->implicit_solvent != eisNO)
747 gmx_fatal(FARGS,"Will not convert old tpr files to the Verlet cut-off scheme with free-energy calculations or implicit solvent");
750 if (EI_DYNAMICS(ir->eI) && !(EI_MD(ir->eI) && ir->etc == etcNO))
752 verletbuf_list_setup_t ls;
754 verletbuf_get_list_setup(FALSE,&ls);
755 calc_verlet_buffer_size(mtop,box_vol,ir,ir->verletbuf_drift,&ls,
760 ir->verletbuf_drift = -1;
761 ir->rlist = 1.05*max(ir->rvdw,ir->rcoulomb);
764 gmx_mtop_remove_chargegroups(mtop);
767 /* Check the process affinity mask. If it is non-zero, something
768 * else has set the affinity, and mdrun should honor that and
769 * not attempt to do its own thread pinning.
771 * This function should be called twice. Once before the OpenMP
772 * library gets initialized with bAfterOpenMPInit=FALSE (which will
773 * detect affinity set by external tools like taskset), and again
774 * later, after the OpenMP initialization, with bAfterOpenMPInit=TRUE
775 * (which will detect affinity changes made by the OpenMP library).
777 * Note that this will only work on Linux, because we use a GNU
779 static void check_cpu_affinity_set(FILE *fplog, const t_commrec *cr,
780 gmx_hw_opt_t *hw_opt, int ncpus,
781 gmx_bool bAfterOpenmpInit)
783 #ifdef HAVE_SCHED_GETAFFINITY
784 cpu_set_t mask_current;
785 int i, ret, cpu_count, cpu_set;
789 if (!hw_opt->bThreadPinning)
791 /* internal affinity setting is off, don't bother checking process affinity */
795 CPU_ZERO(&mask_current);
796 if ((ret = sched_getaffinity(0, sizeof(cpu_set_t), &mask_current)) != 0)
798 /* failed to query affinity mask, will just return */
801 fprintf(debug, "Failed to query affinity mask (error %d)", ret);
806 /* Before proceeding with the actual check, make sure that the number of
807 * detected CPUs is >= the CPUs in the current set.
808 * We need to check for CPU_COUNT as it was added only in glibc 2.6. */
810 if (ncpus < CPU_COUNT(&mask_current))
814 fprintf(debug, "%d CPUs detected, but %d was returned by CPU_COUNT",
815 ncpus, CPU_COUNT(&mask_current));
819 #endif /* CPU_COUNT */
822 for (i = 0; (i < ncpus && i < CPU_SETSIZE); i++)
824 bAllSet = bAllSet && (CPU_ISSET(i, &mask_current) != 0);
829 if (!bAfterOpenmpInit)
831 md_print_warn(cr, fplog,
832 "%s detected a non-default process affinity, "
833 "so it will not attempt to pin its threads", ShortProgram());
837 md_print_warn(cr, fplog,
838 "%s detected a non-default process affinity, "
839 "probably set by the OpenMP library, "
840 "so it will not attempt to pin its threads", ShortProgram());
842 hw_opt->bThreadPinning = FALSE;
846 fprintf(debug, "Non-default affinity mask found, mdrun will not pin threads\n");
853 fprintf(debug, "Default affinity mask found\n");
856 #endif /* HAVE_SCHED_GETAFFINITY */
859 /* Set CPU affinity. Can be important for performance.
860 On some systems (e.g. Cray) CPU Affinity is set by default.
861 But default assigning doesn't work (well) with only some ranks
862 having threads. This causes very low performance.
863 External tools have cumbersome syntax for setting affinity
864 in the case that only some ranks have threads.
865 Thus it is important that GROMACS sets the affinity internally
866 if only PME is using threads.
868 static void set_cpu_affinity(FILE *fplog,
870 gmx_hw_opt_t *hw_opt,
872 const gmx_hw_info_t *hwinfo,
873 const t_inputrec *inputrec)
875 #if defined GMX_THREAD_MPI
876 /* With the number of TMPI threads equal to the number of cores
877 * we already pinned in thread-MPI, so don't pin again here.
879 if (hw_opt->nthreads_tmpi == tMPI_Thread_get_hw_number())
886 /* If the tMPI thread affinity setting is not supported encourage the user
887 * to report it as it's either a bug or an exotic platform which we might
888 * want to support. */
889 if (tMPI_Thread_setaffinity_support() != TMPI_SETAFFINITY_SUPPORT_YES)
891 md_print_warn(NULL, fplog,
892 "Can not set thread affinities on the current plarform. On NUMA systems this\n"
893 "can cause performance degradation. If you think your platform should support\n"
894 "setting affinities, contact the GROMACS developers.");
897 #endif /* __APPLE__ */
899 if (hw_opt->bThreadPinning)
901 int nth_affinity_set, thread_id_node, thread_id,
902 nthread_local, nthread_node, nthread_hw_max, nphyscore;
906 /* threads on this MPI process or TMPI thread */
907 if (cr->duty & DUTY_PP)
909 nthread_local = gmx_omp_nthreads_get(emntNonbonded);
913 nthread_local = gmx_omp_nthreads_get(emntPME);
916 /* map the current process to cores */
918 nthread_node = nthread_local;
920 if (PAR(cr) || MULTISIM(cr))
922 /* We need to determine a scan of the thread counts in this
927 MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),cr->rank_intranode,
929 MPI_Scan(&nthread_local,&thread_id_node,1,MPI_INT,MPI_SUM,comm_intra);
930 /* MPI_Scan is inclusive, but here we need exclusive */
931 thread_id_node -= nthread_local;
932 /* Get the total number of threads on this physical node */
933 MPI_Allreduce(&nthread_local,&nthread_node,1,MPI_INT,MPI_SUM,comm_intra);
934 MPI_Comm_free(&comm_intra);
939 if (hw_opt->core_pinning_offset > 0)
941 offset = hw_opt->core_pinning_offset;
944 fprintf(stderr, "Applying core pinning offset %d\n", offset);
948 fprintf(fplog, "Applying core pinning offset %d\n", offset);
952 /* With Intel Hyper-Threading enabled, we want to pin consecutive
953 * threads to physical cores when using more threads than physical
954 * cores or when the user requests so.
956 nthread_hw_max = hwinfo->nthreads_hw_avail;
958 if (hw_opt->bPinHyperthreading ||
959 (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
960 nthread_node > nthread_hw_max/2 && getenv("GMX_DISABLE_PINHT") == NULL))
962 if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) != GMX_CPUID_X86_SMT_ENABLED)
964 /* We print to stderr on all processes, as we might have
965 * different settings on different physical nodes.
967 if (gmx_cpuid_vendor(hwinfo->cpuid_info) != GMX_CPUID_VENDOR_INTEL)
969 md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
970 "but non-Intel CPU detected (vendor: %s)\n",
971 gmx_cpuid_vendor_string[gmx_cpuid_vendor(hwinfo->cpuid_info)]);
975 md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
976 "but the CPU detected does not have Intel Hyper-Threading support "
977 "(or it is turned off)\n");
980 nphyscore = nthread_hw_max/2;
984 fprintf(stderr, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
989 fprintf(fplog, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
994 /* Set the per-thread affinity. In order to be able to check the success
995 * of affinity settings, we will set nth_affinity_set to 1 on threads
996 * where the affinity setting succeded and to 0 where it failed.
997 * Reducing these 0/1 values over the threads will give the total number
998 * of threads on which we succeeded.
1000 nth_affinity_set = 0;
1001 #pragma omp parallel firstprivate(thread_id_node) num_threads(nthread_local) \
1002 reduction(+:nth_affinity_set)
1005 gmx_bool setaffinity_ret;
1007 thread_id = gmx_omp_get_thread_num();
1008 thread_id_node += thread_id;
1011 core = offset + thread_id_node;
1015 /* Lock pairs of threads to the same hyperthreaded core */
1016 core = offset + thread_id_node/2 + (thread_id_node % 2)*nphyscore;
1019 setaffinity_ret = tMPI_Thread_setaffinity_single(tMPI_Thread_self(), core);
1021 /* store the per-thread success-values of the setaffinity */
1022 nth_affinity_set = (setaffinity_ret == 0);
1026 fprintf(debug, "On rank %2d, thread %2d, core %2d the affinity setting returned %d\n",
1027 cr->nodeid, gmx_omp_get_thread_num(), core, setaffinity_ret);
1031 if (nth_affinity_set > nthread_local)
1035 sprintf(msg, "Looks like we have set affinity for more threads than "
1036 "we have (%d > %d)!\n", nth_affinity_set, nthread_local);
1041 /* check & warn if some threads failed to set their affinities */
1042 if (nth_affinity_set != nthread_local)
1044 char sbuf1[STRLEN], sbuf2[STRLEN];
1046 /* sbuf1 contains rank info, while sbuf2 OpenMP thread info */
1047 sbuf1[0] = sbuf2[0] = '\0';
1049 #ifdef GMX_THREAD_MPI
1050 sprintf(sbuf1, "In thread-MPI thread #%d: ", cr->nodeid);
1051 #else /* GMX_LIB_MPI */
1052 sprintf(sbuf1, "In MPI process #%d: ", cr->nodeid);
1054 #endif /* GMX_MPI */
1056 if (nthread_local > 1)
1058 sprintf(sbuf2, "of %d/%d thread%s ",
1059 nthread_local - nth_affinity_set, nthread_local,
1060 (nthread_local - nth_affinity_set) > 1 ? "s" : "");
1063 md_print_warn(NULL, fplog,
1064 "NOTE: %sAffinity setting %sfailed.\n"
1065 " This can cause performance degradation!",
1073 static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt,
1076 gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp);
1078 #ifndef GMX_THREAD_MPI
1079 if (hw_opt->nthreads_tot > 0)
1081 gmx_fatal(FARGS,"Setting the total number of threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
1083 if (hw_opt->nthreads_tmpi > 0)
1085 gmx_fatal(FARGS,"Setting the number of thread-MPI threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
1089 if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
1091 /* We have the same number of OpenMP threads for PP and PME processes,
1092 * thus we can perform several consistency checks.
1094 if (hw_opt->nthreads_tmpi > 0 &&
1095 hw_opt->nthreads_omp > 0 &&
1096 hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
1098 gmx_fatal(FARGS,"The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
1099 hw_opt->nthreads_tot,hw_opt->nthreads_tmpi,hw_opt->nthreads_omp);
1102 if (hw_opt->nthreads_tmpi > 0 &&
1103 hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
1105 gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
1106 hw_opt->nthreads_tot,hw_opt->nthreads_tmpi);
1109 if (hw_opt->nthreads_omp > 0 &&
1110 hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
1112 gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
1113 hw_opt->nthreads_tot,hw_opt->nthreads_omp);
1116 if (hw_opt->nthreads_tmpi > 0 &&
1117 hw_opt->nthreads_omp <= 0)
1119 hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
1124 if (hw_opt->nthreads_omp > 1)
1126 gmx_fatal(FARGS,"OpenMP threads are requested, but Gromacs was compiled without OpenMP support");
1130 if (cutoff_scheme == ecutsGROUP)
1132 /* We only have OpenMP support for PME only nodes */
1133 if (hw_opt->nthreads_omp > 1)
1135 gmx_fatal(FARGS,"OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
1136 ecutscheme_names[cutoff_scheme],
1137 ecutscheme_names[ecutsVERLET]);
1139 hw_opt->nthreads_omp = 1;
1142 if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
1144 gmx_fatal(FARGS,"You need to specify -ntomp in addition to -ntomp_pme");
1147 if (hw_opt->nthreads_tot == 1)
1149 hw_opt->nthreads_tmpi = 1;
1151 if (hw_opt->nthreads_omp > 1)
1153 gmx_fatal(FARGS,"You requested %d OpenMP threads with %d total threads",
1154 hw_opt->nthreads_tmpi,hw_opt->nthreads_tot);
1156 hw_opt->nthreads_omp = 1;
1159 if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
1161 hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
1166 fprintf(debug,"hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
1167 hw_opt->nthreads_tot,
1168 hw_opt->nthreads_tmpi,
1169 hw_opt->nthreads_omp,
1170 hw_opt->nthreads_omp_pme,
1171 hw_opt->gpu_id!=NULL ? hw_opt->gpu_id : "");
1177 /* Override the value in inputrec with value passed on the command line (if any) */
1178 static void override_nsteps_cmdline(FILE *fplog,
1181 const t_commrec *cr)
1186 /* override with anything else than the default -2 */
1187 if (nsteps_cmdline > -2)
1191 ir->nsteps = nsteps_cmdline;
1192 if (EI_DYNAMICS(ir->eI))
1194 sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps, %.3f ps",
1195 nsteps_cmdline, nsteps_cmdline*ir->delta_t);
1199 sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps",
1203 md_print_warn(cr, fplog, "%s\n", stmp);
1207 /* Data structure set by SIMMASTER which needs to be passed to all nodes
1208 * before the other nodes have read the tpx file and called gmx_detect_hardware.
1211 int cutoff_scheme; /* The cutoff scheme from inputrec_t */
1212 gmx_bool bUseGPU; /* Use GPU or GPU emulation */
1215 int mdrunner(gmx_hw_opt_t *hw_opt,
1216 FILE *fplog,t_commrec *cr,int nfile,
1217 const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
1218 gmx_bool bCompact, int nstglobalcomm,
1219 ivec ddxyz,int dd_node_order,real rdd,real rconstr,
1220 const char *dddlb_opt,real dlb_scale,
1221 const char *ddcsx,const char *ddcsy,const char *ddcsz,
1222 const char *nbpu_opt,
1223 int nsteps_cmdline, int nstepout,int resetstep,
1224 int nmultisim,int repl_ex_nst,int repl_ex_nex,
1225 int repl_ex_seed, real pforce,real cpt_period,real max_hours,
1226 const char *deviceOptions, unsigned long Flags)
1228 gmx_bool bForceUseGPU,bTryUseGPU;
1229 double nodetime=0,realtime;
1230 t_inputrec *inputrec;
1231 t_state *state=NULL;
1233 gmx_ddbox_t ddbox={0};
1234 int npme_major,npme_minor;
1237 gmx_mtop_t *mtop=NULL;
1238 t_mdatoms *mdatoms=NULL;
1239 t_forcerec *fr=NULL;
1242 gmx_pme_t *pmedata=NULL;
1243 gmx_vsite_t *vsite=NULL;
1244 gmx_constr_t constr;
1245 int i,m,nChargePerturbed=-1,status,nalloc;
1247 gmx_wallcycle_t wcycle;
1248 gmx_bool bReadRNG,bReadEkin;
1250 gmx_runtime_t runtime;
1252 gmx_large_int_t reset_counters;
1253 gmx_edsam_t ed=NULL;
1254 t_commrec *cr_old=cr;
1257 gmx_membed_t membed=NULL;
1258 gmx_hw_info_t *hwinfo=NULL;
1259 master_inf_t minf={-1,FALSE};
1261 /* CAUTION: threads may be started later on in this function, so
1262 cr doesn't reflect the final parallel state right now */
1266 if (Flags & MD_APPENDFILES)
1271 bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
1272 bTryUseGPU = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
1277 /* Read (nearly) all data required for the simulation */
1278 read_tpx_state(ftp2fn(efTPX,nfile,fnm),inputrec,state,NULL,mtop);
1280 if (inputrec->cutoff_scheme != ecutsVERLET &&
1281 ((Flags & MD_TESTVERLET) || getenv("GMX_VERLET_SCHEME") != NULL))
1283 convert_to_verlet_scheme(fplog,inputrec,mtop,det(state->box));
1286 /* Detect hardware, gather information. With tMPI only thread 0 does it
1287 * and after threads are started broadcasts hwinfo around. */
1289 gmx_detect_hardware(fplog, hwinfo, cr,
1290 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
1292 minf.cutoff_scheme = inputrec->cutoff_scheme;
1293 minf.bUseGPU = FALSE;
1295 if (inputrec->cutoff_scheme == ecutsVERLET)
1297 prepare_verlet_scheme(fplog,hwinfo,cr,hw_opt,nbpu_opt,
1298 inputrec,mtop,state->box,
1301 else if (hwinfo->bCanUseGPU)
1303 md_print_warn(cr,fplog,
1304 "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
1305 " To use a GPU, set the mdp option: cutoff-scheme = Verlet\n"
1306 " (for quick performance testing you can use the -testverlet option)\n");
1310 gmx_fatal(FARGS,"GPU requested, but can't be used without cutoff-scheme=Verlet");
1314 #ifndef GMX_THREAD_MPI
1317 gmx_bcast_sim(sizeof(minf),&minf,cr);
1320 if (minf.bUseGPU && cr->npmenodes == -1)
1322 /* Don't automatically use PME-only nodes with GPUs */
1326 /* Check for externally set OpenMP affinity and turn off internal
1327 * pinning if any is found. We need to do this check early to tell
1328 * thread-MPI whether it should do pinning when spawning threads.
1330 gmx_omp_check_thread_affinity(fplog, cr, hw_opt);
1332 #ifdef GMX_THREAD_MPI
1333 /* With thread-MPI inputrec is only set here on the master thread */
1337 check_and_update_hw_opt(hw_opt,minf.cutoff_scheme);
1339 #ifdef GMX_THREAD_MPI
1340 /* Early check for externally set process affinity. Can't do over all
1341 * MPI processes because hwinfo is not available everywhere, but with
1342 * thread-MPI it's needed as pinning might get turned off which needs
1343 * to be known before starting thread-MPI. */
1344 check_cpu_affinity_set(fplog,
1346 hw_opt, hwinfo->nthreads_hw_avail, FALSE);
1349 #ifdef GMX_THREAD_MPI
1350 if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
1352 gmx_fatal(FARGS,"You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME nodes");
1356 if (hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
1359 gmx_fatal(FARGS,"You need to explicitly specify the number of PME nodes (-npme) when using different number of OpenMP threads for PP and PME nodes");
1363 #ifdef GMX_THREAD_MPI
1366 /* NOW the threads will be started: */
1367 hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
1371 if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
1373 hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
1376 if (hw_opt->nthreads_tmpi > 1)
1378 /* now start the threads. */
1379 cr=mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm,
1380 oenv, bVerbose, bCompact, nstglobalcomm,
1381 ddxyz, dd_node_order, rdd, rconstr,
1382 dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
1384 nsteps_cmdline, nstepout, resetstep, nmultisim,
1385 repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
1386 cpt_period, max_hours, deviceOptions,
1388 /* the main thread continues here with a new cr. We don't deallocate
1389 the old cr because other threads may still be reading it. */
1392 gmx_comm("Failed to spawn threads");
1397 /* END OF CAUTION: cr is now reliable */
1399 /* g_membed initialisation *
1400 * Because we change the mtop, init_membed is called before the init_parallel *
1401 * (in case we ever want to make it run in parallel) */
1402 if (opt2bSet("-membed",nfile,fnm))
1406 fprintf(stderr,"Initializing membed");
1408 membed = init_membed(fplog,nfile,fnm,mtop,inputrec,state,cr,&cpt_period);
1413 /* now broadcast everything to the non-master nodes/threads: */
1414 init_parallel(fplog, cr, inputrec, mtop);
1416 /* This check needs to happen after get_nthreads_mpi() */
1417 if (inputrec->cutoff_scheme == ecutsVERLET && (Flags & MD_PARTDEC))
1419 gmx_fatal_collective(FARGS,cr,NULL,
1420 "The Verlet cut-off scheme is not supported with particle decomposition.\n"
1421 "You can achieve the same effect as particle decomposition by running in parallel using only OpenMP threads.");
1426 pr_inputrec(fplog,0,"Input Parameters",inputrec,FALSE);
1429 #if defined GMX_THREAD_MPI
1430 /* With tMPI we detected on thread 0 and we'll just pass the hwinfo pointer
1431 * to the other threads -- slightly uncool, but works fine, just need to
1432 * make sure that the data doesn't get freed twice. */
1439 gmx_bcast(sizeof(&hwinfo), &hwinfo, cr);
1442 if (PAR(cr) && !SIMMASTER(cr))
1444 /* now we have inputrec on all nodes, can run the detection */
1445 /* TODO: perhaps it's better to propagate within a node instead? */
1447 gmx_detect_hardware(fplog, hwinfo, cr,
1448 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
1451 /* Now do the affinity check with MPI/no-MPI (done earlier with thread-MPI). */
1452 check_cpu_affinity_set(fplog, cr,
1453 hw_opt, hwinfo->nthreads_hw_avail, FALSE);
1456 /* now make sure the state is initialized and propagated */
1457 set_state_entries(state,inputrec,cr->nnodes);
1459 /* remove when vv and rerun works correctly! */
1460 if (PAR(cr) && EI_VV(inputrec->eI) && ((Flags & MD_RERUN) || (Flags & MD_RERUN_VSITE)))
1463 "Currently can't do velocity verlet with rerun in parallel.");
1466 /* A parallel command line option consistency check that we can
1467 only do after any threads have started. */
1469 (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
1472 "The -dd or -npme option request a parallel simulation, "
1474 "but %s was compiled without threads or MPI enabled"
1476 #ifdef GMX_THREAD_MPI
1477 "but the number of threads (option -nt) is 1"
1479 "but %s was not started through mpirun/mpiexec or only one process was requested through mpirun/mpiexec"
1486 if ((Flags & MD_RERUN) &&
1487 (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
1489 gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
1492 if (can_use_allvsall(inputrec,mtop,TRUE,cr,fplog) && PAR(cr))
1494 /* All-vs-all loops do not work with domain decomposition */
1495 Flags |= MD_PARTDEC;
1498 if (!EEL_PME(inputrec->coulombtype) || (Flags & MD_PARTDEC))
1500 if (cr->npmenodes > 0)
1502 if (!EEL_PME(inputrec->coulombtype))
1504 gmx_fatal_collective(FARGS,cr,NULL,
1505 "PME nodes are requested, but the system does not use PME electrostatics");
1507 if (Flags & MD_PARTDEC)
1509 gmx_fatal_collective(FARGS,cr,NULL,
1510 "PME nodes are requested, but particle decomposition does not support separate PME nodes");
1518 fcRegisterSteps(inputrec->nsteps,inputrec->init_step);
1521 /* NMR restraints must be initialized before load_checkpoint,
1522 * since with time averaging the history is added to t_state.
1523 * For proper consistency check we therefore need to extend
1525 * So the PME-only nodes (if present) will also initialize
1526 * the distance restraints.
1530 /* This needs to be called before read_checkpoint to extend the state */
1531 init_disres(fplog,mtop,inputrec,cr,Flags & MD_PARTDEC,fcd,state);
1533 if (gmx_mtop_ftype_count(mtop,F_ORIRES) > 0)
1535 if (PAR(cr) && !(Flags & MD_PARTDEC))
1537 gmx_fatal(FARGS,"Orientation restraints do not work (yet) with domain decomposition, use particle decomposition (mdrun option -pd)");
1539 /* Orientation restraints */
1542 init_orires(fplog,mtop,state->x,inputrec,cr->ms,&(fcd->orires),
1547 if (DEFORM(*inputrec))
1549 /* Store the deform reference box before reading the checkpoint */
1552 copy_mat(state->box,box);
1556 gmx_bcast(sizeof(box),box,cr);
1558 /* Because we do not have the update struct available yet
1559 * in which the reference values should be stored,
1560 * we store them temporarily in static variables.
1561 * This should be thread safe, since they are only written once
1562 * and with identical values.
1564 #ifdef GMX_THREAD_MPI
1565 tMPI_Thread_mutex_lock(&deform_init_box_mutex);
1567 deform_init_init_step_tpx = inputrec->init_step;
1568 copy_mat(box,deform_init_box_tpx);
1569 #ifdef GMX_THREAD_MPI
1570 tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
1574 if (opt2bSet("-cpi",nfile,fnm))
1576 /* Check if checkpoint file exists before doing continuation.
1577 * This way we can use identical input options for the first and subsequent runs...
1579 if( gmx_fexist_master(opt2fn_master("-cpi",nfile,fnm,cr),cr) )
1581 load_checkpoint(opt2fn_master("-cpi",nfile,fnm,cr),&fplog,
1582 cr,Flags & MD_PARTDEC,ddxyz,
1583 inputrec,state,&bReadRNG,&bReadEkin,
1584 (Flags & MD_APPENDFILES),
1585 (Flags & MD_APPENDFILESSET));
1589 Flags |= MD_READ_RNG;
1593 Flags |= MD_READ_EKIN;
1598 if (((MASTER(cr) || (Flags & MD_SEPPOT)) && (Flags & MD_APPENDFILES))
1599 #ifdef GMX_THREAD_MPI
1600 /* With thread MPI only the master node/thread exists in mdrun.c,
1601 * therefore non-master nodes need to open the "seppot" log file here.
1603 || (!MASTER(cr) && (Flags & MD_SEPPOT))
1607 gmx_log_open(ftp2fn(efLOG,nfile,fnm),cr,!(Flags & MD_SEPPOT),
1611 /* override nsteps with value from cmdline */
1612 override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
1616 copy_mat(state->box,box);
1621 gmx_bcast(sizeof(box),box,cr);
1624 /* Essential dynamics */
1625 if (opt2bSet("-ei",nfile,fnm))
1627 /* Open input and output files, allocate space for ED data structure */
1628 ed = ed_open(nfile,fnm,Flags,cr);
1631 if (PAR(cr) && !((Flags & MD_PARTDEC) ||
1632 EI_TPI(inputrec->eI) ||
1633 inputrec->eI == eiNM))
1635 cr->dd = init_domain_decomposition(fplog,cr,Flags,ddxyz,rdd,rconstr,
1636 dddlb_opt,dlb_scale,
1640 &ddbox,&npme_major,&npme_minor);
1642 make_dd_communicators(fplog,cr,dd_node_order);
1644 /* Set overallocation to avoid frequent reallocation of arrays */
1645 set_over_alloc_dd(TRUE);
1649 /* PME, if used, is done on all nodes with 1D decomposition */
1651 cr->duty = (DUTY_PP | DUTY_PME);
1654 if (!EI_TPI(inputrec->eI))
1656 npme_major = cr->nnodes;
1659 if (inputrec->ePBC == epbcSCREW)
1662 "pbc=%s is only implemented with domain decomposition",
1663 epbc_names[inputrec->ePBC]);
1669 /* After possible communicator splitting in make_dd_communicators.
1670 * we can set up the intra/inter node communication.
1672 gmx_setup_nodecomm(fplog,cr);
1675 /* Initialize per-physical-node MPI process/thread ID and counters. */
1676 gmx_init_intranode_counters(cr);
1679 md_print_info(cr,fplog,"Using %d MPI %s\n",
1681 #ifdef GMX_THREAD_MPI
1682 cr->nnodes==1 ? "thread" : "threads"
1684 cr->nnodes==1 ? "process" : "processes"
1690 gmx_omp_nthreads_init(fplog, cr,
1691 hwinfo->nthreads_hw_avail,
1692 hw_opt->nthreads_omp,
1693 hw_opt->nthreads_omp_pme,
1694 (cr->duty & DUTY_PP) == 0,
1695 inputrec->cutoff_scheme == ecutsVERLET);
1697 gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi, minf.bUseGPU);
1699 /* getting number of PP/PME threads
1700 PME: env variable should be read only on one node to make sure it is
1701 identical everywhere;
1703 /* TODO nthreads_pp is only used for pinning threads.
1704 * This is a temporary solution until we have a hw topology library.
1706 nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
1707 nthreads_pme = gmx_omp_nthreads_get(emntPME);
1709 wcycle = wallcycle_init(fplog,resetstep,cr,nthreads_pp,nthreads_pme);
1713 /* Master synchronizes its value of reset_counters with all nodes
1714 * including PME only nodes */
1715 reset_counters = wcycle_get_reset_counters(wcycle);
1716 gmx_bcast_sim(sizeof(reset_counters),&reset_counters,cr);
1717 wcycle_set_reset_counters(wcycle, reset_counters);
1721 if (cr->duty & DUTY_PP)
1723 /* For domain decomposition we allocate dynamically
1724 * in dd_partition_system.
1726 if (DOMAINDECOMP(cr))
1728 bcast_state_setup(cr,state);
1734 bcast_state(cr,state,TRUE);
1738 /* Initiate forcerecord */
1740 fr->hwinfo = hwinfo;
1741 init_forcerec(fplog,oenv,fr,fcd,inputrec,mtop,cr,box,FALSE,
1742 opt2fn("-table",nfile,fnm),
1743 opt2fn("-tabletf",nfile,fnm),
1744 opt2fn("-tablep",nfile,fnm),
1745 opt2fn("-tableb",nfile,fnm),
1749 /* version for PCA_NOT_READ_NODE (see md.c) */
1750 /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
1751 "nofile","nofile","nofile","nofile",FALSE,pforce);
1753 fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT);
1755 /* Initialize QM-MM */
1758 init_QMMMrec(cr,box,mtop,inputrec,fr);
1761 /* Initialize the mdatoms structure.
1762 * mdatoms is not filled with atom data,
1763 * as this can not be done now with domain decomposition.
1765 mdatoms = init_mdatoms(fplog,mtop,inputrec->efep!=efepNO);
1767 /* Initialize the virtual site communication */
1768 vsite = init_vsite(mtop,cr,FALSE);
1770 calc_shifts(box,fr->shift_vec);
1772 /* With periodic molecules the charge groups should be whole at start up
1773 * and the virtual sites should not be far from their proper positions.
1775 if (!inputrec->bContinuation && MASTER(cr) &&
1776 !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
1778 /* Make molecules whole at start of run */
1779 if (fr->ePBC != epbcNONE)
1781 do_pbc_first_mtop(fplog,inputrec->ePBC,box,mtop,state->x);
1785 /* Correct initial vsite positions are required
1786 * for the initial distribution in the domain decomposition
1787 * and for the initial shell prediction.
1789 construct_vsites_mtop(fplog,vsite,mtop,state->x);
1793 if (EEL_PME(fr->eeltype))
1795 ewaldcoeff = fr->ewaldcoeff;
1796 pmedata = &fr->pmedata;
1805 /* This is a PME only node */
1807 /* We don't need the state */
1810 ewaldcoeff = calc_ewaldcoeff(inputrec->rcoulomb, inputrec->ewald_rtol);
1814 /* Before setting affinity, check whether the affinity has changed
1815 * - which indicates that probably the OpenMP library has changed it since
1816 * we first checked). */
1817 check_cpu_affinity_set(fplog, cr, hw_opt, hwinfo->nthreads_hw_avail, TRUE);
1819 /* Set the CPU affinity */
1820 set_cpu_affinity(fplog,cr,hw_opt,nthreads_pme,hwinfo,inputrec);
1822 /* Initiate PME if necessary,
1823 * either on all nodes or on dedicated PME nodes only. */
1824 if (EEL_PME(inputrec->coulombtype))
1828 nChargePerturbed = mdatoms->nChargePerturbed;
1830 if (cr->npmenodes > 0)
1832 /* The PME only nodes need to know nChargePerturbed */
1833 gmx_bcast_sim(sizeof(nChargePerturbed),&nChargePerturbed,cr);
1836 if (cr->duty & DUTY_PME)
1838 status = gmx_pme_init(pmedata,cr,npme_major,npme_minor,inputrec,
1839 mtop ? mtop->natoms : 0,nChargePerturbed,
1840 (Flags & MD_REPRODUCIBLE),nthreads_pme);
1843 gmx_fatal(FARGS,"Error %d initializing PME",status);
1849 if (integrator[inputrec->eI].func == do_md
1851 integrator[inputrec->eI].func == do_md_openmm
1854 /* Turn on signal handling on all nodes */
1856 * (A user signal from the PME nodes (if any)
1857 * is communicated to the PP nodes.
1859 signal_handler_install();
1862 if (cr->duty & DUTY_PP)
1864 if (inputrec->ePull != epullNO)
1866 /* Initialize pull code */
1867 init_pull(fplog,inputrec,nfile,fnm,mtop,cr,oenv, inputrec->fepvals->init_lambda,
1868 EI_DYNAMICS(inputrec->eI) && MASTER(cr),Flags);
1873 /* Initialize enforced rotation code */
1874 init_rot(fplog,inputrec,nfile,fnm,cr,state->x,box,mtop,oenv,
1878 constr = init_constraints(fplog,mtop,inputrec,ed,state,cr);
1880 if (DOMAINDECOMP(cr))
1882 dd_init_bondeds(fplog,cr->dd,mtop,vsite,constr,inputrec,
1883 Flags & MD_DDBONDCHECK,fr->cginfo_mb);
1885 set_dd_parameters(fplog,cr->dd,dlb_scale,inputrec,fr,&ddbox);
1887 setup_dd_grid(fplog,cr->dd);
1890 /* Now do whatever the user wants us to do (how flexible...) */
1891 integrator[inputrec->eI].func(fplog,cr,nfile,fnm,
1892 oenv,bVerbose,bCompact,
1895 nstepout,inputrec,mtop,
1897 mdatoms,nrnb,wcycle,ed,fr,
1898 repl_ex_nst,repl_ex_nex,repl_ex_seed,
1900 cpt_period,max_hours,
1905 if (inputrec->ePull != epullNO)
1907 finish_pull(fplog,inputrec->pull);
1912 finish_rot(fplog,inputrec->rot);
1919 gmx_pmeonly(*pmedata,cr,nrnb,wcycle,ewaldcoeff,FALSE,inputrec);
1922 if (EI_DYNAMICS(inputrec->eI) || EI_TPI(inputrec->eI))
1924 /* Some timing stats */
1927 if (runtime.proc == 0)
1929 runtime.proc = runtime.real;
1938 wallcycle_stop(wcycle,ewcRUN);
1940 /* Finish up, write some stuff
1941 * if rerunMD, don't write last frame again
1943 finish_run(fplog,cr,ftp2fn(efSTO,nfile,fnm),
1944 inputrec,nrnb,wcycle,&runtime,
1945 fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ?
1946 nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL,
1948 EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
1950 if ((cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU)
1952 char gpu_err_str[STRLEN];
1954 /* free GPU memory and uninitialize GPU (by destroying the context) */
1955 nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
1957 if (!free_gpu(gpu_err_str))
1959 gmx_warning("On node %d failed to free GPU #%d: %s",
1960 cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
1964 if (opt2bSet("-membed",nfile,fnm))
1969 #ifdef GMX_THREAD_MPI
1970 if (PAR(cr) && SIMMASTER(cr))
1973 gmx_hardware_info_free(hwinfo);
1976 /* Does what it says */
1977 print_date_and_time(fplog,cr->nodeid,"Finished mdrun",&runtime);
1979 /* Close logfile already here if we were appending to it */
1980 if (MASTER(cr) && (Flags & MD_APPENDFILES))
1982 gmx_log_close(fplog);
1985 rc=(int)gmx_get_stop_condition();
1987 #ifdef GMX_THREAD_MPI
1988 /* we need to join all threads. The sub-threads join when they
1989 exit this function, but the master thread needs to be told to
1991 if (PAR(cr) && MASTER(cr))