1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This source code is part of
8 * GROningen MAchine for Chemical Simulations
11 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13 * Copyright (c) 2001-2004, The GROMACS development team,
14 * check out http://www.gromacs.org for more information.
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version 2
19 * of the License, or (at your option) any later version.
21 * If you want to redistribute modifications, please consider that
22 * scientific software is very special. Version control is crucial -
23 * bugs must be traceable. We will be happy to consider code for
24 * inclusion in the official distribution, but derived work must not
25 * be called official GROMACS. Details are found in the README & COPYING
26 * files - if they are missing, get the official version at www.gromacs.org.
28 * To help us fund GROMACS development, we humbly ask that you cite
29 * the papers on the package - you can find them in the top README file.
31 * For more info, check our website at http://www.gromacs.org
34 * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
39 #if defined(HAVE_SCHED_H) && defined(HAVE_SCHED_GETAFFINITY)
42 #include <sys/syscall.h>
54 #include "md_logging.h"
55 #include "md_support.h"
65 #include "mpelogging.h"
71 #include "checkpoint.h"
72 #include "mtop_util.h"
73 #include "sighandler.h"
76 #include "gmx_detect_hardware.h"
77 #include "gmx_omp_nthreads.h"
78 #include "pull_rotation.h"
79 #include "calc_verletbuf.h"
80 #include "../mdlib/nbnxn_search.h"
81 #include "../mdlib/nbnxn_consts.h"
82 #include "gmx_fatal_collective.h"
84 #include "md_openmm.h"
87 #include "thread_mpi/threads.h"
101 #include "md_openmm.h"
104 #include "gpu_utils.h"
105 #include "nbnxn_cuda_data_mgmt.h"
108 gmx_integrator_t *func;
111 /* The array should match the eI array in include/types/enums.h */
112 #ifdef GMX_OPENMM /* FIXME do_md_openmm needs fixing */
113 const gmx_intp_t integrator[eiNR] = { {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm},{do_md_openmm}};
115 const gmx_intp_t integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md},{do_md}};
118 gmx_large_int_t deform_init_init_step_tpx;
119 matrix deform_init_box_tpx;
120 #ifdef GMX_THREAD_MPI
121 tMPI_Thread_mutex_t deform_init_box_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
125 #ifdef GMX_THREAD_MPI
126 struct mdrunner_arglist
128 gmx_hw_opt_t *hw_opt;
141 const char *dddlb_opt;
146 const char *nbpu_opt;
157 const char *deviceOptions;
159 int ret; /* return value */
163 /* The function used for spawning threads. Extracts the mdrunner()
164 arguments from its one argument and calls mdrunner(), after making
166 static void mdrunner_start_fn(void *arg)
168 struct mdrunner_arglist *mda=(struct mdrunner_arglist*)arg;
169 struct mdrunner_arglist mc=*mda; /* copy the arg list to make sure
170 that it's thread-local. This doesn't
171 copy pointed-to items, of course,
172 but those are all const. */
173 t_commrec *cr; /* we need a local version of this */
177 fnm = dup_tfn(mc.nfile, mc.fnm);
179 cr = init_par_threads(mc.cr);
186 mda->ret=mdrunner(mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv,
187 mc.bVerbose, mc.bCompact, mc.nstglobalcomm,
188 mc.ddxyz, mc.dd_node_order, mc.rdd,
189 mc.rconstr, mc.dddlb_opt, mc.dlb_scale,
190 mc.ddcsx, mc.ddcsy, mc.ddcsz,
192 mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
193 mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce,
194 mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.Flags);
197 /* called by mdrunner() to start a specific number of threads (including
198 the main thread) for thread-parallel runs. This in turn calls mdrunner()
200 All options besides nthreads are the same as for mdrunner(). */
201 static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
202 FILE *fplog,t_commrec *cr,int nfile,
203 const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
204 gmx_bool bCompact, int nstglobalcomm,
205 ivec ddxyz,int dd_node_order,real rdd,real rconstr,
206 const char *dddlb_opt,real dlb_scale,
207 const char *ddcsx,const char *ddcsy,const char *ddcsz,
208 const char *nbpu_opt,
209 int nsteps_cmdline, int nstepout,int resetstep,
210 int nmultisim,int repl_ex_nst,int repl_ex_nex, int repl_ex_seed,
211 real pforce,real cpt_period, real max_hours,
212 const char *deviceOptions, unsigned long Flags)
215 struct mdrunner_arglist *mda;
216 t_commrec *crn; /* the new commrec */
219 /* first check whether we even need to start tMPI */
220 if (hw_opt->nthreads_tmpi < 2)
225 /* a few small, one-time, almost unavoidable memory leaks: */
227 fnmn=dup_tfn(nfile, fnm);
229 /* fill the data structure to pass as void pointer to thread start fn */
236 mda->bVerbose=bVerbose;
237 mda->bCompact=bCompact;
238 mda->nstglobalcomm=nstglobalcomm;
239 mda->ddxyz[XX]=ddxyz[XX];
240 mda->ddxyz[YY]=ddxyz[YY];
241 mda->ddxyz[ZZ]=ddxyz[ZZ];
242 mda->dd_node_order=dd_node_order;
244 mda->rconstr=rconstr;
245 mda->dddlb_opt=dddlb_opt;
246 mda->dlb_scale=dlb_scale;
250 mda->nbpu_opt=nbpu_opt;
251 mda->nsteps_cmdline=nsteps_cmdline;
252 mda->nstepout=nstepout;
253 mda->resetstep=resetstep;
254 mda->nmultisim=nmultisim;
255 mda->repl_ex_nst=repl_ex_nst;
256 mda->repl_ex_nex=repl_ex_nex;
257 mda->repl_ex_seed=repl_ex_seed;
259 mda->cpt_period=cpt_period;
260 mda->max_hours=max_hours;
261 mda->deviceOptions=deviceOptions;
264 fprintf(stderr, "Starting %d tMPI threads\n",hw_opt->nthreads_tmpi);
266 /* now spawn new threads that start mdrunner_start_fn(), while
267 the main thread returns */
268 ret=tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi,
269 (hw_opt->bThreadPinning ? TMPI_AFFINITY_ALL_CORES : TMPI_AFFINITY_NONE),
270 mdrunner_start_fn, (void*)(mda) );
271 if (ret!=TMPI_SUCCESS)
274 /* make a new comm_rec to reflect the new situation */
275 crn=init_par_threads(cr);
280 static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
281 const gmx_hw_opt_t *hw_opt,
287 /* There are no separate PME nodes here, as we ensured in
288 * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
289 * and a conditional ensures we would not have ended up here.
290 * Note that separate PME nodes might be switched on later.
294 nthreads_tmpi = ngpu;
295 if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi)
297 nthreads_tmpi = nthreads_tot;
300 else if (hw_opt->nthreads_omp > 0)
302 /* Here we could oversubscribe, when we do, we issue a warning later */
303 nthreads_tmpi = max(1,nthreads_tot/hw_opt->nthreads_omp);
307 /* TODO choose nthreads_omp based on hardware topology
308 when we have a hardware topology detection library */
309 /* In general, when running up to 4 threads, OpenMP should be faster.
310 * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
311 * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
312 * even on two CPUs it's usually faster (but with many OpenMP threads
313 * it could be faster not to use HT, currently we always use HT).
314 * On Nehalem/Westmere we want to avoid running 16 threads over
315 * two CPUs with HT, so we need a limit<16; thus we use 12.
316 * A reasonable limit for Intel Sandy and Ivy bridge,
317 * not knowing the topology, is 16 threads.
319 const int nthreads_omp_always_faster = 4;
320 const int nthreads_omp_always_faster_Nehalem = 12;
321 const int nthreads_omp_always_faster_SandyBridge = 16;
322 const int first_model_Nehalem = 0x1A;
323 const int first_model_SandyBridge = 0x2A;
324 gmx_bool bIntel_Family6;
327 (gmx_cpuid_vendor(hwinfo->cpuid_info) == GMX_CPUID_VENDOR_INTEL &&
328 gmx_cpuid_family(hwinfo->cpuid_info) == 6);
330 if (nthreads_tot <= nthreads_omp_always_faster ||
332 ((gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_Nehalem && nthreads_tot <= nthreads_omp_always_faster_Nehalem) ||
333 (gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_SandyBridge && nthreads_tot <= nthreads_omp_always_faster_SandyBridge))))
335 /* Use pure OpenMP parallelization */
340 /* Don't use OpenMP parallelization */
341 nthreads_tmpi = nthreads_tot;
345 return nthreads_tmpi;
349 /* Get the number of threads to use for thread-MPI based on how many
350 * were requested, which algorithms we're using,
351 * and how many particles there are.
352 * At the point we have already called check_and_update_hw_opt.
353 * Thus all options should be internally consistent and consistent
354 * with the hardware, except that ntmpi could be larger than #GPU.
356 static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
357 gmx_hw_opt_t *hw_opt,
358 t_inputrec *inputrec, gmx_mtop_t *mtop,
362 int nthreads_hw,nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu;
363 int min_atoms_per_mpi_thread;
368 if (hw_opt->nthreads_tmpi > 0)
370 /* Trivial, return right away */
371 return hw_opt->nthreads_tmpi;
374 nthreads_hw = hwinfo->nthreads_hw_avail;
376 /* How many total (#tMPI*#OpenMP) threads can we start? */
377 if (hw_opt->nthreads_tot > 0)
379 nthreads_tot_max = hw_opt->nthreads_tot;
383 nthreads_tot_max = nthreads_hw;
386 bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
389 ngpu = hwinfo->gpu_info.ncuda_dev_use;
397 get_tmpi_omp_thread_division(hwinfo,hw_opt,nthreads_tot_max,ngpu);
399 if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
401 /* Steps are divided over the nodes iso splitting the atoms */
402 min_atoms_per_mpi_thread = 0;
408 min_atoms_per_mpi_thread = MIN_ATOMS_PER_GPU;
412 min_atoms_per_mpi_thread = MIN_ATOMS_PER_MPI_THREAD;
416 /* Check if an algorithm does not support parallel simulation. */
417 if (nthreads_tmpi != 1 &&
418 ( inputrec->eI == eiLBFGS ||
419 inputrec->coulombtype == eelEWALD ) )
423 md_print_warn(cr,fplog,"The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
424 if (hw_opt->nthreads_tmpi > nthreads_tmpi)
426 gmx_fatal(FARGS,"You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
429 else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_thread)
431 /* the thread number was chosen automatically, but there are too many
432 threads (too few atoms per thread) */
433 nthreads_new = max(1,mtop->natoms/min_atoms_per_mpi_thread);
435 /* Avoid partial use of Hyper-Threading */
436 if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
437 nthreads_new > nthreads_hw/2 && nthreads_new < nthreads_hw)
439 nthreads_new = nthreads_hw/2;
442 /* Avoid large prime numbers in the thread count */
443 if (nthreads_new >= 6)
445 /* Use only 6,8,10 with additional factors of 2 */
449 while (3*fac*2 <= nthreads_new)
454 nthreads_new = (nthreads_new/fac)*fac;
459 if (nthreads_new == 5)
465 nthreads_tmpi = nthreads_new;
467 fprintf(stderr,"\n");
468 fprintf(stderr,"NOTE: Parallelization is limited by the small number of atoms,\n");
469 fprintf(stderr," only starting %d thread-MPI threads.\n",nthreads_tmpi);
470 fprintf(stderr," You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
473 return nthreads_tmpi;
475 #endif /* GMX_THREAD_MPI */
478 /* Environment variable for setting nstlist */
479 static const char* NSTLIST_ENVVAR = "GMX_NSTLIST";
480 /* Try to increase nstlist when using a GPU with nstlist less than this */
481 static const int NSTLIST_GPU_ENOUGH = 20;
482 /* Increase nstlist until the non-bonded cost increases more than this factor */
483 static const float NBNXN_GPU_LIST_OK_FAC = 1.25;
484 /* Don't increase nstlist beyond a non-bonded cost increases of this factor */
485 static const float NBNXN_GPU_LIST_MAX_FAC = 1.40;
487 /* Try to increase nstlist when running on a GPU */
488 static void increase_nstlist(FILE *fp,t_commrec *cr,
489 t_inputrec *ir,const gmx_mtop_t *mtop,matrix box)
492 int nstlist_orig,nstlist_prev;
493 verletbuf_list_setup_t ls;
494 real rlist_inc,rlist_ok,rlist_max,rlist_new,rlist_prev;
497 gmx_bool bBox,bDD,bCont;
498 const char *nstl_fmt="\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
499 const char *vbd_err="Can not increase nstlist for GPU run because verlet-buffer-drift is not set or used";
500 const char *box_err="Can not increase nstlist for GPU run because the box is too small";
501 const char *dd_err ="Can not increase nstlist for GPU run because of domain decomposition limitations";
504 /* Number of + nstlist alternative values to try when switching */
505 const int nstl[]={ 20, 25, 40, 50 };
506 #define NNSTL sizeof(nstl)/sizeof(nstl[0])
508 env = getenv(NSTLIST_ENVVAR);
513 fprintf(fp,nstl_fmt,ir->nstlist);
517 if (ir->verletbuf_drift == 0)
519 gmx_fatal(FARGS,"You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
522 if (ir->verletbuf_drift < 0)
526 fprintf(stderr,"%s\n",vbd_err);
530 fprintf(fp,"%s\n",vbd_err);
536 nstlist_orig = ir->nstlist;
539 sprintf(buf,"Getting nstlist from environment variable GMX_NSTLIST=%s",env);
542 fprintf(stderr,"%s\n",buf);
546 fprintf(fp,"%s\n",buf);
548 sscanf(env,"%d",&ir->nstlist);
551 verletbuf_get_list_setup(TRUE,&ls);
553 /* Allow rlist to make the list double the size of the cut-off sphere */
554 rlist_inc = nbnxn_get_rlist_effective_inc(NBNXN_GPU_CLUSTER_SIZE,mtop->natoms/det(box));
555 rlist_ok = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_OK_FAC,1.0/3.0) - rlist_inc;
556 rlist_max = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_MAX_FAC,1.0/3.0) - rlist_inc;
559 fprintf(debug,"GPU nstlist tuning: rlist_inc %.3f rlist_max %.3f\n",
560 rlist_inc,rlist_max);
564 nstlist_prev = nstlist_orig;
565 rlist_prev = ir->rlist;
570 ir->nstlist = nstl[i];
573 /* Set the pair-list buffer size in ir */
574 calc_verlet_buffer_size(mtop,det(box),ir,ir->verletbuf_drift,&ls,
577 /* Does rlist fit in the box? */
578 bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC,box));
580 if (bBox && DOMAINDECOMP(cr))
582 /* Check if rlist fits in the domain decomposition */
583 if (inputrec2nboundeddim(ir) < DIM)
585 gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
587 copy_mat(box,state_tmp.box);
588 bDD = change_dd_cutoff(cr,&state_tmp,ir,rlist_new);
595 if (bBox && bDD && rlist_new <= rlist_max)
597 /* Increase nstlist */
598 nstlist_prev = ir->nstlist;
599 rlist_prev = rlist_new;
600 bCont = (i+1 < NNSTL && rlist_new < rlist_ok);
604 /* Stick with the previous nstlist */
605 ir->nstlist = nstlist_prev;
606 rlist_new = rlist_prev;
618 gmx_warning(!bBox ? box_err : dd_err);
621 fprintf(fp,"\n%s\n",bBox ? box_err : dd_err);
623 ir->nstlist = nstlist_orig;
625 else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
627 sprintf(buf,"Changing nstlist from %d to %d, rlist from %g to %g",
628 nstlist_orig,ir->nstlist,
629 ir->rlist,rlist_new);
632 fprintf(stderr,"%s\n\n",buf);
636 fprintf(fp,"%s\n\n",buf);
638 ir->rlist = rlist_new;
639 ir->rlistlong = rlist_new;
643 static void prepare_verlet_scheme(FILE *fplog,
644 gmx_hw_info_t *hwinfo,
646 gmx_hw_opt_t *hw_opt,
647 const char *nbpu_opt,
649 const gmx_mtop_t *mtop,
653 /* Here we only check for GPU usage on the MPI master process,
654 * as here we don't know how many GPUs we will use yet.
655 * We check for a GPU on all processes later.
657 *bUseGPU = hwinfo->bCanUseGPU || (getenv("GMX_EMULATE_GPU") != NULL);
659 if (ir->verletbuf_drift > 0)
661 /* Update the Verlet buffer size for the current run setup */
662 verletbuf_list_setup_t ls;
665 /* Here we assume CPU acceleration is on. But as currently
666 * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
667 * and 4x2 gives a larger buffer than 4x4, this is ok.
669 verletbuf_get_list_setup(*bUseGPU,&ls);
671 calc_verlet_buffer_size(mtop,det(box),ir,
672 ir->verletbuf_drift,&ls,
674 if (rlist_new != ir->rlist)
678 fprintf(fplog,"\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
680 ls.cluster_size_i,ls.cluster_size_j);
682 ir->rlist = rlist_new;
683 ir->rlistlong = rlist_new;
687 /* With GPU or emulation we should check nstlist for performance */
688 if ((EI_DYNAMICS(ir->eI) &&
690 ir->nstlist < NSTLIST_GPU_ENOUGH) ||
691 getenv(NSTLIST_ENVVAR) != NULL)
693 /* Choose a better nstlist */
694 increase_nstlist(fplog,cr,ir,mtop,box);
698 static void convert_to_verlet_scheme(FILE *fplog,
700 gmx_mtop_t *mtop,real box_vol)
702 char *conv_mesg="Converting input file with group cut-off scheme to the Verlet cut-off scheme";
704 md_print_warn(NULL,fplog,"%s\n",conv_mesg);
706 ir->cutoff_scheme = ecutsVERLET;
707 ir->verletbuf_drift = 0.005;
709 if (ir->rcoulomb != ir->rvdw)
711 gmx_fatal(FARGS,"The VdW and Coulomb cut-offs are different, whereas the Verlet scheme only supports equal cut-offs");
714 if (ir->vdwtype == evdwUSER || EEL_USER(ir->coulombtype))
716 gmx_fatal(FARGS,"User non-bonded potentials are not (yet) supported with the Verlet scheme");
718 else if (EVDW_SWITCHED(ir->vdwtype) || EEL_SWITCHED(ir->coulombtype))
720 md_print_warn(NULL,fplog,"Converting switched or shifted interactions to a shifted potential (without force shift), this will lead to slightly different interaction potentials");
722 if (EVDW_SWITCHED(ir->vdwtype))
724 ir->vdwtype = evdwCUT;
726 if (EEL_SWITCHED(ir->coulombtype))
728 if (EEL_FULL(ir->coulombtype))
730 /* With full electrostatic only PME can be switched */
731 ir->coulombtype = eelPME;
735 md_print_warn(NULL,fplog,"NOTE: Replacing %s electrostatics with reaction-field with epsilon-rf=inf\n",eel_names[ir->coulombtype]);
736 ir->coulombtype = eelRF;
737 ir->epsilon_rf = 0.0;
741 /* We set the target energy drift to a small number.
742 * Note that this is only for testing. For production the user
743 * should think about this and set the mdp options.
745 ir->verletbuf_drift = 1e-4;
748 if (inputrec2nboundeddim(ir) != 3)
750 gmx_fatal(FARGS,"Can only convert old tpr files to the Verlet cut-off scheme with 3D pbc");
753 if (ir->efep != efepNO || ir->implicit_solvent != eisNO)
755 gmx_fatal(FARGS,"Will not convert old tpr files to the Verlet cut-off scheme with free-energy calculations or implicit solvent");
758 if (EI_DYNAMICS(ir->eI) && !(EI_MD(ir->eI) && ir->etc == etcNO))
760 verletbuf_list_setup_t ls;
762 verletbuf_get_list_setup(FALSE,&ls);
763 calc_verlet_buffer_size(mtop,box_vol,ir,ir->verletbuf_drift,&ls,
768 ir->verletbuf_drift = -1;
769 ir->rlist = 1.05*max(ir->rvdw,ir->rcoulomb);
772 gmx_mtop_remove_chargegroups(mtop);
775 /* Check the process affinity mask and if it is found to be non-zero,
776 * will honor it and disable mdrun internal affinity setting.
777 * This function should be called first before the OpenMP library gets
778 * initialized with the last argument FALSE (which will detect affinity
779 * set by external tools like taskset), and later, after the OpenMP
780 * initialization, with the last argument TRUE to detect affinity changes
781 * made by the OpenMP library.
783 * Note that this will only work on Linux as we use a GNU feature. */
784 static void check_cpu_affinity_set(FILE *fplog, const t_commrec *cr,
785 gmx_hw_opt_t *hw_opt, int ncpus,
786 gmx_bool bAfterOpenmpInit)
788 #ifdef HAVE_SCHED_GETAFFINITY
789 cpu_set_t mask_current;
790 int i, ret, cpu_count, cpu_set;
794 if (!hw_opt->bThreadPinning)
796 /* internal affinity setting is off, don't bother checking process affinity */
800 CPU_ZERO(&mask_current);
801 if ((ret = sched_getaffinity(0, sizeof(cpu_set_t), &mask_current)) != 0)
803 /* failed to query affinity mask, will just return */
806 fprintf(debug, "Failed to query affinity mask (error %d)", ret);
811 /* Before proceeding with the actual check, make sure that the number of
812 * detected CPUs is >= the CPUs in the current set.
813 * We need to check for CPU_COUNT as it was added only in glibc 2.6. */
815 if (ncpus < CPU_COUNT(&mask_current))
819 fprintf(debug, "%d CPUs detected, but %d was returned by CPU_COUNT",
820 ncpus, CPU_COUNT(&mask_current));
824 #endif /* CPU_COUNT */
827 for (i = 0; (i < ncpus && i < CPU_SETSIZE); i++)
829 bAllSet = bAllSet && (CPU_ISSET(i, &mask_current) != 0);
834 if (!bAfterOpenmpInit)
836 md_print_warn(cr, fplog,
837 "Non-default process affinity set, disabling internal affinity");
841 md_print_warn(cr, fplog,
842 "Non-default process affinity set probably by the OpenMP library, "
843 "disabling internal affinity");
845 hw_opt->bThreadPinning = FALSE;
849 fprintf(debug, "Non-default affinity mask found\n");
856 fprintf(debug, "Default affinity mask found\n");
859 #endif /* HAVE_SCHED_GETAFFINITY */
862 /* Set CPU affinity. Can be important for performance.
863 On some systems (e.g. Cray) CPU Affinity is set by default.
864 But default assigning doesn't work (well) with only some ranks
865 having threads. This causes very low performance.
866 External tools have cumbersome syntax for setting affinity
867 in the case that only some ranks have threads.
868 Thus it is important that GROMACS sets the affinity internally
869 if only PME is using threads.
871 static void set_cpu_affinity(FILE *fplog,
873 gmx_hw_opt_t *hw_opt,
875 const gmx_hw_info_t *hwinfo,
876 const t_inputrec *inputrec)
878 #if defined GMX_THREAD_MPI
879 /* With the number of TMPI threads equal to the number of cores
880 * we already pinned in thread-MPI, so don't pin again here.
882 if (hw_opt->nthreads_tmpi == tMPI_Thread_get_hw_number())
889 /* If the tMPI thread affinity setting is not supported encourage the user
890 * to report it as it's either a bug or an exotic platform which we might
891 * want to support. */
892 if (tMPI_Thread_setaffinity_support() != TMPI_SETAFFINITY_SUPPORT_YES)
894 md_print_warn(NULL, fplog,
895 "Can not set thread affinities on the current plarform. On NUMA systems this\n"
896 "can cause performance degradation. If you think your platform should support\n"
897 "setting affinities, contact the GROMACS developers.");
900 #endif /* __APPLE__ */
902 if (hw_opt->bThreadPinning)
904 int nth_affinity_set, thread_id_node, thread_id,
905 nthread_local, nthread_node, nthread_hw_max, nphyscore;
909 /* threads on this MPI process or TMPI thread */
910 if (cr->duty & DUTY_PP)
912 nthread_local = gmx_omp_nthreads_get(emntNonbonded);
916 nthread_local = gmx_omp_nthreads_get(emntPME);
919 /* map the current process to cores */
921 nthread_node = nthread_local;
923 if (PAR(cr) || MULTISIM(cr))
925 /* We need to determine a scan of the thread counts in this
931 process_index = cr->nodeid_intra;
934 /* To simplify the code, we shift process indices by nnodes.
935 * There might be far less processes, but that doesn't matter.
937 process_index += cr->ms->sim*cr->nnodes;
939 MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),process_index,
941 MPI_Scan(&nthread_local,&thread_id_node,1,MPI_INT,MPI_SUM,comm_intra);
942 /* MPI_Scan is inclusive, but here we need exclusive */
943 thread_id_node -= nthread_local;
944 /* Get the total number of threads on this physical node */
945 MPI_Allreduce(&nthread_local,&nthread_node,1,MPI_INT,MPI_SUM,comm_intra);
946 MPI_Comm_free(&comm_intra);
951 if (hw_opt->core_pinning_offset > 0)
953 offset = hw_opt->core_pinning_offset;
956 fprintf(stderr, "Applying core pinning offset %d\n", offset);
960 fprintf(fplog, "Applying core pinning offset %d\n", offset);
964 /* With Intel Hyper-Threading enabled, we want to pin consecutive
965 * threads to physical cores when using more threads than physical
966 * cores or when the user requests so.
968 nthread_hw_max = hwinfo->nthreads_hw_avail;
970 if (hw_opt->bPinHyperthreading ||
971 (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
972 nthread_node > nthread_hw_max/2 && getenv("GMX_DISABLE_PINHT") == NULL))
974 if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) != GMX_CPUID_X86_SMT_ENABLED)
976 /* We print to stderr on all processes, as we might have
977 * different settings on different physical nodes.
979 if (gmx_cpuid_vendor(hwinfo->cpuid_info) != GMX_CPUID_VENDOR_INTEL)
981 md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
982 "but non-Intel CPU detected (vendor: %s)\n",
983 gmx_cpuid_vendor_string[gmx_cpuid_vendor(hwinfo->cpuid_info)]);
987 md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
988 "but the CPU detected does not have Intel Hyper-Threading support "
989 "(or it is turned off)\n");
992 nphyscore = nthread_hw_max/2;
996 fprintf(stderr, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
1001 fprintf(fplog, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
1006 /* Set the per-thread affinity. In order to be able to check the success
1007 * of affinity settings, we will set nth_affinity_set to 1 on threads
1008 * where the affinity setting succeded and to 0 where it failed.
1009 * Reducing these 0/1 values over the threads will give the total number
1010 * of threads on which we succeeded.
1012 nth_affinity_set = 0;
1013 #pragma omp parallel firstprivate(thread_id_node) num_threads(nthread_local) \
1014 reduction(+:nth_affinity_set)
1017 gmx_bool setaffinity_ret;
1019 thread_id = gmx_omp_get_thread_num();
1020 thread_id_node += thread_id;
1023 core = offset + thread_id_node;
1027 /* Lock pairs of threads to the same hyperthreaded core */
1028 core = offset + thread_id_node/2 + (thread_id_node % 2)*nphyscore;
1031 setaffinity_ret = tMPI_Thread_setaffinity_single(tMPI_Thread_self(), core);
1033 /* store the per-thread success-values of the setaffinity */
1034 nth_affinity_set = (setaffinity_ret == 0);
1038 fprintf(debug, "On node %d, thread %d the affinity setting returned %d\n",
1039 cr->nodeid, gmx_omp_get_thread_num(), setaffinity_ret);
1043 if (nth_affinity_set > nthread_local)
1047 sprintf(msg, "Looks like we have set affinity for more threads than "
1048 "we have (%d > %d)!\n", nth_affinity_set, nthread_local);
1053 /* check if some threads failed to set their affinities */
1054 if (nth_affinity_set != nthread_local)
1059 #ifdef GMX_THREAD_MPI
1060 sprintf(sbuf, "In thread-MPI thread #%d", cr->nodeid);
1061 #else /* GMX_LIB_MPI */
1063 sprintf(sbuf, "In MPI process #%d", cr->nodeid);
1064 #endif /* GMX_MPI */
1065 md_print_warn(NULL, fplog,
1066 "%s%d/%d thread%s failed to set their affinities. "
1067 "This can cause performance degradation!",
1068 sbuf, nthread_local - nth_affinity_set, nthread_local,
1069 (nthread_local - nth_affinity_set) > 1 ? "s" : "");
1076 static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt,
1079 gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp);
1081 #ifndef GMX_THREAD_MPI
1082 if (hw_opt->nthreads_tot > 0)
1084 gmx_fatal(FARGS,"Setting the total number of threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
1086 if (hw_opt->nthreads_tmpi > 0)
1088 gmx_fatal(FARGS,"Setting the number of thread-MPI threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
1092 if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
1094 /* We have the same number of OpenMP threads for PP and PME processes,
1095 * thus we can perform several consistency checks.
1097 if (hw_opt->nthreads_tmpi > 0 &&
1098 hw_opt->nthreads_omp > 0 &&
1099 hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
1101 gmx_fatal(FARGS,"The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
1102 hw_opt->nthreads_tot,hw_opt->nthreads_tmpi,hw_opt->nthreads_omp);
1105 if (hw_opt->nthreads_tmpi > 0 &&
1106 hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
1108 gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
1109 hw_opt->nthreads_tot,hw_opt->nthreads_tmpi);
1112 if (hw_opt->nthreads_omp > 0 &&
1113 hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
1115 gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
1116 hw_opt->nthreads_tot,hw_opt->nthreads_omp);
1119 if (hw_opt->nthreads_tmpi > 0 &&
1120 hw_opt->nthreads_omp <= 0)
1122 hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
1127 if (hw_opt->nthreads_omp > 1)
1129 gmx_fatal(FARGS,"OpenMP threads are requested, but Gromacs was compiled without OpenMP support");
1133 if (cutoff_scheme == ecutsGROUP)
1135 /* We only have OpenMP support for PME only nodes */
1136 if (hw_opt->nthreads_omp > 1)
1138 gmx_fatal(FARGS,"OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
1139 ecutscheme_names[cutoff_scheme],
1140 ecutscheme_names[ecutsVERLET]);
1142 hw_opt->nthreads_omp = 1;
1145 if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
1147 gmx_fatal(FARGS,"You need to specify -ntomp in addition to -ntomp_pme");
1150 if (hw_opt->nthreads_tot == 1)
1152 hw_opt->nthreads_tmpi = 1;
1154 if (hw_opt->nthreads_omp > 1)
1156 gmx_fatal(FARGS,"You requested %d OpenMP threads with %d total threads",
1157 hw_opt->nthreads_tmpi,hw_opt->nthreads_tot);
1159 hw_opt->nthreads_omp = 1;
1162 if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
1164 hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
1169 fprintf(debug,"hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
1170 hw_opt->nthreads_tot,
1171 hw_opt->nthreads_tmpi,
1172 hw_opt->nthreads_omp,
1173 hw_opt->nthreads_omp_pme,
1174 hw_opt->gpu_id!=NULL ? hw_opt->gpu_id : "");
1180 /* Override the value in inputrec with value passed on the command line (if any) */
1181 static void override_nsteps_cmdline(FILE *fplog,
1184 const t_commrec *cr)
1189 /* override with anything else than the default -2 */
1190 if (nsteps_cmdline > -2)
1194 ir->nsteps = nsteps_cmdline;
1195 if (EI_DYNAMICS(ir->eI))
1197 sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps, %.3f ps",
1198 nsteps_cmdline, nsteps_cmdline*ir->delta_t);
1202 sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps",
1206 md_print_warn(cr, fplog, "%s\n", stmp);
1210 /* Data structure set by SIMMASTER which needs to be passed to all nodes
1211 * before the other nodes have read the tpx file and called gmx_detect_hardware.
1214 int cutoff_scheme; /* The cutoff scheme from inputrec_t */
1215 gmx_bool bUseGPU; /* Use GPU or GPU emulation */
1218 int mdrunner(gmx_hw_opt_t *hw_opt,
1219 FILE *fplog,t_commrec *cr,int nfile,
1220 const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
1221 gmx_bool bCompact, int nstglobalcomm,
1222 ivec ddxyz,int dd_node_order,real rdd,real rconstr,
1223 const char *dddlb_opt,real dlb_scale,
1224 const char *ddcsx,const char *ddcsy,const char *ddcsz,
1225 const char *nbpu_opt,
1226 int nsteps_cmdline, int nstepout,int resetstep,
1227 int nmultisim,int repl_ex_nst,int repl_ex_nex,
1228 int repl_ex_seed, real pforce,real cpt_period,real max_hours,
1229 const char *deviceOptions, unsigned long Flags)
1231 gmx_bool bForceUseGPU,bTryUseGPU;
1232 double nodetime=0,realtime;
1233 t_inputrec *inputrec;
1234 t_state *state=NULL;
1236 gmx_ddbox_t ddbox={0};
1237 int npme_major,npme_minor;
1240 gmx_mtop_t *mtop=NULL;
1241 t_mdatoms *mdatoms=NULL;
1242 t_forcerec *fr=NULL;
1245 gmx_pme_t *pmedata=NULL;
1246 gmx_vsite_t *vsite=NULL;
1247 gmx_constr_t constr;
1248 int i,m,nChargePerturbed=-1,status,nalloc;
1250 gmx_wallcycle_t wcycle;
1251 gmx_bool bReadRNG,bReadEkin;
1253 gmx_runtime_t runtime;
1255 gmx_large_int_t reset_counters;
1256 gmx_edsam_t ed=NULL;
1257 t_commrec *cr_old=cr;
1260 gmx_membed_t membed=NULL;
1261 gmx_hw_info_t *hwinfo=NULL;
1262 master_inf_t minf={-1,FALSE};
1264 /* CAUTION: threads may be started later on in this function, so
1265 cr doesn't reflect the final parallel state right now */
1269 if (Flags & MD_APPENDFILES)
1274 bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
1275 bTryUseGPU = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
1280 /* Read (nearly) all data required for the simulation */
1281 read_tpx_state(ftp2fn(efTPX,nfile,fnm),inputrec,state,NULL,mtop);
1283 if (inputrec->cutoff_scheme != ecutsVERLET &&
1284 ((Flags & MD_TESTVERLET) || getenv("GMX_VERLET_SCHEME") != NULL))
1286 convert_to_verlet_scheme(fplog,inputrec,mtop,det(state->box));
1289 /* Detect hardware, gather information. With tMPI only thread 0 does it
1290 * and after threads are started broadcasts hwinfo around. */
1292 gmx_detect_hardware(fplog, hwinfo, cr,
1293 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
1295 minf.cutoff_scheme = inputrec->cutoff_scheme;
1296 minf.bUseGPU = FALSE;
1298 if (inputrec->cutoff_scheme == ecutsVERLET)
1300 prepare_verlet_scheme(fplog,hwinfo,cr,hw_opt,nbpu_opt,
1301 inputrec,mtop,state->box,
1304 else if (hwinfo->bCanUseGPU)
1306 md_print_warn(cr,fplog,
1307 "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
1308 " To use a GPU, set the mdp option: cutoff-scheme = Verlet\n"
1309 " (for quick performance testing you can use the -testverlet option)\n");
1313 gmx_fatal(FARGS,"GPU requested, but can't be used without cutoff-scheme=Verlet");
1317 #ifndef GMX_THREAD_MPI
1320 gmx_bcast_sim(sizeof(minf),&minf,cr);
1323 if (minf.bUseGPU && cr->npmenodes == -1)
1325 /* Don't automatically use PME-only nodes with GPUs */
1329 /* Check for externally set OpenMP affinity and turn off internal
1330 * pinning if any is found. We need to do this check early to tell
1331 * thread-MPI whether it should do pinning when spawning threads.
1333 gmx_omp_check_thread_affinity(fplog, cr, hw_opt);
1335 #ifdef GMX_THREAD_MPI
1336 /* With thread-MPI inputrec is only set here on the master thread */
1340 check_and_update_hw_opt(hw_opt,minf.cutoff_scheme);
1342 #ifdef GMX_THREAD_MPI
1343 /* Early check for externally set process affinity. Can't do over all
1344 * MPI processes because hwinfo is not available everywhere, but with
1345 * thread-MPI it's needed as pinning might get turned off which needs
1346 * to be known before starting thread-MPI. */
1347 check_cpu_affinity_set(fplog,
1349 hw_opt, hwinfo->nthreads_hw_avail, FALSE);
1352 #ifdef GMX_THREAD_MPI
1353 if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
1355 gmx_fatal(FARGS,"You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME nodes");
1359 if (hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
1362 gmx_fatal(FARGS,"You need to explicitly specify the number of PME nodes (-npme) when using different number of OpenMP threads for PP and PME nodes");
1366 #ifdef GMX_THREAD_MPI
1369 /* NOW the threads will be started: */
1370 hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
1374 if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
1376 hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
1379 if (hw_opt->nthreads_tmpi > 1)
1381 /* now start the threads. */
1382 cr=mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm,
1383 oenv, bVerbose, bCompact, nstglobalcomm,
1384 ddxyz, dd_node_order, rdd, rconstr,
1385 dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
1387 nsteps_cmdline, nstepout, resetstep, nmultisim,
1388 repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
1389 cpt_period, max_hours, deviceOptions,
1391 /* the main thread continues here with a new cr. We don't deallocate
1392 the old cr because other threads may still be reading it. */
1395 gmx_comm("Failed to spawn threads");
1400 /* END OF CAUTION: cr is now reliable */
1402 /* g_membed initialisation *
1403 * Because we change the mtop, init_membed is called before the init_parallel *
1404 * (in case we ever want to make it run in parallel) */
1405 if (opt2bSet("-membed",nfile,fnm))
1409 fprintf(stderr,"Initializing membed");
1411 membed = init_membed(fplog,nfile,fnm,mtop,inputrec,state,cr,&cpt_period);
1416 /* now broadcast everything to the non-master nodes/threads: */
1417 init_parallel(fplog, cr, inputrec, mtop);
1419 /* This check needs to happen after get_nthreads_mpi() */
1420 if (inputrec->cutoff_scheme == ecutsVERLET && (Flags & MD_PARTDEC))
1422 gmx_fatal_collective(FARGS,cr,NULL,
1423 "The Verlet cut-off scheme is not supported with particle decomposition.\n"
1424 "You can achieve the same effect as particle decomposition by running in parallel using only OpenMP threads.");
1429 pr_inputrec(fplog,0,"Input Parameters",inputrec,FALSE);
1432 #if defined GMX_THREAD_MPI
1433 /* With tMPI we detected on thread 0 and we'll just pass the hwinfo pointer
1434 * to the other threads -- slightly uncool, but works fine, just need to
1435 * make sure that the data doesn't get freed twice. */
1442 gmx_bcast(sizeof(&hwinfo), &hwinfo, cr);
1445 if (PAR(cr) && !SIMMASTER(cr))
1447 /* now we have inputrec on all nodes, can run the detection */
1448 /* TODO: perhaps it's better to propagate within a node instead? */
1450 gmx_detect_hardware(fplog, hwinfo, cr,
1451 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
1454 /* Now do the affinity check with MPI/no-MPI (done earlier with thread-MPI). */
1455 check_cpu_affinity_set(fplog, cr,
1456 hw_opt, hwinfo->nthreads_hw_avail, FALSE);
1459 /* now make sure the state is initialized and propagated */
1460 set_state_entries(state,inputrec,cr->nnodes);
1462 /* remove when vv and rerun works correctly! */
1463 if (PAR(cr) && EI_VV(inputrec->eI) && ((Flags & MD_RERUN) || (Flags & MD_RERUN_VSITE)))
1466 "Currently can't do velocity verlet with rerun in parallel.");
1469 /* A parallel command line option consistency check that we can
1470 only do after any threads have started. */
1472 (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
1475 "The -dd or -npme option request a parallel simulation, "
1477 "but %s was compiled without threads or MPI enabled"
1479 #ifdef GMX_THREAD_MPI
1480 "but the number of threads (option -nt) is 1"
1482 "but %s was not started through mpirun/mpiexec or only one process was requested through mpirun/mpiexec"
1489 if ((Flags & MD_RERUN) &&
1490 (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
1492 gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
1495 if (can_use_allvsall(inputrec,mtop,TRUE,cr,fplog) && PAR(cr))
1497 /* All-vs-all loops do not work with domain decomposition */
1498 Flags |= MD_PARTDEC;
1501 if (!EEL_PME(inputrec->coulombtype) || (Flags & MD_PARTDEC))
1503 if (cr->npmenodes > 0)
1505 if (!EEL_PME(inputrec->coulombtype))
1507 gmx_fatal_collective(FARGS,cr,NULL,
1508 "PME nodes are requested, but the system does not use PME electrostatics");
1510 if (Flags & MD_PARTDEC)
1512 gmx_fatal_collective(FARGS,cr,NULL,
1513 "PME nodes are requested, but particle decomposition does not support separate PME nodes");
1521 fcRegisterSteps(inputrec->nsteps,inputrec->init_step);
1524 /* NMR restraints must be initialized before load_checkpoint,
1525 * since with time averaging the history is added to t_state.
1526 * For proper consistency check we therefore need to extend
1528 * So the PME-only nodes (if present) will also initialize
1529 * the distance restraints.
1533 /* This needs to be called before read_checkpoint to extend the state */
1534 init_disres(fplog,mtop,inputrec,cr,Flags & MD_PARTDEC,fcd,state);
1536 if (gmx_mtop_ftype_count(mtop,F_ORIRES) > 0)
1538 if (PAR(cr) && !(Flags & MD_PARTDEC))
1540 gmx_fatal(FARGS,"Orientation restraints do not work (yet) with domain decomposition, use particle decomposition (mdrun option -pd)");
1542 /* Orientation restraints */
1545 init_orires(fplog,mtop,state->x,inputrec,cr->ms,&(fcd->orires),
1550 if (DEFORM(*inputrec))
1552 /* Store the deform reference box before reading the checkpoint */
1555 copy_mat(state->box,box);
1559 gmx_bcast(sizeof(box),box,cr);
1561 /* Because we do not have the update struct available yet
1562 * in which the reference values should be stored,
1563 * we store them temporarily in static variables.
1564 * This should be thread safe, since they are only written once
1565 * and with identical values.
1567 #ifdef GMX_THREAD_MPI
1568 tMPI_Thread_mutex_lock(&deform_init_box_mutex);
1570 deform_init_init_step_tpx = inputrec->init_step;
1571 copy_mat(box,deform_init_box_tpx);
1572 #ifdef GMX_THREAD_MPI
1573 tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
1577 if (opt2bSet("-cpi",nfile,fnm))
1579 /* Check if checkpoint file exists before doing continuation.
1580 * This way we can use identical input options for the first and subsequent runs...
1582 if( gmx_fexist_master(opt2fn_master("-cpi",nfile,fnm,cr),cr) )
1584 load_checkpoint(opt2fn_master("-cpi",nfile,fnm,cr),&fplog,
1585 cr,Flags & MD_PARTDEC,ddxyz,
1586 inputrec,state,&bReadRNG,&bReadEkin,
1587 (Flags & MD_APPENDFILES),
1588 (Flags & MD_APPENDFILESSET));
1592 Flags |= MD_READ_RNG;
1596 Flags |= MD_READ_EKIN;
1601 if (((MASTER(cr) || (Flags & MD_SEPPOT)) && (Flags & MD_APPENDFILES))
1602 #ifdef GMX_THREAD_MPI
1603 /* With thread MPI only the master node/thread exists in mdrun.c,
1604 * therefore non-master nodes need to open the "seppot" log file here.
1606 || (!MASTER(cr) && (Flags & MD_SEPPOT))
1610 gmx_log_open(ftp2fn(efLOG,nfile,fnm),cr,!(Flags & MD_SEPPOT),
1614 /* override nsteps with value from cmdline */
1615 override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
1619 copy_mat(state->box,box);
1624 gmx_bcast(sizeof(box),box,cr);
1627 /* Essential dynamics */
1628 if (opt2bSet("-ei",nfile,fnm))
1630 /* Open input and output files, allocate space for ED data structure */
1631 ed = ed_open(nfile,fnm,Flags,cr);
1634 if (PAR(cr) && !((Flags & MD_PARTDEC) ||
1635 EI_TPI(inputrec->eI) ||
1636 inputrec->eI == eiNM))
1638 cr->dd = init_domain_decomposition(fplog,cr,Flags,ddxyz,rdd,rconstr,
1639 dddlb_opt,dlb_scale,
1643 &ddbox,&npme_major,&npme_minor);
1645 make_dd_communicators(fplog,cr,dd_node_order);
1647 /* Set overallocation to avoid frequent reallocation of arrays */
1648 set_over_alloc_dd(TRUE);
1652 /* PME, if used, is done on all nodes with 1D decomposition */
1654 cr->duty = (DUTY_PP | DUTY_PME);
1657 if (!EI_TPI(inputrec->eI))
1659 npme_major = cr->nnodes;
1662 if (inputrec->ePBC == epbcSCREW)
1665 "pbc=%s is only implemented with domain decomposition",
1666 epbc_names[inputrec->ePBC]);
1672 /* After possible communicator splitting in make_dd_communicators.
1673 * we can set up the intra/inter node communication.
1675 gmx_setup_nodecomm(fplog,cr);
1678 /* Initialize per-node process ID and counters. */
1679 gmx_init_intra_counters(cr);
1682 md_print_info(cr,fplog,"Using %d MPI %s\n",
1684 #ifdef GMX_THREAD_MPI
1685 cr->nnodes==1 ? "thread" : "threads"
1687 cr->nnodes==1 ? "process" : "processes"
1692 gmx_omp_nthreads_init(fplog, cr,
1693 hwinfo->nthreads_hw_avail,
1694 hw_opt->nthreads_omp,
1695 hw_opt->nthreads_omp_pme,
1696 (cr->duty & DUTY_PP) == 0,
1697 inputrec->cutoff_scheme == ecutsVERLET);
1699 gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi, minf.bUseGPU);
1701 /* getting number of PP/PME threads
1702 PME: env variable should be read only on one node to make sure it is
1703 identical everywhere;
1705 /* TODO nthreads_pp is only used for pinning threads.
1706 * This is a temporary solution until we have a hw topology library.
1708 nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
1709 nthreads_pme = gmx_omp_nthreads_get(emntPME);
1711 wcycle = wallcycle_init(fplog,resetstep,cr,nthreads_pp,nthreads_pme);
1715 /* Master synchronizes its value of reset_counters with all nodes
1716 * including PME only nodes */
1717 reset_counters = wcycle_get_reset_counters(wcycle);
1718 gmx_bcast_sim(sizeof(reset_counters),&reset_counters,cr);
1719 wcycle_set_reset_counters(wcycle, reset_counters);
1723 if (cr->duty & DUTY_PP)
1725 /* For domain decomposition we allocate dynamically
1726 * in dd_partition_system.
1728 if (DOMAINDECOMP(cr))
1730 bcast_state_setup(cr,state);
1736 bcast_state(cr,state,TRUE);
1740 /* Initiate forcerecord */
1742 fr->hwinfo = hwinfo;
1743 init_forcerec(fplog,oenv,fr,fcd,inputrec,mtop,cr,box,FALSE,
1744 opt2fn("-table",nfile,fnm),
1745 opt2fn("-tabletf",nfile,fnm),
1746 opt2fn("-tablep",nfile,fnm),
1747 opt2fn("-tableb",nfile,fnm),
1751 /* version for PCA_NOT_READ_NODE (see md.c) */
1752 /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
1753 "nofile","nofile","nofile","nofile",FALSE,pforce);
1755 fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT);
1757 /* Initialize QM-MM */
1760 init_QMMMrec(cr,box,mtop,inputrec,fr);
1763 /* Initialize the mdatoms structure.
1764 * mdatoms is not filled with atom data,
1765 * as this can not be done now with domain decomposition.
1767 mdatoms = init_mdatoms(fplog,mtop,inputrec->efep!=efepNO);
1769 /* Initialize the virtual site communication */
1770 vsite = init_vsite(mtop,cr,FALSE);
1772 calc_shifts(box,fr->shift_vec);
1774 /* With periodic molecules the charge groups should be whole at start up
1775 * and the virtual sites should not be far from their proper positions.
1777 if (!inputrec->bContinuation && MASTER(cr) &&
1778 !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
1780 /* Make molecules whole at start of run */
1781 if (fr->ePBC != epbcNONE)
1783 do_pbc_first_mtop(fplog,inputrec->ePBC,box,mtop,state->x);
1787 /* Correct initial vsite positions are required
1788 * for the initial distribution in the domain decomposition
1789 * and for the initial shell prediction.
1791 construct_vsites_mtop(fplog,vsite,mtop,state->x);
1795 if (EEL_PME(fr->eeltype))
1797 ewaldcoeff = fr->ewaldcoeff;
1798 pmedata = &fr->pmedata;
1807 /* This is a PME only node */
1809 /* We don't need the state */
1812 ewaldcoeff = calc_ewaldcoeff(inputrec->rcoulomb, inputrec->ewald_rtol);
1816 /* Before setting affinity, check whether the affinity has changed
1817 * - which indicates that probably the OpenMP library has changed it since
1818 * we first checked). */
1819 check_cpu_affinity_set(fplog, cr, hw_opt, hwinfo->nthreads_hw_avail, TRUE);
1821 /* Set the CPU affinity */
1822 set_cpu_affinity(fplog,cr,hw_opt,nthreads_pme,hwinfo,inputrec);
1824 /* Initiate PME if necessary,
1825 * either on all nodes or on dedicated PME nodes only. */
1826 if (EEL_PME(inputrec->coulombtype))
1830 nChargePerturbed = mdatoms->nChargePerturbed;
1832 if (cr->npmenodes > 0)
1834 /* The PME only nodes need to know nChargePerturbed */
1835 gmx_bcast_sim(sizeof(nChargePerturbed),&nChargePerturbed,cr);
1838 if (cr->duty & DUTY_PME)
1840 status = gmx_pme_init(pmedata,cr,npme_major,npme_minor,inputrec,
1841 mtop ? mtop->natoms : 0,nChargePerturbed,
1842 (Flags & MD_REPRODUCIBLE),nthreads_pme);
1845 gmx_fatal(FARGS,"Error %d initializing PME",status);
1851 if (integrator[inputrec->eI].func == do_md
1854 integrator[inputrec->eI].func == do_md_openmm
1858 /* Turn on signal handling on all nodes */
1860 * (A user signal from the PME nodes (if any)
1861 * is communicated to the PP nodes.
1863 signal_handler_install();
1866 if (cr->duty & DUTY_PP)
1868 if (inputrec->ePull != epullNO)
1870 /* Initialize pull code */
1871 init_pull(fplog,inputrec,nfile,fnm,mtop,cr,oenv, inputrec->fepvals->init_lambda,
1872 EI_DYNAMICS(inputrec->eI) && MASTER(cr),Flags);
1877 /* Initialize enforced rotation code */
1878 init_rot(fplog,inputrec,nfile,fnm,cr,state->x,box,mtop,oenv,
1882 constr = init_constraints(fplog,mtop,inputrec,ed,state,cr);
1884 if (DOMAINDECOMP(cr))
1886 dd_init_bondeds(fplog,cr->dd,mtop,vsite,constr,inputrec,
1887 Flags & MD_DDBONDCHECK,fr->cginfo_mb);
1889 set_dd_parameters(fplog,cr->dd,dlb_scale,inputrec,fr,&ddbox);
1891 setup_dd_grid(fplog,cr->dd);
1894 /* Now do whatever the user wants us to do (how flexible...) */
1895 integrator[inputrec->eI].func(fplog,cr,nfile,fnm,
1896 oenv,bVerbose,bCompact,
1899 nstepout,inputrec,mtop,
1901 mdatoms,nrnb,wcycle,ed,fr,
1902 repl_ex_nst,repl_ex_nex,repl_ex_seed,
1904 cpt_period,max_hours,
1909 if (inputrec->ePull != epullNO)
1911 finish_pull(fplog,inputrec->pull);
1916 finish_rot(fplog,inputrec->rot);
1923 gmx_pmeonly(*pmedata,cr,nrnb,wcycle,ewaldcoeff,FALSE,inputrec);
1926 if (EI_DYNAMICS(inputrec->eI) || EI_TPI(inputrec->eI))
1928 /* Some timing stats */
1931 if (runtime.proc == 0)
1933 runtime.proc = runtime.real;
1942 wallcycle_stop(wcycle,ewcRUN);
1944 /* Finish up, write some stuff
1945 * if rerunMD, don't write last frame again
1947 finish_run(fplog,cr,ftp2fn(efSTO,nfile,fnm),
1948 inputrec,nrnb,wcycle,&runtime,
1949 fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ?
1950 nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL,
1952 EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
1954 if ((cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU)
1956 char gpu_err_str[STRLEN];
1958 /* free GPU memory and uninitialize GPU (by destroying the context) */
1959 nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
1961 if (!free_gpu(gpu_err_str))
1963 gmx_warning("On node %d failed to free GPU #%d: %s",
1964 cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
1968 if (opt2bSet("-membed",nfile,fnm))
1973 #ifdef GMX_THREAD_MPI
1974 if (PAR(cr) && SIMMASTER(cr))
1977 gmx_hardware_info_free(hwinfo);
1980 /* Does what it says */
1981 print_date_and_time(fplog,cr->nodeid,"Finished mdrun",&runtime);
1983 /* Close logfile already here if we were appending to it */
1984 if (MASTER(cr) && (Flags & MD_APPENDFILES))
1986 gmx_log_close(fplog);
1989 rc=(int)gmx_get_stop_condition();
1991 #ifdef GMX_THREAD_MPI
1992 /* we need to join all threads. The sub-threads join when they
1993 exit this function, but the master thread needs to be told to
1995 if (PAR(cr) && MASTER(cr))