1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This source code is part of
8 * GROningen MAchine for Chemical Simulations
11 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13 * Copyright (c) 2001-2004, The GROMACS development team,
14 * check out http://www.gromacs.org for more information.
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version 2
19 * of the License, or (at your option) any later version.
21 * If you want to redistribute modifications, please consider that
22 * scientific software is very special. Version control is crucial -
23 * bugs must be traceable. We will be happy to consider code for
24 * inclusion in the official distribution, but derived work must not
25 * be called official GROMACS. Details are found in the README & COPYING
26 * files - if they are missing, get the official version at www.gromacs.org.
28 * To help us fund GROMACS development, we humbly ask that you cite
29 * the papers on the package - you can find them in the top README file.
31 * For more info, check our website at http://www.gromacs.org
34 * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
39 #if defined(HAVE_SCHED_H) && defined(HAVE_SCHED_GETAFFINITY)
42 #include <sys/syscall.h>
54 #include "md_logging.h"
55 #include "md_support.h"
65 #include "mpelogging.h"
71 #include "checkpoint.h"
72 #include "mtop_util.h"
73 #include "sighandler.h"
76 #include "gmx_detect_hardware.h"
77 #include "gmx_omp_nthreads.h"
78 #include "pull_rotation.h"
79 #include "calc_verletbuf.h"
80 #include "../mdlib/nbnxn_search.h"
81 #include "../mdlib/nbnxn_consts.h"
82 #include "gmx_fatal_collective.h"
84 #include "md_openmm.h"
87 #include "thread_mpi/threads.h"
101 #include "md_openmm.h"
104 #include "gpu_utils.h"
105 #include "nbnxn_cuda_data_mgmt.h"
108 gmx_integrator_t *func;
111 /* The array should match the eI array in include/types/enums.h */
112 #ifdef GMX_OPENMM /* FIXME do_md_openmm needs fixing */
113 const gmx_intp_t integrator[eiNR] = { {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm},{do_md_openmm}};
115 const gmx_intp_t integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md},{do_md}};
118 gmx_large_int_t deform_init_init_step_tpx;
119 matrix deform_init_box_tpx;
120 #ifdef GMX_THREAD_MPI
121 tMPI_Thread_mutex_t deform_init_box_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
125 #ifdef GMX_THREAD_MPI
126 struct mdrunner_arglist
128 gmx_hw_opt_t *hw_opt;
141 const char *dddlb_opt;
146 const char *nbpu_opt;
157 const char *deviceOptions;
159 int ret; /* return value */
163 /* The function used for spawning threads. Extracts the mdrunner()
164 arguments from its one argument and calls mdrunner(), after making
166 static void mdrunner_start_fn(void *arg)
168 struct mdrunner_arglist *mda=(struct mdrunner_arglist*)arg;
169 struct mdrunner_arglist mc=*mda; /* copy the arg list to make sure
170 that it's thread-local. This doesn't
171 copy pointed-to items, of course,
172 but those are all const. */
173 t_commrec *cr; /* we need a local version of this */
177 fnm = dup_tfn(mc.nfile, mc.fnm);
179 cr = init_par_threads(mc.cr);
186 mda->ret=mdrunner(mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv,
187 mc.bVerbose, mc.bCompact, mc.nstglobalcomm,
188 mc.ddxyz, mc.dd_node_order, mc.rdd,
189 mc.rconstr, mc.dddlb_opt, mc.dlb_scale,
190 mc.ddcsx, mc.ddcsy, mc.ddcsz,
192 mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
193 mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce,
194 mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.Flags);
197 /* called by mdrunner() to start a specific number of threads (including
198 the main thread) for thread-parallel runs. This in turn calls mdrunner()
200 All options besides nthreads are the same as for mdrunner(). */
201 static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
202 FILE *fplog,t_commrec *cr,int nfile,
203 const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
204 gmx_bool bCompact, int nstglobalcomm,
205 ivec ddxyz,int dd_node_order,real rdd,real rconstr,
206 const char *dddlb_opt,real dlb_scale,
207 const char *ddcsx,const char *ddcsy,const char *ddcsz,
208 const char *nbpu_opt,
209 int nsteps_cmdline, int nstepout,int resetstep,
210 int nmultisim,int repl_ex_nst,int repl_ex_nex, int repl_ex_seed,
211 real pforce,real cpt_period, real max_hours,
212 const char *deviceOptions, unsigned long Flags)
215 struct mdrunner_arglist *mda;
216 t_commrec *crn; /* the new commrec */
219 /* first check whether we even need to start tMPI */
220 if (hw_opt->nthreads_tmpi < 2)
225 /* a few small, one-time, almost unavoidable memory leaks: */
227 fnmn=dup_tfn(nfile, fnm);
229 /* fill the data structure to pass as void pointer to thread start fn */
236 mda->bVerbose=bVerbose;
237 mda->bCompact=bCompact;
238 mda->nstglobalcomm=nstglobalcomm;
239 mda->ddxyz[XX]=ddxyz[XX];
240 mda->ddxyz[YY]=ddxyz[YY];
241 mda->ddxyz[ZZ]=ddxyz[ZZ];
242 mda->dd_node_order=dd_node_order;
244 mda->rconstr=rconstr;
245 mda->dddlb_opt=dddlb_opt;
246 mda->dlb_scale=dlb_scale;
250 mda->nbpu_opt=nbpu_opt;
251 mda->nsteps_cmdline=nsteps_cmdline;
252 mda->nstepout=nstepout;
253 mda->resetstep=resetstep;
254 mda->nmultisim=nmultisim;
255 mda->repl_ex_nst=repl_ex_nst;
256 mda->repl_ex_nex=repl_ex_nex;
257 mda->repl_ex_seed=repl_ex_seed;
259 mda->cpt_period=cpt_period;
260 mda->max_hours=max_hours;
261 mda->deviceOptions=deviceOptions;
264 fprintf(stderr, "Starting %d tMPI threads\n",hw_opt->nthreads_tmpi);
266 /* now spawn new threads that start mdrunner_start_fn(), while
267 the main thread returns */
268 ret=tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi,
269 (hw_opt->bThreadPinning ? TMPI_AFFINITY_ALL_CORES : TMPI_AFFINITY_NONE),
270 mdrunner_start_fn, (void*)(mda) );
271 if (ret!=TMPI_SUCCESS)
274 /* make a new comm_rec to reflect the new situation */
275 crn=init_par_threads(cr);
280 static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
281 const gmx_hw_opt_t *hw_opt,
287 /* There are no separate PME nodes here, as we ensured in
288 * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
289 * and a conditional ensures we would not have ended up here.
290 * Note that separate PME nodes might be switched on later.
294 nthreads_tmpi = ngpu;
295 if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi)
297 nthreads_tmpi = nthreads_tot;
300 else if (hw_opt->nthreads_omp > 0)
302 /* Here we could oversubscribe, when we do, we issue a warning later */
303 nthreads_tmpi = max(1,nthreads_tot/hw_opt->nthreads_omp);
307 /* TODO choose nthreads_omp based on hardware topology
308 when we have a hardware topology detection library */
309 /* In general, when running up to 4 threads, OpenMP should be faster.
310 * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
311 * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
312 * even on two CPUs it's usually faster (but with many OpenMP threads
313 * it could be faster not to use HT, currently we always use HT).
314 * On Nehalem/Westmere we want to avoid running 16 threads over
315 * two CPUs with HT, so we need a limit<16; thus we use 12.
316 * A reasonable limit for Intel Sandy and Ivy bridge,
317 * not knowing the topology, is 16 threads.
319 const int nthreads_omp_always_faster = 4;
320 const int nthreads_omp_always_faster_Nehalem = 12;
321 const int nthreads_omp_always_faster_SandyBridge = 16;
322 const int first_model_Nehalem = 0x1A;
323 const int first_model_SandyBridge = 0x2A;
324 gmx_bool bIntel_Family6;
327 (gmx_cpuid_vendor(hwinfo->cpuid_info) == GMX_CPUID_VENDOR_INTEL &&
328 gmx_cpuid_family(hwinfo->cpuid_info) == 6);
330 if (nthreads_tot <= nthreads_omp_always_faster ||
332 ((gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_Nehalem && nthreads_tot <= nthreads_omp_always_faster_Nehalem) ||
333 (gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_SandyBridge && nthreads_tot <= nthreads_omp_always_faster_SandyBridge))))
335 /* Use pure OpenMP parallelization */
340 /* Don't use OpenMP parallelization */
341 nthreads_tmpi = nthreads_tot;
345 return nthreads_tmpi;
349 /* Get the number of threads to use for thread-MPI based on how many
350 * were requested, which algorithms we're using,
351 * and how many particles there are.
352 * At the point we have already called check_and_update_hw_opt.
353 * Thus all options should be internally consistent and consistent
354 * with the hardware, except that ntmpi could be larger than #GPU.
356 static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
357 gmx_hw_opt_t *hw_opt,
358 t_inputrec *inputrec, gmx_mtop_t *mtop,
362 int nthreads_hw,nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu;
363 int min_atoms_per_mpi_thread;
368 if (hw_opt->nthreads_tmpi > 0)
370 /* Trivial, return right away */
371 return hw_opt->nthreads_tmpi;
374 nthreads_hw = hwinfo->nthreads_hw_avail;
376 /* How many total (#tMPI*#OpenMP) threads can we start? */
377 if (hw_opt->nthreads_tot > 0)
379 nthreads_tot_max = hw_opt->nthreads_tot;
383 nthreads_tot_max = nthreads_hw;
386 bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
389 ngpu = hwinfo->gpu_info.ncuda_dev_use;
397 get_tmpi_omp_thread_division(hwinfo,hw_opt,nthreads_tot_max,ngpu);
399 if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
401 /* Steps are divided over the nodes iso splitting the atoms */
402 min_atoms_per_mpi_thread = 0;
408 min_atoms_per_mpi_thread = MIN_ATOMS_PER_GPU;
412 min_atoms_per_mpi_thread = MIN_ATOMS_PER_MPI_THREAD;
416 /* Check if an algorithm does not support parallel simulation. */
417 if (nthreads_tmpi != 1 &&
418 ( inputrec->eI == eiLBFGS ||
419 inputrec->coulombtype == eelEWALD ) )
423 md_print_warn(cr,fplog,"The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
424 if (hw_opt->nthreads_tmpi > nthreads_tmpi)
426 gmx_fatal(FARGS,"You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
429 else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_thread)
431 /* the thread number was chosen automatically, but there are too many
432 threads (too few atoms per thread) */
433 nthreads_new = max(1,mtop->natoms/min_atoms_per_mpi_thread);
435 /* Avoid partial use of Hyper-Threading */
436 if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
437 nthreads_new > nthreads_hw/2 && nthreads_new < nthreads_hw)
439 nthreads_new = nthreads_hw/2;
442 /* Avoid large prime numbers in the thread count */
443 if (nthreads_new >= 6)
445 /* Use only 6,8,10 with additional factors of 2 */
449 while (3*fac*2 <= nthreads_new)
454 nthreads_new = (nthreads_new/fac)*fac;
459 if (nthreads_new == 5)
465 nthreads_tmpi = nthreads_new;
467 fprintf(stderr,"\n");
468 fprintf(stderr,"NOTE: Parallelization is limited by the small number of atoms,\n");
469 fprintf(stderr," only starting %d thread-MPI threads.\n",nthreads_tmpi);
470 fprintf(stderr," You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
473 return nthreads_tmpi;
475 #endif /* GMX_THREAD_MPI */
478 /* Environment variable for setting nstlist */
479 static const char* NSTLIST_ENVVAR = "GMX_NSTLIST";
480 /* Try to increase nstlist when using a GPU with nstlist less than this */
481 static const int NSTLIST_GPU_ENOUGH = 20;
482 /* Increase nstlist until the non-bonded cost increases more than this factor */
483 static const float NBNXN_GPU_LIST_OK_FAC = 1.25;
484 /* Don't increase nstlist beyond a non-bonded cost increases of this factor */
485 static const float NBNXN_GPU_LIST_MAX_FAC = 1.40;
487 /* Try to increase nstlist when running on a GPU */
488 static void increase_nstlist(FILE *fp,t_commrec *cr,
489 t_inputrec *ir,const gmx_mtop_t *mtop,matrix box)
492 int nstlist_orig,nstlist_prev;
493 verletbuf_list_setup_t ls;
494 real rlist_inc,rlist_ok,rlist_max,rlist_new,rlist_prev;
497 gmx_bool bBox,bDD,bCont;
498 const char *nstl_fmt="\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
499 const char *vbd_err="Can not increase nstlist for GPU run because verlet-buffer-drift is not set or used";
500 const char *box_err="Can not increase nstlist for GPU run because the box is too small";
501 const char *dd_err ="Can not increase nstlist for GPU run because of domain decomposition limitations";
504 /* Number of + nstlist alternative values to try when switching */
505 const int nstl[]={ 20, 25, 40, 50 };
506 #define NNSTL sizeof(nstl)/sizeof(nstl[0])
508 env = getenv(NSTLIST_ENVVAR);
513 fprintf(fp,nstl_fmt,ir->nstlist);
517 if (ir->verletbuf_drift == 0)
519 gmx_fatal(FARGS,"You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
522 if (ir->verletbuf_drift < 0)
526 fprintf(stderr,"%s\n",vbd_err);
530 fprintf(fp,"%s\n",vbd_err);
536 nstlist_orig = ir->nstlist;
539 sprintf(buf,"Getting nstlist from environment variable GMX_NSTLIST=%s",env);
542 fprintf(stderr,"%s\n",buf);
546 fprintf(fp,"%s\n",buf);
548 sscanf(env,"%d",&ir->nstlist);
551 verletbuf_get_list_setup(TRUE,&ls);
553 /* Allow rlist to make the list double the size of the cut-off sphere */
554 rlist_inc = nbnxn_get_rlist_effective_inc(NBNXN_GPU_CLUSTER_SIZE,mtop->natoms/det(box));
555 rlist_ok = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_OK_FAC,1.0/3.0) - rlist_inc;
556 rlist_max = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_MAX_FAC,1.0/3.0) - rlist_inc;
559 fprintf(debug,"GPU nstlist tuning: rlist_inc %.3f rlist_max %.3f\n",
560 rlist_inc,rlist_max);
564 nstlist_prev = nstlist_orig;
565 rlist_prev = ir->rlist;
570 ir->nstlist = nstl[i];
573 /* Set the pair-list buffer size in ir */
574 calc_verlet_buffer_size(mtop,det(box),ir,ir->verletbuf_drift,&ls,
577 /* Does rlist fit in the box? */
578 bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC,box));
580 if (bBox && DOMAINDECOMP(cr))
582 /* Check if rlist fits in the domain decomposition */
583 if (inputrec2nboundeddim(ir) < DIM)
585 gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
587 copy_mat(box,state_tmp.box);
588 bDD = change_dd_cutoff(cr,&state_tmp,ir,rlist_new);
595 if (bBox && bDD && rlist_new <= rlist_max)
597 /* Increase nstlist */
598 nstlist_prev = ir->nstlist;
599 rlist_prev = rlist_new;
600 bCont = (i+1 < NNSTL && rlist_new < rlist_ok);
604 /* Stick with the previous nstlist */
605 ir->nstlist = nstlist_prev;
606 rlist_new = rlist_prev;
618 gmx_warning(!bBox ? box_err : dd_err);
621 fprintf(fp,"\n%s\n",bBox ? box_err : dd_err);
623 ir->nstlist = nstlist_orig;
625 else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
627 sprintf(buf,"Changing nstlist from %d to %d, rlist from %g to %g",
628 nstlist_orig,ir->nstlist,
629 ir->rlist,rlist_new);
632 fprintf(stderr,"%s\n\n",buf);
636 fprintf(fp,"%s\n\n",buf);
638 ir->rlist = rlist_new;
639 ir->rlistlong = rlist_new;
643 static void prepare_verlet_scheme(FILE *fplog,
644 gmx_hw_info_t *hwinfo,
646 gmx_hw_opt_t *hw_opt,
647 const char *nbpu_opt,
649 const gmx_mtop_t *mtop,
653 /* Here we only check for GPU usage on the MPI master process,
654 * as here we don't know how many GPUs we will use yet.
655 * We check for a GPU on all processes later.
657 *bUseGPU = hwinfo->bCanUseGPU || (getenv("GMX_EMULATE_GPU") != NULL);
659 if (ir->verletbuf_drift > 0)
661 /* Update the Verlet buffer size for the current run setup */
662 verletbuf_list_setup_t ls;
665 /* Here we assume CPU acceleration is on. But as currently
666 * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
667 * and 4x2 gives a larger buffer than 4x4, this is ok.
669 verletbuf_get_list_setup(*bUseGPU,&ls);
671 calc_verlet_buffer_size(mtop,det(box),ir,
672 ir->verletbuf_drift,&ls,
674 if (rlist_new != ir->rlist)
678 fprintf(fplog,"\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
680 ls.cluster_size_i,ls.cluster_size_j);
682 ir->rlist = rlist_new;
683 ir->rlistlong = rlist_new;
687 /* With GPU or emulation we should check nstlist for performance */
688 if ((EI_DYNAMICS(ir->eI) &&
690 ir->nstlist < NSTLIST_GPU_ENOUGH) ||
691 getenv(NSTLIST_ENVVAR) != NULL)
693 /* Choose a better nstlist */
694 increase_nstlist(fplog,cr,ir,mtop,box);
698 static void convert_to_verlet_scheme(FILE *fplog,
700 gmx_mtop_t *mtop,real box_vol)
702 char *conv_mesg="Converting input file with group cut-off scheme to the Verlet cut-off scheme";
704 md_print_warn(NULL,fplog,"%s\n",conv_mesg);
706 ir->cutoff_scheme = ecutsVERLET;
707 ir->verletbuf_drift = 0.005;
709 if (ir->rcoulomb != ir->rvdw)
711 gmx_fatal(FARGS,"The VdW and Coulomb cut-offs are different, whereas the Verlet scheme only supports equal cut-offs");
714 if (ir->vdwtype == evdwUSER || EEL_USER(ir->coulombtype))
716 gmx_fatal(FARGS,"User non-bonded potentials are not (yet) supported with the Verlet scheme");
718 else if (EVDW_SWITCHED(ir->vdwtype) || EEL_SWITCHED(ir->coulombtype))
720 md_print_warn(NULL,fplog,"Converting switched or shifted interactions to a shifted potential (without force shift), this will lead to slightly different interaction potentials");
722 if (EVDW_SWITCHED(ir->vdwtype))
724 ir->vdwtype = evdwCUT;
726 if (EEL_SWITCHED(ir->coulombtype))
728 if (EEL_FULL(ir->coulombtype))
730 /* With full electrostatic only PME can be switched */
731 ir->coulombtype = eelPME;
735 md_print_warn(NULL,fplog,"NOTE: Replacing %s electrostatics with reaction-field with epsilon-rf=inf\n",eel_names[ir->coulombtype]);
736 ir->coulombtype = eelRF;
737 ir->epsilon_rf = 0.0;
741 /* We set the target energy drift to a small number.
742 * Note that this is only for testing. For production the user
743 * should think about this and set the mdp options.
745 ir->verletbuf_drift = 1e-4;
748 if (inputrec2nboundeddim(ir) != 3)
750 gmx_fatal(FARGS,"Can only convert old tpr files to the Verlet cut-off scheme with 3D pbc");
753 if (ir->efep != efepNO || ir->implicit_solvent != eisNO)
755 gmx_fatal(FARGS,"Will not convert old tpr files to the Verlet cut-off scheme with free-energy calculations or implicit solvent");
758 if (EI_DYNAMICS(ir->eI) && !(EI_MD(ir->eI) && ir->etc == etcNO))
760 verletbuf_list_setup_t ls;
762 verletbuf_get_list_setup(FALSE,&ls);
763 calc_verlet_buffer_size(mtop,box_vol,ir,ir->verletbuf_drift,&ls,
768 ir->verletbuf_drift = -1;
769 ir->rlist = 1.05*max(ir->rvdw,ir->rcoulomb);
772 gmx_mtop_remove_chargegroups(mtop);
775 /* Check the process affinity mask and if it is found to be non-zero,
776 * will honor it and disable mdrun internal affinity setting.
777 * This function should be called first before the OpenMP library gets
778 * initialized with the last argument FALSE (which will detect affinity
779 * set by external tools like taskset), and later, after the OpenMP
780 * initialization, with the last argument TRUE to detect affinity changes
781 * made by the OpenMP library.
783 * Note that this will only work on Linux as we use a GNU feature. */
784 static void check_cpu_affinity_set(FILE *fplog, const t_commrec *cr,
785 gmx_hw_opt_t *hw_opt, int ncpus,
786 gmx_bool bAfterOpenmpInit)
788 #ifdef HAVE_SCHED_GETAFFINITY
789 cpu_set_t mask_current;
790 int i, ret, cpu_count, cpu_set;
794 if (!hw_opt->bThreadPinning)
796 /* internal affinity setting is off, don't bother checking process affinity */
800 CPU_ZERO(&mask_current);
801 if ((ret = sched_getaffinity(0, sizeof(cpu_set_t), &mask_current)) != 0)
803 /* failed to query affinity mask, will just return */
806 fprintf(debug, "Failed to query affinity mask (error %d)", ret);
811 /* Before proceeding with the actual check, make sure that the number of
812 * detected CPUs is >= the CPUs in the current set.
813 * We need to check for CPU_COUNT as it was added only in glibc 2.6. */
815 if (ncpus < CPU_COUNT(&mask_current))
819 fprintf(debug, "%d CPUs detected, but %d was returned by CPU_COUNT",
820 ncpus, CPU_COUNT(&mask_current));
824 #endif /* CPU_COUNT */
827 for (i = 0; (i < ncpus && i < CPU_SETSIZE); i++)
829 bAllSet = bAllSet && (CPU_ISSET(i, &mask_current) != 0);
834 if (!bAfterOpenmpInit)
836 md_print_warn(cr, fplog,
837 "Non-default process affinity set, disabling internal affinity");
841 md_print_warn(cr, fplog,
842 "Non-default process affinity set probably by the OpenMP library, "
843 "disabling internal affinity");
845 hw_opt->bThreadPinning = FALSE;
849 fprintf(debug, "Non-default affinity mask found\n");
856 fprintf(debug, "Default affinity mask found\n");
859 #endif /* HAVE_SCHED_GETAFFINITY */
862 /* Set CPU affinity. Can be important for performance.
863 On some systems (e.g. Cray) CPU Affinity is set by default.
864 But default assigning doesn't work (well) with only some ranks
865 having threads. This causes very low performance.
866 External tools have cumbersome syntax for setting affinity
867 in the case that only some ranks have threads.
868 Thus it is important that GROMACS sets the affinity internally
869 if only PME is using threads.
871 static void set_cpu_affinity(FILE *fplog,
873 gmx_hw_opt_t *hw_opt,
875 const gmx_hw_info_t *hwinfo,
876 const t_inputrec *inputrec)
878 #if defined GMX_THREAD_MPI
879 /* With the number of TMPI threads equal to the number of cores
880 * we already pinned in thread-MPI, so don't pin again here.
882 if (hw_opt->nthreads_tmpi == tMPI_Thread_get_hw_number())
889 /* If the tMPI thread affinity setting is not supported encourage the user
890 * to report it as it's either a bug or an exotic platform which we might
891 * want to support. */
892 if (tMPI_Thread_setaffinity_support() != TMPI_SETAFFINITY_SUPPORT_YES)
894 md_print_warn(NULL, fplog,
895 "Can not set thread affinities on the current plarform. On NUMA systems this\n"
896 "can cause performance degradation. If you think your platform should support\n"
897 "setting affinities, contact the GROMACS developers.");
900 #endif /* __APPLE__ */
902 if (hw_opt->bThreadPinning)
904 int nth_affinity_set, thread_id_node, thread_id,
905 nthread_local, nthread_node, nthread_hw_max, nphyscore;
909 /* threads on this MPI process or TMPI thread */
910 if (cr->duty & DUTY_PP)
912 nthread_local = gmx_omp_nthreads_get(emntNonbonded);
916 nthread_local = gmx_omp_nthreads_get(emntPME);
919 /* map the current process to cores */
921 nthread_node = nthread_local;
923 if (PAR(cr) || MULTISIM(cr))
925 /* We need to determine a scan of the thread counts in this
930 MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),cr->nodeid_intra,
932 MPI_Scan(&nthread_local,&thread_id_node,1,MPI_INT,MPI_SUM,comm_intra);
933 /* MPI_Scan is inclusive, but here we need exclusive */
934 thread_id_node -= nthread_local;
935 /* Get the total number of threads on this physical node */
936 MPI_Allreduce(&nthread_local,&nthread_node,1,MPI_INT,MPI_SUM,comm_intra);
937 MPI_Comm_free(&comm_intra);
942 if (hw_opt->core_pinning_offset > 0)
944 offset = hw_opt->core_pinning_offset;
947 fprintf(stderr, "Applying core pinning offset %d\n", offset);
951 fprintf(fplog, "Applying core pinning offset %d\n", offset);
955 /* With Intel Hyper-Threading enabled, we want to pin consecutive
956 * threads to physical cores when using more threads than physical
957 * cores or when the user requests so.
959 nthread_hw_max = hwinfo->nthreads_hw_avail;
961 if (hw_opt->bPinHyperthreading ||
962 (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
963 nthread_node > nthread_hw_max/2 && getenv("GMX_DISABLE_PINHT") == NULL))
965 if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) != GMX_CPUID_X86_SMT_ENABLED)
967 /* We print to stderr on all processes, as we might have
968 * different settings on different physical nodes.
970 if (gmx_cpuid_vendor(hwinfo->cpuid_info) != GMX_CPUID_VENDOR_INTEL)
972 md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
973 "but non-Intel CPU detected (vendor: %s)\n",
974 gmx_cpuid_vendor_string[gmx_cpuid_vendor(hwinfo->cpuid_info)]);
978 md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
979 "but the CPU detected does not have Intel Hyper-Threading support "
980 "(or it is turned off)\n");
983 nphyscore = nthread_hw_max/2;
987 fprintf(stderr, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
992 fprintf(fplog, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
997 /* Set the per-thread affinity. In order to be able to check the success
998 * of affinity settings, we will set nth_affinity_set to 1 on threads
999 * where the affinity setting succeded and to 0 where it failed.
1000 * Reducing these 0/1 values over the threads will give the total number
1001 * of threads on which we succeeded.
1003 nth_affinity_set = 0;
1004 #pragma omp parallel firstprivate(thread_id_node) num_threads(nthread_local) \
1005 reduction(+:nth_affinity_set)
1008 gmx_bool setaffinity_ret;
1010 thread_id = gmx_omp_get_thread_num();
1011 thread_id_node += thread_id;
1014 core = offset + thread_id_node;
1018 /* Lock pairs of threads to the same hyperthreaded core */
1019 core = offset + thread_id_node/2 + (thread_id_node % 2)*nphyscore;
1022 setaffinity_ret = tMPI_Thread_setaffinity_single(tMPI_Thread_self(), core);
1024 /* store the per-thread success-values of the setaffinity */
1025 nth_affinity_set = (setaffinity_ret == 0);
1029 fprintf(debug, "On node %d, thread %d the affinity setting returned %d\n",
1030 cr->nodeid, gmx_omp_get_thread_num(), setaffinity_ret);
1034 if (nth_affinity_set > nthread_local)
1038 sprintf(msg, "Looks like we have set affinity for more threads than "
1039 "we have (%d > %d)!\n", nth_affinity_set, nthread_local);
1044 /* check if some threads failed to set their affinities */
1045 if (nth_affinity_set != nthread_local)
1050 #ifdef GMX_THREAD_MPI
1051 sprintf(sbuf, "In thread-MPI thread #%d", cr->nodeid);
1052 #else /* GMX_LIB_MPI */
1054 sprintf(sbuf, "In MPI process #%d", cr->nodeid);
1055 #endif /* GMX_MPI */
1056 md_print_warn(NULL, fplog,
1057 "%s%d/%d thread%s failed to set their affinities. "
1058 "This can cause performance degradation!",
1059 sbuf, nthread_local - nth_affinity_set, nthread_local,
1060 (nthread_local - nth_affinity_set) > 1 ? "s" : "");
1067 static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt,
1070 gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp);
1072 #ifndef GMX_THREAD_MPI
1073 if (hw_opt->nthreads_tot > 0)
1075 gmx_fatal(FARGS,"Setting the total number of threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
1077 if (hw_opt->nthreads_tmpi > 0)
1079 gmx_fatal(FARGS,"Setting the number of thread-MPI threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
1083 if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
1085 /* We have the same number of OpenMP threads for PP and PME processes,
1086 * thus we can perform several consistency checks.
1088 if (hw_opt->nthreads_tmpi > 0 &&
1089 hw_opt->nthreads_omp > 0 &&
1090 hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
1092 gmx_fatal(FARGS,"The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
1093 hw_opt->nthreads_tot,hw_opt->nthreads_tmpi,hw_opt->nthreads_omp);
1096 if (hw_opt->nthreads_tmpi > 0 &&
1097 hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
1099 gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
1100 hw_opt->nthreads_tot,hw_opt->nthreads_tmpi);
1103 if (hw_opt->nthreads_omp > 0 &&
1104 hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
1106 gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
1107 hw_opt->nthreads_tot,hw_opt->nthreads_omp);
1110 if (hw_opt->nthreads_tmpi > 0 &&
1111 hw_opt->nthreads_omp <= 0)
1113 hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
1118 if (hw_opt->nthreads_omp > 1)
1120 gmx_fatal(FARGS,"OpenMP threads are requested, but Gromacs was compiled without OpenMP support");
1124 if (cutoff_scheme == ecutsGROUP)
1126 /* We only have OpenMP support for PME only nodes */
1127 if (hw_opt->nthreads_omp > 1)
1129 gmx_fatal(FARGS,"OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
1130 ecutscheme_names[cutoff_scheme],
1131 ecutscheme_names[ecutsVERLET]);
1133 hw_opt->nthreads_omp = 1;
1136 if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
1138 gmx_fatal(FARGS,"You need to specify -ntomp in addition to -ntomp_pme");
1141 if (hw_opt->nthreads_tot == 1)
1143 hw_opt->nthreads_tmpi = 1;
1145 if (hw_opt->nthreads_omp > 1)
1147 gmx_fatal(FARGS,"You requested %d OpenMP threads with %d total threads",
1148 hw_opt->nthreads_tmpi,hw_opt->nthreads_tot);
1150 hw_opt->nthreads_omp = 1;
1153 if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
1155 hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
1160 fprintf(debug,"hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
1161 hw_opt->nthreads_tot,
1162 hw_opt->nthreads_tmpi,
1163 hw_opt->nthreads_omp,
1164 hw_opt->nthreads_omp_pme,
1165 hw_opt->gpu_id!=NULL ? hw_opt->gpu_id : "");
1171 /* Override the value in inputrec with value passed on the command line (if any) */
1172 static void override_nsteps_cmdline(FILE *fplog,
1175 const t_commrec *cr)
1180 /* override with anything else than the default -2 */
1181 if (nsteps_cmdline > -2)
1185 ir->nsteps = nsteps_cmdline;
1186 if (EI_DYNAMICS(ir->eI))
1188 sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps, %.3f ps",
1189 nsteps_cmdline, nsteps_cmdline*ir->delta_t);
1193 sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps",
1197 md_print_warn(cr, fplog, "%s\n", stmp);
1201 /* Data structure set by SIMMASTER which needs to be passed to all nodes
1202 * before the other nodes have read the tpx file and called gmx_detect_hardware.
1205 int cutoff_scheme; /* The cutoff scheme from inputrec_t */
1206 gmx_bool bUseGPU; /* Use GPU or GPU emulation */
1209 int mdrunner(gmx_hw_opt_t *hw_opt,
1210 FILE *fplog,t_commrec *cr,int nfile,
1211 const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
1212 gmx_bool bCompact, int nstglobalcomm,
1213 ivec ddxyz,int dd_node_order,real rdd,real rconstr,
1214 const char *dddlb_opt,real dlb_scale,
1215 const char *ddcsx,const char *ddcsy,const char *ddcsz,
1216 const char *nbpu_opt,
1217 int nsteps_cmdline, int nstepout,int resetstep,
1218 int nmultisim,int repl_ex_nst,int repl_ex_nex,
1219 int repl_ex_seed, real pforce,real cpt_period,real max_hours,
1220 const char *deviceOptions, unsigned long Flags)
1222 gmx_bool bForceUseGPU,bTryUseGPU;
1223 double nodetime=0,realtime;
1224 t_inputrec *inputrec;
1225 t_state *state=NULL;
1227 gmx_ddbox_t ddbox={0};
1228 int npme_major,npme_minor;
1231 gmx_mtop_t *mtop=NULL;
1232 t_mdatoms *mdatoms=NULL;
1233 t_forcerec *fr=NULL;
1236 gmx_pme_t *pmedata=NULL;
1237 gmx_vsite_t *vsite=NULL;
1238 gmx_constr_t constr;
1239 int i,m,nChargePerturbed=-1,status,nalloc;
1241 gmx_wallcycle_t wcycle;
1242 gmx_bool bReadRNG,bReadEkin;
1244 gmx_runtime_t runtime;
1246 gmx_large_int_t reset_counters;
1247 gmx_edsam_t ed=NULL;
1248 t_commrec *cr_old=cr;
1251 gmx_membed_t membed=NULL;
1252 gmx_hw_info_t *hwinfo=NULL;
1253 master_inf_t minf={-1,FALSE};
1255 /* CAUTION: threads may be started later on in this function, so
1256 cr doesn't reflect the final parallel state right now */
1260 if (Flags & MD_APPENDFILES)
1265 bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
1266 bTryUseGPU = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
1271 /* Read (nearly) all data required for the simulation */
1272 read_tpx_state(ftp2fn(efTPX,nfile,fnm),inputrec,state,NULL,mtop);
1274 if (inputrec->cutoff_scheme != ecutsVERLET &&
1275 ((Flags & MD_TESTVERLET) || getenv("GMX_VERLET_SCHEME") != NULL))
1277 convert_to_verlet_scheme(fplog,inputrec,mtop,det(state->box));
1280 /* Detect hardware, gather information. With tMPI only thread 0 does it
1281 * and after threads are started broadcasts hwinfo around. */
1283 gmx_detect_hardware(fplog, hwinfo, cr,
1284 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
1286 minf.cutoff_scheme = inputrec->cutoff_scheme;
1287 minf.bUseGPU = FALSE;
1289 if (inputrec->cutoff_scheme == ecutsVERLET)
1291 prepare_verlet_scheme(fplog,hwinfo,cr,hw_opt,nbpu_opt,
1292 inputrec,mtop,state->box,
1295 else if (hwinfo->bCanUseGPU)
1297 md_print_warn(cr,fplog,
1298 "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
1299 " To use a GPU, set the mdp option: cutoff-scheme = Verlet\n"
1300 " (for quick performance testing you can use the -testverlet option)\n");
1304 gmx_fatal(FARGS,"GPU requested, but can't be used without cutoff-scheme=Verlet");
1308 #ifndef GMX_THREAD_MPI
1311 gmx_bcast_sim(sizeof(minf),&minf,cr);
1314 if (minf.bUseGPU && cr->npmenodes == -1)
1316 /* Don't automatically use PME-only nodes with GPUs */
1320 /* Check for externally set OpenMP affinity and turn off internal
1321 * pinning if any is found. We need to do this check early to tell
1322 * thread-MPI whether it should do pinning when spawning threads.
1324 gmx_omp_check_thread_affinity(fplog, cr, hw_opt);
1326 #ifdef GMX_THREAD_MPI
1327 /* With thread-MPI inputrec is only set here on the master thread */
1331 check_and_update_hw_opt(hw_opt,minf.cutoff_scheme);
1333 #ifdef GMX_THREAD_MPI
1334 /* Early check for externally set process affinity. Can't do over all
1335 * MPI processes because hwinfo is not available everywhere, but with
1336 * thread-MPI it's needed as pinning might get turned off which needs
1337 * to be known before starting thread-MPI. */
1338 check_cpu_affinity_set(fplog,
1340 hw_opt, hwinfo->nthreads_hw_avail, FALSE);
1343 #ifdef GMX_THREAD_MPI
1344 if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
1346 gmx_fatal(FARGS,"You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME nodes");
1350 if (hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
1353 gmx_fatal(FARGS,"You need to explicitly specify the number of PME nodes (-npme) when using different number of OpenMP threads for PP and PME nodes");
1357 #ifdef GMX_THREAD_MPI
1360 /* NOW the threads will be started: */
1361 hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
1365 if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
1367 hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
1370 if (hw_opt->nthreads_tmpi > 1)
1372 /* now start the threads. */
1373 cr=mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm,
1374 oenv, bVerbose, bCompact, nstglobalcomm,
1375 ddxyz, dd_node_order, rdd, rconstr,
1376 dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
1378 nsteps_cmdline, nstepout, resetstep, nmultisim,
1379 repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
1380 cpt_period, max_hours, deviceOptions,
1382 /* the main thread continues here with a new cr. We don't deallocate
1383 the old cr because other threads may still be reading it. */
1386 gmx_comm("Failed to spawn threads");
1391 /* END OF CAUTION: cr is now reliable */
1393 /* g_membed initialisation *
1394 * Because we change the mtop, init_membed is called before the init_parallel *
1395 * (in case we ever want to make it run in parallel) */
1396 if (opt2bSet("-membed",nfile,fnm))
1400 fprintf(stderr,"Initializing membed");
1402 membed = init_membed(fplog,nfile,fnm,mtop,inputrec,state,cr,&cpt_period);
1407 /* now broadcast everything to the non-master nodes/threads: */
1408 init_parallel(fplog, cr, inputrec, mtop);
1410 /* This check needs to happen after get_nthreads_mpi() */
1411 if (inputrec->cutoff_scheme == ecutsVERLET && (Flags & MD_PARTDEC))
1413 gmx_fatal_collective(FARGS,cr,NULL,
1414 "The Verlet cut-off scheme is not supported with particle decomposition.\n"
1415 "You can achieve the same effect as particle decomposition by running in parallel using only OpenMP threads.");
1420 pr_inputrec(fplog,0,"Input Parameters",inputrec,FALSE);
1423 #if defined GMX_THREAD_MPI
1424 /* With tMPI we detected on thread 0 and we'll just pass the hwinfo pointer
1425 * to the other threads -- slightly uncool, but works fine, just need to
1426 * make sure that the data doesn't get freed twice. */
1433 gmx_bcast(sizeof(&hwinfo), &hwinfo, cr);
1436 if (PAR(cr) && !SIMMASTER(cr))
1438 /* now we have inputrec on all nodes, can run the detection */
1439 /* TODO: perhaps it's better to propagate within a node instead? */
1441 gmx_detect_hardware(fplog, hwinfo, cr,
1442 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
1445 /* Now do the affinity check with MPI/no-MPI (done earlier with thread-MPI). */
1446 check_cpu_affinity_set(fplog, cr,
1447 hw_opt, hwinfo->nthreads_hw_avail, FALSE);
1450 /* now make sure the state is initialized and propagated */
1451 set_state_entries(state,inputrec,cr->nnodes);
1453 /* remove when vv and rerun works correctly! */
1454 if (PAR(cr) && EI_VV(inputrec->eI) && ((Flags & MD_RERUN) || (Flags & MD_RERUN_VSITE)))
1457 "Currently can't do velocity verlet with rerun in parallel.");
1460 /* A parallel command line option consistency check that we can
1461 only do after any threads have started. */
1463 (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
1466 "The -dd or -npme option request a parallel simulation, "
1468 "but %s was compiled without threads or MPI enabled"
1470 #ifdef GMX_THREAD_MPI
1471 "but the number of threads (option -nt) is 1"
1473 "but %s was not started through mpirun/mpiexec or only one process was requested through mpirun/mpiexec"
1480 if ((Flags & MD_RERUN) &&
1481 (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
1483 gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
1486 if (can_use_allvsall(inputrec,mtop,TRUE,cr,fplog) && PAR(cr))
1488 /* All-vs-all loops do not work with domain decomposition */
1489 Flags |= MD_PARTDEC;
1492 if (!EEL_PME(inputrec->coulombtype) || (Flags & MD_PARTDEC))
1494 if (cr->npmenodes > 0)
1496 if (!EEL_PME(inputrec->coulombtype))
1498 gmx_fatal_collective(FARGS,cr,NULL,
1499 "PME nodes are requested, but the system does not use PME electrostatics");
1501 if (Flags & MD_PARTDEC)
1503 gmx_fatal_collective(FARGS,cr,NULL,
1504 "PME nodes are requested, but particle decomposition does not support separate PME nodes");
1512 fcRegisterSteps(inputrec->nsteps,inputrec->init_step);
1515 /* NMR restraints must be initialized before load_checkpoint,
1516 * since with time averaging the history is added to t_state.
1517 * For proper consistency check we therefore need to extend
1519 * So the PME-only nodes (if present) will also initialize
1520 * the distance restraints.
1524 /* This needs to be called before read_checkpoint to extend the state */
1525 init_disres(fplog,mtop,inputrec,cr,Flags & MD_PARTDEC,fcd,state);
1527 if (gmx_mtop_ftype_count(mtop,F_ORIRES) > 0)
1529 if (PAR(cr) && !(Flags & MD_PARTDEC))
1531 gmx_fatal(FARGS,"Orientation restraints do not work (yet) with domain decomposition, use particle decomposition (mdrun option -pd)");
1533 /* Orientation restraints */
1536 init_orires(fplog,mtop,state->x,inputrec,cr->ms,&(fcd->orires),
1541 if (DEFORM(*inputrec))
1543 /* Store the deform reference box before reading the checkpoint */
1546 copy_mat(state->box,box);
1550 gmx_bcast(sizeof(box),box,cr);
1552 /* Because we do not have the update struct available yet
1553 * in which the reference values should be stored,
1554 * we store them temporarily in static variables.
1555 * This should be thread safe, since they are only written once
1556 * and with identical values.
1558 #ifdef GMX_THREAD_MPI
1559 tMPI_Thread_mutex_lock(&deform_init_box_mutex);
1561 deform_init_init_step_tpx = inputrec->init_step;
1562 copy_mat(box,deform_init_box_tpx);
1563 #ifdef GMX_THREAD_MPI
1564 tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
1568 if (opt2bSet("-cpi",nfile,fnm))
1570 /* Check if checkpoint file exists before doing continuation.
1571 * This way we can use identical input options for the first and subsequent runs...
1573 if( gmx_fexist_master(opt2fn_master("-cpi",nfile,fnm,cr),cr) )
1575 load_checkpoint(opt2fn_master("-cpi",nfile,fnm,cr),&fplog,
1576 cr,Flags & MD_PARTDEC,ddxyz,
1577 inputrec,state,&bReadRNG,&bReadEkin,
1578 (Flags & MD_APPENDFILES),
1579 (Flags & MD_APPENDFILESSET));
1583 Flags |= MD_READ_RNG;
1587 Flags |= MD_READ_EKIN;
1592 if (((MASTER(cr) || (Flags & MD_SEPPOT)) && (Flags & MD_APPENDFILES))
1593 #ifdef GMX_THREAD_MPI
1594 /* With thread MPI only the master node/thread exists in mdrun.c,
1595 * therefore non-master nodes need to open the "seppot" log file here.
1597 || (!MASTER(cr) && (Flags & MD_SEPPOT))
1601 gmx_log_open(ftp2fn(efLOG,nfile,fnm),cr,!(Flags & MD_SEPPOT),
1605 /* override nsteps with value from cmdline */
1606 override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
1610 copy_mat(state->box,box);
1615 gmx_bcast(sizeof(box),box,cr);
1618 /* Essential dynamics */
1619 if (opt2bSet("-ei",nfile,fnm))
1621 /* Open input and output files, allocate space for ED data structure */
1622 ed = ed_open(nfile,fnm,Flags,cr);
1625 if (PAR(cr) && !((Flags & MD_PARTDEC) ||
1626 EI_TPI(inputrec->eI) ||
1627 inputrec->eI == eiNM))
1629 cr->dd = init_domain_decomposition(fplog,cr,Flags,ddxyz,rdd,rconstr,
1630 dddlb_opt,dlb_scale,
1634 &ddbox,&npme_major,&npme_minor);
1636 make_dd_communicators(fplog,cr,dd_node_order);
1638 /* Set overallocation to avoid frequent reallocation of arrays */
1639 set_over_alloc_dd(TRUE);
1643 /* PME, if used, is done on all nodes with 1D decomposition */
1645 cr->duty = (DUTY_PP | DUTY_PME);
1648 if (!EI_TPI(inputrec->eI))
1650 npme_major = cr->nnodes;
1653 if (inputrec->ePBC == epbcSCREW)
1656 "pbc=%s is only implemented with domain decomposition",
1657 epbc_names[inputrec->ePBC]);
1663 /* After possible communicator splitting in make_dd_communicators.
1664 * we can set up the intra/inter node communication.
1666 gmx_setup_nodecomm(fplog,cr);
1669 /* Initialize per-node process ID and counters. */
1670 gmx_init_intra_counters(cr);
1673 md_print_info(cr,fplog,"Using %d MPI %s\n",
1675 #ifdef GMX_THREAD_MPI
1676 cr->nnodes==1 ? "thread" : "threads"
1678 cr->nnodes==1 ? "process" : "processes"
1683 gmx_omp_nthreads_init(fplog, cr,
1684 hwinfo->nthreads_hw_avail,
1685 hw_opt->nthreads_omp,
1686 hw_opt->nthreads_omp_pme,
1687 (cr->duty & DUTY_PP) == 0,
1688 inputrec->cutoff_scheme == ecutsVERLET);
1690 gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi, minf.bUseGPU);
1692 /* getting number of PP/PME threads
1693 PME: env variable should be read only on one node to make sure it is
1694 identical everywhere;
1696 /* TODO nthreads_pp is only used for pinning threads.
1697 * This is a temporary solution until we have a hw topology library.
1699 nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
1700 nthreads_pme = gmx_omp_nthreads_get(emntPME);
1702 wcycle = wallcycle_init(fplog,resetstep,cr,nthreads_pp,nthreads_pme);
1706 /* Master synchronizes its value of reset_counters with all nodes
1707 * including PME only nodes */
1708 reset_counters = wcycle_get_reset_counters(wcycle);
1709 gmx_bcast_sim(sizeof(reset_counters),&reset_counters,cr);
1710 wcycle_set_reset_counters(wcycle, reset_counters);
1714 if (cr->duty & DUTY_PP)
1716 /* For domain decomposition we allocate dynamically
1717 * in dd_partition_system.
1719 if (DOMAINDECOMP(cr))
1721 bcast_state_setup(cr,state);
1727 bcast_state(cr,state,TRUE);
1731 /* Initiate forcerecord */
1733 fr->hwinfo = hwinfo;
1734 init_forcerec(fplog,oenv,fr,fcd,inputrec,mtop,cr,box,FALSE,
1735 opt2fn("-table",nfile,fnm),
1736 opt2fn("-tabletf",nfile,fnm),
1737 opt2fn("-tablep",nfile,fnm),
1738 opt2fn("-tableb",nfile,fnm),
1742 /* version for PCA_NOT_READ_NODE (see md.c) */
1743 /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
1744 "nofile","nofile","nofile","nofile",FALSE,pforce);
1746 fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT);
1748 /* Initialize QM-MM */
1751 init_QMMMrec(cr,box,mtop,inputrec,fr);
1754 /* Initialize the mdatoms structure.
1755 * mdatoms is not filled with atom data,
1756 * as this can not be done now with domain decomposition.
1758 mdatoms = init_mdatoms(fplog,mtop,inputrec->efep!=efepNO);
1760 /* Initialize the virtual site communication */
1761 vsite = init_vsite(mtop,cr,FALSE);
1763 calc_shifts(box,fr->shift_vec);
1765 /* With periodic molecules the charge groups should be whole at start up
1766 * and the virtual sites should not be far from their proper positions.
1768 if (!inputrec->bContinuation && MASTER(cr) &&
1769 !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
1771 /* Make molecules whole at start of run */
1772 if (fr->ePBC != epbcNONE)
1774 do_pbc_first_mtop(fplog,inputrec->ePBC,box,mtop,state->x);
1778 /* Correct initial vsite positions are required
1779 * for the initial distribution in the domain decomposition
1780 * and for the initial shell prediction.
1782 construct_vsites_mtop(fplog,vsite,mtop,state->x);
1786 if (EEL_PME(fr->eeltype))
1788 ewaldcoeff = fr->ewaldcoeff;
1789 pmedata = &fr->pmedata;
1798 /* This is a PME only node */
1800 /* We don't need the state */
1803 ewaldcoeff = calc_ewaldcoeff(inputrec->rcoulomb, inputrec->ewald_rtol);
1807 /* Before setting affinity, check whether the affinity has changed
1808 * - which indicates that probably the OpenMP library has changed it since
1809 * we first checked). */
1810 check_cpu_affinity_set(fplog, cr, hw_opt, hwinfo->nthreads_hw_avail, TRUE);
1812 /* Set the CPU affinity */
1813 set_cpu_affinity(fplog,cr,hw_opt,nthreads_pme,hwinfo,inputrec);
1815 /* Initiate PME if necessary,
1816 * either on all nodes or on dedicated PME nodes only. */
1817 if (EEL_PME(inputrec->coulombtype))
1821 nChargePerturbed = mdatoms->nChargePerturbed;
1823 if (cr->npmenodes > 0)
1825 /* The PME only nodes need to know nChargePerturbed */
1826 gmx_bcast_sim(sizeof(nChargePerturbed),&nChargePerturbed,cr);
1829 if (cr->duty & DUTY_PME)
1831 status = gmx_pme_init(pmedata,cr,npme_major,npme_minor,inputrec,
1832 mtop ? mtop->natoms : 0,nChargePerturbed,
1833 (Flags & MD_REPRODUCIBLE),nthreads_pme);
1836 gmx_fatal(FARGS,"Error %d initializing PME",status);
1842 if (integrator[inputrec->eI].func == do_md
1845 integrator[inputrec->eI].func == do_md_openmm
1849 /* Turn on signal handling on all nodes */
1851 * (A user signal from the PME nodes (if any)
1852 * is communicated to the PP nodes.
1854 signal_handler_install();
1857 if (cr->duty & DUTY_PP)
1859 if (inputrec->ePull != epullNO)
1861 /* Initialize pull code */
1862 init_pull(fplog,inputrec,nfile,fnm,mtop,cr,oenv, inputrec->fepvals->init_lambda,
1863 EI_DYNAMICS(inputrec->eI) && MASTER(cr),Flags);
1868 /* Initialize enforced rotation code */
1869 init_rot(fplog,inputrec,nfile,fnm,cr,state->x,box,mtop,oenv,
1873 constr = init_constraints(fplog,mtop,inputrec,ed,state,cr);
1875 if (DOMAINDECOMP(cr))
1877 dd_init_bondeds(fplog,cr->dd,mtop,vsite,constr,inputrec,
1878 Flags & MD_DDBONDCHECK,fr->cginfo_mb);
1880 set_dd_parameters(fplog,cr->dd,dlb_scale,inputrec,fr,&ddbox);
1882 setup_dd_grid(fplog,cr->dd);
1885 /* Now do whatever the user wants us to do (how flexible...) */
1886 integrator[inputrec->eI].func(fplog,cr,nfile,fnm,
1887 oenv,bVerbose,bCompact,
1890 nstepout,inputrec,mtop,
1892 mdatoms,nrnb,wcycle,ed,fr,
1893 repl_ex_nst,repl_ex_nex,repl_ex_seed,
1895 cpt_period,max_hours,
1900 if (inputrec->ePull != epullNO)
1902 finish_pull(fplog,inputrec->pull);
1907 finish_rot(fplog,inputrec->rot);
1914 gmx_pmeonly(*pmedata,cr,nrnb,wcycle,ewaldcoeff,FALSE,inputrec);
1917 if (EI_DYNAMICS(inputrec->eI) || EI_TPI(inputrec->eI))
1919 /* Some timing stats */
1922 if (runtime.proc == 0)
1924 runtime.proc = runtime.real;
1933 wallcycle_stop(wcycle,ewcRUN);
1935 /* Finish up, write some stuff
1936 * if rerunMD, don't write last frame again
1938 finish_run(fplog,cr,ftp2fn(efSTO,nfile,fnm),
1939 inputrec,nrnb,wcycle,&runtime,
1940 fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ?
1941 nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL,
1943 EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
1945 if ((cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU)
1947 char gpu_err_str[STRLEN];
1949 /* free GPU memory and uninitialize GPU (by destroying the context) */
1950 nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
1952 if (!free_gpu(gpu_err_str))
1954 gmx_warning("On node %d failed to free GPU #%d: %s",
1955 cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
1959 if (opt2bSet("-membed",nfile,fnm))
1964 #ifdef GMX_THREAD_MPI
1965 if (PAR(cr) && SIMMASTER(cr))
1968 gmx_hardware_info_free(hwinfo);
1971 /* Does what it says */
1972 print_date_and_time(fplog,cr->nodeid,"Finished mdrun",&runtime);
1974 /* Close logfile already here if we were appending to it */
1975 if (MASTER(cr) && (Flags & MD_APPENDFILES))
1977 gmx_log_close(fplog);
1980 rc=(int)gmx_get_stop_condition();
1982 #ifdef GMX_THREAD_MPI
1983 /* we need to join all threads. The sub-threads join when they
1984 exit this function, but the master thread needs to be told to
1986 if (PAR(cr) && MASTER(cr))