2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team,
6 * check out http://www.gromacs.org for more information.
7 * Copyright (c) 2012, by the GROMACS development team, led by
8 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
9 * others, as listed in the AUTHORS file in the top-level source
10 * directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
41 #if defined(HAVE_SCHED_H) && defined(HAVE_SCHED_GETAFFINITY)
44 #include <sys/syscall.h>
56 #include "md_logging.h"
57 #include "md_support.h"
67 #include "mpelogging.h"
73 #include "checkpoint.h"
74 #include "mtop_util.h"
75 #include "sighandler.h"
78 #include "gmx_detect_hardware.h"
79 #include "gmx_omp_nthreads.h"
80 #include "pull_rotation.h"
81 #include "calc_verletbuf.h"
82 #include "../mdlib/nbnxn_search.h"
83 #include "../mdlib/nbnxn_consts.h"
84 #include "gmx_fatal_collective.h"
86 #include "md_openmm.h"
89 #include "thread_mpi/threads.h"
91 #include "gromacs/utility/gmxmpi.h"
97 #include "gpu_utils.h"
98 #include "nbnxn_cuda_data_mgmt.h"
101 gmx_integrator_t *func;
104 /* The array should match the eI array in include/types/enums.h */
105 const gmx_intp_t integrator[eiNR] = { {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm},{do_md_openmm}};
107 gmx_large_int_t deform_init_init_step_tpx;
108 matrix deform_init_box_tpx;
109 #ifdef GMX_THREAD_MPI
110 tMPI_Thread_mutex_t deform_init_box_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
114 #ifdef GMX_THREAD_MPI
115 struct mdrunner_arglist
117 gmx_hw_opt_t *hw_opt;
130 const char *dddlb_opt;
135 const char *nbpu_opt;
146 const char *deviceOptions;
148 int ret; /* return value */
152 /* The function used for spawning threads. Extracts the mdrunner()
153 arguments from its one argument and calls mdrunner(), after making
155 static void mdrunner_start_fn(void *arg)
157 struct mdrunner_arglist *mda=(struct mdrunner_arglist*)arg;
158 struct mdrunner_arglist mc=*mda; /* copy the arg list to make sure
159 that it's thread-local. This doesn't
160 copy pointed-to items, of course,
161 but those are all const. */
162 t_commrec *cr; /* we need a local version of this */
166 fnm = dup_tfn(mc.nfile, mc.fnm);
168 cr = init_par_threads(mc.cr);
175 mda->ret=mdrunner(mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv,
176 mc.bVerbose, mc.bCompact, mc.nstglobalcomm,
177 mc.ddxyz, mc.dd_node_order, mc.rdd,
178 mc.rconstr, mc.dddlb_opt, mc.dlb_scale,
179 mc.ddcsx, mc.ddcsy, mc.ddcsz,
181 mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
182 mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce,
183 mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.Flags);
186 /* called by mdrunner() to start a specific number of threads (including
187 the main thread) for thread-parallel runs. This in turn calls mdrunner()
189 All options besides nthreads are the same as for mdrunner(). */
190 static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
191 FILE *fplog,t_commrec *cr,int nfile,
192 const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
193 gmx_bool bCompact, int nstglobalcomm,
194 ivec ddxyz,int dd_node_order,real rdd,real rconstr,
195 const char *dddlb_opt,real dlb_scale,
196 const char *ddcsx,const char *ddcsy,const char *ddcsz,
197 const char *nbpu_opt,
198 int nsteps_cmdline, int nstepout,int resetstep,
199 int nmultisim,int repl_ex_nst,int repl_ex_nex, int repl_ex_seed,
200 real pforce,real cpt_period, real max_hours,
201 const char *deviceOptions, unsigned long Flags)
204 struct mdrunner_arglist *mda;
205 t_commrec *crn; /* the new commrec */
208 /* first check whether we even need to start tMPI */
209 if (hw_opt->nthreads_tmpi < 2)
214 /* a few small, one-time, almost unavoidable memory leaks: */
216 fnmn=dup_tfn(nfile, fnm);
218 /* fill the data structure to pass as void pointer to thread start fn */
225 mda->bVerbose=bVerbose;
226 mda->bCompact=bCompact;
227 mda->nstglobalcomm=nstglobalcomm;
228 mda->ddxyz[XX]=ddxyz[XX];
229 mda->ddxyz[YY]=ddxyz[YY];
230 mda->ddxyz[ZZ]=ddxyz[ZZ];
231 mda->dd_node_order=dd_node_order;
233 mda->rconstr=rconstr;
234 mda->dddlb_opt=dddlb_opt;
235 mda->dlb_scale=dlb_scale;
239 mda->nbpu_opt=nbpu_opt;
240 mda->nsteps_cmdline=nsteps_cmdline;
241 mda->nstepout=nstepout;
242 mda->resetstep=resetstep;
243 mda->nmultisim=nmultisim;
244 mda->repl_ex_nst=repl_ex_nst;
245 mda->repl_ex_nex=repl_ex_nex;
246 mda->repl_ex_seed=repl_ex_seed;
248 mda->cpt_period=cpt_period;
249 mda->max_hours=max_hours;
250 mda->deviceOptions=deviceOptions;
253 /* now spawn new threads that start mdrunner_start_fn(), while
254 the main thread returns */
255 ret=tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi,
256 (hw_opt->bThreadPinning ? TMPI_AFFINITY_ALL_CORES : TMPI_AFFINITY_NONE),
257 mdrunner_start_fn, (void*)(mda) );
258 if (ret!=TMPI_SUCCESS)
261 /* make a new comm_rec to reflect the new situation */
262 crn=init_par_threads(cr);
267 static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
268 const gmx_hw_opt_t *hw_opt,
274 /* There are no separate PME nodes here, as we ensured in
275 * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
276 * and a conditional ensures we would not have ended up here.
277 * Note that separate PME nodes might be switched on later.
281 nthreads_tmpi = ngpu;
282 if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi)
284 nthreads_tmpi = nthreads_tot;
287 else if (hw_opt->nthreads_omp > 0)
289 /* Here we could oversubscribe, when we do, we issue a warning later */
290 nthreads_tmpi = max(1,nthreads_tot/hw_opt->nthreads_omp);
294 /* TODO choose nthreads_omp based on hardware topology
295 when we have a hardware topology detection library */
296 /* In general, when running up to 4 threads, OpenMP should be faster.
297 * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
298 * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
299 * even on two CPUs it's usually faster (but with many OpenMP threads
300 * it could be faster not to use HT, currently we always use HT).
301 * On Nehalem/Westmere we want to avoid running 16 threads over
302 * two CPUs with HT, so we need a limit<16; thus we use 12.
303 * A reasonable limit for Intel Sandy and Ivy bridge,
304 * not knowing the topology, is 16 threads.
306 const int nthreads_omp_always_faster = 4;
307 const int nthreads_omp_always_faster_Nehalem = 12;
308 const int nthreads_omp_always_faster_SandyBridge = 16;
309 const int first_model_Nehalem = 0x1A;
310 const int first_model_SandyBridge = 0x2A;
311 gmx_bool bIntel_Family6;
314 (gmx_cpuid_vendor(hwinfo->cpuid_info) == GMX_CPUID_VENDOR_INTEL &&
315 gmx_cpuid_family(hwinfo->cpuid_info) == 6);
317 if (nthreads_tot <= nthreads_omp_always_faster ||
319 ((gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_Nehalem && nthreads_tot <= nthreads_omp_always_faster_Nehalem) ||
320 (gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_SandyBridge && nthreads_tot <= nthreads_omp_always_faster_SandyBridge))))
322 /* Use pure OpenMP parallelization */
327 /* Don't use OpenMP parallelization */
328 nthreads_tmpi = nthreads_tot;
332 return nthreads_tmpi;
336 /* Get the number of threads to use for thread-MPI based on how many
337 * were requested, which algorithms we're using,
338 * and how many particles there are.
339 * At the point we have already called check_and_update_hw_opt.
340 * Thus all options should be internally consistent and consistent
341 * with the hardware, except that ntmpi could be larger than #GPU.
343 static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
344 gmx_hw_opt_t *hw_opt,
345 t_inputrec *inputrec, gmx_mtop_t *mtop,
349 int nthreads_hw,nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu;
350 int min_atoms_per_mpi_thread;
355 if (hw_opt->nthreads_tmpi > 0)
357 /* Trivial, return right away */
358 return hw_opt->nthreads_tmpi;
361 nthreads_hw = hwinfo->nthreads_hw_avail;
363 /* How many total (#tMPI*#OpenMP) threads can we start? */
364 if (hw_opt->nthreads_tot > 0)
366 nthreads_tot_max = hw_opt->nthreads_tot;
370 nthreads_tot_max = nthreads_hw;
373 bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
376 ngpu = hwinfo->gpu_info.ncuda_dev_use;
384 get_tmpi_omp_thread_division(hwinfo,hw_opt,nthreads_tot_max,ngpu);
386 if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
388 /* Steps are divided over the nodes iso splitting the atoms */
389 min_atoms_per_mpi_thread = 0;
395 min_atoms_per_mpi_thread = MIN_ATOMS_PER_GPU;
399 min_atoms_per_mpi_thread = MIN_ATOMS_PER_MPI_THREAD;
403 /* Check if an algorithm does not support parallel simulation. */
404 if (nthreads_tmpi != 1 &&
405 ( inputrec->eI == eiLBFGS ||
406 inputrec->coulombtype == eelEWALD ) )
410 md_print_warn(cr,fplog,"The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
411 if (hw_opt->nthreads_tmpi > nthreads_tmpi)
413 gmx_fatal(FARGS,"You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
416 else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_thread)
418 /* the thread number was chosen automatically, but there are too many
419 threads (too few atoms per thread) */
420 nthreads_new = max(1,mtop->natoms/min_atoms_per_mpi_thread);
422 /* Avoid partial use of Hyper-Threading */
423 if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
424 nthreads_new > nthreads_hw/2 && nthreads_new < nthreads_hw)
426 nthreads_new = nthreads_hw/2;
429 /* Avoid large prime numbers in the thread count */
430 if (nthreads_new >= 6)
432 /* Use only 6,8,10 with additional factors of 2 */
436 while (3*fac*2 <= nthreads_new)
441 nthreads_new = (nthreads_new/fac)*fac;
446 if (nthreads_new == 5)
452 nthreads_tmpi = nthreads_new;
454 fprintf(stderr,"\n");
455 fprintf(stderr,"NOTE: Parallelization is limited by the small number of atoms,\n");
456 fprintf(stderr," only starting %d thread-MPI threads.\n",nthreads_tmpi);
457 fprintf(stderr," You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
460 return nthreads_tmpi;
462 #endif /* GMX_THREAD_MPI */
465 /* Environment variable for setting nstlist */
466 static const char* NSTLIST_ENVVAR = "GMX_NSTLIST";
467 /* Try to increase nstlist when using a GPU with nstlist less than this */
468 static const int NSTLIST_GPU_ENOUGH = 20;
469 /* Increase nstlist until the non-bonded cost increases more than this factor */
470 static const float NBNXN_GPU_LIST_OK_FAC = 1.25;
471 /* Don't increase nstlist beyond a non-bonded cost increases of this factor */
472 static const float NBNXN_GPU_LIST_MAX_FAC = 1.40;
474 /* Try to increase nstlist when running on a GPU */
475 static void increase_nstlist(FILE *fp,t_commrec *cr,
476 t_inputrec *ir,const gmx_mtop_t *mtop,matrix box)
479 int nstlist_orig,nstlist_prev;
480 verletbuf_list_setup_t ls;
481 real rlist_inc,rlist_ok,rlist_max,rlist_new,rlist_prev;
484 gmx_bool bBox,bDD,bCont;
485 const char *nstl_fmt="\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
486 const char *vbd_err="Can not increase nstlist for GPU run because verlet-buffer-drift is not set or used";
487 const char *box_err="Can not increase nstlist for GPU run because the box is too small";
488 const char *dd_err ="Can not increase nstlist for GPU run because of domain decomposition limitations";
491 /* Number of + nstlist alternative values to try when switching */
492 const int nstl[]={ 20, 25, 40, 50 };
493 #define NNSTL sizeof(nstl)/sizeof(nstl[0])
495 env = getenv(NSTLIST_ENVVAR);
500 fprintf(fp,nstl_fmt,ir->nstlist);
504 if (ir->verletbuf_drift == 0)
506 gmx_fatal(FARGS,"You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
509 if (ir->verletbuf_drift < 0)
513 fprintf(stderr,"%s\n",vbd_err);
517 fprintf(fp,"%s\n",vbd_err);
523 nstlist_orig = ir->nstlist;
526 sprintf(buf,"Getting nstlist from environment variable GMX_NSTLIST=%s",env);
529 fprintf(stderr,"%s\n",buf);
533 fprintf(fp,"%s\n",buf);
535 sscanf(env,"%d",&ir->nstlist);
538 verletbuf_get_list_setup(TRUE,&ls);
540 /* Allow rlist to make the list double the size of the cut-off sphere */
541 rlist_inc = nbnxn_get_rlist_effective_inc(NBNXN_GPU_CLUSTER_SIZE,mtop->natoms/det(box));
542 rlist_ok = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_OK_FAC,1.0/3.0) - rlist_inc;
543 rlist_max = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_MAX_FAC,1.0/3.0) - rlist_inc;
546 fprintf(debug,"GPU nstlist tuning: rlist_inc %.3f rlist_max %.3f\n",
547 rlist_inc,rlist_max);
551 nstlist_prev = nstlist_orig;
552 rlist_prev = ir->rlist;
557 ir->nstlist = nstl[i];
560 /* Set the pair-list buffer size in ir */
561 calc_verlet_buffer_size(mtop,det(box),ir,ir->verletbuf_drift,&ls,
564 /* Does rlist fit in the box? */
565 bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC,box));
567 if (bBox && DOMAINDECOMP(cr))
569 /* Check if rlist fits in the domain decomposition */
570 if (inputrec2nboundeddim(ir) < DIM)
572 gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
574 copy_mat(box,state_tmp.box);
575 bDD = change_dd_cutoff(cr,&state_tmp,ir,rlist_new);
582 if (bBox && bDD && rlist_new <= rlist_max)
584 /* Increase nstlist */
585 nstlist_prev = ir->nstlist;
586 rlist_prev = rlist_new;
587 bCont = (i+1 < NNSTL && rlist_new < rlist_ok);
591 /* Stick with the previous nstlist */
592 ir->nstlist = nstlist_prev;
593 rlist_new = rlist_prev;
605 gmx_warning(!bBox ? box_err : dd_err);
608 fprintf(fp,"\n%s\n",bBox ? box_err : dd_err);
610 ir->nstlist = nstlist_orig;
612 else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
614 sprintf(buf,"Changing nstlist from %d to %d, rlist from %g to %g",
615 nstlist_orig,ir->nstlist,
616 ir->rlist,rlist_new);
619 fprintf(stderr,"%s\n\n",buf);
623 fprintf(fp,"%s\n\n",buf);
625 ir->rlist = rlist_new;
626 ir->rlistlong = rlist_new;
630 static void prepare_verlet_scheme(FILE *fplog,
631 gmx_hw_info_t *hwinfo,
633 gmx_hw_opt_t *hw_opt,
634 const char *nbpu_opt,
636 const gmx_mtop_t *mtop,
640 /* Here we only check for GPU usage on the MPI master process,
641 * as here we don't know how many GPUs we will use yet.
642 * We check for a GPU on all processes later.
644 *bUseGPU = hwinfo->bCanUseGPU || (getenv("GMX_EMULATE_GPU") != NULL);
646 if (ir->verletbuf_drift > 0)
648 /* Update the Verlet buffer size for the current run setup */
649 verletbuf_list_setup_t ls;
652 /* Here we assume CPU acceleration is on. But as currently
653 * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
654 * and 4x2 gives a larger buffer than 4x4, this is ok.
656 verletbuf_get_list_setup(*bUseGPU,&ls);
658 calc_verlet_buffer_size(mtop,det(box),ir,
659 ir->verletbuf_drift,&ls,
661 if (rlist_new != ir->rlist)
665 fprintf(fplog,"\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
667 ls.cluster_size_i,ls.cluster_size_j);
669 ir->rlist = rlist_new;
670 ir->rlistlong = rlist_new;
674 /* With GPU or emulation we should check nstlist for performance */
675 if ((EI_DYNAMICS(ir->eI) &&
677 ir->nstlist < NSTLIST_GPU_ENOUGH) ||
678 getenv(NSTLIST_ENVVAR) != NULL)
680 /* Choose a better nstlist */
681 increase_nstlist(fplog,cr,ir,mtop,box);
685 static void convert_to_verlet_scheme(FILE *fplog,
687 gmx_mtop_t *mtop,real box_vol)
689 char *conv_mesg="Converting input file with group cut-off scheme to the Verlet cut-off scheme";
691 md_print_warn(NULL,fplog,"%s\n",conv_mesg);
693 ir->cutoff_scheme = ecutsVERLET;
694 ir->verletbuf_drift = 0.005;
696 if (ir->rcoulomb != ir->rvdw)
698 gmx_fatal(FARGS,"The VdW and Coulomb cut-offs are different, whereas the Verlet scheme only supports equal cut-offs");
701 if (ir->vdwtype == evdwUSER || EEL_USER(ir->coulombtype))
703 gmx_fatal(FARGS,"User non-bonded potentials are not (yet) supported with the Verlet scheme");
705 else if (EVDW_SWITCHED(ir->vdwtype) || EEL_SWITCHED(ir->coulombtype))
707 md_print_warn(NULL,fplog,"Converting switched or shifted interactions to a shifted potential (without force shift), this will lead to slightly different interaction potentials");
709 if (EVDW_SWITCHED(ir->vdwtype))
711 ir->vdwtype = evdwCUT;
713 if (EEL_SWITCHED(ir->coulombtype))
715 if (EEL_FULL(ir->coulombtype))
717 /* With full electrostatic only PME can be switched */
718 ir->coulombtype = eelPME;
722 md_print_warn(NULL,fplog,"NOTE: Replacing %s electrostatics with reaction-field with epsilon-rf=inf\n",eel_names[ir->coulombtype]);
723 ir->coulombtype = eelRF;
724 ir->epsilon_rf = 0.0;
728 /* We set the target energy drift to a small number.
729 * Note that this is only for testing. For production the user
730 * should think about this and set the mdp options.
732 ir->verletbuf_drift = 1e-4;
735 if (inputrec2nboundeddim(ir) != 3)
737 gmx_fatal(FARGS,"Can only convert old tpr files to the Verlet cut-off scheme with 3D pbc");
740 if (ir->efep != efepNO || ir->implicit_solvent != eisNO)
742 gmx_fatal(FARGS,"Will not convert old tpr files to the Verlet cut-off scheme with free-energy calculations or implicit solvent");
745 if (EI_DYNAMICS(ir->eI) && !(EI_MD(ir->eI) && ir->etc == etcNO))
747 verletbuf_list_setup_t ls;
749 verletbuf_get_list_setup(FALSE,&ls);
750 calc_verlet_buffer_size(mtop,box_vol,ir,ir->verletbuf_drift,&ls,
755 ir->verletbuf_drift = -1;
756 ir->rlist = 1.05*max(ir->rvdw,ir->rcoulomb);
759 gmx_mtop_remove_chargegroups(mtop);
762 /* Check the process affinity mask. If it is non-zero, something
763 * else has set the affinity, and mdrun should honor that and
764 * not attempt to do its own thread pinning.
766 * This function should be called twice. Once before the OpenMP
767 * library gets initialized with bAfterOpenMPInit=FALSE (which will
768 * detect affinity set by external tools like taskset), and again
769 * later, after the OpenMP initialization, with bAfterOpenMPInit=TRUE
770 * (which will detect affinity changes made by the OpenMP library).
772 * Note that this will only work on Linux, because we use a GNU
774 static void check_cpu_affinity_set(FILE *fplog, const t_commrec *cr,
775 gmx_hw_opt_t *hw_opt, int ncpus,
776 gmx_bool bAfterOpenmpInit)
778 #ifdef HAVE_SCHED_GETAFFINITY
779 cpu_set_t mask_current;
780 int i, ret, cpu_count, cpu_set;
784 if (!hw_opt->bThreadPinning)
786 /* internal affinity setting is off, don't bother checking process affinity */
790 CPU_ZERO(&mask_current);
791 if ((ret = sched_getaffinity(0, sizeof(cpu_set_t), &mask_current)) != 0)
793 /* failed to query affinity mask, will just return */
796 fprintf(debug, "Failed to query affinity mask (error %d)", ret);
801 /* Before proceeding with the actual check, make sure that the number of
802 * detected CPUs is >= the CPUs in the current set.
803 * We need to check for CPU_COUNT as it was added only in glibc 2.6. */
805 if (ncpus < CPU_COUNT(&mask_current))
809 fprintf(debug, "%d CPUs detected, but %d was returned by CPU_COUNT",
810 ncpus, CPU_COUNT(&mask_current));
814 #endif /* CPU_COUNT */
817 for (i = 0; (i < ncpus && i < CPU_SETSIZE); i++)
819 bAllSet = bAllSet && (CPU_ISSET(i, &mask_current) != 0);
824 if (!bAfterOpenmpInit)
826 md_print_warn(cr, fplog,
827 "%s detected a non-default process affinity, "
828 "so it will not attempt to pin its threads", ShortProgram());
832 md_print_warn(cr, fplog,
833 "%s detected a non-default process affinity, "
834 "probably set by the OpenMP library, "
835 "so it will not attempt to pin its threads", ShortProgram());
837 hw_opt->bThreadPinning = FALSE;
841 fprintf(debug, "Non-default affinity mask found, mdrun will not pin threads\n");
848 fprintf(debug, "Default affinity mask found\n");
851 #endif /* HAVE_SCHED_GETAFFINITY */
854 /* Set CPU affinity. Can be important for performance.
855 On some systems (e.g. Cray) CPU Affinity is set by default.
856 But default assigning doesn't work (well) with only some ranks
857 having threads. This causes very low performance.
858 External tools have cumbersome syntax for setting affinity
859 in the case that only some ranks have threads.
860 Thus it is important that GROMACS sets the affinity internally
861 if only PME is using threads.
863 static void set_cpu_affinity(FILE *fplog,
865 gmx_hw_opt_t *hw_opt,
867 const gmx_hw_info_t *hwinfo,
868 const t_inputrec *inputrec)
870 #if defined GMX_THREAD_MPI
871 /* With the number of TMPI threads equal to the number of cores
872 * we already pinned in thread-MPI, so don't pin again here.
874 if (hw_opt->nthreads_tmpi == tMPI_Thread_get_hw_number())
881 /* If the tMPI thread affinity setting is not supported encourage the user
882 * to report it as it's either a bug or an exotic platform which we might
883 * want to support. */
884 if (tMPI_Thread_setaffinity_support() != TMPI_SETAFFINITY_SUPPORT_YES)
886 md_print_warn(NULL, fplog,
887 "Can not set thread affinities on the current plarform. On NUMA systems this\n"
888 "can cause performance degradation. If you think your platform should support\n"
889 "setting affinities, contact the GROMACS developers.");
892 #endif /* __APPLE__ */
894 if (hw_opt->bThreadPinning)
896 int nth_affinity_set, thread_id_node, thread_id,
897 nthread_local, nthread_node, nthread_hw_max, nphyscore;
901 /* threads on this MPI process or TMPI thread */
902 if (cr->duty & DUTY_PP)
904 nthread_local = gmx_omp_nthreads_get(emntNonbonded);
908 nthread_local = gmx_omp_nthreads_get(emntPME);
911 /* map the current process to cores */
913 nthread_node = nthread_local;
915 if (PAR(cr) || MULTISIM(cr))
917 /* We need to determine a scan of the thread counts in this
922 MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),cr->rank_intranode,
924 MPI_Scan(&nthread_local,&thread_id_node,1,MPI_INT,MPI_SUM,comm_intra);
925 /* MPI_Scan is inclusive, but here we need exclusive */
926 thread_id_node -= nthread_local;
927 /* Get the total number of threads on this physical node */
928 MPI_Allreduce(&nthread_local,&nthread_node,1,MPI_INT,MPI_SUM,comm_intra);
929 MPI_Comm_free(&comm_intra);
934 if (hw_opt->core_pinning_offset > 0)
936 offset = hw_opt->core_pinning_offset;
939 fprintf(stderr, "Applying core pinning offset %d\n", offset);
943 fprintf(fplog, "Applying core pinning offset %d\n", offset);
947 /* With Intel Hyper-Threading enabled, we want to pin consecutive
948 * threads to physical cores when using more threads than physical
949 * cores or when the user requests so.
951 nthread_hw_max = hwinfo->nthreads_hw_avail;
953 if (hw_opt->bPinHyperthreading ||
954 (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
955 nthread_node > nthread_hw_max/2 && getenv("GMX_DISABLE_PINHT") == NULL))
957 if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) != GMX_CPUID_X86_SMT_ENABLED)
959 /* We print to stderr on all processes, as we might have
960 * different settings on different physical nodes.
962 if (gmx_cpuid_vendor(hwinfo->cpuid_info) != GMX_CPUID_VENDOR_INTEL)
964 md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
965 "but non-Intel CPU detected (vendor: %s)\n",
966 gmx_cpuid_vendor_string[gmx_cpuid_vendor(hwinfo->cpuid_info)]);
970 md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
971 "but the CPU detected does not have Intel Hyper-Threading support "
972 "(or it is turned off)\n");
975 nphyscore = nthread_hw_max/2;
979 fprintf(stderr, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
984 fprintf(fplog, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
989 /* Set the per-thread affinity. In order to be able to check the success
990 * of affinity settings, we will set nth_affinity_set to 1 on threads
991 * where the affinity setting succeded and to 0 where it failed.
992 * Reducing these 0/1 values over the threads will give the total number
993 * of threads on which we succeeded.
995 nth_affinity_set = 0;
996 #pragma omp parallel firstprivate(thread_id_node) num_threads(nthread_local) \
997 reduction(+:nth_affinity_set)
1000 gmx_bool setaffinity_ret;
1002 thread_id = gmx_omp_get_thread_num();
1003 thread_id_node += thread_id;
1006 core = offset + thread_id_node;
1010 /* Lock pairs of threads to the same hyperthreaded core */
1011 core = offset + thread_id_node/2 + (thread_id_node % 2)*nphyscore;
1014 setaffinity_ret = tMPI_Thread_setaffinity_single(tMPI_Thread_self(), core);
1016 /* store the per-thread success-values of the setaffinity */
1017 nth_affinity_set = (setaffinity_ret == 0);
1021 fprintf(debug, "On rank %2d, thread %2d, core %2d the affinity setting returned %d\n",
1022 cr->nodeid, gmx_omp_get_thread_num(), core, setaffinity_ret);
1026 if (nth_affinity_set > nthread_local)
1030 sprintf(msg, "Looks like we have set affinity for more threads than "
1031 "we have (%d > %d)!\n", nth_affinity_set, nthread_local);
1036 /* check & warn if some threads failed to set their affinities */
1037 if (nth_affinity_set != nthread_local)
1039 char sbuf1[STRLEN], sbuf2[STRLEN];
1041 /* sbuf1 contains rank info, while sbuf2 OpenMP thread info */
1042 sbuf1[0] = sbuf2[0] = '\0';
1044 #ifdef GMX_THREAD_MPI
1045 sprintf(sbuf1, "In thread-MPI thread #%d: ", cr->nodeid);
1046 #else /* GMX_LIB_MPI */
1047 sprintf(sbuf1, "In MPI process #%d: ", cr->nodeid);
1049 #endif /* GMX_MPI */
1051 if (nthread_local > 1)
1053 sprintf(sbuf2, "of %d/%d thread%s ",
1054 nthread_local - nth_affinity_set, nthread_local,
1055 (nthread_local - nth_affinity_set) > 1 ? "s" : "");
1058 md_print_warn(NULL, fplog,
1059 "NOTE: %sAffinity setting %sfailed.\n"
1060 " This can cause performance degradation!",
1068 static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt,
1071 gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp);
1073 #ifndef GMX_THREAD_MPI
1074 if (hw_opt->nthreads_tot > 0)
1076 gmx_fatal(FARGS,"Setting the total number of threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
1078 if (hw_opt->nthreads_tmpi > 0)
1080 gmx_fatal(FARGS,"Setting the number of thread-MPI threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
1084 if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
1086 /* We have the same number of OpenMP threads for PP and PME processes,
1087 * thus we can perform several consistency checks.
1089 if (hw_opt->nthreads_tmpi > 0 &&
1090 hw_opt->nthreads_omp > 0 &&
1091 hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
1093 gmx_fatal(FARGS,"The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
1094 hw_opt->nthreads_tot,hw_opt->nthreads_tmpi,hw_opt->nthreads_omp);
1097 if (hw_opt->nthreads_tmpi > 0 &&
1098 hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
1100 gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
1101 hw_opt->nthreads_tot,hw_opt->nthreads_tmpi);
1104 if (hw_opt->nthreads_omp > 0 &&
1105 hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
1107 gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
1108 hw_opt->nthreads_tot,hw_opt->nthreads_omp);
1111 if (hw_opt->nthreads_tmpi > 0 &&
1112 hw_opt->nthreads_omp <= 0)
1114 hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
1119 if (hw_opt->nthreads_omp > 1)
1121 gmx_fatal(FARGS,"OpenMP threads are requested, but Gromacs was compiled without OpenMP support");
1125 if (cutoff_scheme == ecutsGROUP)
1127 /* We only have OpenMP support for PME only nodes */
1128 if (hw_opt->nthreads_omp > 1)
1130 gmx_fatal(FARGS,"OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
1131 ecutscheme_names[cutoff_scheme],
1132 ecutscheme_names[ecutsVERLET]);
1134 hw_opt->nthreads_omp = 1;
1137 if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
1139 gmx_fatal(FARGS,"You need to specify -ntomp in addition to -ntomp_pme");
1142 if (hw_opt->nthreads_tot == 1)
1144 hw_opt->nthreads_tmpi = 1;
1146 if (hw_opt->nthreads_omp > 1)
1148 gmx_fatal(FARGS,"You requested %d OpenMP threads with %d total threads",
1149 hw_opt->nthreads_tmpi,hw_opt->nthreads_tot);
1151 hw_opt->nthreads_omp = 1;
1154 if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
1156 hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
1161 fprintf(debug,"hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
1162 hw_opt->nthreads_tot,
1163 hw_opt->nthreads_tmpi,
1164 hw_opt->nthreads_omp,
1165 hw_opt->nthreads_omp_pme,
1166 hw_opt->gpu_id!=NULL ? hw_opt->gpu_id : "");
1172 /* Override the value in inputrec with value passed on the command line (if any) */
1173 static void override_nsteps_cmdline(FILE *fplog,
1176 const t_commrec *cr)
1181 /* override with anything else than the default -2 */
1182 if (nsteps_cmdline > -2)
1186 ir->nsteps = nsteps_cmdline;
1187 if (EI_DYNAMICS(ir->eI))
1189 sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps, %.3f ps",
1190 nsteps_cmdline, nsteps_cmdline*ir->delta_t);
1194 sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps",
1198 md_print_warn(cr, fplog, "%s\n", stmp);
1202 /* Data structure set by SIMMASTER which needs to be passed to all nodes
1203 * before the other nodes have read the tpx file and called gmx_detect_hardware.
1206 int cutoff_scheme; /* The cutoff scheme from inputrec_t */
1207 gmx_bool bUseGPU; /* Use GPU or GPU emulation */
1210 int mdrunner(gmx_hw_opt_t *hw_opt,
1211 FILE *fplog,t_commrec *cr,int nfile,
1212 const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
1213 gmx_bool bCompact, int nstglobalcomm,
1214 ivec ddxyz,int dd_node_order,real rdd,real rconstr,
1215 const char *dddlb_opt,real dlb_scale,
1216 const char *ddcsx,const char *ddcsy,const char *ddcsz,
1217 const char *nbpu_opt,
1218 int nsteps_cmdline, int nstepout,int resetstep,
1219 int nmultisim,int repl_ex_nst,int repl_ex_nex,
1220 int repl_ex_seed, real pforce,real cpt_period,real max_hours,
1221 const char *deviceOptions, unsigned long Flags)
1223 gmx_bool bForceUseGPU,bTryUseGPU;
1224 double nodetime=0,realtime;
1225 t_inputrec *inputrec;
1226 t_state *state=NULL;
1228 gmx_ddbox_t ddbox={0};
1229 int npme_major,npme_minor;
1232 gmx_mtop_t *mtop=NULL;
1233 t_mdatoms *mdatoms=NULL;
1234 t_forcerec *fr=NULL;
1237 gmx_pme_t *pmedata=NULL;
1238 gmx_vsite_t *vsite=NULL;
1239 gmx_constr_t constr;
1240 int i,m,nChargePerturbed=-1,status,nalloc;
1242 gmx_wallcycle_t wcycle;
1243 gmx_bool bReadRNG,bReadEkin;
1245 gmx_runtime_t runtime;
1247 gmx_large_int_t reset_counters;
1248 gmx_edsam_t ed=NULL;
1249 t_commrec *cr_old=cr;
1252 gmx_membed_t membed=NULL;
1253 gmx_hw_info_t *hwinfo=NULL;
1254 master_inf_t minf={-1,FALSE};
1256 /* CAUTION: threads may be started later on in this function, so
1257 cr doesn't reflect the final parallel state right now */
1261 if (Flags & MD_APPENDFILES)
1266 bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
1267 bTryUseGPU = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
1272 /* Read (nearly) all data required for the simulation */
1273 read_tpx_state(ftp2fn(efTPX,nfile,fnm),inputrec,state,NULL,mtop);
1275 if (inputrec->cutoff_scheme != ecutsVERLET &&
1276 ((Flags & MD_TESTVERLET) || getenv("GMX_VERLET_SCHEME") != NULL))
1278 convert_to_verlet_scheme(fplog,inputrec,mtop,det(state->box));
1281 /* Detect hardware, gather information. With tMPI only thread 0 does it
1282 * and after threads are started broadcasts hwinfo around. */
1284 gmx_detect_hardware(fplog, hwinfo, cr,
1285 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
1287 minf.cutoff_scheme = inputrec->cutoff_scheme;
1288 minf.bUseGPU = FALSE;
1290 if (inputrec->cutoff_scheme == ecutsVERLET)
1292 prepare_verlet_scheme(fplog,hwinfo,cr,hw_opt,nbpu_opt,
1293 inputrec,mtop,state->box,
1296 else if (hwinfo->bCanUseGPU)
1298 md_print_warn(cr,fplog,
1299 "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
1300 " To use a GPU, set the mdp option: cutoff-scheme = Verlet\n"
1301 " (for quick performance testing you can use the -testverlet option)\n");
1305 gmx_fatal(FARGS,"GPU requested, but can't be used without cutoff-scheme=Verlet");
1309 #ifndef GMX_THREAD_MPI
1312 gmx_bcast_sim(sizeof(minf),&minf,cr);
1315 if (minf.bUseGPU && cr->npmenodes == -1)
1317 /* Don't automatically use PME-only nodes with GPUs */
1321 /* Check for externally set OpenMP affinity and turn off internal
1322 * pinning if any is found. We need to do this check early to tell
1323 * thread-MPI whether it should do pinning when spawning threads.
1325 gmx_omp_check_thread_affinity(fplog, cr, hw_opt);
1327 #ifdef GMX_THREAD_MPI
1328 /* With thread-MPI inputrec is only set here on the master thread */
1332 check_and_update_hw_opt(hw_opt,minf.cutoff_scheme);
1334 #ifdef GMX_THREAD_MPI
1335 /* Early check for externally set process affinity. Can't do over all
1336 * MPI processes because hwinfo is not available everywhere, but with
1337 * thread-MPI it's needed as pinning might get turned off which needs
1338 * to be known before starting thread-MPI. */
1339 check_cpu_affinity_set(fplog,
1341 hw_opt, hwinfo->nthreads_hw_avail, FALSE);
1344 #ifdef GMX_THREAD_MPI
1345 if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
1347 gmx_fatal(FARGS,"You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME nodes");
1351 if (hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
1354 gmx_fatal(FARGS,"You need to explicitly specify the number of PME nodes (-npme) when using different number of OpenMP threads for PP and PME nodes");
1358 #ifdef GMX_THREAD_MPI
1361 /* NOW the threads will be started: */
1362 hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
1366 if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
1368 hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
1371 if (hw_opt->nthreads_tmpi > 1)
1373 /* now start the threads. */
1374 cr=mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm,
1375 oenv, bVerbose, bCompact, nstglobalcomm,
1376 ddxyz, dd_node_order, rdd, rconstr,
1377 dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
1379 nsteps_cmdline, nstepout, resetstep, nmultisim,
1380 repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
1381 cpt_period, max_hours, deviceOptions,
1383 /* the main thread continues here with a new cr. We don't deallocate
1384 the old cr because other threads may still be reading it. */
1387 gmx_comm("Failed to spawn threads");
1392 /* END OF CAUTION: cr is now reliable */
1394 /* g_membed initialisation *
1395 * Because we change the mtop, init_membed is called before the init_parallel *
1396 * (in case we ever want to make it run in parallel) */
1397 if (opt2bSet("-membed",nfile,fnm))
1401 fprintf(stderr,"Initializing membed");
1403 membed = init_membed(fplog,nfile,fnm,mtop,inputrec,state,cr,&cpt_period);
1408 /* now broadcast everything to the non-master nodes/threads: */
1409 init_parallel(fplog, cr, inputrec, mtop);
1411 /* This check needs to happen after get_nthreads_mpi() */
1412 if (inputrec->cutoff_scheme == ecutsVERLET && (Flags & MD_PARTDEC))
1414 gmx_fatal_collective(FARGS,cr,NULL,
1415 "The Verlet cut-off scheme is not supported with particle decomposition.\n"
1416 "You can achieve the same effect as particle decomposition by running in parallel using only OpenMP threads.");
1421 pr_inputrec(fplog,0,"Input Parameters",inputrec,FALSE);
1424 #if defined GMX_THREAD_MPI
1425 /* With tMPI we detected on thread 0 and we'll just pass the hwinfo pointer
1426 * to the other threads -- slightly uncool, but works fine, just need to
1427 * make sure that the data doesn't get freed twice. */
1434 gmx_bcast(sizeof(&hwinfo), &hwinfo, cr);
1437 if (PAR(cr) && !SIMMASTER(cr))
1439 /* now we have inputrec on all nodes, can run the detection */
1440 /* TODO: perhaps it's better to propagate within a node instead? */
1442 gmx_detect_hardware(fplog, hwinfo, cr,
1443 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
1446 /* Now do the affinity check with MPI/no-MPI (done earlier with thread-MPI). */
1447 check_cpu_affinity_set(fplog, cr,
1448 hw_opt, hwinfo->nthreads_hw_avail, FALSE);
1451 /* now make sure the state is initialized and propagated */
1452 set_state_entries(state,inputrec,cr->nnodes);
1454 /* remove when vv and rerun works correctly! */
1455 if (PAR(cr) && EI_VV(inputrec->eI) && ((Flags & MD_RERUN) || (Flags & MD_RERUN_VSITE)))
1458 "Currently can't do velocity verlet with rerun in parallel.");
1461 /* A parallel command line option consistency check that we can
1462 only do after any threads have started. */
1464 (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
1467 "The -dd or -npme option request a parallel simulation, "
1469 "but %s was compiled without threads or MPI enabled"
1471 #ifdef GMX_THREAD_MPI
1472 "but the number of threads (option -nt) is 1"
1474 "but %s was not started through mpirun/mpiexec or only one process was requested through mpirun/mpiexec"
1481 if ((Flags & MD_RERUN) &&
1482 (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
1484 gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
1487 if (can_use_allvsall(inputrec,mtop,TRUE,cr,fplog) && PAR(cr))
1489 /* All-vs-all loops do not work with domain decomposition */
1490 Flags |= MD_PARTDEC;
1493 if (!EEL_PME(inputrec->coulombtype) || (Flags & MD_PARTDEC))
1495 if (cr->npmenodes > 0)
1497 if (!EEL_PME(inputrec->coulombtype))
1499 gmx_fatal_collective(FARGS,cr,NULL,
1500 "PME nodes are requested, but the system does not use PME electrostatics");
1502 if (Flags & MD_PARTDEC)
1504 gmx_fatal_collective(FARGS,cr,NULL,
1505 "PME nodes are requested, but particle decomposition does not support separate PME nodes");
1513 fcRegisterSteps(inputrec->nsteps,inputrec->init_step);
1516 /* NMR restraints must be initialized before load_checkpoint,
1517 * since with time averaging the history is added to t_state.
1518 * For proper consistency check we therefore need to extend
1520 * So the PME-only nodes (if present) will also initialize
1521 * the distance restraints.
1525 /* This needs to be called before read_checkpoint to extend the state */
1526 init_disres(fplog,mtop,inputrec,cr,Flags & MD_PARTDEC,fcd,state);
1528 if (gmx_mtop_ftype_count(mtop,F_ORIRES) > 0)
1530 if (PAR(cr) && !(Flags & MD_PARTDEC))
1532 gmx_fatal(FARGS,"Orientation restraints do not work (yet) with domain decomposition, use particle decomposition (mdrun option -pd)");
1534 /* Orientation restraints */
1537 init_orires(fplog,mtop,state->x,inputrec,cr->ms,&(fcd->orires),
1542 if (DEFORM(*inputrec))
1544 /* Store the deform reference box before reading the checkpoint */
1547 copy_mat(state->box,box);
1551 gmx_bcast(sizeof(box),box,cr);
1553 /* Because we do not have the update struct available yet
1554 * in which the reference values should be stored,
1555 * we store them temporarily in static variables.
1556 * This should be thread safe, since they are only written once
1557 * and with identical values.
1559 #ifdef GMX_THREAD_MPI
1560 tMPI_Thread_mutex_lock(&deform_init_box_mutex);
1562 deform_init_init_step_tpx = inputrec->init_step;
1563 copy_mat(box,deform_init_box_tpx);
1564 #ifdef GMX_THREAD_MPI
1565 tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
1569 if (opt2bSet("-cpi",nfile,fnm))
1571 /* Check if checkpoint file exists before doing continuation.
1572 * This way we can use identical input options for the first and subsequent runs...
1574 if( gmx_fexist_master(opt2fn_master("-cpi",nfile,fnm,cr),cr) )
1576 load_checkpoint(opt2fn_master("-cpi",nfile,fnm,cr),&fplog,
1577 cr,Flags & MD_PARTDEC,ddxyz,
1578 inputrec,state,&bReadRNG,&bReadEkin,
1579 (Flags & MD_APPENDFILES),
1580 (Flags & MD_APPENDFILESSET));
1584 Flags |= MD_READ_RNG;
1588 Flags |= MD_READ_EKIN;
1593 if (((MASTER(cr) || (Flags & MD_SEPPOT)) && (Flags & MD_APPENDFILES))
1594 #ifdef GMX_THREAD_MPI
1595 /* With thread MPI only the master node/thread exists in mdrun.c,
1596 * therefore non-master nodes need to open the "seppot" log file here.
1598 || (!MASTER(cr) && (Flags & MD_SEPPOT))
1602 gmx_log_open(ftp2fn(efLOG,nfile,fnm),cr,!(Flags & MD_SEPPOT),
1606 /* override nsteps with value from cmdline */
1607 override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
1611 copy_mat(state->box,box);
1616 gmx_bcast(sizeof(box),box,cr);
1619 /* Essential dynamics */
1620 if (opt2bSet("-ei",nfile,fnm))
1622 /* Open input and output files, allocate space for ED data structure */
1623 ed = ed_open(nfile,fnm,Flags,cr);
1626 if (PAR(cr) && !((Flags & MD_PARTDEC) ||
1627 EI_TPI(inputrec->eI) ||
1628 inputrec->eI == eiNM))
1630 cr->dd = init_domain_decomposition(fplog,cr,Flags,ddxyz,rdd,rconstr,
1631 dddlb_opt,dlb_scale,
1635 &ddbox,&npme_major,&npme_minor);
1637 make_dd_communicators(fplog,cr,dd_node_order);
1639 /* Set overallocation to avoid frequent reallocation of arrays */
1640 set_over_alloc_dd(TRUE);
1644 /* PME, if used, is done on all nodes with 1D decomposition */
1646 cr->duty = (DUTY_PP | DUTY_PME);
1649 if (!EI_TPI(inputrec->eI))
1651 npme_major = cr->nnodes;
1654 if (inputrec->ePBC == epbcSCREW)
1657 "pbc=%s is only implemented with domain decomposition",
1658 epbc_names[inputrec->ePBC]);
1664 /* After possible communicator splitting in make_dd_communicators.
1665 * we can set up the intra/inter node communication.
1667 gmx_setup_nodecomm(fplog,cr);
1670 /* Initialize per-physical-node MPI process/thread ID and counters. */
1671 gmx_init_intranode_counters(cr);
1674 md_print_info(cr,fplog,"Using %d MPI %s\n",
1676 #ifdef GMX_THREAD_MPI
1677 cr->nnodes==1 ? "thread" : "threads"
1679 cr->nnodes==1 ? "process" : "processes"
1685 gmx_omp_nthreads_init(fplog, cr,
1686 hwinfo->nthreads_hw_avail,
1687 hw_opt->nthreads_omp,
1688 hw_opt->nthreads_omp_pme,
1689 (cr->duty & DUTY_PP) == 0,
1690 inputrec->cutoff_scheme == ecutsVERLET);
1692 gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi, minf.bUseGPU);
1694 /* getting number of PP/PME threads
1695 PME: env variable should be read only on one node to make sure it is
1696 identical everywhere;
1698 /* TODO nthreads_pp is only used for pinning threads.
1699 * This is a temporary solution until we have a hw topology library.
1701 nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
1702 nthreads_pme = gmx_omp_nthreads_get(emntPME);
1704 wcycle = wallcycle_init(fplog,resetstep,cr,nthreads_pp,nthreads_pme);
1708 /* Master synchronizes its value of reset_counters with all nodes
1709 * including PME only nodes */
1710 reset_counters = wcycle_get_reset_counters(wcycle);
1711 gmx_bcast_sim(sizeof(reset_counters),&reset_counters,cr);
1712 wcycle_set_reset_counters(wcycle, reset_counters);
1716 if (cr->duty & DUTY_PP)
1718 /* For domain decomposition we allocate dynamically
1719 * in dd_partition_system.
1721 if (DOMAINDECOMP(cr))
1723 bcast_state_setup(cr,state);
1729 bcast_state(cr,state,TRUE);
1733 /* Initiate forcerecord */
1735 fr->hwinfo = hwinfo;
1736 init_forcerec(fplog,oenv,fr,fcd,inputrec,mtop,cr,box,FALSE,
1737 opt2fn("-table",nfile,fnm),
1738 opt2fn("-tabletf",nfile,fnm),
1739 opt2fn("-tablep",nfile,fnm),
1740 opt2fn("-tableb",nfile,fnm),
1744 /* version for PCA_NOT_READ_NODE (see md.c) */
1745 /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
1746 "nofile","nofile","nofile","nofile",FALSE,pforce);
1748 fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT);
1750 /* Initialize QM-MM */
1753 init_QMMMrec(cr,box,mtop,inputrec,fr);
1756 /* Initialize the mdatoms structure.
1757 * mdatoms is not filled with atom data,
1758 * as this can not be done now with domain decomposition.
1760 mdatoms = init_mdatoms(fplog,mtop,inputrec->efep!=efepNO);
1762 /* Initialize the virtual site communication */
1763 vsite = init_vsite(mtop,cr,FALSE);
1765 calc_shifts(box,fr->shift_vec);
1767 /* With periodic molecules the charge groups should be whole at start up
1768 * and the virtual sites should not be far from their proper positions.
1770 if (!inputrec->bContinuation && MASTER(cr) &&
1771 !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
1773 /* Make molecules whole at start of run */
1774 if (fr->ePBC != epbcNONE)
1776 do_pbc_first_mtop(fplog,inputrec->ePBC,box,mtop,state->x);
1780 /* Correct initial vsite positions are required
1781 * for the initial distribution in the domain decomposition
1782 * and for the initial shell prediction.
1784 construct_vsites_mtop(fplog,vsite,mtop,state->x);
1788 if (EEL_PME(fr->eeltype))
1790 ewaldcoeff = fr->ewaldcoeff;
1791 pmedata = &fr->pmedata;
1800 /* This is a PME only node */
1802 /* We don't need the state */
1805 ewaldcoeff = calc_ewaldcoeff(inputrec->rcoulomb, inputrec->ewald_rtol);
1809 /* Before setting affinity, check whether the affinity has changed
1810 * - which indicates that probably the OpenMP library has changed it since
1811 * we first checked). */
1812 check_cpu_affinity_set(fplog, cr, hw_opt, hwinfo->nthreads_hw_avail, TRUE);
1814 /* Set the CPU affinity */
1815 set_cpu_affinity(fplog,cr,hw_opt,nthreads_pme,hwinfo,inputrec);
1817 /* Initiate PME if necessary,
1818 * either on all nodes or on dedicated PME nodes only. */
1819 if (EEL_PME(inputrec->coulombtype))
1823 nChargePerturbed = mdatoms->nChargePerturbed;
1825 if (cr->npmenodes > 0)
1827 /* The PME only nodes need to know nChargePerturbed */
1828 gmx_bcast_sim(sizeof(nChargePerturbed),&nChargePerturbed,cr);
1831 if (cr->duty & DUTY_PME)
1833 status = gmx_pme_init(pmedata,cr,npme_major,npme_minor,inputrec,
1834 mtop ? mtop->natoms : 0,nChargePerturbed,
1835 (Flags & MD_REPRODUCIBLE),nthreads_pme);
1838 gmx_fatal(FARGS,"Error %d initializing PME",status);
1844 if (integrator[inputrec->eI].func == do_md
1846 integrator[inputrec->eI].func == do_md_openmm
1849 /* Turn on signal handling on all nodes */
1851 * (A user signal from the PME nodes (if any)
1852 * is communicated to the PP nodes.
1854 signal_handler_install();
1857 if (cr->duty & DUTY_PP)
1859 if (inputrec->ePull != epullNO)
1861 /* Initialize pull code */
1862 init_pull(fplog,inputrec,nfile,fnm,mtop,cr,oenv, inputrec->fepvals->init_lambda,
1863 EI_DYNAMICS(inputrec->eI) && MASTER(cr),Flags);
1868 /* Initialize enforced rotation code */
1869 init_rot(fplog,inputrec,nfile,fnm,cr,state->x,box,mtop,oenv,
1873 constr = init_constraints(fplog,mtop,inputrec,ed,state,cr);
1875 if (DOMAINDECOMP(cr))
1877 dd_init_bondeds(fplog,cr->dd,mtop,vsite,constr,inputrec,
1878 Flags & MD_DDBONDCHECK,fr->cginfo_mb);
1880 set_dd_parameters(fplog,cr->dd,dlb_scale,inputrec,fr,&ddbox);
1882 setup_dd_grid(fplog,cr->dd);
1885 /* Now do whatever the user wants us to do (how flexible...) */
1886 integrator[inputrec->eI].func(fplog,cr,nfile,fnm,
1887 oenv,bVerbose,bCompact,
1890 nstepout,inputrec,mtop,
1892 mdatoms,nrnb,wcycle,ed,fr,
1893 repl_ex_nst,repl_ex_nex,repl_ex_seed,
1895 cpt_period,max_hours,
1900 if (inputrec->ePull != epullNO)
1902 finish_pull(fplog,inputrec->pull);
1907 finish_rot(inputrec->rot);
1914 gmx_pmeonly(*pmedata,cr,nrnb,wcycle,ewaldcoeff,FALSE,inputrec);
1917 if (EI_DYNAMICS(inputrec->eI) || EI_TPI(inputrec->eI))
1919 /* Some timing stats */
1922 if (runtime.proc == 0)
1924 runtime.proc = runtime.real;
1933 wallcycle_stop(wcycle,ewcRUN);
1935 /* Finish up, write some stuff
1936 * if rerunMD, don't write last frame again
1938 finish_run(fplog,cr,ftp2fn(efSTO,nfile,fnm),
1939 inputrec,nrnb,wcycle,&runtime,
1940 fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ?
1941 nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL,
1943 EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
1945 if ((cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU)
1947 char gpu_err_str[STRLEN];
1949 /* free GPU memory and uninitialize GPU (by destroying the context) */
1950 nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
1952 if (!free_gpu(gpu_err_str))
1954 gmx_warning("On node %d failed to free GPU #%d: %s",
1955 cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
1959 if (opt2bSet("-membed",nfile,fnm))
1964 #ifdef GMX_THREAD_MPI
1965 if (PAR(cr) && SIMMASTER(cr))
1968 gmx_hardware_info_free(hwinfo);
1971 /* Does what it says */
1972 print_date_and_time(fplog,cr->nodeid,"Finished mdrun",&runtime);
1974 /* Close logfile already here if we were appending to it */
1975 if (MASTER(cr) && (Flags & MD_APPENDFILES))
1977 gmx_log_close(fplog);
1980 rc=(int)gmx_get_stop_condition();
1982 #ifdef GMX_THREAD_MPI
1983 /* we need to join all threads. The sub-threads join when they
1984 exit this function, but the master thread needs to be told to
1986 if (PAR(cr) && MASTER(cr))