2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team,
6 * check out http://www.gromacs.org for more information.
7 * Copyright (c) 2012, by the GROMACS development team, led by
8 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
9 * others, as listed in the AUTHORS file in the top-level source
10 * directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
41 #if defined(HAVE_SCHED_H) && defined(HAVE_SCHED_GETAFFINITY)
44 #include <sys/syscall.h>
56 #include "md_logging.h"
57 #include "md_support.h"
67 #include "mpelogging.h"
73 #include "checkpoint.h"
74 #include "mtop_util.h"
75 #include "sighandler.h"
78 #include "gmx_detect_hardware.h"
79 #include "gmx_omp_nthreads.h"
80 #include "pull_rotation.h"
81 #include "calc_verletbuf.h"
82 #include "../mdlib/nbnxn_search.h"
83 #include "../mdlib/nbnxn_consts.h"
84 #include "gmx_fatal_collective.h"
86 #include "md_openmm.h"
89 #include "thread_mpi/threads.h"
103 #include "md_openmm.h"
106 #include "gpu_utils.h"
107 #include "nbnxn_cuda_data_mgmt.h"
110 gmx_integrator_t *func;
113 /* The array should match the eI array in include/types/enums.h */
114 #ifdef GMX_OPENMM /* FIXME do_md_openmm needs fixing */
115 const gmx_intp_t integrator[eiNR] = { {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm},{do_md_openmm}};
117 const gmx_intp_t integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md},{do_md}};
120 gmx_large_int_t deform_init_init_step_tpx;
121 matrix deform_init_box_tpx;
122 #ifdef GMX_THREAD_MPI
123 tMPI_Thread_mutex_t deform_init_box_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
127 #ifdef GMX_THREAD_MPI
128 struct mdrunner_arglist
130 gmx_hw_opt_t *hw_opt;
143 const char *dddlb_opt;
148 const char *nbpu_opt;
159 const char *deviceOptions;
161 int ret; /* return value */
165 /* The function used for spawning threads. Extracts the mdrunner()
166 arguments from its one argument and calls mdrunner(), after making
168 static void mdrunner_start_fn(void *arg)
170 struct mdrunner_arglist *mda=(struct mdrunner_arglist*)arg;
171 struct mdrunner_arglist mc=*mda; /* copy the arg list to make sure
172 that it's thread-local. This doesn't
173 copy pointed-to items, of course,
174 but those are all const. */
175 t_commrec *cr; /* we need a local version of this */
179 fnm = dup_tfn(mc.nfile, mc.fnm);
181 cr = init_par_threads(mc.cr);
188 mda->ret=mdrunner(mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv,
189 mc.bVerbose, mc.bCompact, mc.nstglobalcomm,
190 mc.ddxyz, mc.dd_node_order, mc.rdd,
191 mc.rconstr, mc.dddlb_opt, mc.dlb_scale,
192 mc.ddcsx, mc.ddcsy, mc.ddcsz,
194 mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
195 mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce,
196 mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.Flags);
199 /* called by mdrunner() to start a specific number of threads (including
200 the main thread) for thread-parallel runs. This in turn calls mdrunner()
202 All options besides nthreads are the same as for mdrunner(). */
203 static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
204 FILE *fplog,t_commrec *cr,int nfile,
205 const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
206 gmx_bool bCompact, int nstglobalcomm,
207 ivec ddxyz,int dd_node_order,real rdd,real rconstr,
208 const char *dddlb_opt,real dlb_scale,
209 const char *ddcsx,const char *ddcsy,const char *ddcsz,
210 const char *nbpu_opt,
211 int nsteps_cmdline, int nstepout,int resetstep,
212 int nmultisim,int repl_ex_nst,int repl_ex_nex, int repl_ex_seed,
213 real pforce,real cpt_period, real max_hours,
214 const char *deviceOptions, unsigned long Flags)
217 struct mdrunner_arglist *mda;
218 t_commrec *crn; /* the new commrec */
221 /* first check whether we even need to start tMPI */
222 if (hw_opt->nthreads_tmpi < 2)
227 /* a few small, one-time, almost unavoidable memory leaks: */
229 fnmn=dup_tfn(nfile, fnm);
231 /* fill the data structure to pass as void pointer to thread start fn */
238 mda->bVerbose=bVerbose;
239 mda->bCompact=bCompact;
240 mda->nstglobalcomm=nstglobalcomm;
241 mda->ddxyz[XX]=ddxyz[XX];
242 mda->ddxyz[YY]=ddxyz[YY];
243 mda->ddxyz[ZZ]=ddxyz[ZZ];
244 mda->dd_node_order=dd_node_order;
246 mda->rconstr=rconstr;
247 mda->dddlb_opt=dddlb_opt;
248 mda->dlb_scale=dlb_scale;
252 mda->nbpu_opt=nbpu_opt;
253 mda->nsteps_cmdline=nsteps_cmdline;
254 mda->nstepout=nstepout;
255 mda->resetstep=resetstep;
256 mda->nmultisim=nmultisim;
257 mda->repl_ex_nst=repl_ex_nst;
258 mda->repl_ex_nex=repl_ex_nex;
259 mda->repl_ex_seed=repl_ex_seed;
261 mda->cpt_period=cpt_period;
262 mda->max_hours=max_hours;
263 mda->deviceOptions=deviceOptions;
266 /* now spawn new threads that start mdrunner_start_fn(), while
267 the main thread returns */
268 ret=tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi,
269 (hw_opt->bThreadPinning ? TMPI_AFFINITY_ALL_CORES : TMPI_AFFINITY_NONE),
270 mdrunner_start_fn, (void*)(mda) );
271 if (ret!=TMPI_SUCCESS)
274 /* make a new comm_rec to reflect the new situation */
275 crn=init_par_threads(cr);
280 static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
281 const gmx_hw_opt_t *hw_opt,
287 /* There are no separate PME nodes here, as we ensured in
288 * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
289 * and a conditional ensures we would not have ended up here.
290 * Note that separate PME nodes might be switched on later.
294 nthreads_tmpi = ngpu;
295 if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi)
297 nthreads_tmpi = nthreads_tot;
300 else if (hw_opt->nthreads_omp > 0)
302 /* Here we could oversubscribe, when we do, we issue a warning later */
303 nthreads_tmpi = max(1,nthreads_tot/hw_opt->nthreads_omp);
307 /* TODO choose nthreads_omp based on hardware topology
308 when we have a hardware topology detection library */
309 /* In general, when running up to 4 threads, OpenMP should be faster.
310 * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
311 * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
312 * even on two CPUs it's usually faster (but with many OpenMP threads
313 * it could be faster not to use HT, currently we always use HT).
314 * On Nehalem/Westmere we want to avoid running 16 threads over
315 * two CPUs with HT, so we need a limit<16; thus we use 12.
316 * A reasonable limit for Intel Sandy and Ivy bridge,
317 * not knowing the topology, is 16 threads.
319 const int nthreads_omp_always_faster = 4;
320 const int nthreads_omp_always_faster_Nehalem = 12;
321 const int nthreads_omp_always_faster_SandyBridge = 16;
322 const int first_model_Nehalem = 0x1A;
323 const int first_model_SandyBridge = 0x2A;
324 gmx_bool bIntel_Family6;
327 (gmx_cpuid_vendor(hwinfo->cpuid_info) == GMX_CPUID_VENDOR_INTEL &&
328 gmx_cpuid_family(hwinfo->cpuid_info) == 6);
330 if (nthreads_tot <= nthreads_omp_always_faster ||
332 ((gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_Nehalem && nthreads_tot <= nthreads_omp_always_faster_Nehalem) ||
333 (gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_SandyBridge && nthreads_tot <= nthreads_omp_always_faster_SandyBridge))))
335 /* Use pure OpenMP parallelization */
340 /* Don't use OpenMP parallelization */
341 nthreads_tmpi = nthreads_tot;
345 return nthreads_tmpi;
349 /* Get the number of threads to use for thread-MPI based on how many
350 * were requested, which algorithms we're using,
351 * and how many particles there are.
352 * At the point we have already called check_and_update_hw_opt.
353 * Thus all options should be internally consistent and consistent
354 * with the hardware, except that ntmpi could be larger than #GPU.
356 static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
357 gmx_hw_opt_t *hw_opt,
358 t_inputrec *inputrec, gmx_mtop_t *mtop,
362 int nthreads_hw,nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu;
363 int min_atoms_per_mpi_thread;
368 if (hw_opt->nthreads_tmpi > 0)
370 /* Trivial, return right away */
371 return hw_opt->nthreads_tmpi;
374 nthreads_hw = hwinfo->nthreads_hw_avail;
376 /* How many total (#tMPI*#OpenMP) threads can we start? */
377 if (hw_opt->nthreads_tot > 0)
379 nthreads_tot_max = hw_opt->nthreads_tot;
383 nthreads_tot_max = nthreads_hw;
386 bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
389 ngpu = hwinfo->gpu_info.ncuda_dev_use;
397 get_tmpi_omp_thread_division(hwinfo,hw_opt,nthreads_tot_max,ngpu);
399 if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
401 /* Steps are divided over the nodes iso splitting the atoms */
402 min_atoms_per_mpi_thread = 0;
408 min_atoms_per_mpi_thread = MIN_ATOMS_PER_GPU;
412 min_atoms_per_mpi_thread = MIN_ATOMS_PER_MPI_THREAD;
416 /* Check if an algorithm does not support parallel simulation. */
417 if (nthreads_tmpi != 1 &&
418 ( inputrec->eI == eiLBFGS ||
419 inputrec->coulombtype == eelEWALD ) )
423 md_print_warn(cr,fplog,"The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
424 if (hw_opt->nthreads_tmpi > nthreads_tmpi)
426 gmx_fatal(FARGS,"You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
429 else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_thread)
431 /* the thread number was chosen automatically, but there are too many
432 threads (too few atoms per thread) */
433 nthreads_new = max(1,mtop->natoms/min_atoms_per_mpi_thread);
435 /* Avoid partial use of Hyper-Threading */
436 if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
437 nthreads_new > nthreads_hw/2 && nthreads_new < nthreads_hw)
439 nthreads_new = nthreads_hw/2;
442 /* Avoid large prime numbers in the thread count */
443 if (nthreads_new >= 6)
445 /* Use only 6,8,10 with additional factors of 2 */
449 while (3*fac*2 <= nthreads_new)
454 nthreads_new = (nthreads_new/fac)*fac;
459 if (nthreads_new == 5)
465 nthreads_tmpi = nthreads_new;
467 fprintf(stderr,"\n");
468 fprintf(stderr,"NOTE: Parallelization is limited by the small number of atoms,\n");
469 fprintf(stderr," only starting %d thread-MPI threads.\n",nthreads_tmpi);
470 fprintf(stderr," You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
473 return nthreads_tmpi;
475 #endif /* GMX_THREAD_MPI */
478 /* Environment variable for setting nstlist */
479 static const char* NSTLIST_ENVVAR = "GMX_NSTLIST";
480 /* Try to increase nstlist when using a GPU with nstlist less than this */
481 static const int NSTLIST_GPU_ENOUGH = 20;
482 /* Increase nstlist until the non-bonded cost increases more than this factor */
483 static const float NBNXN_GPU_LIST_OK_FAC = 1.25;
484 /* Don't increase nstlist beyond a non-bonded cost increases of this factor */
485 static const float NBNXN_GPU_LIST_MAX_FAC = 1.40;
487 /* Try to increase nstlist when running on a GPU */
488 static void increase_nstlist(FILE *fp,t_commrec *cr,
489 t_inputrec *ir,const gmx_mtop_t *mtop,matrix box)
492 int nstlist_orig,nstlist_prev;
493 verletbuf_list_setup_t ls;
494 real rlist_inc,rlist_ok,rlist_max,rlist_new,rlist_prev;
497 gmx_bool bBox,bDD,bCont;
498 const char *nstl_fmt="\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
499 const char *vbd_err="Can not increase nstlist for GPU run because verlet-buffer-drift is not set or used";
500 const char *box_err="Can not increase nstlist for GPU run because the box is too small";
501 const char *dd_err ="Can not increase nstlist for GPU run because of domain decomposition limitations";
504 /* Number of + nstlist alternative values to try when switching */
505 const int nstl[]={ 20, 25, 40, 50 };
506 #define NNSTL sizeof(nstl)/sizeof(nstl[0])
508 env = getenv(NSTLIST_ENVVAR);
513 fprintf(fp,nstl_fmt,ir->nstlist);
517 if (ir->verletbuf_drift == 0)
519 gmx_fatal(FARGS,"You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
522 if (ir->verletbuf_drift < 0)
526 fprintf(stderr,"%s\n",vbd_err);
530 fprintf(fp,"%s\n",vbd_err);
536 nstlist_orig = ir->nstlist;
539 sprintf(buf,"Getting nstlist from environment variable GMX_NSTLIST=%s",env);
542 fprintf(stderr,"%s\n",buf);
546 fprintf(fp,"%s\n",buf);
548 sscanf(env,"%d",&ir->nstlist);
551 verletbuf_get_list_setup(TRUE,&ls);
553 /* Allow rlist to make the list double the size of the cut-off sphere */
554 rlist_inc = nbnxn_get_rlist_effective_inc(NBNXN_GPU_CLUSTER_SIZE,mtop->natoms/det(box));
555 rlist_ok = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_OK_FAC,1.0/3.0) - rlist_inc;
556 rlist_max = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_MAX_FAC,1.0/3.0) - rlist_inc;
559 fprintf(debug,"GPU nstlist tuning: rlist_inc %.3f rlist_max %.3f\n",
560 rlist_inc,rlist_max);
564 nstlist_prev = nstlist_orig;
565 rlist_prev = ir->rlist;
570 ir->nstlist = nstl[i];
573 /* Set the pair-list buffer size in ir */
574 calc_verlet_buffer_size(mtop,det(box),ir,ir->verletbuf_drift,&ls,
577 /* Does rlist fit in the box? */
578 bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC,box));
580 if (bBox && DOMAINDECOMP(cr))
582 /* Check if rlist fits in the domain decomposition */
583 if (inputrec2nboundeddim(ir) < DIM)
585 gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
587 copy_mat(box,state_tmp.box);
588 bDD = change_dd_cutoff(cr,&state_tmp,ir,rlist_new);
595 if (bBox && bDD && rlist_new <= rlist_max)
597 /* Increase nstlist */
598 nstlist_prev = ir->nstlist;
599 rlist_prev = rlist_new;
600 bCont = (i+1 < NNSTL && rlist_new < rlist_ok);
604 /* Stick with the previous nstlist */
605 ir->nstlist = nstlist_prev;
606 rlist_new = rlist_prev;
618 gmx_warning(!bBox ? box_err : dd_err);
621 fprintf(fp,"\n%s\n",bBox ? box_err : dd_err);
623 ir->nstlist = nstlist_orig;
625 else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
627 sprintf(buf,"Changing nstlist from %d to %d, rlist from %g to %g",
628 nstlist_orig,ir->nstlist,
629 ir->rlist,rlist_new);
632 fprintf(stderr,"%s\n\n",buf);
636 fprintf(fp,"%s\n\n",buf);
638 ir->rlist = rlist_new;
639 ir->rlistlong = rlist_new;
643 static void prepare_verlet_scheme(FILE *fplog,
644 gmx_hw_info_t *hwinfo,
646 gmx_hw_opt_t *hw_opt,
647 const char *nbpu_opt,
649 const gmx_mtop_t *mtop,
653 /* Here we only check for GPU usage on the MPI master process,
654 * as here we don't know how many GPUs we will use yet.
655 * We check for a GPU on all processes later.
657 *bUseGPU = hwinfo->bCanUseGPU || (getenv("GMX_EMULATE_GPU") != NULL);
659 if (ir->verletbuf_drift > 0)
661 /* Update the Verlet buffer size for the current run setup */
662 verletbuf_list_setup_t ls;
665 /* Here we assume CPU acceleration is on. But as currently
666 * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
667 * and 4x2 gives a larger buffer than 4x4, this is ok.
669 verletbuf_get_list_setup(*bUseGPU,&ls);
671 calc_verlet_buffer_size(mtop,det(box),ir,
672 ir->verletbuf_drift,&ls,
674 if (rlist_new != ir->rlist)
678 fprintf(fplog,"\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
680 ls.cluster_size_i,ls.cluster_size_j);
682 ir->rlist = rlist_new;
683 ir->rlistlong = rlist_new;
687 /* With GPU or emulation we should check nstlist for performance */
688 if ((EI_DYNAMICS(ir->eI) &&
690 ir->nstlist < NSTLIST_GPU_ENOUGH) ||
691 getenv(NSTLIST_ENVVAR) != NULL)
693 /* Choose a better nstlist */
694 increase_nstlist(fplog,cr,ir,mtop,box);
698 static void convert_to_verlet_scheme(FILE *fplog,
700 gmx_mtop_t *mtop,real box_vol)
702 char *conv_mesg="Converting input file with group cut-off scheme to the Verlet cut-off scheme";
704 md_print_warn(NULL,fplog,"%s\n",conv_mesg);
706 ir->cutoff_scheme = ecutsVERLET;
707 ir->verletbuf_drift = 0.005;
709 if (ir->rcoulomb != ir->rvdw)
711 gmx_fatal(FARGS,"The VdW and Coulomb cut-offs are different, whereas the Verlet scheme only supports equal cut-offs");
714 if (ir->vdwtype == evdwUSER || EEL_USER(ir->coulombtype))
716 gmx_fatal(FARGS,"User non-bonded potentials are not (yet) supported with the Verlet scheme");
718 else if (EVDW_SWITCHED(ir->vdwtype) || EEL_SWITCHED(ir->coulombtype))
720 md_print_warn(NULL,fplog,"Converting switched or shifted interactions to a shifted potential (without force shift), this will lead to slightly different interaction potentials");
722 if (EVDW_SWITCHED(ir->vdwtype))
724 ir->vdwtype = evdwCUT;
726 if (EEL_SWITCHED(ir->coulombtype))
728 if (EEL_FULL(ir->coulombtype))
730 /* With full electrostatic only PME can be switched */
731 ir->coulombtype = eelPME;
735 md_print_warn(NULL,fplog,"NOTE: Replacing %s electrostatics with reaction-field with epsilon-rf=inf\n",eel_names[ir->coulombtype]);
736 ir->coulombtype = eelRF;
737 ir->epsilon_rf = 0.0;
741 /* We set the target energy drift to a small number.
742 * Note that this is only for testing. For production the user
743 * should think about this and set the mdp options.
745 ir->verletbuf_drift = 1e-4;
748 if (inputrec2nboundeddim(ir) != 3)
750 gmx_fatal(FARGS,"Can only convert old tpr files to the Verlet cut-off scheme with 3D pbc");
753 if (ir->efep != efepNO || ir->implicit_solvent != eisNO)
755 gmx_fatal(FARGS,"Will not convert old tpr files to the Verlet cut-off scheme with free-energy calculations or implicit solvent");
758 if (EI_DYNAMICS(ir->eI) && !(EI_MD(ir->eI) && ir->etc == etcNO))
760 verletbuf_list_setup_t ls;
762 verletbuf_get_list_setup(FALSE,&ls);
763 calc_verlet_buffer_size(mtop,box_vol,ir,ir->verletbuf_drift,&ls,
768 ir->verletbuf_drift = -1;
769 ir->rlist = 1.05*max(ir->rvdw,ir->rcoulomb);
772 gmx_mtop_remove_chargegroups(mtop);
775 /* Check the process affinity mask and if it is found to be non-zero,
776 * will honor it and disable mdrun internal affinity setting.
777 * This function should be called first before the OpenMP library gets
778 * initialized with the last argument FALSE (which will detect affinity
779 * set by external tools like taskset), and later, after the OpenMP
780 * initialization, with the last argument TRUE to detect affinity changes
781 * made by the OpenMP library.
783 * Note that this will only work on Linux as we use a GNU feature. */
784 static void check_cpu_affinity_set(FILE *fplog, const t_commrec *cr,
785 gmx_hw_opt_t *hw_opt, int ncpus,
786 gmx_bool bAfterOpenmpInit)
788 #ifdef HAVE_SCHED_GETAFFINITY
789 cpu_set_t mask_current;
790 int i, ret, cpu_count, cpu_set;
794 if (!hw_opt->bThreadPinning)
796 /* internal affinity setting is off, don't bother checking process affinity */
800 CPU_ZERO(&mask_current);
801 if ((ret = sched_getaffinity(0, sizeof(cpu_set_t), &mask_current)) != 0)
803 /* failed to query affinity mask, will just return */
806 fprintf(debug, "Failed to query affinity mask (error %d)", ret);
811 /* Before proceeding with the actual check, make sure that the number of
812 * detected CPUs is >= the CPUs in the current set.
813 * We need to check for CPU_COUNT as it was added only in glibc 2.6. */
815 if (ncpus < CPU_COUNT(&mask_current))
819 fprintf(debug, "%d CPUs detected, but %d was returned by CPU_COUNT",
820 ncpus, CPU_COUNT(&mask_current));
824 #endif /* CPU_COUNT */
827 for (i = 0; (i < ncpus && i < CPU_SETSIZE); i++)
829 bAllSet = bAllSet && (CPU_ISSET(i, &mask_current) != 0);
834 if (!bAfterOpenmpInit)
836 md_print_warn(cr, fplog,
837 "Non-default process affinity set, disabling internal affinity");
841 md_print_warn(cr, fplog,
842 "Non-default process affinity set probably by the OpenMP library, "
843 "disabling internal affinity");
845 hw_opt->bThreadPinning = FALSE;
849 fprintf(debug, "Non-default affinity mask found\n");
856 fprintf(debug, "Default affinity mask found\n");
859 #endif /* HAVE_SCHED_GETAFFINITY */
862 /* Set CPU affinity. Can be important for performance.
863 On some systems (e.g. Cray) CPU Affinity is set by default.
864 But default assigning doesn't work (well) with only some ranks
865 having threads. This causes very low performance.
866 External tools have cumbersome syntax for setting affinity
867 in the case that only some ranks have threads.
868 Thus it is important that GROMACS sets the affinity internally
869 if only PME is using threads.
871 static void set_cpu_affinity(FILE *fplog,
873 gmx_hw_opt_t *hw_opt,
875 const gmx_hw_info_t *hwinfo,
876 const t_inputrec *inputrec)
878 #if defined GMX_THREAD_MPI
879 /* With the number of TMPI threads equal to the number of cores
880 * we already pinned in thread-MPI, so don't pin again here.
882 if (hw_opt->nthreads_tmpi == tMPI_Thread_get_hw_number())
889 /* If the tMPI thread affinity setting is not supported encourage the user
890 * to report it as it's either a bug or an exotic platform which we might
891 * want to support. */
892 if (tMPI_Thread_setaffinity_support() != TMPI_SETAFFINITY_SUPPORT_YES)
894 md_print_warn(NULL, fplog,
895 "Can not set thread affinities on the current plarform. On NUMA systems this\n"
896 "can cause performance degradation. If you think your platform should support\n"
897 "setting affinities, contact the GROMACS developers.");
900 #endif /* __APPLE__ */
902 if (hw_opt->bThreadPinning)
904 int nth_affinity_set, thread_id_node, thread_id,
905 nthread_local, nthread_node, nthread_hw_max, nphyscore;
909 /* threads on this MPI process or TMPI thread */
910 if (cr->duty & DUTY_PP)
912 nthread_local = gmx_omp_nthreads_get(emntNonbonded);
916 nthread_local = gmx_omp_nthreads_get(emntPME);
919 /* map the current process to cores */
921 nthread_node = nthread_local;
923 if (PAR(cr) || MULTISIM(cr))
925 /* We need to determine a scan of the thread counts in this
930 MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),cr->rank_intranode,
932 MPI_Scan(&nthread_local,&thread_id_node,1,MPI_INT,MPI_SUM,comm_intra);
933 /* MPI_Scan is inclusive, but here we need exclusive */
934 thread_id_node -= nthread_local;
935 /* Get the total number of threads on this physical node */
936 MPI_Allreduce(&nthread_local,&nthread_node,1,MPI_INT,MPI_SUM,comm_intra);
937 MPI_Comm_free(&comm_intra);
942 if (hw_opt->core_pinning_offset > 0)
944 offset = hw_opt->core_pinning_offset;
947 fprintf(stderr, "Applying core pinning offset %d\n", offset);
951 fprintf(fplog, "Applying core pinning offset %d\n", offset);
955 /* With Intel Hyper-Threading enabled, we want to pin consecutive
956 * threads to physical cores when using more threads than physical
957 * cores or when the user requests so.
959 nthread_hw_max = hwinfo->nthreads_hw_avail;
961 if (hw_opt->bPinHyperthreading ||
962 (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
963 nthread_node > nthread_hw_max/2 && getenv("GMX_DISABLE_PINHT") == NULL))
965 if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) != GMX_CPUID_X86_SMT_ENABLED)
967 /* We print to stderr on all processes, as we might have
968 * different settings on different physical nodes.
970 if (gmx_cpuid_vendor(hwinfo->cpuid_info) != GMX_CPUID_VENDOR_INTEL)
972 md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
973 "but non-Intel CPU detected (vendor: %s)\n",
974 gmx_cpuid_vendor_string[gmx_cpuid_vendor(hwinfo->cpuid_info)]);
978 md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
979 "but the CPU detected does not have Intel Hyper-Threading support "
980 "(or it is turned off)\n");
983 nphyscore = nthread_hw_max/2;
987 fprintf(stderr, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
992 fprintf(fplog, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
997 /* Set the per-thread affinity. In order to be able to check the success
998 * of affinity settings, we will set nth_affinity_set to 1 on threads
999 * where the affinity setting succeded and to 0 where it failed.
1000 * Reducing these 0/1 values over the threads will give the total number
1001 * of threads on which we succeeded.
1003 nth_affinity_set = 0;
1004 #pragma omp parallel firstprivate(thread_id_node) num_threads(nthread_local) \
1005 reduction(+:nth_affinity_set)
1008 gmx_bool setaffinity_ret;
1010 thread_id = gmx_omp_get_thread_num();
1011 thread_id_node += thread_id;
1014 core = offset + thread_id_node;
1018 /* Lock pairs of threads to the same hyperthreaded core */
1019 core = offset + thread_id_node/2 + (thread_id_node % 2)*nphyscore;
1022 setaffinity_ret = tMPI_Thread_setaffinity_single(tMPI_Thread_self(), core);
1024 /* store the per-thread success-values of the setaffinity */
1025 nth_affinity_set = (setaffinity_ret == 0);
1029 fprintf(debug, "On rank %2d, thread %2d, core %2d the affinity setting returned %d\n",
1030 cr->nodeid, gmx_omp_get_thread_num(), core, setaffinity_ret);
1034 if (nth_affinity_set > nthread_local)
1038 sprintf(msg, "Looks like we have set affinity for more threads than "
1039 "we have (%d > %d)!\n", nth_affinity_set, nthread_local);
1044 /* check & warn if some threads failed to set their affinities */
1045 if (nth_affinity_set != nthread_local)
1047 char sbuf1[STRLEN], sbuf2[STRLEN];
1049 /* sbuf1 contains rank info, while sbuf2 OpenMP thread info */
1050 sbuf1[0] = sbuf2[0] = '\0';
1052 #ifdef GMX_THREAD_MPI
1053 sprintf(sbuf1, "In thread-MPI thread #%d: ", cr->nodeid);
1054 #else /* GMX_LIB_MPI */
1055 sprintf(sbuf1, "In MPI process #%d: ", cr->nodeid);
1057 #endif /* GMX_MPI */
1059 if (nthread_local > 1)
1061 sprintf(sbuf2, "of %d/%d thread%s ",
1062 nthread_local - nth_affinity_set, nthread_local,
1063 (nthread_local - nth_affinity_set) > 1 ? "s" : "");
1066 md_print_warn(NULL, fplog,
1067 "NOTE: %sAffinity setting %sfailed.\n"
1068 " This can cause performance degradation!",
1076 static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt,
1079 gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp);
1081 #ifndef GMX_THREAD_MPI
1082 if (hw_opt->nthreads_tot > 0)
1084 gmx_fatal(FARGS,"Setting the total number of threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
1086 if (hw_opt->nthreads_tmpi > 0)
1088 gmx_fatal(FARGS,"Setting the number of thread-MPI threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
1092 if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
1094 /* We have the same number of OpenMP threads for PP and PME processes,
1095 * thus we can perform several consistency checks.
1097 if (hw_opt->nthreads_tmpi > 0 &&
1098 hw_opt->nthreads_omp > 0 &&
1099 hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
1101 gmx_fatal(FARGS,"The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
1102 hw_opt->nthreads_tot,hw_opt->nthreads_tmpi,hw_opt->nthreads_omp);
1105 if (hw_opt->nthreads_tmpi > 0 &&
1106 hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
1108 gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
1109 hw_opt->nthreads_tot,hw_opt->nthreads_tmpi);
1112 if (hw_opt->nthreads_omp > 0 &&
1113 hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
1115 gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
1116 hw_opt->nthreads_tot,hw_opt->nthreads_omp);
1119 if (hw_opt->nthreads_tmpi > 0 &&
1120 hw_opt->nthreads_omp <= 0)
1122 hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
1127 if (hw_opt->nthreads_omp > 1)
1129 gmx_fatal(FARGS,"OpenMP threads are requested, but Gromacs was compiled without OpenMP support");
1133 if (cutoff_scheme == ecutsGROUP)
1135 /* We only have OpenMP support for PME only nodes */
1136 if (hw_opt->nthreads_omp > 1)
1138 gmx_fatal(FARGS,"OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
1139 ecutscheme_names[cutoff_scheme],
1140 ecutscheme_names[ecutsVERLET]);
1142 hw_opt->nthreads_omp = 1;
1145 if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
1147 gmx_fatal(FARGS,"You need to specify -ntomp in addition to -ntomp_pme");
1150 if (hw_opt->nthreads_tot == 1)
1152 hw_opt->nthreads_tmpi = 1;
1154 if (hw_opt->nthreads_omp > 1)
1156 gmx_fatal(FARGS,"You requested %d OpenMP threads with %d total threads",
1157 hw_opt->nthreads_tmpi,hw_opt->nthreads_tot);
1159 hw_opt->nthreads_omp = 1;
1162 if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
1164 hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
1169 fprintf(debug,"hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
1170 hw_opt->nthreads_tot,
1171 hw_opt->nthreads_tmpi,
1172 hw_opt->nthreads_omp,
1173 hw_opt->nthreads_omp_pme,
1174 hw_opt->gpu_id!=NULL ? hw_opt->gpu_id : "");
1180 /* Override the value in inputrec with value passed on the command line (if any) */
1181 static void override_nsteps_cmdline(FILE *fplog,
1184 const t_commrec *cr)
1189 /* override with anything else than the default -2 */
1190 if (nsteps_cmdline > -2)
1194 ir->nsteps = nsteps_cmdline;
1195 if (EI_DYNAMICS(ir->eI))
1197 sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps, %.3f ps",
1198 nsteps_cmdline, nsteps_cmdline*ir->delta_t);
1202 sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps",
1206 md_print_warn(cr, fplog, "%s\n", stmp);
1210 /* Data structure set by SIMMASTER which needs to be passed to all nodes
1211 * before the other nodes have read the tpx file and called gmx_detect_hardware.
1214 int cutoff_scheme; /* The cutoff scheme from inputrec_t */
1215 gmx_bool bUseGPU; /* Use GPU or GPU emulation */
1218 int mdrunner(gmx_hw_opt_t *hw_opt,
1219 FILE *fplog,t_commrec *cr,int nfile,
1220 const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
1221 gmx_bool bCompact, int nstglobalcomm,
1222 ivec ddxyz,int dd_node_order,real rdd,real rconstr,
1223 const char *dddlb_opt,real dlb_scale,
1224 const char *ddcsx,const char *ddcsy,const char *ddcsz,
1225 const char *nbpu_opt,
1226 int nsteps_cmdline, int nstepout,int resetstep,
1227 int nmultisim,int repl_ex_nst,int repl_ex_nex,
1228 int repl_ex_seed, real pforce,real cpt_period,real max_hours,
1229 const char *deviceOptions, unsigned long Flags)
1231 gmx_bool bForceUseGPU,bTryUseGPU;
1232 double nodetime=0,realtime;
1233 t_inputrec *inputrec;
1234 t_state *state=NULL;
1236 gmx_ddbox_t ddbox={0};
1237 int npme_major,npme_minor;
1240 gmx_mtop_t *mtop=NULL;
1241 t_mdatoms *mdatoms=NULL;
1242 t_forcerec *fr=NULL;
1245 gmx_pme_t *pmedata=NULL;
1246 gmx_vsite_t *vsite=NULL;
1247 gmx_constr_t constr;
1248 int i,m,nChargePerturbed=-1,status,nalloc;
1250 gmx_wallcycle_t wcycle;
1251 gmx_bool bReadRNG,bReadEkin;
1253 gmx_runtime_t runtime;
1255 gmx_large_int_t reset_counters;
1256 gmx_edsam_t ed=NULL;
1257 t_commrec *cr_old=cr;
1260 gmx_membed_t membed=NULL;
1261 gmx_hw_info_t *hwinfo=NULL;
1262 master_inf_t minf={-1,FALSE};
1264 /* CAUTION: threads may be started later on in this function, so
1265 cr doesn't reflect the final parallel state right now */
1269 if (Flags & MD_APPENDFILES)
1274 bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
1275 bTryUseGPU = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
1280 /* Read (nearly) all data required for the simulation */
1281 read_tpx_state(ftp2fn(efTPX,nfile,fnm),inputrec,state,NULL,mtop);
1283 if (inputrec->cutoff_scheme != ecutsVERLET &&
1284 ((Flags & MD_TESTVERLET) || getenv("GMX_VERLET_SCHEME") != NULL))
1286 convert_to_verlet_scheme(fplog,inputrec,mtop,det(state->box));
1289 /* Detect hardware, gather information. With tMPI only thread 0 does it
1290 * and after threads are started broadcasts hwinfo around. */
1292 gmx_detect_hardware(fplog, hwinfo, cr,
1293 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
1295 minf.cutoff_scheme = inputrec->cutoff_scheme;
1296 minf.bUseGPU = FALSE;
1298 if (inputrec->cutoff_scheme == ecutsVERLET)
1300 prepare_verlet_scheme(fplog,hwinfo,cr,hw_opt,nbpu_opt,
1301 inputrec,mtop,state->box,
1304 else if (hwinfo->bCanUseGPU)
1306 md_print_warn(cr,fplog,
1307 "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
1308 " To use a GPU, set the mdp option: cutoff-scheme = Verlet\n"
1309 " (for quick performance testing you can use the -testverlet option)\n");
1313 gmx_fatal(FARGS,"GPU requested, but can't be used without cutoff-scheme=Verlet");
1317 #ifndef GMX_THREAD_MPI
1320 gmx_bcast_sim(sizeof(minf),&minf,cr);
1323 if (minf.bUseGPU && cr->npmenodes == -1)
1325 /* Don't automatically use PME-only nodes with GPUs */
1329 /* Check for externally set OpenMP affinity and turn off internal
1330 * pinning if any is found. We need to do this check early to tell
1331 * thread-MPI whether it should do pinning when spawning threads.
1333 gmx_omp_check_thread_affinity(fplog, cr, hw_opt);
1335 #ifdef GMX_THREAD_MPI
1336 /* With thread-MPI inputrec is only set here on the master thread */
1340 check_and_update_hw_opt(hw_opt,minf.cutoff_scheme);
1342 #ifdef GMX_THREAD_MPI
1343 /* Early check for externally set process affinity. Can't do over all
1344 * MPI processes because hwinfo is not available everywhere, but with
1345 * thread-MPI it's needed as pinning might get turned off which needs
1346 * to be known before starting thread-MPI. */
1347 check_cpu_affinity_set(fplog,
1349 hw_opt, hwinfo->nthreads_hw_avail, FALSE);
1352 #ifdef GMX_THREAD_MPI
1353 if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
1355 gmx_fatal(FARGS,"You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME nodes");
1359 if (hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
1362 gmx_fatal(FARGS,"You need to explicitly specify the number of PME nodes (-npme) when using different number of OpenMP threads for PP and PME nodes");
1366 #ifdef GMX_THREAD_MPI
1369 /* NOW the threads will be started: */
1370 hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
1374 if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
1376 hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
1379 if (hw_opt->nthreads_tmpi > 1)
1381 /* now start the threads. */
1382 cr=mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm,
1383 oenv, bVerbose, bCompact, nstglobalcomm,
1384 ddxyz, dd_node_order, rdd, rconstr,
1385 dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
1387 nsteps_cmdline, nstepout, resetstep, nmultisim,
1388 repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
1389 cpt_period, max_hours, deviceOptions,
1391 /* the main thread continues here with a new cr. We don't deallocate
1392 the old cr because other threads may still be reading it. */
1395 gmx_comm("Failed to spawn threads");
1400 /* END OF CAUTION: cr is now reliable */
1402 /* g_membed initialisation *
1403 * Because we change the mtop, init_membed is called before the init_parallel *
1404 * (in case we ever want to make it run in parallel) */
1405 if (opt2bSet("-membed",nfile,fnm))
1409 fprintf(stderr,"Initializing membed");
1411 membed = init_membed(fplog,nfile,fnm,mtop,inputrec,state,cr,&cpt_period);
1416 /* now broadcast everything to the non-master nodes/threads: */
1417 init_parallel(fplog, cr, inputrec, mtop);
1419 /* This check needs to happen after get_nthreads_mpi() */
1420 if (inputrec->cutoff_scheme == ecutsVERLET && (Flags & MD_PARTDEC))
1422 gmx_fatal_collective(FARGS,cr,NULL,
1423 "The Verlet cut-off scheme is not supported with particle decomposition.\n"
1424 "You can achieve the same effect as particle decomposition by running in parallel using only OpenMP threads.");
1429 pr_inputrec(fplog,0,"Input Parameters",inputrec,FALSE);
1432 #if defined GMX_THREAD_MPI
1433 /* With tMPI we detected on thread 0 and we'll just pass the hwinfo pointer
1434 * to the other threads -- slightly uncool, but works fine, just need to
1435 * make sure that the data doesn't get freed twice. */
1442 gmx_bcast(sizeof(&hwinfo), &hwinfo, cr);
1445 if (PAR(cr) && !SIMMASTER(cr))
1447 /* now we have inputrec on all nodes, can run the detection */
1448 /* TODO: perhaps it's better to propagate within a node instead? */
1450 gmx_detect_hardware(fplog, hwinfo, cr,
1451 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
1454 /* Now do the affinity check with MPI/no-MPI (done earlier with thread-MPI). */
1455 check_cpu_affinity_set(fplog, cr,
1456 hw_opt, hwinfo->nthreads_hw_avail, FALSE);
1459 /* now make sure the state is initialized and propagated */
1460 set_state_entries(state,inputrec,cr->nnodes);
1462 /* remove when vv and rerun works correctly! */
1463 if (PAR(cr) && EI_VV(inputrec->eI) && ((Flags & MD_RERUN) || (Flags & MD_RERUN_VSITE)))
1466 "Currently can't do velocity verlet with rerun in parallel.");
1469 /* A parallel command line option consistency check that we can
1470 only do after any threads have started. */
1472 (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
1475 "The -dd or -npme option request a parallel simulation, "
1477 "but %s was compiled without threads or MPI enabled"
1479 #ifdef GMX_THREAD_MPI
1480 "but the number of threads (option -nt) is 1"
1482 "but %s was not started through mpirun/mpiexec or only one process was requested through mpirun/mpiexec"
1489 if ((Flags & MD_RERUN) &&
1490 (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
1492 gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
1495 if (can_use_allvsall(inputrec,mtop,TRUE,cr,fplog) && PAR(cr))
1497 /* All-vs-all loops do not work with domain decomposition */
1498 Flags |= MD_PARTDEC;
1501 if (!EEL_PME(inputrec->coulombtype) || (Flags & MD_PARTDEC))
1503 if (cr->npmenodes > 0)
1505 if (!EEL_PME(inputrec->coulombtype))
1507 gmx_fatal_collective(FARGS,cr,NULL,
1508 "PME nodes are requested, but the system does not use PME electrostatics");
1510 if (Flags & MD_PARTDEC)
1512 gmx_fatal_collective(FARGS,cr,NULL,
1513 "PME nodes are requested, but particle decomposition does not support separate PME nodes");
1521 fcRegisterSteps(inputrec->nsteps,inputrec->init_step);
1524 /* NMR restraints must be initialized before load_checkpoint,
1525 * since with time averaging the history is added to t_state.
1526 * For proper consistency check we therefore need to extend
1528 * So the PME-only nodes (if present) will also initialize
1529 * the distance restraints.
1533 /* This needs to be called before read_checkpoint to extend the state */
1534 init_disres(fplog,mtop,inputrec,cr,Flags & MD_PARTDEC,fcd,state);
1536 if (gmx_mtop_ftype_count(mtop,F_ORIRES) > 0)
1538 if (PAR(cr) && !(Flags & MD_PARTDEC))
1540 gmx_fatal(FARGS,"Orientation restraints do not work (yet) with domain decomposition, use particle decomposition (mdrun option -pd)");
1542 /* Orientation restraints */
1545 init_orires(fplog,mtop,state->x,inputrec,cr->ms,&(fcd->orires),
1550 if (DEFORM(*inputrec))
1552 /* Store the deform reference box before reading the checkpoint */
1555 copy_mat(state->box,box);
1559 gmx_bcast(sizeof(box),box,cr);
1561 /* Because we do not have the update struct available yet
1562 * in which the reference values should be stored,
1563 * we store them temporarily in static variables.
1564 * This should be thread safe, since they are only written once
1565 * and with identical values.
1567 #ifdef GMX_THREAD_MPI
1568 tMPI_Thread_mutex_lock(&deform_init_box_mutex);
1570 deform_init_init_step_tpx = inputrec->init_step;
1571 copy_mat(box,deform_init_box_tpx);
1572 #ifdef GMX_THREAD_MPI
1573 tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
1577 if (opt2bSet("-cpi",nfile,fnm))
1579 /* Check if checkpoint file exists before doing continuation.
1580 * This way we can use identical input options for the first and subsequent runs...
1582 if( gmx_fexist_master(opt2fn_master("-cpi",nfile,fnm,cr),cr) )
1584 load_checkpoint(opt2fn_master("-cpi",nfile,fnm,cr),&fplog,
1585 cr,Flags & MD_PARTDEC,ddxyz,
1586 inputrec,state,&bReadRNG,&bReadEkin,
1587 (Flags & MD_APPENDFILES),
1588 (Flags & MD_APPENDFILESSET));
1592 Flags |= MD_READ_RNG;
1596 Flags |= MD_READ_EKIN;
1601 if (((MASTER(cr) || (Flags & MD_SEPPOT)) && (Flags & MD_APPENDFILES))
1602 #ifdef GMX_THREAD_MPI
1603 /* With thread MPI only the master node/thread exists in mdrun.c,
1604 * therefore non-master nodes need to open the "seppot" log file here.
1606 || (!MASTER(cr) && (Flags & MD_SEPPOT))
1610 gmx_log_open(ftp2fn(efLOG,nfile,fnm),cr,!(Flags & MD_SEPPOT),
1614 /* override nsteps with value from cmdline */
1615 override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
1619 copy_mat(state->box,box);
1624 gmx_bcast(sizeof(box),box,cr);
1627 /* Essential dynamics */
1628 if (opt2bSet("-ei",nfile,fnm))
1630 /* Open input and output files, allocate space for ED data structure */
1631 ed = ed_open(nfile,fnm,Flags,cr);
1634 if (PAR(cr) && !((Flags & MD_PARTDEC) ||
1635 EI_TPI(inputrec->eI) ||
1636 inputrec->eI == eiNM))
1638 cr->dd = init_domain_decomposition(fplog,cr,Flags,ddxyz,rdd,rconstr,
1639 dddlb_opt,dlb_scale,
1643 &ddbox,&npme_major,&npme_minor);
1645 make_dd_communicators(fplog,cr,dd_node_order);
1647 /* Set overallocation to avoid frequent reallocation of arrays */
1648 set_over_alloc_dd(TRUE);
1652 /* PME, if used, is done on all nodes with 1D decomposition */
1654 cr->duty = (DUTY_PP | DUTY_PME);
1657 if (!EI_TPI(inputrec->eI))
1659 npme_major = cr->nnodes;
1662 if (inputrec->ePBC == epbcSCREW)
1665 "pbc=%s is only implemented with domain decomposition",
1666 epbc_names[inputrec->ePBC]);
1672 /* After possible communicator splitting in make_dd_communicators.
1673 * we can set up the intra/inter node communication.
1675 gmx_setup_nodecomm(fplog,cr);
1678 /* Initialize per-physical-node MPI process/thread ID and counters. */
1679 gmx_init_intranode_counters(cr);
1682 md_print_info(cr,fplog,"Using %d MPI %s\n",
1684 #ifdef GMX_THREAD_MPI
1685 cr->nnodes==1 ? "thread" : "threads"
1687 cr->nnodes==1 ? "process" : "processes"
1693 gmx_omp_nthreads_init(fplog, cr,
1694 hwinfo->nthreads_hw_avail,
1695 hw_opt->nthreads_omp,
1696 hw_opt->nthreads_omp_pme,
1697 (cr->duty & DUTY_PP) == 0,
1698 inputrec->cutoff_scheme == ecutsVERLET);
1700 gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi, minf.bUseGPU);
1702 /* getting number of PP/PME threads
1703 PME: env variable should be read only on one node to make sure it is
1704 identical everywhere;
1706 /* TODO nthreads_pp is only used for pinning threads.
1707 * This is a temporary solution until we have a hw topology library.
1709 nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
1710 nthreads_pme = gmx_omp_nthreads_get(emntPME);
1712 wcycle = wallcycle_init(fplog,resetstep,cr,nthreads_pp,nthreads_pme);
1716 /* Master synchronizes its value of reset_counters with all nodes
1717 * including PME only nodes */
1718 reset_counters = wcycle_get_reset_counters(wcycle);
1719 gmx_bcast_sim(sizeof(reset_counters),&reset_counters,cr);
1720 wcycle_set_reset_counters(wcycle, reset_counters);
1724 if (cr->duty & DUTY_PP)
1726 /* For domain decomposition we allocate dynamically
1727 * in dd_partition_system.
1729 if (DOMAINDECOMP(cr))
1731 bcast_state_setup(cr,state);
1737 bcast_state(cr,state,TRUE);
1741 /* Initiate forcerecord */
1743 fr->hwinfo = hwinfo;
1744 init_forcerec(fplog,oenv,fr,fcd,inputrec,mtop,cr,box,FALSE,
1745 opt2fn("-table",nfile,fnm),
1746 opt2fn("-tabletf",nfile,fnm),
1747 opt2fn("-tablep",nfile,fnm),
1748 opt2fn("-tableb",nfile,fnm),
1752 /* version for PCA_NOT_READ_NODE (see md.c) */
1753 /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
1754 "nofile","nofile","nofile","nofile",FALSE,pforce);
1756 fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT);
1758 /* Initialize QM-MM */
1761 init_QMMMrec(cr,box,mtop,inputrec,fr);
1764 /* Initialize the mdatoms structure.
1765 * mdatoms is not filled with atom data,
1766 * as this can not be done now with domain decomposition.
1768 mdatoms = init_mdatoms(fplog,mtop,inputrec->efep!=efepNO);
1770 /* Initialize the virtual site communication */
1771 vsite = init_vsite(mtop,cr,FALSE);
1773 calc_shifts(box,fr->shift_vec);
1775 /* With periodic molecules the charge groups should be whole at start up
1776 * and the virtual sites should not be far from their proper positions.
1778 if (!inputrec->bContinuation && MASTER(cr) &&
1779 !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
1781 /* Make molecules whole at start of run */
1782 if (fr->ePBC != epbcNONE)
1784 do_pbc_first_mtop(fplog,inputrec->ePBC,box,mtop,state->x);
1788 /* Correct initial vsite positions are required
1789 * for the initial distribution in the domain decomposition
1790 * and for the initial shell prediction.
1792 construct_vsites_mtop(fplog,vsite,mtop,state->x);
1796 if (EEL_PME(fr->eeltype))
1798 ewaldcoeff = fr->ewaldcoeff;
1799 pmedata = &fr->pmedata;
1808 /* This is a PME only node */
1810 /* We don't need the state */
1813 ewaldcoeff = calc_ewaldcoeff(inputrec->rcoulomb, inputrec->ewald_rtol);
1817 /* Before setting affinity, check whether the affinity has changed
1818 * - which indicates that probably the OpenMP library has changed it since
1819 * we first checked). */
1820 check_cpu_affinity_set(fplog, cr, hw_opt, hwinfo->nthreads_hw_avail, TRUE);
1822 /* Set the CPU affinity */
1823 set_cpu_affinity(fplog,cr,hw_opt,nthreads_pme,hwinfo,inputrec);
1825 /* Initiate PME if necessary,
1826 * either on all nodes or on dedicated PME nodes only. */
1827 if (EEL_PME(inputrec->coulombtype))
1831 nChargePerturbed = mdatoms->nChargePerturbed;
1833 if (cr->npmenodes > 0)
1835 /* The PME only nodes need to know nChargePerturbed */
1836 gmx_bcast_sim(sizeof(nChargePerturbed),&nChargePerturbed,cr);
1839 if (cr->duty & DUTY_PME)
1841 status = gmx_pme_init(pmedata,cr,npme_major,npme_minor,inputrec,
1842 mtop ? mtop->natoms : 0,nChargePerturbed,
1843 (Flags & MD_REPRODUCIBLE),nthreads_pme);
1846 gmx_fatal(FARGS,"Error %d initializing PME",status);
1852 if (integrator[inputrec->eI].func == do_md
1855 integrator[inputrec->eI].func == do_md_openmm
1859 /* Turn on signal handling on all nodes */
1861 * (A user signal from the PME nodes (if any)
1862 * is communicated to the PP nodes.
1864 signal_handler_install();
1867 if (cr->duty & DUTY_PP)
1869 if (inputrec->ePull != epullNO)
1871 /* Initialize pull code */
1872 init_pull(fplog,inputrec,nfile,fnm,mtop,cr,oenv, inputrec->fepvals->init_lambda,
1873 EI_DYNAMICS(inputrec->eI) && MASTER(cr),Flags);
1878 /* Initialize enforced rotation code */
1879 init_rot(fplog,inputrec,nfile,fnm,cr,state->x,box,mtop,oenv,
1883 constr = init_constraints(fplog,mtop,inputrec,ed,state,cr);
1885 if (DOMAINDECOMP(cr))
1887 dd_init_bondeds(fplog,cr->dd,mtop,vsite,constr,inputrec,
1888 Flags & MD_DDBONDCHECK,fr->cginfo_mb);
1890 set_dd_parameters(fplog,cr->dd,dlb_scale,inputrec,fr,&ddbox);
1892 setup_dd_grid(fplog,cr->dd);
1895 /* Now do whatever the user wants us to do (how flexible...) */
1896 integrator[inputrec->eI].func(fplog,cr,nfile,fnm,
1897 oenv,bVerbose,bCompact,
1900 nstepout,inputrec,mtop,
1902 mdatoms,nrnb,wcycle,ed,fr,
1903 repl_ex_nst,repl_ex_nex,repl_ex_seed,
1905 cpt_period,max_hours,
1910 if (inputrec->ePull != epullNO)
1912 finish_pull(fplog,inputrec->pull);
1917 finish_rot(fplog,inputrec->rot);
1924 gmx_pmeonly(*pmedata,cr,nrnb,wcycle,ewaldcoeff,FALSE,inputrec);
1927 if (EI_DYNAMICS(inputrec->eI) || EI_TPI(inputrec->eI))
1929 /* Some timing stats */
1932 if (runtime.proc == 0)
1934 runtime.proc = runtime.real;
1943 wallcycle_stop(wcycle,ewcRUN);
1945 /* Finish up, write some stuff
1946 * if rerunMD, don't write last frame again
1948 finish_run(fplog,cr,ftp2fn(efSTO,nfile,fnm),
1949 inputrec,nrnb,wcycle,&runtime,
1950 fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ?
1951 nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL,
1953 EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
1955 if ((cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU)
1957 char gpu_err_str[STRLEN];
1959 /* free GPU memory and uninitialize GPU (by destroying the context) */
1960 nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
1962 if (!free_gpu(gpu_err_str))
1964 gmx_warning("On node %d failed to free GPU #%d: %s",
1965 cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
1969 if (opt2bSet("-membed",nfile,fnm))
1974 #ifdef GMX_THREAD_MPI
1975 if (PAR(cr) && SIMMASTER(cr))
1978 gmx_hardware_info_free(hwinfo);
1981 /* Does what it says */
1982 print_date_and_time(fplog,cr->nodeid,"Finished mdrun",&runtime);
1984 /* Close logfile already here if we were appending to it */
1985 if (MASTER(cr) && (Flags & MD_APPENDFILES))
1987 gmx_log_close(fplog);
1990 rc=(int)gmx_get_stop_condition();
1992 #ifdef GMX_THREAD_MPI
1993 /* we need to join all threads. The sub-threads join when they
1994 exit this function, but the master thread needs to be told to
1996 if (PAR(cr) && MASTER(cr))