1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This source code is part of
8 * GROningen MAchine for Chemical Simulations
11 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13 * Copyright (c) 2001-2004, The GROMACS development team,
14 * check out http://www.gromacs.org for more information.
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version 2
19 * of the License, or (at your option) any later version.
21 * If you want to redistribute modifications, please consider that
22 * scientific software is very special. Version control is crucial -
23 * bugs must be traceable. We will be happy to consider code for
24 * inclusion in the official distribution, but derived work must not
25 * be called official GROMACS. Details are found in the README & COPYING
26 * files - if they are missing, get the official version at www.gromacs.org.
28 * To help us fund GROMACS development, we humbly ask that you cite
29 * the papers on the package - you can find them in the top README file.
31 * For more info, check our website at http://www.gromacs.org
34 * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
39 #if defined(HAVE_SCHED_H) && (defined(HAVE_SCHED_GETAFFINITY) || defined(HAVE_SCHED_SETAFFINITY))
42 #include <sys/syscall.h>
54 #include "md_logging.h"
55 #include "md_support.h"
65 #include "mpelogging.h"
71 #include "checkpoint.h"
72 #include "mtop_util.h"
73 #include "sighandler.h"
76 #include "gmx_detect_hardware.h"
77 #include "gmx_omp_nthreads.h"
78 #include "pull_rotation.h"
79 #include "calc_verletbuf.h"
80 #include "../mdlib/nbnxn_search.h"
81 #include "../mdlib/nbnxn_consts.h"
82 #include "gmx_fatal_collective.h"
84 #include "md_openmm.h"
99 #include "md_openmm.h"
102 #include "gpu_utils.h"
103 #include "nbnxn_cuda_data_mgmt.h"
106 gmx_integrator_t *func;
109 /* The array should match the eI array in include/types/enums.h */
110 #ifdef GMX_OPENMM /* FIXME do_md_openmm needs fixing */
111 const gmx_intp_t integrator[eiNR] = { {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm},{do_md_openmm}};
113 const gmx_intp_t integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md},{do_md}};
116 gmx_large_int_t deform_init_init_step_tpx;
117 matrix deform_init_box_tpx;
118 #ifdef GMX_THREAD_MPI
119 tMPI_Thread_mutex_t deform_init_box_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
123 #ifdef GMX_THREAD_MPI
124 struct mdrunner_arglist
126 gmx_hw_opt_t *hw_opt;
139 const char *dddlb_opt;
144 const char *nbpu_opt;
155 const char *deviceOptions;
157 int ret; /* return value */
161 /* The function used for spawning threads. Extracts the mdrunner()
162 arguments from its one argument and calls mdrunner(), after making
164 static void mdrunner_start_fn(void *arg)
166 struct mdrunner_arglist *mda=(struct mdrunner_arglist*)arg;
167 struct mdrunner_arglist mc=*mda; /* copy the arg list to make sure
168 that it's thread-local. This doesn't
169 copy pointed-to items, of course,
170 but those are all const. */
171 t_commrec *cr; /* we need a local version of this */
175 fnm = dup_tfn(mc.nfile, mc.fnm);
177 cr = init_par_threads(mc.cr);
184 mda->ret=mdrunner(mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv,
185 mc.bVerbose, mc.bCompact, mc.nstglobalcomm,
186 mc.ddxyz, mc.dd_node_order, mc.rdd,
187 mc.rconstr, mc.dddlb_opt, mc.dlb_scale,
188 mc.ddcsx, mc.ddcsy, mc.ddcsz,
190 mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
191 mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce,
192 mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.Flags);
195 /* called by mdrunner() to start a specific number of threads (including
196 the main thread) for thread-parallel runs. This in turn calls mdrunner()
198 All options besides nthreads are the same as for mdrunner(). */
199 static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
200 FILE *fplog,t_commrec *cr,int nfile,
201 const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
202 gmx_bool bCompact, int nstglobalcomm,
203 ivec ddxyz,int dd_node_order,real rdd,real rconstr,
204 const char *dddlb_opt,real dlb_scale,
205 const char *ddcsx,const char *ddcsy,const char *ddcsz,
206 const char *nbpu_opt,
207 int nsteps_cmdline, int nstepout,int resetstep,
208 int nmultisim,int repl_ex_nst,int repl_ex_nex, int repl_ex_seed,
209 real pforce,real cpt_period, real max_hours,
210 const char *deviceOptions, unsigned long Flags)
213 struct mdrunner_arglist *mda;
214 t_commrec *crn; /* the new commrec */
217 /* first check whether we even need to start tMPI */
218 if (hw_opt->nthreads_tmpi < 2)
223 /* a few small, one-time, almost unavoidable memory leaks: */
225 fnmn=dup_tfn(nfile, fnm);
227 /* fill the data structure to pass as void pointer to thread start fn */
234 mda->bVerbose=bVerbose;
235 mda->bCompact=bCompact;
236 mda->nstglobalcomm=nstglobalcomm;
237 mda->ddxyz[XX]=ddxyz[XX];
238 mda->ddxyz[YY]=ddxyz[YY];
239 mda->ddxyz[ZZ]=ddxyz[ZZ];
240 mda->dd_node_order=dd_node_order;
242 mda->rconstr=rconstr;
243 mda->dddlb_opt=dddlb_opt;
244 mda->dlb_scale=dlb_scale;
248 mda->nbpu_opt=nbpu_opt;
249 mda->nsteps_cmdline=nsteps_cmdline;
250 mda->nstepout=nstepout;
251 mda->resetstep=resetstep;
252 mda->nmultisim=nmultisim;
253 mda->repl_ex_nst=repl_ex_nst;
254 mda->repl_ex_nex=repl_ex_nex;
255 mda->repl_ex_seed=repl_ex_seed;
257 mda->cpt_period=cpt_period;
258 mda->max_hours=max_hours;
259 mda->deviceOptions=deviceOptions;
262 fprintf(stderr, "Starting %d tMPI threads\n",hw_opt->nthreads_tmpi);
264 /* now spawn new threads that start mdrunner_start_fn(), while
265 the main thread returns */
266 ret=tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi,
267 (hw_opt->bThreadPinning ? TMPI_AFFINITY_ALL_CORES : TMPI_AFFINITY_NONE),
268 mdrunner_start_fn, (void*)(mda) );
269 if (ret!=TMPI_SUCCESS)
272 /* make a new comm_rec to reflect the new situation */
273 crn=init_par_threads(cr);
278 static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
279 const gmx_hw_opt_t *hw_opt,
285 /* There are no separate PME nodes here, as we ensured in
286 * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
287 * and a conditional ensures we would not have ended up here.
288 * Note that separate PME nodes might be switched on later.
292 nthreads_tmpi = ngpu;
293 if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi)
295 nthreads_tmpi = nthreads_tot;
298 else if (hw_opt->nthreads_omp > 0)
300 /* Here we could oversubscribe, when we do, we issue a warning later */
301 nthreads_tmpi = max(1,nthreads_tot/hw_opt->nthreads_omp);
305 /* TODO choose nthreads_omp based on hardware topology
306 when we have a hardware topology detection library */
307 /* In general, when running up to 4 threads, OpenMP should be faster.
308 * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
309 * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
310 * even on two CPUs it's usually faster (but with many OpenMP threads
311 * it could be faster not to use HT, currently we always use HT).
312 * On Nehalem/Westmere we want to avoid running 16 threads over
313 * two CPUs with HT, so we need a limit<16; thus we use 12.
314 * A reasonable limit for Intel Sandy and Ivy bridge,
315 * not knowing the topology, is 16 threads.
317 const int nthreads_omp_always_faster = 4;
318 const int nthreads_omp_always_faster_Nehalem = 12;
319 const int nthreads_omp_always_faster_SandyBridge = 16;
320 const int first_model_Nehalem = 0x1A;
321 const int first_model_SandyBridge = 0x2A;
322 gmx_bool bIntel_Family6;
325 (gmx_cpuid_vendor(hwinfo->cpuid_info) == GMX_CPUID_VENDOR_INTEL &&
326 gmx_cpuid_family(hwinfo->cpuid_info) == 6);
328 if (nthreads_tot <= nthreads_omp_always_faster ||
330 ((gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_Nehalem && nthreads_tot <= nthreads_omp_always_faster_Nehalem) ||
331 (gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_SandyBridge && nthreads_tot <= nthreads_omp_always_faster_SandyBridge))))
333 /* Use pure OpenMP parallelization */
338 /* Don't use OpenMP parallelization */
339 nthreads_tmpi = nthreads_tot;
343 return nthreads_tmpi;
347 /* Get the number of threads to use for thread-MPI based on how many
348 * were requested, which algorithms we're using,
349 * and how many particles there are.
350 * At the point we have already called check_and_update_hw_opt.
351 * Thus all options should be internally consistent and consistent
352 * with the hardware, except that ntmpi could be larger than #GPU.
354 static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
355 gmx_hw_opt_t *hw_opt,
356 t_inputrec *inputrec, gmx_mtop_t *mtop,
360 int nthreads_hw,nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu;
361 int min_atoms_per_mpi_thread;
366 if (hw_opt->nthreads_tmpi > 0)
368 /* Trivial, return right away */
369 return hw_opt->nthreads_tmpi;
372 nthreads_hw = hwinfo->nthreads_hw_avail;
374 /* How many total (#tMPI*#OpenMP) threads can we start? */
375 if (hw_opt->nthreads_tot > 0)
377 nthreads_tot_max = hw_opt->nthreads_tot;
381 nthreads_tot_max = nthreads_hw;
384 bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
387 ngpu = hwinfo->gpu_info.ncuda_dev_use;
395 get_tmpi_omp_thread_division(hwinfo,hw_opt,nthreads_tot_max,ngpu);
397 if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
399 /* Steps are divided over the nodes iso splitting the atoms */
400 min_atoms_per_mpi_thread = 0;
406 min_atoms_per_mpi_thread = MIN_ATOMS_PER_GPU;
410 min_atoms_per_mpi_thread = MIN_ATOMS_PER_MPI_THREAD;
414 /* Check if an algorithm does not support parallel simulation. */
415 if (nthreads_tmpi != 1 &&
416 ( inputrec->eI == eiLBFGS ||
417 inputrec->coulombtype == eelEWALD ) )
421 md_print_warn(cr,fplog,"The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
422 if (hw_opt->nthreads_tmpi > nthreads_tmpi)
424 gmx_fatal(FARGS,"You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
427 else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_thread)
429 /* the thread number was chosen automatically, but there are too many
430 threads (too few atoms per thread) */
431 nthreads_new = max(1,mtop->natoms/min_atoms_per_mpi_thread);
433 /* Avoid partial use of Hyper-Threading */
434 if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
435 nthreads_new > nthreads_hw/2 && nthreads_new < nthreads_hw)
437 nthreads_new = nthreads_hw/2;
440 /* Avoid large prime numbers in the thread count */
441 if (nthreads_new >= 6)
443 /* Use only 6,8,10 with additional factors of 2 */
447 while (3*fac*2 <= nthreads_new)
452 nthreads_new = (nthreads_new/fac)*fac;
457 if (nthreads_new == 5)
463 nthreads_tmpi = nthreads_new;
465 fprintf(stderr,"\n");
466 fprintf(stderr,"NOTE: Parallelization is limited by the small number of atoms,\n");
467 fprintf(stderr," only starting %d thread-MPI threads.\n",nthreads_tmpi);
468 fprintf(stderr," You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
471 return nthreads_tmpi;
473 #endif /* GMX_THREAD_MPI */
476 /* Environment variable for setting nstlist */
477 static const char* NSTLIST_ENVVAR = "GMX_NSTLIST";
478 /* Try to increase nstlist when using a GPU with nstlist less than this */
479 static const int NSTLIST_GPU_ENOUGH = 20;
480 /* Increase nstlist until the non-bonded cost increases more than this factor */
481 static const float NBNXN_GPU_LIST_OK_FAC = 1.25;
482 /* Don't increase nstlist beyond a non-bonded cost increases of this factor */
483 static const float NBNXN_GPU_LIST_MAX_FAC = 1.40;
485 /* Try to increase nstlist when running on a GPU */
486 static void increase_nstlist(FILE *fp,t_commrec *cr,
487 t_inputrec *ir,const gmx_mtop_t *mtop,matrix box)
490 int nstlist_orig,nstlist_prev;
491 verletbuf_list_setup_t ls;
492 real rlist_inc,rlist_ok,rlist_max,rlist_new,rlist_prev;
495 gmx_bool bBox,bDD,bCont;
496 const char *nstl_fmt="\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
497 const char *vbd_err="Can not increase nstlist for GPU run because verlet-buffer-drift is not set or used";
498 const char *box_err="Can not increase nstlist for GPU run because the box is too small";
499 const char *dd_err ="Can not increase nstlist for GPU run because of domain decomposition limitations";
502 /* Number of + nstlist alternative values to try when switching */
503 const int nstl[]={ 20, 25, 40, 50 };
504 #define NNSTL sizeof(nstl)/sizeof(nstl[0])
506 env = getenv(NSTLIST_ENVVAR);
511 fprintf(fp,nstl_fmt,ir->nstlist);
515 if (ir->verletbuf_drift == 0)
517 gmx_fatal(FARGS,"You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
520 if (ir->verletbuf_drift < 0)
524 fprintf(stderr,"%s\n",vbd_err);
528 fprintf(fp,"%s\n",vbd_err);
534 nstlist_orig = ir->nstlist;
537 sprintf(buf,"Getting nstlist from environment variable GMX_NSTLIST=%s",env);
540 fprintf(stderr,"%s\n",buf);
544 fprintf(fp,"%s\n",buf);
546 sscanf(env,"%d",&ir->nstlist);
549 verletbuf_get_list_setup(TRUE,&ls);
551 /* Allow rlist to make the list double the size of the cut-off sphere */
552 rlist_inc = nbnxn_get_rlist_effective_inc(NBNXN_GPU_CLUSTER_SIZE,mtop->natoms/det(box));
553 rlist_ok = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_OK_FAC,1.0/3.0) - rlist_inc;
554 rlist_max = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_MAX_FAC,1.0/3.0) - rlist_inc;
557 fprintf(debug,"GPU nstlist tuning: rlist_inc %.3f rlist_max %.3f\n",
558 rlist_inc,rlist_max);
562 nstlist_prev = nstlist_orig;
563 rlist_prev = ir->rlist;
568 ir->nstlist = nstl[i];
571 /* Set the pair-list buffer size in ir */
572 calc_verlet_buffer_size(mtop,det(box),ir,ir->verletbuf_drift,&ls,
575 /* Does rlist fit in the box? */
576 bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC,box));
578 if (bBox && DOMAINDECOMP(cr))
580 /* Check if rlist fits in the domain decomposition */
581 if (inputrec2nboundeddim(ir) < DIM)
583 gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
585 copy_mat(box,state_tmp.box);
586 bDD = change_dd_cutoff(cr,&state_tmp,ir,rlist_new);
593 if (bBox && bDD && rlist_new <= rlist_max)
595 /* Increase nstlist */
596 nstlist_prev = ir->nstlist;
597 rlist_prev = rlist_new;
598 bCont = (i+1 < NNSTL && rlist_new < rlist_ok);
602 /* Stick with the previous nstlist */
603 ir->nstlist = nstlist_prev;
604 rlist_new = rlist_prev;
616 gmx_warning(!bBox ? box_err : dd_err);
619 fprintf(fp,"\n%s\n",bBox ? box_err : dd_err);
621 ir->nstlist = nstlist_orig;
623 else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
625 sprintf(buf,"Changing nstlist from %d to %d, rlist from %g to %g",
626 nstlist_orig,ir->nstlist,
627 ir->rlist,rlist_new);
630 fprintf(stderr,"%s\n\n",buf);
634 fprintf(fp,"%s\n\n",buf);
636 ir->rlist = rlist_new;
637 ir->rlistlong = rlist_new;
641 static void prepare_verlet_scheme(FILE *fplog,
642 gmx_hw_info_t *hwinfo,
644 gmx_hw_opt_t *hw_opt,
645 const char *nbpu_opt,
647 const gmx_mtop_t *mtop,
651 /* Here we only check for GPU usage on the MPI master process,
652 * as here we don't know how many GPUs we will use yet.
653 * We check for a GPU on all processes later.
655 *bUseGPU = hwinfo->bCanUseGPU || (getenv("GMX_EMULATE_GPU") != NULL);
657 if (ir->verletbuf_drift > 0)
659 /* Update the Verlet buffer size for the current run setup */
660 verletbuf_list_setup_t ls;
663 /* Here we assume CPU acceleration is on. But as currently
664 * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
665 * and 4x2 gives a larger buffer than 4x4, this is ok.
667 verletbuf_get_list_setup(*bUseGPU,&ls);
669 calc_verlet_buffer_size(mtop,det(box),ir,
670 ir->verletbuf_drift,&ls,
672 if (rlist_new != ir->rlist)
676 fprintf(fplog,"\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
678 ls.cluster_size_i,ls.cluster_size_j);
680 ir->rlist = rlist_new;
681 ir->rlistlong = rlist_new;
685 /* With GPU or emulation we should check nstlist for performance */
686 if ((EI_DYNAMICS(ir->eI) &&
688 ir->nstlist < NSTLIST_GPU_ENOUGH) ||
689 getenv(NSTLIST_ENVVAR) != NULL)
691 /* Choose a better nstlist */
692 increase_nstlist(fplog,cr,ir,mtop,box);
696 static void convert_to_verlet_scheme(FILE *fplog,
698 gmx_mtop_t *mtop,real box_vol)
700 char *conv_mesg="Converting input file with group cut-off scheme to the Verlet cut-off scheme";
702 md_print_warn(NULL,fplog,"%s\n",conv_mesg);
704 ir->cutoff_scheme = ecutsVERLET;
705 ir->verletbuf_drift = 0.005;
707 if (ir->rcoulomb != ir->rvdw)
709 gmx_fatal(FARGS,"The VdW and Coulomb cut-offs are different, whereas the Verlet scheme only supports equal cut-offs");
712 if (ir->vdwtype == evdwUSER || EEL_USER(ir->coulombtype))
714 gmx_fatal(FARGS,"User non-bonded potentials are not (yet) supported with the Verlet scheme");
716 else if (EVDW_SWITCHED(ir->vdwtype) || EEL_SWITCHED(ir->coulombtype))
718 md_print_warn(NULL,fplog,"Converting switched or shifted interactions to a shifted potential (without force shift), this will lead to slightly different interaction potentials");
720 if (EVDW_SWITCHED(ir->vdwtype))
722 ir->vdwtype = evdwCUT;
724 if (EEL_SWITCHED(ir->coulombtype))
726 if (EEL_FULL(ir->coulombtype))
728 /* With full electrostatic only PME can be switched */
729 ir->coulombtype = eelPME;
733 md_print_warn(NULL,fplog,"NOTE: Replacing %s electrostatics with reaction-field with epsilon-rf=inf\n",eel_names[ir->coulombtype]);
734 ir->coulombtype = eelRF;
735 ir->epsilon_rf = 0.0;
739 /* We set the target energy drift to a small number.
740 * Note that this is only for testing. For production the user
741 * should think about this and set the mdp options.
743 ir->verletbuf_drift = 1e-4;
746 if (inputrec2nboundeddim(ir) != 3)
748 gmx_fatal(FARGS,"Can only convert old tpr files to the Verlet cut-off scheme with 3D pbc");
751 if (ir->efep != efepNO || ir->implicit_solvent != eisNO)
753 gmx_fatal(FARGS,"Will not convert old tpr files to the Verlet cut-off scheme with free-energy calculations or implicit solvent");
756 if (EI_DYNAMICS(ir->eI) && !(EI_MD(ir->eI) && ir->etc == etcNO))
758 verletbuf_list_setup_t ls;
760 verletbuf_get_list_setup(FALSE,&ls);
761 calc_verlet_buffer_size(mtop,box_vol,ir,ir->verletbuf_drift,&ls,
766 ir->verletbuf_drift = -1;
767 ir->rlist = 1.05*max(ir->rvdw,ir->rcoulomb);
770 gmx_mtop_remove_chargegroups(mtop);
773 /* Check the process affinity mask and if it is found to be non-zero,
774 * will honor it and disable mdrun internal affinity setting.
775 * This function should be called first before the OpenMP library gets
776 * initialized with the last argument FALSE (which will detect affinity
777 * set by external tools like taskset), and later, after the OpenMP
778 * initialization, with the last argument TRUE to detect affinity changes
779 * made by the OpenMP library.
781 * Note that this will only work on Linux as we use a GNU feature. */
782 static void check_cpu_affinity_set(FILE *fplog, const t_commrec *cr,
783 gmx_hw_opt_t *hw_opt, int ncpus,
784 gmx_bool bAfterOpenmpInit)
786 #ifdef HAVE_SCHED_GETAFFINITY
787 cpu_set_t mask_current;
788 int i, ret, cpu_count, cpu_set;
792 if (!hw_opt->bThreadPinning)
794 /* internal affinity setting is off, don't bother checking process affinity */
798 CPU_ZERO(&mask_current);
799 if ((ret = sched_getaffinity(0, sizeof(cpu_set_t), &mask_current)) != 0)
801 /* failed to query affinity mask, will just return */
804 fprintf(debug, "Failed to query affinity mask (error %d)", ret);
809 /* Before proceeding with the actual check, make sure that the number of
810 * detected CPUs is >= the CPUs in the current set.
811 * We need to check for CPU_COUNT as it was added only in glibc 2.6. */
813 if (ncpus < CPU_COUNT(&mask_current))
817 fprintf(debug, "%d CPUs detected, but %d was returned by CPU_COUNT",
818 ncpus, CPU_COUNT(&mask_current));
822 #endif /* CPU_COUNT */
825 for (i = 0; (i < ncpus && i < CPU_SETSIZE); i++)
827 bAllSet = bAllSet && (CPU_ISSET(i, &mask_current) != 0);
832 if (!bAfterOpenmpInit)
834 md_print_warn(cr, fplog,
835 "Non-default process affinity set, disabling internal affinity");
839 md_print_warn(cr, fplog,
840 "Non-default process affinity set probably by the OpenMP library, "
841 "disabling internal affinity");
843 hw_opt->bThreadPinning = FALSE;
847 fprintf(debug, "Non-default affinity mask found\n");
854 fprintf(debug, "Default affinity mask found\n");
857 #endif /* HAVE_SCHED_GETAFFINITY */
860 /* Set CPU affinity. Can be important for performance.
861 On some systems (e.g. Cray) CPU Affinity is set by default.
862 But default assigning doesn't work (well) with only some ranks
863 having threads. This causes very low performance.
864 External tools have cumbersome syntax for setting affinity
865 in the case that only some ranks have threads.
866 Thus it is important that GROMACS sets the affinity internally
867 if only PME is using threads.
869 static void set_cpu_affinity(FILE *fplog,
871 gmx_hw_opt_t *hw_opt,
873 const gmx_hw_info_t *hwinfo,
874 const t_inputrec *inputrec)
876 #if defined GMX_THREAD_MPI
877 /* With the number of TMPI threads equal to the number of cores
878 * we already pinned in thread-MPI, so don't pin again here.
880 if (hw_opt->nthreads_tmpi == tMPI_Thread_get_hw_number())
886 #ifdef GMX_OPENMP /* TODO: actually we could do this even without OpenMP?! */
887 #ifdef HAVE_SCHED_SETAFFINITY
888 if (hw_opt->bThreadPinning)
890 int thread, nthread_local, nthread_node, nthread_hw_max, nphyscore;
894 /* threads on this MPI process or TMPI thread */
895 if (cr->duty & DUTY_PP)
897 nthread_local = gmx_omp_nthreads_get(emntNonbonded);
901 nthread_local = gmx_omp_nthreads_get(emntPME);
904 /* map the current process to cores */
906 nthread_node = nthread_local;
908 if (PAR(cr) || MULTISIM(cr))
910 /* We need to determine a scan of the thread counts in this
915 MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),cr->nodeid_intra,
917 MPI_Scan(&nthread_local,&thread,1,MPI_INT,MPI_SUM,comm_intra);
918 /* MPI_Scan is inclusive, but here we need exclusive */
919 thread -= nthread_local;
920 /* Get the total number of threads on this physical node */
921 MPI_Allreduce(&nthread_local,&nthread_node,1,MPI_INT,MPI_SUM,comm_intra);
922 MPI_Comm_free(&comm_intra);
927 if (hw_opt->core_pinning_offset > 0)
929 offset = hw_opt->core_pinning_offset;
932 fprintf(stderr, "Applying core pinning offset %d\n", offset);
936 fprintf(fplog, "Applying core pinning offset %d\n", offset);
940 /* With Intel Hyper-Threading enabled, we want to pin consecutive
941 * threads to physical cores when using more threads than physical
942 * cores or when the user requests so.
944 nthread_hw_max = hwinfo->nthreads_hw_avail;
946 if (hw_opt->bPinHyperthreading ||
947 (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
948 nthread_node > nthread_hw_max/2 && getenv("GMX_DISABLE_PINHT") == NULL))
950 if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) != GMX_CPUID_X86_SMT_ENABLED)
952 /* We print to stderr on all processes, as we might have
953 * different settings on different physical nodes.
955 if (gmx_cpuid_vendor(hwinfo->cpuid_info) != GMX_CPUID_VENDOR_INTEL)
957 md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
958 "but non-Intel CPU detected (vendor: %s)\n",
959 gmx_cpuid_vendor_string[gmx_cpuid_vendor(hwinfo->cpuid_info)]);
963 md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
964 "but the CPU detected does not have Intel Hyper-Threading support "
965 "(or it is turned off)\n");
968 nphyscore = nthread_hw_max/2;
972 fprintf(stderr, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
977 fprintf(fplog, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
982 /* set the per-thread affinity */
983 #pragma omp parallel firstprivate(thread) num_threads(nthread_local)
989 thread += gmx_omp_get_thread_num();
992 core = offset + thread;
996 /* Lock pairs of threads to the same hyperthreaded core */
997 core = offset + thread/2 + (thread % 2)*nphyscore;
999 CPU_SET(core, &mask);
1000 sched_setaffinity((pid_t) syscall (SYS_gettid), sizeof(cpu_set_t), &mask);
1003 #endif /* HAVE_SCHED_SETAFFINITY */
1004 #endif /* GMX_OPENMP */
1008 static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt,
1011 gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp);
1013 #ifndef GMX_THREAD_MPI
1014 if (hw_opt->nthreads_tot > 0)
1016 gmx_fatal(FARGS,"Setting the total number of threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
1018 if (hw_opt->nthreads_tmpi > 0)
1020 gmx_fatal(FARGS,"Setting the number of thread-MPI threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
1024 if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
1026 /* We have the same number of OpenMP threads for PP and PME processes,
1027 * thus we can perform several consistency checks.
1029 if (hw_opt->nthreads_tmpi > 0 &&
1030 hw_opt->nthreads_omp > 0 &&
1031 hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
1033 gmx_fatal(FARGS,"The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
1034 hw_opt->nthreads_tot,hw_opt->nthreads_tmpi,hw_opt->nthreads_omp);
1037 if (hw_opt->nthreads_tmpi > 0 &&
1038 hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
1040 gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
1041 hw_opt->nthreads_tot,hw_opt->nthreads_tmpi);
1044 if (hw_opt->nthreads_omp > 0 &&
1045 hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
1047 gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
1048 hw_opt->nthreads_tot,hw_opt->nthreads_omp);
1051 if (hw_opt->nthreads_tmpi > 0 &&
1052 hw_opt->nthreads_omp <= 0)
1054 hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
1059 if (hw_opt->nthreads_omp > 1)
1061 gmx_fatal(FARGS,"OpenMP threads are requested, but Gromacs was compiled without OpenMP support");
1065 if (cutoff_scheme == ecutsGROUP)
1067 /* We only have OpenMP support for PME only nodes */
1068 if (hw_opt->nthreads_omp > 1)
1070 gmx_fatal(FARGS,"OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
1071 ecutscheme_names[cutoff_scheme],
1072 ecutscheme_names[ecutsVERLET]);
1074 hw_opt->nthreads_omp = 1;
1077 if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
1079 gmx_fatal(FARGS,"You need to specify -ntomp in addition to -ntomp_pme");
1082 if (hw_opt->nthreads_tot == 1)
1084 hw_opt->nthreads_tmpi = 1;
1086 if (hw_opt->nthreads_omp > 1)
1088 gmx_fatal(FARGS,"You requested %d OpenMP threads with %d total threads",
1089 hw_opt->nthreads_tmpi,hw_opt->nthreads_tot);
1091 hw_opt->nthreads_omp = 1;
1094 if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
1096 hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
1101 fprintf(debug,"hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
1102 hw_opt->nthreads_tot,
1103 hw_opt->nthreads_tmpi,
1104 hw_opt->nthreads_omp,
1105 hw_opt->nthreads_omp_pme,
1106 hw_opt->gpu_id!=NULL ? hw_opt->gpu_id : "");
1112 /* Override the value in inputrec with value passed on the command line (if any) */
1113 static void override_nsteps_cmdline(FILE *fplog,
1116 const t_commrec *cr)
1121 /* override with anything else than the default -2 */
1122 if (nsteps_cmdline > -2)
1126 ir->nsteps = nsteps_cmdline;
1127 if (EI_DYNAMICS(ir->eI))
1129 sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps, %.3f ps",
1130 nsteps_cmdline, nsteps_cmdline*ir->delta_t);
1134 sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps",
1138 md_print_warn(cr, fplog, "%s\n", stmp);
1142 /* Data structure set by SIMMASTER which needs to be passed to all nodes
1143 * before the other nodes have read the tpx file and called gmx_detect_hardware.
1146 int cutoff_scheme; /* The cutoff scheme from inputrec_t */
1147 gmx_bool bUseGPU; /* Use GPU or GPU emulation */
1150 int mdrunner(gmx_hw_opt_t *hw_opt,
1151 FILE *fplog,t_commrec *cr,int nfile,
1152 const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
1153 gmx_bool bCompact, int nstglobalcomm,
1154 ivec ddxyz,int dd_node_order,real rdd,real rconstr,
1155 const char *dddlb_opt,real dlb_scale,
1156 const char *ddcsx,const char *ddcsy,const char *ddcsz,
1157 const char *nbpu_opt,
1158 int nsteps_cmdline, int nstepout,int resetstep,
1159 int nmultisim,int repl_ex_nst,int repl_ex_nex,
1160 int repl_ex_seed, real pforce,real cpt_period,real max_hours,
1161 const char *deviceOptions, unsigned long Flags)
1163 gmx_bool bForceUseGPU,bTryUseGPU;
1164 double nodetime=0,realtime;
1165 t_inputrec *inputrec;
1166 t_state *state=NULL;
1168 gmx_ddbox_t ddbox={0};
1169 int npme_major,npme_minor;
1172 gmx_mtop_t *mtop=NULL;
1173 t_mdatoms *mdatoms=NULL;
1174 t_forcerec *fr=NULL;
1177 gmx_pme_t *pmedata=NULL;
1178 gmx_vsite_t *vsite=NULL;
1179 gmx_constr_t constr;
1180 int i,m,nChargePerturbed=-1,status,nalloc;
1182 gmx_wallcycle_t wcycle;
1183 gmx_bool bReadRNG,bReadEkin;
1185 gmx_runtime_t runtime;
1187 gmx_large_int_t reset_counters;
1188 gmx_edsam_t ed=NULL;
1189 t_commrec *cr_old=cr;
1192 gmx_membed_t membed=NULL;
1193 gmx_hw_info_t *hwinfo=NULL;
1194 master_inf_t minf={-1,FALSE};
1196 /* CAUTION: threads may be started later on in this function, so
1197 cr doesn't reflect the final parallel state right now */
1201 if (Flags & MD_APPENDFILES)
1206 bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
1207 bTryUseGPU = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
1212 /* Read (nearly) all data required for the simulation */
1213 read_tpx_state(ftp2fn(efTPX,nfile,fnm),inputrec,state,NULL,mtop);
1215 if (inputrec->cutoff_scheme != ecutsVERLET &&
1216 ((Flags & MD_TESTVERLET) || getenv("GMX_VERLET_SCHEME") != NULL))
1218 convert_to_verlet_scheme(fplog,inputrec,mtop,det(state->box));
1221 /* Detect hardware, gather information. With tMPI only thread 0 does it
1222 * and after threads are started broadcasts hwinfo around. */
1224 gmx_detect_hardware(fplog, hwinfo, cr,
1225 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
1227 minf.cutoff_scheme = inputrec->cutoff_scheme;
1228 minf.bUseGPU = FALSE;
1230 if (inputrec->cutoff_scheme == ecutsVERLET)
1232 prepare_verlet_scheme(fplog,hwinfo,cr,hw_opt,nbpu_opt,
1233 inputrec,mtop,state->box,
1236 else if (hwinfo->bCanUseGPU)
1238 md_print_warn(cr,fplog,
1239 "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
1240 " To use a GPU, set the mdp option: cutoff-scheme = Verlet\n"
1241 " (for quick performance testing you can use the -testverlet option)\n");
1245 gmx_fatal(FARGS,"GPU requested, but can't be used without cutoff-scheme=Verlet");
1249 #ifndef GMX_THREAD_MPI
1252 gmx_bcast_sim(sizeof(minf),&minf,cr);
1255 if (minf.bUseGPU && cr->npmenodes == -1)
1257 /* Don't automatically use PME-only nodes with GPUs */
1261 /* Check for externally set OpenMP affinity and turn off internal
1262 * pinning if any is found. We need to do this check early to tell
1263 * thread-MPI whether it should do pinning when spawning threads.
1265 gmx_omp_check_thread_affinity(fplog, cr, hw_opt);
1267 #ifdef GMX_THREAD_MPI
1268 /* With thread-MPI inputrec is only set here on the master thread */
1272 check_and_update_hw_opt(hw_opt,minf.cutoff_scheme);
1274 #ifdef GMX_THREAD_MPI
1275 /* Early check for externally set process affinity. Can't do over all
1276 * MPI processes because hwinfo is not available everywhere, but with
1277 * thread-MPI it's needed as pinning might get turned off which needs
1278 * to be known before starting thread-MPI. */
1279 check_cpu_affinity_set(fplog,
1281 hw_opt, hwinfo->nthreads_hw_avail, FALSE);
1284 #ifdef GMX_THREAD_MPI
1285 if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
1287 gmx_fatal(FARGS,"You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME nodes");
1291 if (hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
1294 gmx_fatal(FARGS,"You need to explicitly specify the number of PME nodes (-npme) when using different number of OpenMP threads for PP and PME nodes");
1298 #ifdef GMX_THREAD_MPI
1301 /* NOW the threads will be started: */
1302 hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
1306 if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
1308 hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
1311 if (hw_opt->nthreads_tmpi > 1)
1313 /* now start the threads. */
1314 cr=mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm,
1315 oenv, bVerbose, bCompact, nstglobalcomm,
1316 ddxyz, dd_node_order, rdd, rconstr,
1317 dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
1319 nsteps_cmdline, nstepout, resetstep, nmultisim,
1320 repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
1321 cpt_period, max_hours, deviceOptions,
1323 /* the main thread continues here with a new cr. We don't deallocate
1324 the old cr because other threads may still be reading it. */
1327 gmx_comm("Failed to spawn threads");
1332 /* END OF CAUTION: cr is now reliable */
1334 /* g_membed initialisation *
1335 * Because we change the mtop, init_membed is called before the init_parallel *
1336 * (in case we ever want to make it run in parallel) */
1337 if (opt2bSet("-membed",nfile,fnm))
1341 fprintf(stderr,"Initializing membed");
1343 membed = init_membed(fplog,nfile,fnm,mtop,inputrec,state,cr,&cpt_period);
1348 /* now broadcast everything to the non-master nodes/threads: */
1349 init_parallel(fplog, cr, inputrec, mtop);
1351 /* This check needs to happen after get_nthreads_mpi() */
1352 if (inputrec->cutoff_scheme == ecutsVERLET && (Flags & MD_PARTDEC))
1354 gmx_fatal_collective(FARGS,cr,NULL,
1355 "The Verlet cut-off scheme is not supported with particle decomposition.\n"
1356 "You can achieve the same effect as particle decomposition by running in parallel using only OpenMP threads.");
1361 pr_inputrec(fplog,0,"Input Parameters",inputrec,FALSE);
1364 #if defined GMX_THREAD_MPI
1365 /* With tMPI we detected on thread 0 and we'll just pass the hwinfo pointer
1366 * to the other threads -- slightly uncool, but works fine, just need to
1367 * make sure that the data doesn't get freed twice. */
1374 gmx_bcast(sizeof(&hwinfo), &hwinfo, cr);
1377 if (PAR(cr) && !SIMMASTER(cr))
1379 /* now we have inputrec on all nodes, can run the detection */
1380 /* TODO: perhaps it's better to propagate within a node instead? */
1382 gmx_detect_hardware(fplog, hwinfo, cr,
1383 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
1386 /* Now do the affinity check with MPI/no-MPI (done earlier with thread-MPI). */
1387 check_cpu_affinity_set(fplog, cr,
1388 hw_opt, hwinfo->nthreads_hw_avail, FALSE);
1391 /* now make sure the state is initialized and propagated */
1392 set_state_entries(state,inputrec,cr->nnodes);
1394 /* remove when vv and rerun works correctly! */
1395 if (PAR(cr) && EI_VV(inputrec->eI) && ((Flags & MD_RERUN) || (Flags & MD_RERUN_VSITE)))
1398 "Currently can't do velocity verlet with rerun in parallel.");
1401 /* A parallel command line option consistency check that we can
1402 only do after any threads have started. */
1404 (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
1407 "The -dd or -npme option request a parallel simulation, "
1409 "but %s was compiled without threads or MPI enabled"
1411 #ifdef GMX_THREAD_MPI
1412 "but the number of threads (option -nt) is 1"
1414 "but %s was not started through mpirun/mpiexec or only one process was requested through mpirun/mpiexec"
1421 if ((Flags & MD_RERUN) &&
1422 (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
1424 gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
1427 if (can_use_allvsall(inputrec,mtop,TRUE,cr,fplog) && PAR(cr))
1429 /* All-vs-all loops do not work with domain decomposition */
1430 Flags |= MD_PARTDEC;
1433 if (!EEL_PME(inputrec->coulombtype) || (Flags & MD_PARTDEC))
1435 if (cr->npmenodes > 0)
1437 if (!EEL_PME(inputrec->coulombtype))
1439 gmx_fatal_collective(FARGS,cr,NULL,
1440 "PME nodes are requested, but the system does not use PME electrostatics");
1442 if (Flags & MD_PARTDEC)
1444 gmx_fatal_collective(FARGS,cr,NULL,
1445 "PME nodes are requested, but particle decomposition does not support separate PME nodes");
1453 fcRegisterSteps(inputrec->nsteps,inputrec->init_step);
1456 /* NMR restraints must be initialized before load_checkpoint,
1457 * since with time averaging the history is added to t_state.
1458 * For proper consistency check we therefore need to extend
1460 * So the PME-only nodes (if present) will also initialize
1461 * the distance restraints.
1465 /* This needs to be called before read_checkpoint to extend the state */
1466 init_disres(fplog,mtop,inputrec,cr,Flags & MD_PARTDEC,fcd,state);
1468 if (gmx_mtop_ftype_count(mtop,F_ORIRES) > 0)
1470 if (PAR(cr) && !(Flags & MD_PARTDEC))
1472 gmx_fatal(FARGS,"Orientation restraints do not work (yet) with domain decomposition, use particle decomposition (mdrun option -pd)");
1474 /* Orientation restraints */
1477 init_orires(fplog,mtop,state->x,inputrec,cr->ms,&(fcd->orires),
1482 if (DEFORM(*inputrec))
1484 /* Store the deform reference box before reading the checkpoint */
1487 copy_mat(state->box,box);
1491 gmx_bcast(sizeof(box),box,cr);
1493 /* Because we do not have the update struct available yet
1494 * in which the reference values should be stored,
1495 * we store them temporarily in static variables.
1496 * This should be thread safe, since they are only written once
1497 * and with identical values.
1499 #ifdef GMX_THREAD_MPI
1500 tMPI_Thread_mutex_lock(&deform_init_box_mutex);
1502 deform_init_init_step_tpx = inputrec->init_step;
1503 copy_mat(box,deform_init_box_tpx);
1504 #ifdef GMX_THREAD_MPI
1505 tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
1509 if (opt2bSet("-cpi",nfile,fnm))
1511 /* Check if checkpoint file exists before doing continuation.
1512 * This way we can use identical input options for the first and subsequent runs...
1514 if( gmx_fexist_master(opt2fn_master("-cpi",nfile,fnm,cr),cr) )
1516 load_checkpoint(opt2fn_master("-cpi",nfile,fnm,cr),&fplog,
1517 cr,Flags & MD_PARTDEC,ddxyz,
1518 inputrec,state,&bReadRNG,&bReadEkin,
1519 (Flags & MD_APPENDFILES),
1520 (Flags & MD_APPENDFILESSET));
1524 Flags |= MD_READ_RNG;
1528 Flags |= MD_READ_EKIN;
1533 if (((MASTER(cr) || (Flags & MD_SEPPOT)) && (Flags & MD_APPENDFILES))
1534 #ifdef GMX_THREAD_MPI
1535 /* With thread MPI only the master node/thread exists in mdrun.c,
1536 * therefore non-master nodes need to open the "seppot" log file here.
1538 || (!MASTER(cr) && (Flags & MD_SEPPOT))
1542 gmx_log_open(ftp2fn(efLOG,nfile,fnm),cr,!(Flags & MD_SEPPOT),
1546 /* override nsteps with value from cmdline */
1547 override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
1551 copy_mat(state->box,box);
1556 gmx_bcast(sizeof(box),box,cr);
1559 /* Essential dynamics */
1560 if (opt2bSet("-ei",nfile,fnm))
1562 /* Open input and output files, allocate space for ED data structure */
1563 ed = ed_open(nfile,fnm,Flags,cr);
1566 if (PAR(cr) && !((Flags & MD_PARTDEC) ||
1567 EI_TPI(inputrec->eI) ||
1568 inputrec->eI == eiNM))
1570 cr->dd = init_domain_decomposition(fplog,cr,Flags,ddxyz,rdd,rconstr,
1571 dddlb_opt,dlb_scale,
1575 &ddbox,&npme_major,&npme_minor);
1577 make_dd_communicators(fplog,cr,dd_node_order);
1579 /* Set overallocation to avoid frequent reallocation of arrays */
1580 set_over_alloc_dd(TRUE);
1584 /* PME, if used, is done on all nodes with 1D decomposition */
1586 cr->duty = (DUTY_PP | DUTY_PME);
1589 if (!EI_TPI(inputrec->eI))
1591 npme_major = cr->nnodes;
1594 if (inputrec->ePBC == epbcSCREW)
1597 "pbc=%s is only implemented with domain decomposition",
1598 epbc_names[inputrec->ePBC]);
1604 /* After possible communicator splitting in make_dd_communicators.
1605 * we can set up the intra/inter node communication.
1607 gmx_setup_nodecomm(fplog,cr);
1610 /* Initialize per-node process ID and counters. */
1611 gmx_init_intra_counters(cr);
1614 md_print_info(cr,fplog,"Using %d MPI %s\n",
1616 #ifdef GMX_THREAD_MPI
1617 cr->nnodes==1 ? "thread" : "threads"
1619 cr->nnodes==1 ? "process" : "processes"
1624 gmx_omp_nthreads_init(fplog, cr,
1625 hwinfo->nthreads_hw_avail,
1626 hw_opt->nthreads_omp,
1627 hw_opt->nthreads_omp_pme,
1628 (cr->duty & DUTY_PP) == 0,
1629 inputrec->cutoff_scheme == ecutsVERLET);
1631 gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi, minf.bUseGPU);
1633 /* getting number of PP/PME threads
1634 PME: env variable should be read only on one node to make sure it is
1635 identical everywhere;
1637 /* TODO nthreads_pp is only used for pinning threads.
1638 * This is a temporary solution until we have a hw topology library.
1640 nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
1641 nthreads_pme = gmx_omp_nthreads_get(emntPME);
1643 wcycle = wallcycle_init(fplog,resetstep,cr,nthreads_pp,nthreads_pme);
1647 /* Master synchronizes its value of reset_counters with all nodes
1648 * including PME only nodes */
1649 reset_counters = wcycle_get_reset_counters(wcycle);
1650 gmx_bcast_sim(sizeof(reset_counters),&reset_counters,cr);
1651 wcycle_set_reset_counters(wcycle, reset_counters);
1655 if (cr->duty & DUTY_PP)
1657 /* For domain decomposition we allocate dynamically
1658 * in dd_partition_system.
1660 if (DOMAINDECOMP(cr))
1662 bcast_state_setup(cr,state);
1668 bcast_state(cr,state,TRUE);
1672 /* Initiate forcerecord */
1674 fr->hwinfo = hwinfo;
1675 init_forcerec(fplog,oenv,fr,fcd,inputrec,mtop,cr,box,FALSE,
1676 opt2fn("-table",nfile,fnm),
1677 opt2fn("-tabletf",nfile,fnm),
1678 opt2fn("-tablep",nfile,fnm),
1679 opt2fn("-tableb",nfile,fnm),
1683 /* version for PCA_NOT_READ_NODE (see md.c) */
1684 /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
1685 "nofile","nofile","nofile","nofile",FALSE,pforce);
1687 fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT);
1689 /* Initialize QM-MM */
1692 init_QMMMrec(cr,box,mtop,inputrec,fr);
1695 /* Initialize the mdatoms structure.
1696 * mdatoms is not filled with atom data,
1697 * as this can not be done now with domain decomposition.
1699 mdatoms = init_mdatoms(fplog,mtop,inputrec->efep!=efepNO);
1701 /* Initialize the virtual site communication */
1702 vsite = init_vsite(mtop,cr,FALSE);
1704 calc_shifts(box,fr->shift_vec);
1706 /* With periodic molecules the charge groups should be whole at start up
1707 * and the virtual sites should not be far from their proper positions.
1709 if (!inputrec->bContinuation && MASTER(cr) &&
1710 !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
1712 /* Make molecules whole at start of run */
1713 if (fr->ePBC != epbcNONE)
1715 do_pbc_first_mtop(fplog,inputrec->ePBC,box,mtop,state->x);
1719 /* Correct initial vsite positions are required
1720 * for the initial distribution in the domain decomposition
1721 * and for the initial shell prediction.
1723 construct_vsites_mtop(fplog,vsite,mtop,state->x);
1727 if (EEL_PME(fr->eeltype))
1729 ewaldcoeff = fr->ewaldcoeff;
1730 pmedata = &fr->pmedata;
1739 /* This is a PME only node */
1741 /* We don't need the state */
1744 ewaldcoeff = calc_ewaldcoeff(inputrec->rcoulomb, inputrec->ewald_rtol);
1748 /* Before setting affinity, check whether the affinity has changed
1749 * - which indicates that probably the OpenMP library has changed it since
1750 * we first checked). */
1751 check_cpu_affinity_set(fplog, cr, hw_opt, hwinfo->nthreads_hw_avail, TRUE);
1753 /* Set the CPU affinity */
1754 set_cpu_affinity(fplog,cr,hw_opt,nthreads_pme,hwinfo,inputrec);
1756 /* Initiate PME if necessary,
1757 * either on all nodes or on dedicated PME nodes only. */
1758 if (EEL_PME(inputrec->coulombtype))
1762 nChargePerturbed = mdatoms->nChargePerturbed;
1764 if (cr->npmenodes > 0)
1766 /* The PME only nodes need to know nChargePerturbed */
1767 gmx_bcast_sim(sizeof(nChargePerturbed),&nChargePerturbed,cr);
1770 if (cr->duty & DUTY_PME)
1772 status = gmx_pme_init(pmedata,cr,npme_major,npme_minor,inputrec,
1773 mtop ? mtop->natoms : 0,nChargePerturbed,
1774 (Flags & MD_REPRODUCIBLE),nthreads_pme);
1777 gmx_fatal(FARGS,"Error %d initializing PME",status);
1783 if (integrator[inputrec->eI].func == do_md
1786 integrator[inputrec->eI].func == do_md_openmm
1790 /* Turn on signal handling on all nodes */
1792 * (A user signal from the PME nodes (if any)
1793 * is communicated to the PP nodes.
1795 signal_handler_install();
1798 if (cr->duty & DUTY_PP)
1800 if (inputrec->ePull != epullNO)
1802 /* Initialize pull code */
1803 init_pull(fplog,inputrec,nfile,fnm,mtop,cr,oenv, inputrec->fepvals->init_lambda,
1804 EI_DYNAMICS(inputrec->eI) && MASTER(cr),Flags);
1809 /* Initialize enforced rotation code */
1810 init_rot(fplog,inputrec,nfile,fnm,cr,state->x,box,mtop,oenv,
1814 constr = init_constraints(fplog,mtop,inputrec,ed,state,cr);
1816 if (DOMAINDECOMP(cr))
1818 dd_init_bondeds(fplog,cr->dd,mtop,vsite,constr,inputrec,
1819 Flags & MD_DDBONDCHECK,fr->cginfo_mb);
1821 set_dd_parameters(fplog,cr->dd,dlb_scale,inputrec,fr,&ddbox);
1823 setup_dd_grid(fplog,cr->dd);
1826 /* Now do whatever the user wants us to do (how flexible...) */
1827 integrator[inputrec->eI].func(fplog,cr,nfile,fnm,
1828 oenv,bVerbose,bCompact,
1831 nstepout,inputrec,mtop,
1833 mdatoms,nrnb,wcycle,ed,fr,
1834 repl_ex_nst,repl_ex_nex,repl_ex_seed,
1836 cpt_period,max_hours,
1841 if (inputrec->ePull != epullNO)
1843 finish_pull(fplog,inputrec->pull);
1848 finish_rot(fplog,inputrec->rot);
1855 gmx_pmeonly(*pmedata,cr,nrnb,wcycle,ewaldcoeff,FALSE,inputrec);
1858 if (EI_DYNAMICS(inputrec->eI) || EI_TPI(inputrec->eI))
1860 /* Some timing stats */
1863 if (runtime.proc == 0)
1865 runtime.proc = runtime.real;
1874 wallcycle_stop(wcycle,ewcRUN);
1876 /* Finish up, write some stuff
1877 * if rerunMD, don't write last frame again
1879 finish_run(fplog,cr,ftp2fn(efSTO,nfile,fnm),
1880 inputrec,nrnb,wcycle,&runtime,
1881 fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ?
1882 nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL,
1884 EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
1886 if ((cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU)
1888 char gpu_err_str[STRLEN];
1890 /* free GPU memory and uninitialize GPU (by destroying the context) */
1891 nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
1893 if (!free_gpu(gpu_err_str))
1895 gmx_warning("On node %d failed to free GPU #%d: %s",
1896 cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
1900 if (opt2bSet("-membed",nfile,fnm))
1905 #ifdef GMX_THREAD_MPI
1906 if (PAR(cr) && SIMMASTER(cr))
1909 gmx_hardware_info_free(hwinfo);
1912 /* Does what it says */
1913 print_date_and_time(fplog,cr->nodeid,"Finished mdrun",&runtime);
1915 /* Close logfile already here if we were appending to it */
1916 if (MASTER(cr) && (Flags & MD_APPENDFILES))
1918 gmx_log_close(fplog);
1921 rc=(int)gmx_get_stop_condition();
1923 #ifdef GMX_THREAD_MPI
1924 /* we need to join all threads. The sub-threads join when they
1925 exit this function, but the master thread needs to be told to
1927 if (PAR(cr) && MASTER(cr))