2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team,
6 * check out http://www.gromacs.org for more information.
7 * Copyright (c) 2012, by the GROMACS development team, led by
8 * David van der Spoel, Berk Hess, Erik Lindahl, and including many
9 * others, as listed in the AUTHORS file in the top-level source
10 * directory and at http://www.gromacs.org.
12 * GROMACS is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public License
14 * as published by the Free Software Foundation; either version 2.1
15 * of the License, or (at your option) any later version.
17 * GROMACS is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 * Lesser General Public License for more details.
22 * You should have received a copy of the GNU Lesser General Public
23 * License along with GROMACS; if not, see
24 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 * If you want to redistribute modifications to GROMACS, please
28 * consider that scientific software is very special. Version
29 * control is crucial - bugs must be traceable. We will be happy to
30 * consider code for inclusion in the official distribution, but
31 * derived work must not be called official GROMACS. Details are found
32 * in the README & COPYING files - if they are missing, get the
33 * official version at http://www.gromacs.org.
35 * To help us fund GROMACS development, we humbly ask that you cite
36 * the research papers on the package. Check out http://www.gromacs.org.
41 #if defined(HAVE_SCHED_H) && defined(HAVE_SCHED_GETAFFINITY)
44 #include <sys/syscall.h>
56 #include "md_logging.h"
57 #include "md_support.h"
67 #include "mpelogging.h"
73 #include "checkpoint.h"
74 #include "mtop_util.h"
75 #include "sighandler.h"
78 #include "gmx_detect_hardware.h"
79 #include "gmx_omp_nthreads.h"
80 #include "pull_rotation.h"
81 #include "calc_verletbuf.h"
82 #include "../mdlib/nbnxn_search.h"
83 #include "../mdlib/nbnxn_consts.h"
84 #include "gmx_fatal_collective.h"
86 #include "md_openmm.h"
89 #include "thread_mpi/threads.h"
103 #include "md_openmm.h"
106 #include "gpu_utils.h"
107 #include "nbnxn_cuda_data_mgmt.h"
110 gmx_integrator_t *func;
113 /* The array should match the eI array in include/types/enums.h */
114 #ifdef GMX_OPENMM /* FIXME do_md_openmm needs fixing */
115 const gmx_intp_t integrator[eiNR] = { {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm},{do_md_openmm}};
117 const gmx_intp_t integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md},{do_md}};
120 gmx_large_int_t deform_init_init_step_tpx;
121 matrix deform_init_box_tpx;
122 #ifdef GMX_THREAD_MPI
123 tMPI_Thread_mutex_t deform_init_box_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
127 #ifdef GMX_THREAD_MPI
128 struct mdrunner_arglist
130 gmx_hw_opt_t *hw_opt;
143 const char *dddlb_opt;
148 const char *nbpu_opt;
159 const char *deviceOptions;
161 int ret; /* return value */
165 /* The function used for spawning threads. Extracts the mdrunner()
166 arguments from its one argument and calls mdrunner(), after making
168 static void mdrunner_start_fn(void *arg)
170 struct mdrunner_arglist *mda=(struct mdrunner_arglist*)arg;
171 struct mdrunner_arglist mc=*mda; /* copy the arg list to make sure
172 that it's thread-local. This doesn't
173 copy pointed-to items, of course,
174 but those are all const. */
175 t_commrec *cr; /* we need a local version of this */
179 fnm = dup_tfn(mc.nfile, mc.fnm);
181 cr = init_par_threads(mc.cr);
188 mda->ret=mdrunner(mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv,
189 mc.bVerbose, mc.bCompact, mc.nstglobalcomm,
190 mc.ddxyz, mc.dd_node_order, mc.rdd,
191 mc.rconstr, mc.dddlb_opt, mc.dlb_scale,
192 mc.ddcsx, mc.ddcsy, mc.ddcsz,
194 mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
195 mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce,
196 mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.Flags);
199 /* called by mdrunner() to start a specific number of threads (including
200 the main thread) for thread-parallel runs. This in turn calls mdrunner()
202 All options besides nthreads are the same as for mdrunner(). */
203 static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
204 FILE *fplog,t_commrec *cr,int nfile,
205 const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
206 gmx_bool bCompact, int nstglobalcomm,
207 ivec ddxyz,int dd_node_order,real rdd,real rconstr,
208 const char *dddlb_opt,real dlb_scale,
209 const char *ddcsx,const char *ddcsy,const char *ddcsz,
210 const char *nbpu_opt,
211 int nsteps_cmdline, int nstepout,int resetstep,
212 int nmultisim,int repl_ex_nst,int repl_ex_nex, int repl_ex_seed,
213 real pforce,real cpt_period, real max_hours,
214 const char *deviceOptions, unsigned long Flags)
217 struct mdrunner_arglist *mda;
218 t_commrec *crn; /* the new commrec */
221 /* first check whether we even need to start tMPI */
222 if (hw_opt->nthreads_tmpi < 2)
227 /* a few small, one-time, almost unavoidable memory leaks: */
229 fnmn=dup_tfn(nfile, fnm);
231 /* fill the data structure to pass as void pointer to thread start fn */
238 mda->bVerbose=bVerbose;
239 mda->bCompact=bCompact;
240 mda->nstglobalcomm=nstglobalcomm;
241 mda->ddxyz[XX]=ddxyz[XX];
242 mda->ddxyz[YY]=ddxyz[YY];
243 mda->ddxyz[ZZ]=ddxyz[ZZ];
244 mda->dd_node_order=dd_node_order;
246 mda->rconstr=rconstr;
247 mda->dddlb_opt=dddlb_opt;
248 mda->dlb_scale=dlb_scale;
252 mda->nbpu_opt=nbpu_opt;
253 mda->nsteps_cmdline=nsteps_cmdline;
254 mda->nstepout=nstepout;
255 mda->resetstep=resetstep;
256 mda->nmultisim=nmultisim;
257 mda->repl_ex_nst=repl_ex_nst;
258 mda->repl_ex_nex=repl_ex_nex;
259 mda->repl_ex_seed=repl_ex_seed;
261 mda->cpt_period=cpt_period;
262 mda->max_hours=max_hours;
263 mda->deviceOptions=deviceOptions;
266 fprintf(stderr, "Starting %d tMPI threads\n",hw_opt->nthreads_tmpi);
268 /* now spawn new threads that start mdrunner_start_fn(), while
269 the main thread returns */
270 ret=tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi,
271 (hw_opt->bThreadPinning ? TMPI_AFFINITY_ALL_CORES : TMPI_AFFINITY_NONE),
272 mdrunner_start_fn, (void*)(mda) );
273 if (ret!=TMPI_SUCCESS)
276 /* make a new comm_rec to reflect the new situation */
277 crn=init_par_threads(cr);
282 static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
283 const gmx_hw_opt_t *hw_opt,
289 /* There are no separate PME nodes here, as we ensured in
290 * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
291 * and a conditional ensures we would not have ended up here.
292 * Note that separate PME nodes might be switched on later.
296 nthreads_tmpi = ngpu;
297 if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi)
299 nthreads_tmpi = nthreads_tot;
302 else if (hw_opt->nthreads_omp > 0)
304 /* Here we could oversubscribe, when we do, we issue a warning later */
305 nthreads_tmpi = max(1,nthreads_tot/hw_opt->nthreads_omp);
309 /* TODO choose nthreads_omp based on hardware topology
310 when we have a hardware topology detection library */
311 /* In general, when running up to 4 threads, OpenMP should be faster.
312 * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
313 * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
314 * even on two CPUs it's usually faster (but with many OpenMP threads
315 * it could be faster not to use HT, currently we always use HT).
316 * On Nehalem/Westmere we want to avoid running 16 threads over
317 * two CPUs with HT, so we need a limit<16; thus we use 12.
318 * A reasonable limit for Intel Sandy and Ivy bridge,
319 * not knowing the topology, is 16 threads.
321 const int nthreads_omp_always_faster = 4;
322 const int nthreads_omp_always_faster_Nehalem = 12;
323 const int nthreads_omp_always_faster_SandyBridge = 16;
324 const int first_model_Nehalem = 0x1A;
325 const int first_model_SandyBridge = 0x2A;
326 gmx_bool bIntel_Family6;
329 (gmx_cpuid_vendor(hwinfo->cpuid_info) == GMX_CPUID_VENDOR_INTEL &&
330 gmx_cpuid_family(hwinfo->cpuid_info) == 6);
332 if (nthreads_tot <= nthreads_omp_always_faster ||
334 ((gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_Nehalem && nthreads_tot <= nthreads_omp_always_faster_Nehalem) ||
335 (gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_SandyBridge && nthreads_tot <= nthreads_omp_always_faster_SandyBridge))))
337 /* Use pure OpenMP parallelization */
342 /* Don't use OpenMP parallelization */
343 nthreads_tmpi = nthreads_tot;
347 return nthreads_tmpi;
351 /* Get the number of threads to use for thread-MPI based on how many
352 * were requested, which algorithms we're using,
353 * and how many particles there are.
354 * At the point we have already called check_and_update_hw_opt.
355 * Thus all options should be internally consistent and consistent
356 * with the hardware, except that ntmpi could be larger than #GPU.
358 static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
359 gmx_hw_opt_t *hw_opt,
360 t_inputrec *inputrec, gmx_mtop_t *mtop,
364 int nthreads_hw,nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu;
365 int min_atoms_per_mpi_thread;
370 if (hw_opt->nthreads_tmpi > 0)
372 /* Trivial, return right away */
373 return hw_opt->nthreads_tmpi;
376 nthreads_hw = hwinfo->nthreads_hw_avail;
378 /* How many total (#tMPI*#OpenMP) threads can we start? */
379 if (hw_opt->nthreads_tot > 0)
381 nthreads_tot_max = hw_opt->nthreads_tot;
385 nthreads_tot_max = nthreads_hw;
388 bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
391 ngpu = hwinfo->gpu_info.ncuda_dev_use;
399 get_tmpi_omp_thread_division(hwinfo,hw_opt,nthreads_tot_max,ngpu);
401 if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
403 /* Steps are divided over the nodes iso splitting the atoms */
404 min_atoms_per_mpi_thread = 0;
410 min_atoms_per_mpi_thread = MIN_ATOMS_PER_GPU;
414 min_atoms_per_mpi_thread = MIN_ATOMS_PER_MPI_THREAD;
418 /* Check if an algorithm does not support parallel simulation. */
419 if (nthreads_tmpi != 1 &&
420 ( inputrec->eI == eiLBFGS ||
421 inputrec->coulombtype == eelEWALD ) )
425 md_print_warn(cr,fplog,"The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
426 if (hw_opt->nthreads_tmpi > nthreads_tmpi)
428 gmx_fatal(FARGS,"You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
431 else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_thread)
433 /* the thread number was chosen automatically, but there are too many
434 threads (too few atoms per thread) */
435 nthreads_new = max(1,mtop->natoms/min_atoms_per_mpi_thread);
437 /* Avoid partial use of Hyper-Threading */
438 if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
439 nthreads_new > nthreads_hw/2 && nthreads_new < nthreads_hw)
441 nthreads_new = nthreads_hw/2;
444 /* Avoid large prime numbers in the thread count */
445 if (nthreads_new >= 6)
447 /* Use only 6,8,10 with additional factors of 2 */
451 while (3*fac*2 <= nthreads_new)
456 nthreads_new = (nthreads_new/fac)*fac;
461 if (nthreads_new == 5)
467 nthreads_tmpi = nthreads_new;
469 fprintf(stderr,"\n");
470 fprintf(stderr,"NOTE: Parallelization is limited by the small number of atoms,\n");
471 fprintf(stderr," only starting %d thread-MPI threads.\n",nthreads_tmpi);
472 fprintf(stderr," You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
475 return nthreads_tmpi;
477 #endif /* GMX_THREAD_MPI */
480 /* Environment variable for setting nstlist */
481 static const char* NSTLIST_ENVVAR = "GMX_NSTLIST";
482 /* Try to increase nstlist when using a GPU with nstlist less than this */
483 static const int NSTLIST_GPU_ENOUGH = 20;
484 /* Increase nstlist until the non-bonded cost increases more than this factor */
485 static const float NBNXN_GPU_LIST_OK_FAC = 1.25;
486 /* Don't increase nstlist beyond a non-bonded cost increases of this factor */
487 static const float NBNXN_GPU_LIST_MAX_FAC = 1.40;
489 /* Try to increase nstlist when running on a GPU */
490 static void increase_nstlist(FILE *fp,t_commrec *cr,
491 t_inputrec *ir,const gmx_mtop_t *mtop,matrix box)
494 int nstlist_orig,nstlist_prev;
495 verletbuf_list_setup_t ls;
496 real rlist_inc,rlist_ok,rlist_max,rlist_new,rlist_prev;
499 gmx_bool bBox,bDD,bCont;
500 const char *nstl_fmt="\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
501 const char *vbd_err="Can not increase nstlist for GPU run because verlet-buffer-drift is not set or used";
502 const char *box_err="Can not increase nstlist for GPU run because the box is too small";
503 const char *dd_err ="Can not increase nstlist for GPU run because of domain decomposition limitations";
506 /* Number of + nstlist alternative values to try when switching */
507 const int nstl[]={ 20, 25, 40, 50 };
508 #define NNSTL sizeof(nstl)/sizeof(nstl[0])
510 env = getenv(NSTLIST_ENVVAR);
515 fprintf(fp,nstl_fmt,ir->nstlist);
519 if (ir->verletbuf_drift == 0)
521 gmx_fatal(FARGS,"You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
524 if (ir->verletbuf_drift < 0)
528 fprintf(stderr,"%s\n",vbd_err);
532 fprintf(fp,"%s\n",vbd_err);
538 nstlist_orig = ir->nstlist;
541 sprintf(buf,"Getting nstlist from environment variable GMX_NSTLIST=%s",env);
544 fprintf(stderr,"%s\n",buf);
548 fprintf(fp,"%s\n",buf);
550 sscanf(env,"%d",&ir->nstlist);
553 verletbuf_get_list_setup(TRUE,&ls);
555 /* Allow rlist to make the list double the size of the cut-off sphere */
556 rlist_inc = nbnxn_get_rlist_effective_inc(NBNXN_GPU_CLUSTER_SIZE,mtop->natoms/det(box));
557 rlist_ok = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_OK_FAC,1.0/3.0) - rlist_inc;
558 rlist_max = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_MAX_FAC,1.0/3.0) - rlist_inc;
561 fprintf(debug,"GPU nstlist tuning: rlist_inc %.3f rlist_max %.3f\n",
562 rlist_inc,rlist_max);
566 nstlist_prev = nstlist_orig;
567 rlist_prev = ir->rlist;
572 ir->nstlist = nstl[i];
575 /* Set the pair-list buffer size in ir */
576 calc_verlet_buffer_size(mtop,det(box),ir,ir->verletbuf_drift,&ls,
579 /* Does rlist fit in the box? */
580 bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC,box));
582 if (bBox && DOMAINDECOMP(cr))
584 /* Check if rlist fits in the domain decomposition */
585 if (inputrec2nboundeddim(ir) < DIM)
587 gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
589 copy_mat(box,state_tmp.box);
590 bDD = change_dd_cutoff(cr,&state_tmp,ir,rlist_new);
597 if (bBox && bDD && rlist_new <= rlist_max)
599 /* Increase nstlist */
600 nstlist_prev = ir->nstlist;
601 rlist_prev = rlist_new;
602 bCont = (i+1 < NNSTL && rlist_new < rlist_ok);
606 /* Stick with the previous nstlist */
607 ir->nstlist = nstlist_prev;
608 rlist_new = rlist_prev;
620 gmx_warning(!bBox ? box_err : dd_err);
623 fprintf(fp,"\n%s\n",bBox ? box_err : dd_err);
625 ir->nstlist = nstlist_orig;
627 else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
629 sprintf(buf,"Changing nstlist from %d to %d, rlist from %g to %g",
630 nstlist_orig,ir->nstlist,
631 ir->rlist,rlist_new);
634 fprintf(stderr,"%s\n\n",buf);
638 fprintf(fp,"%s\n\n",buf);
640 ir->rlist = rlist_new;
641 ir->rlistlong = rlist_new;
645 static void prepare_verlet_scheme(FILE *fplog,
646 gmx_hw_info_t *hwinfo,
648 gmx_hw_opt_t *hw_opt,
649 const char *nbpu_opt,
651 const gmx_mtop_t *mtop,
655 /* Here we only check for GPU usage on the MPI master process,
656 * as here we don't know how many GPUs we will use yet.
657 * We check for a GPU on all processes later.
659 *bUseGPU = hwinfo->bCanUseGPU || (getenv("GMX_EMULATE_GPU") != NULL);
661 if (ir->verletbuf_drift > 0)
663 /* Update the Verlet buffer size for the current run setup */
664 verletbuf_list_setup_t ls;
667 /* Here we assume CPU acceleration is on. But as currently
668 * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
669 * and 4x2 gives a larger buffer than 4x4, this is ok.
671 verletbuf_get_list_setup(*bUseGPU,&ls);
673 calc_verlet_buffer_size(mtop,det(box),ir,
674 ir->verletbuf_drift,&ls,
676 if (rlist_new != ir->rlist)
680 fprintf(fplog,"\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
682 ls.cluster_size_i,ls.cluster_size_j);
684 ir->rlist = rlist_new;
685 ir->rlistlong = rlist_new;
689 /* With GPU or emulation we should check nstlist for performance */
690 if ((EI_DYNAMICS(ir->eI) &&
692 ir->nstlist < NSTLIST_GPU_ENOUGH) ||
693 getenv(NSTLIST_ENVVAR) != NULL)
695 /* Choose a better nstlist */
696 increase_nstlist(fplog,cr,ir,mtop,box);
700 static void convert_to_verlet_scheme(FILE *fplog,
702 gmx_mtop_t *mtop,real box_vol)
704 char *conv_mesg="Converting input file with group cut-off scheme to the Verlet cut-off scheme";
706 md_print_warn(NULL,fplog,"%s\n",conv_mesg);
708 ir->cutoff_scheme = ecutsVERLET;
709 ir->verletbuf_drift = 0.005;
711 if (ir->rcoulomb != ir->rvdw)
713 gmx_fatal(FARGS,"The VdW and Coulomb cut-offs are different, whereas the Verlet scheme only supports equal cut-offs");
716 if (ir->vdwtype == evdwUSER || EEL_USER(ir->coulombtype))
718 gmx_fatal(FARGS,"User non-bonded potentials are not (yet) supported with the Verlet scheme");
720 else if (EVDW_SWITCHED(ir->vdwtype) || EEL_SWITCHED(ir->coulombtype))
722 md_print_warn(NULL,fplog,"Converting switched or shifted interactions to a shifted potential (without force shift), this will lead to slightly different interaction potentials");
724 if (EVDW_SWITCHED(ir->vdwtype))
726 ir->vdwtype = evdwCUT;
728 if (EEL_SWITCHED(ir->coulombtype))
730 if (EEL_FULL(ir->coulombtype))
732 /* With full electrostatic only PME can be switched */
733 ir->coulombtype = eelPME;
737 md_print_warn(NULL,fplog,"NOTE: Replacing %s electrostatics with reaction-field with epsilon-rf=inf\n",eel_names[ir->coulombtype]);
738 ir->coulombtype = eelRF;
739 ir->epsilon_rf = 0.0;
743 /* We set the target energy drift to a small number.
744 * Note that this is only for testing. For production the user
745 * should think about this and set the mdp options.
747 ir->verletbuf_drift = 1e-4;
750 if (inputrec2nboundeddim(ir) != 3)
752 gmx_fatal(FARGS,"Can only convert old tpr files to the Verlet cut-off scheme with 3D pbc");
755 if (ir->efep != efepNO || ir->implicit_solvent != eisNO)
757 gmx_fatal(FARGS,"Will not convert old tpr files to the Verlet cut-off scheme with free-energy calculations or implicit solvent");
760 if (EI_DYNAMICS(ir->eI) && !(EI_MD(ir->eI) && ir->etc == etcNO))
762 verletbuf_list_setup_t ls;
764 verletbuf_get_list_setup(FALSE,&ls);
765 calc_verlet_buffer_size(mtop,box_vol,ir,ir->verletbuf_drift,&ls,
770 ir->verletbuf_drift = -1;
771 ir->rlist = 1.05*max(ir->rvdw,ir->rcoulomb);
774 gmx_mtop_remove_chargegroups(mtop);
777 /* Check the process affinity mask and if it is found to be non-zero,
778 * will honor it and disable mdrun internal affinity setting.
779 * This function should be called first before the OpenMP library gets
780 * initialized with the last argument FALSE (which will detect affinity
781 * set by external tools like taskset), and later, after the OpenMP
782 * initialization, with the last argument TRUE to detect affinity changes
783 * made by the OpenMP library.
785 * Note that this will only work on Linux as we use a GNU feature. */
786 static void check_cpu_affinity_set(FILE *fplog, const t_commrec *cr,
787 gmx_hw_opt_t *hw_opt, int ncpus,
788 gmx_bool bAfterOpenmpInit)
790 #ifdef HAVE_SCHED_GETAFFINITY
791 cpu_set_t mask_current;
792 int i, ret, cpu_count, cpu_set;
796 if (!hw_opt->bThreadPinning)
798 /* internal affinity setting is off, don't bother checking process affinity */
802 CPU_ZERO(&mask_current);
803 if ((ret = sched_getaffinity(0, sizeof(cpu_set_t), &mask_current)) != 0)
805 /* failed to query affinity mask, will just return */
808 fprintf(debug, "Failed to query affinity mask (error %d)", ret);
813 /* Before proceeding with the actual check, make sure that the number of
814 * detected CPUs is >= the CPUs in the current set.
815 * We need to check for CPU_COUNT as it was added only in glibc 2.6. */
817 if (ncpus < CPU_COUNT(&mask_current))
821 fprintf(debug, "%d CPUs detected, but %d was returned by CPU_COUNT",
822 ncpus, CPU_COUNT(&mask_current));
826 #endif /* CPU_COUNT */
829 for (i = 0; (i < ncpus && i < CPU_SETSIZE); i++)
831 bAllSet = bAllSet && (CPU_ISSET(i, &mask_current) != 0);
836 if (!bAfterOpenmpInit)
838 md_print_warn(cr, fplog,
839 "Non-default process affinity set, disabling internal affinity");
843 md_print_warn(cr, fplog,
844 "Non-default process affinity set probably by the OpenMP library, "
845 "disabling internal affinity");
847 hw_opt->bThreadPinning = FALSE;
851 fprintf(debug, "Non-default affinity mask found\n");
858 fprintf(debug, "Default affinity mask found\n");
861 #endif /* HAVE_SCHED_GETAFFINITY */
864 /* Set CPU affinity. Can be important for performance.
865 On some systems (e.g. Cray) CPU Affinity is set by default.
866 But default assigning doesn't work (well) with only some ranks
867 having threads. This causes very low performance.
868 External tools have cumbersome syntax for setting affinity
869 in the case that only some ranks have threads.
870 Thus it is important that GROMACS sets the affinity internally
871 if only PME is using threads.
873 static void set_cpu_affinity(FILE *fplog,
875 gmx_hw_opt_t *hw_opt,
877 const gmx_hw_info_t *hwinfo,
878 const t_inputrec *inputrec)
880 #if defined GMX_THREAD_MPI
881 /* With the number of TMPI threads equal to the number of cores
882 * we already pinned in thread-MPI, so don't pin again here.
884 if (hw_opt->nthreads_tmpi == tMPI_Thread_get_hw_number())
891 /* If the tMPI thread affinity setting is not supported encourage the user
892 * to report it as it's either a bug or an exotic platform which we might
893 * want to support. */
894 if (tMPI_Thread_setaffinity_support() != TMPI_SETAFFINITY_SUPPORT_YES)
896 md_print_warn(NULL, fplog,
897 "Can not set thread affinities on the current plarform. On NUMA systems this\n"
898 "can cause performance degradation. If you think your platform should support\n"
899 "setting affinities, contact the GROMACS developers.");
902 #endif /* __APPLE__ */
904 if (hw_opt->bThreadPinning)
906 int nth_affinity_set, thread_id_node, thread_id,
907 nthread_local, nthread_node, nthread_hw_max, nphyscore;
911 /* threads on this MPI process or TMPI thread */
912 if (cr->duty & DUTY_PP)
914 nthread_local = gmx_omp_nthreads_get(emntNonbonded);
918 nthread_local = gmx_omp_nthreads_get(emntPME);
921 /* map the current process to cores */
923 nthread_node = nthread_local;
925 if (PAR(cr) || MULTISIM(cr))
927 /* We need to determine a scan of the thread counts in this
933 process_index = cr->nodeid_intra;
936 /* To simplify the code, we shift process indices by nnodes.
937 * There might be far less processes, but that doesn't matter.
939 process_index += cr->ms->sim*cr->nnodes;
941 MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),process_index,
943 MPI_Scan(&nthread_local,&thread_id_node,1,MPI_INT,MPI_SUM,comm_intra);
944 /* MPI_Scan is inclusive, but here we need exclusive */
945 thread_id_node -= nthread_local;
946 /* Get the total number of threads on this physical node */
947 MPI_Allreduce(&nthread_local,&nthread_node,1,MPI_INT,MPI_SUM,comm_intra);
948 MPI_Comm_free(&comm_intra);
953 if (hw_opt->core_pinning_offset > 0)
955 offset = hw_opt->core_pinning_offset;
958 fprintf(stderr, "Applying core pinning offset %d\n", offset);
962 fprintf(fplog, "Applying core pinning offset %d\n", offset);
966 /* With Intel Hyper-Threading enabled, we want to pin consecutive
967 * threads to physical cores when using more threads than physical
968 * cores or when the user requests so.
970 nthread_hw_max = hwinfo->nthreads_hw_avail;
972 if (hw_opt->bPinHyperthreading ||
973 (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
974 nthread_node > nthread_hw_max/2 && getenv("GMX_DISABLE_PINHT") == NULL))
976 if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) != GMX_CPUID_X86_SMT_ENABLED)
978 /* We print to stderr on all processes, as we might have
979 * different settings on different physical nodes.
981 if (gmx_cpuid_vendor(hwinfo->cpuid_info) != GMX_CPUID_VENDOR_INTEL)
983 md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
984 "but non-Intel CPU detected (vendor: %s)\n",
985 gmx_cpuid_vendor_string[gmx_cpuid_vendor(hwinfo->cpuid_info)]);
989 md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
990 "but the CPU detected does not have Intel Hyper-Threading support "
991 "(or it is turned off)\n");
994 nphyscore = nthread_hw_max/2;
998 fprintf(stderr, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
1003 fprintf(fplog, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
1008 /* Set the per-thread affinity. In order to be able to check the success
1009 * of affinity settings, we will set nth_affinity_set to 1 on threads
1010 * where the affinity setting succeded and to 0 where it failed.
1011 * Reducing these 0/1 values over the threads will give the total number
1012 * of threads on which we succeeded.
1014 nth_affinity_set = 0;
1015 #pragma omp parallel firstprivate(thread_id_node) num_threads(nthread_local) \
1016 reduction(+:nth_affinity_set)
1019 gmx_bool setaffinity_ret;
1021 thread_id = gmx_omp_get_thread_num();
1022 thread_id_node += thread_id;
1025 core = offset + thread_id_node;
1029 /* Lock pairs of threads to the same hyperthreaded core */
1030 core = offset + thread_id_node/2 + (thread_id_node % 2)*nphyscore;
1033 setaffinity_ret = tMPI_Thread_setaffinity_single(tMPI_Thread_self(), core);
1035 /* store the per-thread success-values of the setaffinity */
1036 nth_affinity_set = (setaffinity_ret == 0);
1040 fprintf(debug, "On node %d, thread %d the affinity setting returned %d\n",
1041 cr->nodeid, gmx_omp_get_thread_num(), setaffinity_ret);
1045 if (nth_affinity_set > nthread_local)
1049 sprintf(msg, "Looks like we have set affinity for more threads than "
1050 "we have (%d > %d)!\n", nth_affinity_set, nthread_local);
1055 /* check if some threads failed to set their affinities */
1056 if (nth_affinity_set != nthread_local)
1061 #ifdef GMX_THREAD_MPI
1062 sprintf(sbuf, "In thread-MPI thread #%d", cr->nodeid);
1063 #else /* GMX_LIB_MPI */
1065 sprintf(sbuf, "In MPI process #%d", cr->nodeid);
1066 #endif /* GMX_MPI */
1067 md_print_warn(NULL, fplog,
1068 "%s%d/%d thread%s failed to set their affinities. "
1069 "This can cause performance degradation!",
1070 sbuf, nthread_local - nth_affinity_set, nthread_local,
1071 (nthread_local - nth_affinity_set) > 1 ? "s" : "");
1078 static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt,
1081 gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp);
1083 #ifndef GMX_THREAD_MPI
1084 if (hw_opt->nthreads_tot > 0)
1086 gmx_fatal(FARGS,"Setting the total number of threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
1088 if (hw_opt->nthreads_tmpi > 0)
1090 gmx_fatal(FARGS,"Setting the number of thread-MPI threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
1094 if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
1096 /* We have the same number of OpenMP threads for PP and PME processes,
1097 * thus we can perform several consistency checks.
1099 if (hw_opt->nthreads_tmpi > 0 &&
1100 hw_opt->nthreads_omp > 0 &&
1101 hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
1103 gmx_fatal(FARGS,"The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
1104 hw_opt->nthreads_tot,hw_opt->nthreads_tmpi,hw_opt->nthreads_omp);
1107 if (hw_opt->nthreads_tmpi > 0 &&
1108 hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
1110 gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
1111 hw_opt->nthreads_tot,hw_opt->nthreads_tmpi);
1114 if (hw_opt->nthreads_omp > 0 &&
1115 hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
1117 gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
1118 hw_opt->nthreads_tot,hw_opt->nthreads_omp);
1121 if (hw_opt->nthreads_tmpi > 0 &&
1122 hw_opt->nthreads_omp <= 0)
1124 hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
1129 if (hw_opt->nthreads_omp > 1)
1131 gmx_fatal(FARGS,"OpenMP threads are requested, but Gromacs was compiled without OpenMP support");
1135 if (cutoff_scheme == ecutsGROUP)
1137 /* We only have OpenMP support for PME only nodes */
1138 if (hw_opt->nthreads_omp > 1)
1140 gmx_fatal(FARGS,"OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
1141 ecutscheme_names[cutoff_scheme],
1142 ecutscheme_names[ecutsVERLET]);
1144 hw_opt->nthreads_omp = 1;
1147 if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
1149 gmx_fatal(FARGS,"You need to specify -ntomp in addition to -ntomp_pme");
1152 if (hw_opt->nthreads_tot == 1)
1154 hw_opt->nthreads_tmpi = 1;
1156 if (hw_opt->nthreads_omp > 1)
1158 gmx_fatal(FARGS,"You requested %d OpenMP threads with %d total threads",
1159 hw_opt->nthreads_tmpi,hw_opt->nthreads_tot);
1161 hw_opt->nthreads_omp = 1;
1164 if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
1166 hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
1171 fprintf(debug,"hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
1172 hw_opt->nthreads_tot,
1173 hw_opt->nthreads_tmpi,
1174 hw_opt->nthreads_omp,
1175 hw_opt->nthreads_omp_pme,
1176 hw_opt->gpu_id!=NULL ? hw_opt->gpu_id : "");
1182 /* Override the value in inputrec with value passed on the command line (if any) */
1183 static void override_nsteps_cmdline(FILE *fplog,
1186 const t_commrec *cr)
1191 /* override with anything else than the default -2 */
1192 if (nsteps_cmdline > -2)
1196 ir->nsteps = nsteps_cmdline;
1197 if (EI_DYNAMICS(ir->eI))
1199 sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps, %.3f ps",
1200 nsteps_cmdline, nsteps_cmdline*ir->delta_t);
1204 sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps",
1208 md_print_warn(cr, fplog, "%s\n", stmp);
1212 /* Data structure set by SIMMASTER which needs to be passed to all nodes
1213 * before the other nodes have read the tpx file and called gmx_detect_hardware.
1216 int cutoff_scheme; /* The cutoff scheme from inputrec_t */
1217 gmx_bool bUseGPU; /* Use GPU or GPU emulation */
1220 int mdrunner(gmx_hw_opt_t *hw_opt,
1221 FILE *fplog,t_commrec *cr,int nfile,
1222 const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
1223 gmx_bool bCompact, int nstglobalcomm,
1224 ivec ddxyz,int dd_node_order,real rdd,real rconstr,
1225 const char *dddlb_opt,real dlb_scale,
1226 const char *ddcsx,const char *ddcsy,const char *ddcsz,
1227 const char *nbpu_opt,
1228 int nsteps_cmdline, int nstepout,int resetstep,
1229 int nmultisim,int repl_ex_nst,int repl_ex_nex,
1230 int repl_ex_seed, real pforce,real cpt_period,real max_hours,
1231 const char *deviceOptions, unsigned long Flags)
1233 gmx_bool bForceUseGPU,bTryUseGPU;
1234 double nodetime=0,realtime;
1235 t_inputrec *inputrec;
1236 t_state *state=NULL;
1238 gmx_ddbox_t ddbox={0};
1239 int npme_major,npme_minor;
1242 gmx_mtop_t *mtop=NULL;
1243 t_mdatoms *mdatoms=NULL;
1244 t_forcerec *fr=NULL;
1247 gmx_pme_t *pmedata=NULL;
1248 gmx_vsite_t *vsite=NULL;
1249 gmx_constr_t constr;
1250 int i,m,nChargePerturbed=-1,status,nalloc;
1252 gmx_wallcycle_t wcycle;
1253 gmx_bool bReadRNG,bReadEkin;
1255 gmx_runtime_t runtime;
1257 gmx_large_int_t reset_counters;
1258 gmx_edsam_t ed=NULL;
1259 t_commrec *cr_old=cr;
1262 gmx_membed_t membed=NULL;
1263 gmx_hw_info_t *hwinfo=NULL;
1264 master_inf_t minf={-1,FALSE};
1266 /* CAUTION: threads may be started later on in this function, so
1267 cr doesn't reflect the final parallel state right now */
1271 if (Flags & MD_APPENDFILES)
1276 bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
1277 bTryUseGPU = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
1282 /* Read (nearly) all data required for the simulation */
1283 read_tpx_state(ftp2fn(efTPX,nfile,fnm),inputrec,state,NULL,mtop);
1285 if (inputrec->cutoff_scheme != ecutsVERLET &&
1286 ((Flags & MD_TESTVERLET) || getenv("GMX_VERLET_SCHEME") != NULL))
1288 convert_to_verlet_scheme(fplog,inputrec,mtop,det(state->box));
1291 /* Detect hardware, gather information. With tMPI only thread 0 does it
1292 * and after threads are started broadcasts hwinfo around. */
1294 gmx_detect_hardware(fplog, hwinfo, cr,
1295 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
1297 minf.cutoff_scheme = inputrec->cutoff_scheme;
1298 minf.bUseGPU = FALSE;
1300 if (inputrec->cutoff_scheme == ecutsVERLET)
1302 prepare_verlet_scheme(fplog,hwinfo,cr,hw_opt,nbpu_opt,
1303 inputrec,mtop,state->box,
1306 else if (hwinfo->bCanUseGPU)
1308 md_print_warn(cr,fplog,
1309 "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
1310 " To use a GPU, set the mdp option: cutoff-scheme = Verlet\n"
1311 " (for quick performance testing you can use the -testverlet option)\n");
1315 gmx_fatal(FARGS,"GPU requested, but can't be used without cutoff-scheme=Verlet");
1319 #ifndef GMX_THREAD_MPI
1322 gmx_bcast_sim(sizeof(minf),&minf,cr);
1325 if (minf.bUseGPU && cr->npmenodes == -1)
1327 /* Don't automatically use PME-only nodes with GPUs */
1331 /* Check for externally set OpenMP affinity and turn off internal
1332 * pinning if any is found. We need to do this check early to tell
1333 * thread-MPI whether it should do pinning when spawning threads.
1335 gmx_omp_check_thread_affinity(fplog, cr, hw_opt);
1337 #ifdef GMX_THREAD_MPI
1338 /* With thread-MPI inputrec is only set here on the master thread */
1342 check_and_update_hw_opt(hw_opt,minf.cutoff_scheme);
1344 #ifdef GMX_THREAD_MPI
1345 /* Early check for externally set process affinity. Can't do over all
1346 * MPI processes because hwinfo is not available everywhere, but with
1347 * thread-MPI it's needed as pinning might get turned off which needs
1348 * to be known before starting thread-MPI. */
1349 check_cpu_affinity_set(fplog,
1351 hw_opt, hwinfo->nthreads_hw_avail, FALSE);
1354 #ifdef GMX_THREAD_MPI
1355 if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
1357 gmx_fatal(FARGS,"You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME nodes");
1361 if (hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
1364 gmx_fatal(FARGS,"You need to explicitly specify the number of PME nodes (-npme) when using different number of OpenMP threads for PP and PME nodes");
1368 #ifdef GMX_THREAD_MPI
1371 /* NOW the threads will be started: */
1372 hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
1376 if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
1378 hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
1381 if (hw_opt->nthreads_tmpi > 1)
1383 /* now start the threads. */
1384 cr=mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm,
1385 oenv, bVerbose, bCompact, nstglobalcomm,
1386 ddxyz, dd_node_order, rdd, rconstr,
1387 dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
1389 nsteps_cmdline, nstepout, resetstep, nmultisim,
1390 repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
1391 cpt_period, max_hours, deviceOptions,
1393 /* the main thread continues here with a new cr. We don't deallocate
1394 the old cr because other threads may still be reading it. */
1397 gmx_comm("Failed to spawn threads");
1402 /* END OF CAUTION: cr is now reliable */
1404 /* g_membed initialisation *
1405 * Because we change the mtop, init_membed is called before the init_parallel *
1406 * (in case we ever want to make it run in parallel) */
1407 if (opt2bSet("-membed",nfile,fnm))
1411 fprintf(stderr,"Initializing membed");
1413 membed = init_membed(fplog,nfile,fnm,mtop,inputrec,state,cr,&cpt_period);
1418 /* now broadcast everything to the non-master nodes/threads: */
1419 init_parallel(fplog, cr, inputrec, mtop);
1421 /* This check needs to happen after get_nthreads_mpi() */
1422 if (inputrec->cutoff_scheme == ecutsVERLET && (Flags & MD_PARTDEC))
1424 gmx_fatal_collective(FARGS,cr,NULL,
1425 "The Verlet cut-off scheme is not supported with particle decomposition.\n"
1426 "You can achieve the same effect as particle decomposition by running in parallel using only OpenMP threads.");
1431 pr_inputrec(fplog,0,"Input Parameters",inputrec,FALSE);
1434 #if defined GMX_THREAD_MPI
1435 /* With tMPI we detected on thread 0 and we'll just pass the hwinfo pointer
1436 * to the other threads -- slightly uncool, but works fine, just need to
1437 * make sure that the data doesn't get freed twice. */
1444 gmx_bcast(sizeof(&hwinfo), &hwinfo, cr);
1447 if (PAR(cr) && !SIMMASTER(cr))
1449 /* now we have inputrec on all nodes, can run the detection */
1450 /* TODO: perhaps it's better to propagate within a node instead? */
1452 gmx_detect_hardware(fplog, hwinfo, cr,
1453 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
1456 /* Now do the affinity check with MPI/no-MPI (done earlier with thread-MPI). */
1457 check_cpu_affinity_set(fplog, cr,
1458 hw_opt, hwinfo->nthreads_hw_avail, FALSE);
1461 /* now make sure the state is initialized and propagated */
1462 set_state_entries(state,inputrec,cr->nnodes);
1464 /* remove when vv and rerun works correctly! */
1465 if (PAR(cr) && EI_VV(inputrec->eI) && ((Flags & MD_RERUN) || (Flags & MD_RERUN_VSITE)))
1468 "Currently can't do velocity verlet with rerun in parallel.");
1471 /* A parallel command line option consistency check that we can
1472 only do after any threads have started. */
1474 (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
1477 "The -dd or -npme option request a parallel simulation, "
1479 "but %s was compiled without threads or MPI enabled"
1481 #ifdef GMX_THREAD_MPI
1482 "but the number of threads (option -nt) is 1"
1484 "but %s was not started through mpirun/mpiexec or only one process was requested through mpirun/mpiexec"
1491 if ((Flags & MD_RERUN) &&
1492 (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
1494 gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
1497 if (can_use_allvsall(inputrec,mtop,TRUE,cr,fplog) && PAR(cr))
1499 /* All-vs-all loops do not work with domain decomposition */
1500 Flags |= MD_PARTDEC;
1503 if (!EEL_PME(inputrec->coulombtype) || (Flags & MD_PARTDEC))
1505 if (cr->npmenodes > 0)
1507 if (!EEL_PME(inputrec->coulombtype))
1509 gmx_fatal_collective(FARGS,cr,NULL,
1510 "PME nodes are requested, but the system does not use PME electrostatics");
1512 if (Flags & MD_PARTDEC)
1514 gmx_fatal_collective(FARGS,cr,NULL,
1515 "PME nodes are requested, but particle decomposition does not support separate PME nodes");
1523 fcRegisterSteps(inputrec->nsteps,inputrec->init_step);
1526 /* NMR restraints must be initialized before load_checkpoint,
1527 * since with time averaging the history is added to t_state.
1528 * For proper consistency check we therefore need to extend
1530 * So the PME-only nodes (if present) will also initialize
1531 * the distance restraints.
1535 /* This needs to be called before read_checkpoint to extend the state */
1536 init_disres(fplog,mtop,inputrec,cr,Flags & MD_PARTDEC,fcd,state);
1538 if (gmx_mtop_ftype_count(mtop,F_ORIRES) > 0)
1540 if (PAR(cr) && !(Flags & MD_PARTDEC))
1542 gmx_fatal(FARGS,"Orientation restraints do not work (yet) with domain decomposition, use particle decomposition (mdrun option -pd)");
1544 /* Orientation restraints */
1547 init_orires(fplog,mtop,state->x,inputrec,cr->ms,&(fcd->orires),
1552 if (DEFORM(*inputrec))
1554 /* Store the deform reference box before reading the checkpoint */
1557 copy_mat(state->box,box);
1561 gmx_bcast(sizeof(box),box,cr);
1563 /* Because we do not have the update struct available yet
1564 * in which the reference values should be stored,
1565 * we store them temporarily in static variables.
1566 * This should be thread safe, since they are only written once
1567 * and with identical values.
1569 #ifdef GMX_THREAD_MPI
1570 tMPI_Thread_mutex_lock(&deform_init_box_mutex);
1572 deform_init_init_step_tpx = inputrec->init_step;
1573 copy_mat(box,deform_init_box_tpx);
1574 #ifdef GMX_THREAD_MPI
1575 tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
1579 if (opt2bSet("-cpi",nfile,fnm))
1581 /* Check if checkpoint file exists before doing continuation.
1582 * This way we can use identical input options for the first and subsequent runs...
1584 if( gmx_fexist_master(opt2fn_master("-cpi",nfile,fnm,cr),cr) )
1586 load_checkpoint(opt2fn_master("-cpi",nfile,fnm,cr),&fplog,
1587 cr,Flags & MD_PARTDEC,ddxyz,
1588 inputrec,state,&bReadRNG,&bReadEkin,
1589 (Flags & MD_APPENDFILES),
1590 (Flags & MD_APPENDFILESSET));
1594 Flags |= MD_READ_RNG;
1598 Flags |= MD_READ_EKIN;
1603 if (((MASTER(cr) || (Flags & MD_SEPPOT)) && (Flags & MD_APPENDFILES))
1604 #ifdef GMX_THREAD_MPI
1605 /* With thread MPI only the master node/thread exists in mdrun.c,
1606 * therefore non-master nodes need to open the "seppot" log file here.
1608 || (!MASTER(cr) && (Flags & MD_SEPPOT))
1612 gmx_log_open(ftp2fn(efLOG,nfile,fnm),cr,!(Flags & MD_SEPPOT),
1616 /* override nsteps with value from cmdline */
1617 override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
1621 copy_mat(state->box,box);
1626 gmx_bcast(sizeof(box),box,cr);
1629 /* Essential dynamics */
1630 if (opt2bSet("-ei",nfile,fnm))
1632 /* Open input and output files, allocate space for ED data structure */
1633 ed = ed_open(nfile,fnm,Flags,cr);
1636 if (PAR(cr) && !((Flags & MD_PARTDEC) ||
1637 EI_TPI(inputrec->eI) ||
1638 inputrec->eI == eiNM))
1640 cr->dd = init_domain_decomposition(fplog,cr,Flags,ddxyz,rdd,rconstr,
1641 dddlb_opt,dlb_scale,
1645 &ddbox,&npme_major,&npme_minor);
1647 make_dd_communicators(fplog,cr,dd_node_order);
1649 /* Set overallocation to avoid frequent reallocation of arrays */
1650 set_over_alloc_dd(TRUE);
1654 /* PME, if used, is done on all nodes with 1D decomposition */
1656 cr->duty = (DUTY_PP | DUTY_PME);
1659 if (!EI_TPI(inputrec->eI))
1661 npme_major = cr->nnodes;
1664 if (inputrec->ePBC == epbcSCREW)
1667 "pbc=%s is only implemented with domain decomposition",
1668 epbc_names[inputrec->ePBC]);
1674 /* After possible communicator splitting in make_dd_communicators.
1675 * we can set up the intra/inter node communication.
1677 gmx_setup_nodecomm(fplog,cr);
1680 /* Initialize per-node process ID and counters. */
1681 gmx_init_intra_counters(cr);
1684 md_print_info(cr,fplog,"Using %d MPI %s\n",
1686 #ifdef GMX_THREAD_MPI
1687 cr->nnodes==1 ? "thread" : "threads"
1689 cr->nnodes==1 ? "process" : "processes"
1694 gmx_omp_nthreads_init(fplog, cr,
1695 hwinfo->nthreads_hw_avail,
1696 hw_opt->nthreads_omp,
1697 hw_opt->nthreads_omp_pme,
1698 (cr->duty & DUTY_PP) == 0,
1699 inputrec->cutoff_scheme == ecutsVERLET);
1701 gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi, minf.bUseGPU);
1703 /* getting number of PP/PME threads
1704 PME: env variable should be read only on one node to make sure it is
1705 identical everywhere;
1707 /* TODO nthreads_pp is only used for pinning threads.
1708 * This is a temporary solution until we have a hw topology library.
1710 nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
1711 nthreads_pme = gmx_omp_nthreads_get(emntPME);
1713 wcycle = wallcycle_init(fplog,resetstep,cr,nthreads_pp,nthreads_pme);
1717 /* Master synchronizes its value of reset_counters with all nodes
1718 * including PME only nodes */
1719 reset_counters = wcycle_get_reset_counters(wcycle);
1720 gmx_bcast_sim(sizeof(reset_counters),&reset_counters,cr);
1721 wcycle_set_reset_counters(wcycle, reset_counters);
1725 if (cr->duty & DUTY_PP)
1727 /* For domain decomposition we allocate dynamically
1728 * in dd_partition_system.
1730 if (DOMAINDECOMP(cr))
1732 bcast_state_setup(cr,state);
1738 bcast_state(cr,state,TRUE);
1742 /* Initiate forcerecord */
1744 fr->hwinfo = hwinfo;
1745 init_forcerec(fplog,oenv,fr,fcd,inputrec,mtop,cr,box,FALSE,
1746 opt2fn("-table",nfile,fnm),
1747 opt2fn("-tabletf",nfile,fnm),
1748 opt2fn("-tablep",nfile,fnm),
1749 opt2fn("-tableb",nfile,fnm),
1753 /* version for PCA_NOT_READ_NODE (see md.c) */
1754 /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
1755 "nofile","nofile","nofile","nofile",FALSE,pforce);
1757 fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT);
1759 /* Initialize QM-MM */
1762 init_QMMMrec(cr,box,mtop,inputrec,fr);
1765 /* Initialize the mdatoms structure.
1766 * mdatoms is not filled with atom data,
1767 * as this can not be done now with domain decomposition.
1769 mdatoms = init_mdatoms(fplog,mtop,inputrec->efep!=efepNO);
1771 /* Initialize the virtual site communication */
1772 vsite = init_vsite(mtop,cr,FALSE);
1774 calc_shifts(box,fr->shift_vec);
1776 /* With periodic molecules the charge groups should be whole at start up
1777 * and the virtual sites should not be far from their proper positions.
1779 if (!inputrec->bContinuation && MASTER(cr) &&
1780 !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
1782 /* Make molecules whole at start of run */
1783 if (fr->ePBC != epbcNONE)
1785 do_pbc_first_mtop(fplog,inputrec->ePBC,box,mtop,state->x);
1789 /* Correct initial vsite positions are required
1790 * for the initial distribution in the domain decomposition
1791 * and for the initial shell prediction.
1793 construct_vsites_mtop(fplog,vsite,mtop,state->x);
1797 if (EEL_PME(fr->eeltype))
1799 ewaldcoeff = fr->ewaldcoeff;
1800 pmedata = &fr->pmedata;
1809 /* This is a PME only node */
1811 /* We don't need the state */
1814 ewaldcoeff = calc_ewaldcoeff(inputrec->rcoulomb, inputrec->ewald_rtol);
1818 /* Before setting affinity, check whether the affinity has changed
1819 * - which indicates that probably the OpenMP library has changed it since
1820 * we first checked). */
1821 check_cpu_affinity_set(fplog, cr, hw_opt, hwinfo->nthreads_hw_avail, TRUE);
1823 /* Set the CPU affinity */
1824 set_cpu_affinity(fplog,cr,hw_opt,nthreads_pme,hwinfo,inputrec);
1826 /* Initiate PME if necessary,
1827 * either on all nodes or on dedicated PME nodes only. */
1828 if (EEL_PME(inputrec->coulombtype))
1832 nChargePerturbed = mdatoms->nChargePerturbed;
1834 if (cr->npmenodes > 0)
1836 /* The PME only nodes need to know nChargePerturbed */
1837 gmx_bcast_sim(sizeof(nChargePerturbed),&nChargePerturbed,cr);
1840 if (cr->duty & DUTY_PME)
1842 status = gmx_pme_init(pmedata,cr,npme_major,npme_minor,inputrec,
1843 mtop ? mtop->natoms : 0,nChargePerturbed,
1844 (Flags & MD_REPRODUCIBLE),nthreads_pme);
1847 gmx_fatal(FARGS,"Error %d initializing PME",status);
1853 if (integrator[inputrec->eI].func == do_md
1856 integrator[inputrec->eI].func == do_md_openmm
1860 /* Turn on signal handling on all nodes */
1862 * (A user signal from the PME nodes (if any)
1863 * is communicated to the PP nodes.
1865 signal_handler_install();
1868 if (cr->duty & DUTY_PP)
1870 if (inputrec->ePull != epullNO)
1872 /* Initialize pull code */
1873 init_pull(fplog,inputrec,nfile,fnm,mtop,cr,oenv, inputrec->fepvals->init_lambda,
1874 EI_DYNAMICS(inputrec->eI) && MASTER(cr),Flags);
1879 /* Initialize enforced rotation code */
1880 init_rot(fplog,inputrec,nfile,fnm,cr,state->x,box,mtop,oenv,
1884 constr = init_constraints(fplog,mtop,inputrec,ed,state,cr);
1886 if (DOMAINDECOMP(cr))
1888 dd_init_bondeds(fplog,cr->dd,mtop,vsite,constr,inputrec,
1889 Flags & MD_DDBONDCHECK,fr->cginfo_mb);
1891 set_dd_parameters(fplog,cr->dd,dlb_scale,inputrec,fr,&ddbox);
1893 setup_dd_grid(fplog,cr->dd);
1896 /* Now do whatever the user wants us to do (how flexible...) */
1897 integrator[inputrec->eI].func(fplog,cr,nfile,fnm,
1898 oenv,bVerbose,bCompact,
1901 nstepout,inputrec,mtop,
1903 mdatoms,nrnb,wcycle,ed,fr,
1904 repl_ex_nst,repl_ex_nex,repl_ex_seed,
1906 cpt_period,max_hours,
1911 if (inputrec->ePull != epullNO)
1913 finish_pull(fplog,inputrec->pull);
1918 finish_rot(fplog,inputrec->rot);
1925 gmx_pmeonly(*pmedata,cr,nrnb,wcycle,ewaldcoeff,FALSE,inputrec);
1928 if (EI_DYNAMICS(inputrec->eI) || EI_TPI(inputrec->eI))
1930 /* Some timing stats */
1933 if (runtime.proc == 0)
1935 runtime.proc = runtime.real;
1944 wallcycle_stop(wcycle,ewcRUN);
1946 /* Finish up, write some stuff
1947 * if rerunMD, don't write last frame again
1949 finish_run(fplog,cr,ftp2fn(efSTO,nfile,fnm),
1950 inputrec,nrnb,wcycle,&runtime,
1951 fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ?
1952 nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL,
1954 EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
1956 if ((cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU)
1958 char gpu_err_str[STRLEN];
1960 /* free GPU memory and uninitialize GPU (by destroying the context) */
1961 nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
1963 if (!free_gpu(gpu_err_str))
1965 gmx_warning("On node %d failed to free GPU #%d: %s",
1966 cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
1970 if (opt2bSet("-membed",nfile,fnm))
1975 #ifdef GMX_THREAD_MPI
1976 if (PAR(cr) && SIMMASTER(cr))
1979 gmx_hardware_info_free(hwinfo);
1982 /* Does what it says */
1983 print_date_and_time(fplog,cr->nodeid,"Finished mdrun",&runtime);
1985 /* Close logfile already here if we were appending to it */
1986 if (MASTER(cr) && (Flags & MD_APPENDFILES))
1988 gmx_log_close(fplog);
1991 rc=(int)gmx_get_stop_condition();
1993 #ifdef GMX_THREAD_MPI
1994 /* we need to join all threads. The sub-threads join when they
1995 exit this function, but the master thread needs to be told to
1997 if (PAR(cr) && MASTER(cr))