1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This source code is part of
8 * GROningen MAchine for Chemical Simulations
11 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13 * Copyright (c) 2001-2004, The GROMACS development team,
14 * check out http://www.gromacs.org for more information.
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version 2
19 * of the License, or (at your option) any later version.
21 * If you want to redistribute modifications, please consider that
22 * scientific software is very special. Version control is crucial -
23 * bugs must be traceable. We will be happy to consider code for
24 * inclusion in the official distribution, but derived work must not
25 * be called official GROMACS. Details are found in the README & COPYING
26 * files - if they are missing, get the official version at www.gromacs.org.
28 * To help us fund GROMACS development, we humbly ask that you cite
29 * the papers on the package - you can find them in the top README file.
31 * For more info, check our website at http://www.gromacs.org
34 * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
39 #if defined(HAVE_SCHED_H) && (defined(HAVE_SCHED_GETAFFINITY) || defined(HAVE_SCHED_SETAFFINITY))
42 #include <sys/syscall.h>
57 #include "md_logging.h"
58 #include "md_support.h"
61 #include "pull_rotation.h"
74 #include "checkpoint.h"
75 #include "mtop_util.h"
76 #include "sighandler.h"
79 #include "gmx_detect_hardware.h"
80 #include "gmx_omp_nthreads.h"
81 #include "pull_rotation.h"
82 #include "calc_verletbuf.h"
83 #include "../mdlib/nbnxn_search.h"
84 #include "../mdlib/nbnxn_consts.h"
85 #include "gmx_fatal_collective.h"
102 #include "md_openmm.h"
105 #include "gpu_utils.h"
106 #include "nbnxn_cuda_data_mgmt.h"
109 gmx_integrator_t *func;
112 /* The array should match the eI array in include/types/enums.h */
113 #ifdef GMX_OPENMM /* FIXME do_md_openmm needs fixing */
114 const gmx_intp_t integrator[eiNR] = { {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm},{do_md_openmm}};
116 const gmx_intp_t integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md},{do_md}};
119 gmx_large_int_t deform_init_init_step_tpx;
120 matrix deform_init_box_tpx;
121 #ifdef GMX_THREAD_MPI
122 tMPI_Thread_mutex_t deform_init_box_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
126 #ifdef GMX_THREAD_MPI
127 struct mdrunner_arglist
129 gmx_hw_opt_t *hw_opt;
142 const char *dddlb_opt;
147 const char *nbpu_opt;
158 const char *deviceOptions;
160 int ret; /* return value */
164 /* The function used for spawning threads. Extracts the mdrunner()
165 arguments from its one argument and calls mdrunner(), after making
167 static void mdrunner_start_fn(void *arg)
169 struct mdrunner_arglist *mda=(struct mdrunner_arglist*)arg;
170 struct mdrunner_arglist mc=*mda; /* copy the arg list to make sure
171 that it's thread-local. This doesn't
172 copy pointed-to items, of course,
173 but those are all const. */
174 t_commrec *cr; /* we need a local version of this */
178 fnm = dup_tfn(mc.nfile, mc.fnm);
180 cr = init_par_threads(mc.cr);
187 mda->ret=mdrunner(mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv,
188 mc.bVerbose, mc.bCompact, mc.nstglobalcomm,
189 mc.ddxyz, mc.dd_node_order, mc.rdd,
190 mc.rconstr, mc.dddlb_opt, mc.dlb_scale,
191 mc.ddcsx, mc.ddcsy, mc.ddcsz,
193 mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
194 mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce,
195 mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.Flags);
198 /* called by mdrunner() to start a specific number of threads (including
199 the main thread) for thread-parallel runs. This in turn calls mdrunner()
201 All options besides nthreads are the same as for mdrunner(). */
202 static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
203 FILE *fplog,t_commrec *cr,int nfile,
204 const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
205 gmx_bool bCompact, int nstglobalcomm,
206 ivec ddxyz,int dd_node_order,real rdd,real rconstr,
207 const char *dddlb_opt,real dlb_scale,
208 const char *ddcsx,const char *ddcsy,const char *ddcsz,
209 const char *nbpu_opt,
210 int nsteps_cmdline, int nstepout,int resetstep,
211 int nmultisim,int repl_ex_nst,int repl_ex_nex, int repl_ex_seed,
212 real pforce,real cpt_period, real max_hours,
213 const char *deviceOptions, unsigned long Flags)
216 struct mdrunner_arglist *mda;
217 t_commrec *crn; /* the new commrec */
220 /* first check whether we even need to start tMPI */
221 if (hw_opt->nthreads_tmpi < 2)
226 /* a few small, one-time, almost unavoidable memory leaks: */
228 fnmn=dup_tfn(nfile, fnm);
230 /* fill the data structure to pass as void pointer to thread start fn */
237 mda->bVerbose=bVerbose;
238 mda->bCompact=bCompact;
239 mda->nstglobalcomm=nstglobalcomm;
240 mda->ddxyz[XX]=ddxyz[XX];
241 mda->ddxyz[YY]=ddxyz[YY];
242 mda->ddxyz[ZZ]=ddxyz[ZZ];
243 mda->dd_node_order=dd_node_order;
245 mda->rconstr=rconstr;
246 mda->dddlb_opt=dddlb_opt;
247 mda->dlb_scale=dlb_scale;
251 mda->nbpu_opt=nbpu_opt;
252 mda->nsteps_cmdline=nsteps_cmdline;
253 mda->nstepout=nstepout;
254 mda->resetstep=resetstep;
255 mda->nmultisim=nmultisim;
256 mda->repl_ex_nst=repl_ex_nst;
257 mda->repl_ex_nex=repl_ex_nex;
258 mda->repl_ex_seed=repl_ex_seed;
260 mda->cpt_period=cpt_period;
261 mda->max_hours=max_hours;
262 mda->deviceOptions=deviceOptions;
265 fprintf(stderr, "Starting %d tMPI threads\n",hw_opt->nthreads_tmpi);
267 /* now spawn new threads that start mdrunner_start_fn(), while
268 the main thread returns */
269 ret=tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi,
270 (hw_opt->bThreadPinning ? TMPI_AFFINITY_ALL_CORES : TMPI_AFFINITY_NONE),
271 mdrunner_start_fn, (void*)(mda) );
272 if (ret!=TMPI_SUCCESS)
275 /* make a new comm_rec to reflect the new situation */
276 crn=init_par_threads(cr);
281 static int get_tmpi_omp_thread_division(const gmx_hw_info_t *hwinfo,
282 const gmx_hw_opt_t *hw_opt,
288 /* There are no separate PME nodes here, as we ensured in
289 * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
290 * and a conditional ensures we would not have ended up here.
291 * Note that separate PME nodes might be switched on later.
295 nthreads_tmpi = ngpu;
296 if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi)
298 nthreads_tmpi = nthreads_tot;
301 else if (hw_opt->nthreads_omp > 0)
303 /* Here we could oversubscribe, when we do, we issue a warning later */
304 nthreads_tmpi = max(1,nthreads_tot/hw_opt->nthreads_omp);
308 /* TODO choose nthreads_omp based on hardware topology
309 when we have a hardware topology detection library */
310 /* In general, when running up to 4 threads, OpenMP should be faster.
311 * Note: on AMD Bulldozer we should avoid running OpenMP over two dies.
312 * On Intel>=Nehalem running OpenMP on a single CPU is always faster,
313 * even on two CPUs it's usually faster (but with many OpenMP threads
314 * it could be faster not to use HT, currently we always use HT).
315 * On Nehalem/Westmere we want to avoid running 16 threads over
316 * two CPUs with HT, so we need a limit<16; thus we use 12.
317 * A reasonable limit for Intel Sandy and Ivy bridge,
318 * not knowing the topology, is 16 threads.
320 const int nthreads_omp_always_faster = 4;
321 const int nthreads_omp_always_faster_Nehalem = 12;
322 const int nthreads_omp_always_faster_SandyBridge = 16;
323 const int first_model_Nehalem = 0x1A;
324 const int first_model_SandyBridge = 0x2A;
325 gmx_bool bIntel_Family6;
328 (gmx_cpuid_vendor(hwinfo->cpuid_info) == GMX_CPUID_VENDOR_INTEL &&
329 gmx_cpuid_family(hwinfo->cpuid_info) == 6);
331 if (nthreads_tot <= nthreads_omp_always_faster ||
333 ((gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_Nehalem && nthreads_tot <= nthreads_omp_always_faster_Nehalem) ||
334 (gmx_cpuid_model(hwinfo->cpuid_info) >= nthreads_omp_always_faster_SandyBridge && nthreads_tot <= nthreads_omp_always_faster_SandyBridge))))
336 /* Use pure OpenMP parallelization */
341 /* Don't use OpenMP parallelization */
342 nthreads_tmpi = nthreads_tot;
346 return nthreads_tmpi;
350 /* Get the number of threads to use for thread-MPI based on how many
351 * were requested, which algorithms we're using,
352 * and how many particles there are.
353 * At the point we have already called check_and_update_hw_opt.
354 * Thus all options should be internally consistent and consistent
355 * with the hardware, except that ntmpi could be larger than #GPU.
357 static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
358 gmx_hw_opt_t *hw_opt,
359 t_inputrec *inputrec, gmx_mtop_t *mtop,
363 int nthreads_hw,nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu;
364 int min_atoms_per_mpi_thread;
369 if (hw_opt->nthreads_tmpi > 0)
371 /* Trivial, return right away */
372 return hw_opt->nthreads_tmpi;
375 nthreads_hw = hwinfo->nthreads_hw_avail;
377 /* How many total (#tMPI*#OpenMP) threads can we start? */
378 if (hw_opt->nthreads_tot > 0)
380 nthreads_tot_max = hw_opt->nthreads_tot;
384 nthreads_tot_max = nthreads_hw;
387 bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
390 ngpu = hwinfo->gpu_info.ncuda_dev_use;
398 get_tmpi_omp_thread_division(hwinfo,hw_opt,nthreads_tot_max,ngpu);
400 if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
402 /* Steps are divided over the nodes iso splitting the atoms */
403 min_atoms_per_mpi_thread = 0;
409 min_atoms_per_mpi_thread = MIN_ATOMS_PER_GPU;
413 min_atoms_per_mpi_thread = MIN_ATOMS_PER_MPI_THREAD;
417 /* Check if an algorithm does not support parallel simulation. */
418 if (nthreads_tmpi != 1 &&
419 ( inputrec->eI == eiLBFGS ||
420 inputrec->coulombtype == eelEWALD ) )
424 md_print_warn(cr,fplog,"The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
425 if (hw_opt->nthreads_tmpi > nthreads_tmpi)
427 gmx_fatal(FARGS,"You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
430 else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_thread)
432 /* the thread number was chosen automatically, but there are too many
433 threads (too few atoms per thread) */
434 nthreads_new = max(1,mtop->natoms/min_atoms_per_mpi_thread);
436 /* Avoid partial use of Hyper-Threading */
437 if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
438 nthreads_new > nthreads_hw/2 && nthreads_new < nthreads_hw)
440 nthreads_new = nthreads_hw/2;
443 /* Avoid large prime numbers in the thread count */
444 if (nthreads_new >= 6)
446 /* Use only 6,8,10 with additional factors of 2 */
450 while (3*fac*2 <= nthreads_new)
455 nthreads_new = (nthreads_new/fac)*fac;
460 if (nthreads_new == 5)
466 nthreads_tmpi = nthreads_new;
468 fprintf(stderr,"\n");
469 fprintf(stderr,"NOTE: Parallelization is limited by the small number of atoms,\n");
470 fprintf(stderr," only starting %d thread-MPI threads.\n",nthreads_tmpi);
471 fprintf(stderr," You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
474 return nthreads_tmpi;
476 #endif /* GMX_THREAD_MPI */
479 /* Environment variable for setting nstlist */
480 static const char* NSTLIST_ENVVAR = "GMX_NSTLIST";
481 /* Try to increase nstlist when using a GPU with nstlist less than this */
482 static const int NSTLIST_GPU_ENOUGH = 20;
483 /* Increase nstlist until the non-bonded cost increases more than this factor */
484 static const float NBNXN_GPU_LIST_OK_FAC = 1.25;
485 /* Don't increase nstlist beyond a non-bonded cost increases of this factor */
486 static const float NBNXN_GPU_LIST_MAX_FAC = 1.40;
488 /* Try to increase nstlist when running on a GPU */
489 static void increase_nstlist(FILE *fp,t_commrec *cr,
490 t_inputrec *ir,const gmx_mtop_t *mtop,matrix box)
493 int nstlist_orig,nstlist_prev;
494 verletbuf_list_setup_t ls;
495 real rlist_inc,rlist_ok,rlist_max,rlist_new,rlist_prev;
498 gmx_bool bBox,bDD,bCont;
499 const char *nstl_fmt="\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
500 const char *vbd_err="Can not increase nstlist for GPU run because verlet-buffer-drift is not set or used";
501 const char *box_err="Can not increase nstlist for GPU run because the box is too small";
502 const char *dd_err ="Can not increase nstlist for GPU run because of domain decomposition limitations";
505 /* Number of + nstlist alternative values to try when switching */
506 const int nstl[]={ 20, 25, 40, 50 };
507 #define NNSTL sizeof(nstl)/sizeof(nstl[0])
509 env = getenv(NSTLIST_ENVVAR);
514 fprintf(fp,nstl_fmt,ir->nstlist);
518 if (ir->verletbuf_drift == 0)
520 gmx_fatal(FARGS,"You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
523 if (ir->verletbuf_drift < 0)
527 fprintf(stderr,"%s\n",vbd_err);
531 fprintf(fp,"%s\n",vbd_err);
537 nstlist_orig = ir->nstlist;
540 sprintf(buf,"Getting nstlist from environment variable GMX_NSTLIST=%s",env);
543 fprintf(stderr,"%s\n",buf);
547 fprintf(fp,"%s\n",buf);
549 sscanf(env,"%d",&ir->nstlist);
552 verletbuf_get_list_setup(TRUE,&ls);
554 /* Allow rlist to make the list double the size of the cut-off sphere */
555 rlist_inc = nbnxn_get_rlist_effective_inc(NBNXN_GPU_CLUSTER_SIZE,mtop->natoms/det(box));
556 rlist_ok = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_OK_FAC,1.0/3.0) - rlist_inc;
557 rlist_max = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_MAX_FAC,1.0/3.0) - rlist_inc;
560 fprintf(debug,"GPU nstlist tuning: rlist_inc %.3f rlist_max %.3f\n",
561 rlist_inc,rlist_max);
565 nstlist_prev = nstlist_orig;
566 rlist_prev = ir->rlist;
571 ir->nstlist = nstl[i];
574 /* Set the pair-list buffer size in ir */
575 calc_verlet_buffer_size(mtop,det(box),ir,ir->verletbuf_drift,&ls,
578 /* Does rlist fit in the box? */
579 bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC,box));
581 if (bBox && DOMAINDECOMP(cr))
583 /* Check if rlist fits in the domain decomposition */
584 if (inputrec2nboundeddim(ir) < DIM)
586 gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
588 copy_mat(box,state_tmp.box);
589 bDD = change_dd_cutoff(cr,&state_tmp,ir,rlist_new);
596 if (bBox && bDD && rlist_new <= rlist_max)
598 /* Increase nstlist */
599 nstlist_prev = ir->nstlist;
600 rlist_prev = rlist_new;
601 bCont = (i+1 < NNSTL && rlist_new < rlist_ok);
605 /* Stick with the previous nstlist */
606 ir->nstlist = nstlist_prev;
607 rlist_new = rlist_prev;
619 gmx_warning(!bBox ? box_err : dd_err);
622 fprintf(fp,"\n%s\n",bBox ? box_err : dd_err);
624 ir->nstlist = nstlist_orig;
626 else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
628 sprintf(buf,"Changing nstlist from %d to %d, rlist from %g to %g",
629 nstlist_orig,ir->nstlist,
630 ir->rlist,rlist_new);
633 fprintf(stderr,"%s\n\n",buf);
637 fprintf(fp,"%s\n\n",buf);
639 ir->rlist = rlist_new;
640 ir->rlistlong = rlist_new;
644 static void prepare_verlet_scheme(FILE *fplog,
645 gmx_hw_info_t *hwinfo,
647 gmx_hw_opt_t *hw_opt,
648 const char *nbpu_opt,
650 const gmx_mtop_t *mtop,
654 /* Here we only check for GPU usage on the MPI master process,
655 * as here we don't know how many GPUs we will use yet.
656 * We check for a GPU on all processes later.
658 *bUseGPU = hwinfo->bCanUseGPU || (getenv("GMX_EMULATE_GPU") != NULL);
660 if (ir->verletbuf_drift > 0)
662 /* Update the Verlet buffer size for the current run setup */
663 verletbuf_list_setup_t ls;
666 /* Here we assume CPU acceleration is on. But as currently
667 * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
668 * and 4x2 gives a larger buffer than 4x4, this is ok.
670 verletbuf_get_list_setup(*bUseGPU,&ls);
672 calc_verlet_buffer_size(mtop,det(box),ir,
673 ir->verletbuf_drift,&ls,
675 if (rlist_new != ir->rlist)
679 fprintf(fplog,"\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
681 ls.cluster_size_i,ls.cluster_size_j);
683 ir->rlist = rlist_new;
684 ir->rlistlong = rlist_new;
688 /* With GPU or emulation we should check nstlist for performance */
689 if ((EI_DYNAMICS(ir->eI) &&
691 ir->nstlist < NSTLIST_GPU_ENOUGH) ||
692 getenv(NSTLIST_ENVVAR) != NULL)
694 /* Choose a better nstlist */
695 increase_nstlist(fplog,cr,ir,mtop,box);
699 static void convert_to_verlet_scheme(FILE *fplog,
701 gmx_mtop_t *mtop,real box_vol)
703 char *conv_mesg="Converting input file with group cut-off scheme to the Verlet cut-off scheme";
705 md_print_warn(NULL,fplog,"%s\n",conv_mesg);
707 ir->cutoff_scheme = ecutsVERLET;
708 ir->verletbuf_drift = 0.005;
710 if (ir->rcoulomb != ir->rvdw)
712 gmx_fatal(FARGS,"The VdW and Coulomb cut-offs are different, whereas the Verlet scheme only supports equal cut-offs");
715 if (ir->vdwtype == evdwUSER || EEL_USER(ir->coulombtype))
717 gmx_fatal(FARGS,"User non-bonded potentials are not (yet) supported with the Verlet scheme");
719 else if (EVDW_SWITCHED(ir->vdwtype) || EEL_SWITCHED(ir->coulombtype))
721 md_print_warn(NULL,fplog,"Converting switched or shifted interactions to a shifted potential (without force shift), this will lead to slightly different interaction potentials");
723 if (EVDW_SWITCHED(ir->vdwtype))
725 ir->vdwtype = evdwCUT;
727 if (EEL_SWITCHED(ir->coulombtype))
729 if (EEL_FULL(ir->coulombtype))
731 /* With full electrostatic only PME can be switched */
732 ir->coulombtype = eelPME;
736 md_print_warn(NULL,fplog,"NOTE: Replacing %s electrostatics with reaction-field with epsilon-rf=inf\n",eel_names[ir->coulombtype]);
737 ir->coulombtype = eelRF;
738 ir->epsilon_rf = 0.0;
742 /* We set the target energy drift to a small number.
743 * Note that this is only for testing. For production the user
744 * should think about this and set the mdp options.
746 ir->verletbuf_drift = 1e-4;
749 if (inputrec2nboundeddim(ir) != 3)
751 gmx_fatal(FARGS,"Can only convert old tpr files to the Verlet cut-off scheme with 3D pbc");
754 if (ir->efep != efepNO || ir->implicit_solvent != eisNO)
756 gmx_fatal(FARGS,"Will not convert old tpr files to the Verlet cut-off scheme with free-energy calculations or implicit solvent");
759 if (EI_DYNAMICS(ir->eI) && !(EI_MD(ir->eI) && ir->etc == etcNO))
761 verletbuf_list_setup_t ls;
763 verletbuf_get_list_setup(FALSE,&ls);
764 calc_verlet_buffer_size(mtop,box_vol,ir,ir->verletbuf_drift,&ls,
769 ir->verletbuf_drift = -1;
770 ir->rlist = 1.05*max(ir->rvdw,ir->rcoulomb);
773 gmx_mtop_remove_chargegroups(mtop);
776 /* Check the process affinity mask and if it is found to be non-zero,
777 * will honor it and disable mdrun internal affinity setting.
778 * This function should be called first before the OpenMP library gets
779 * initialized with the last argument FALSE (which will detect affinity
780 * set by external tools like taskset), and later, after the OpenMP
781 * initialization, with the last argument TRUE to detect affinity changes
782 * made by the OpenMP library.
784 * Note that this will only work on Linux as we use a GNU feature. */
785 static void check_cpu_affinity_set(FILE *fplog, const t_commrec *cr,
786 gmx_hw_opt_t *hw_opt, int ncpus,
787 gmx_bool bAfterOpenmpInit)
789 #ifdef HAVE_SCHED_GETAFFINITY
790 cpu_set_t mask_current;
791 int i, ret, cpu_count, cpu_set;
795 if (!hw_opt->bThreadPinning)
797 /* internal affinity setting is off, don't bother checking process affinity */
801 CPU_ZERO(&mask_current);
802 if ((ret = sched_getaffinity(0, sizeof(cpu_set_t), &mask_current)) != 0)
804 /* failed to query affinity mask, will just return */
807 fprintf(debug, "Failed to query affinity mask (error %d)", ret);
812 /* Before proceeding with the actual check, make sure that the number of
813 * detected CPUs is >= the CPUs in the current set.
814 * We need to check for CPU_COUNT as it was added only in glibc 2.6. */
816 if (ncpus < CPU_COUNT(&mask_current))
820 fprintf(debug, "%d CPUs detected, but %d was returned by CPU_COUNT",
821 ncpus, CPU_COUNT(&mask_current));
825 #endif /* CPU_COUNT */
828 for (i = 0; (i < ncpus && i < CPU_SETSIZE); i++)
830 bAllSet = bAllSet && (CPU_ISSET(i, &mask_current) != 0);
835 if (!bAfterOpenmpInit)
837 md_print_warn(cr, fplog,
838 "Non-default process affinity set, disabling internal affinity");
842 md_print_warn(cr, fplog,
843 "Non-default process affinity set probably by the OpenMP library, "
844 "disabling internal affinity");
846 hw_opt->bThreadPinning = FALSE;
850 fprintf(debug, "Non-default affinity mask found\n");
857 fprintf(debug, "Default affinity mask found\n");
860 #endif /* HAVE_SCHED_GETAFFINITY */
863 /* Set CPU affinity. Can be important for performance.
864 On some systems (e.g. Cray) CPU Affinity is set by default.
865 But default assigning doesn't work (well) with only some ranks
866 having threads. This causes very low performance.
867 External tools have cumbersome syntax for setting affinity
868 in the case that only some ranks have threads.
869 Thus it is important that GROMACS sets the affinity internally
870 if only PME is using threads.
872 static void set_cpu_affinity(FILE *fplog,
874 gmx_hw_opt_t *hw_opt,
876 const gmx_hw_info_t *hwinfo,
877 const t_inputrec *inputrec)
879 #if defined GMX_THREAD_MPI
880 /* With the number of TMPI threads equal to the number of cores
881 * we already pinned in thread-MPI, so don't pin again here.
883 if (hw_opt->nthreads_tmpi == tMPI_Thread_get_hw_number())
889 #ifdef GMX_OPENMP /* TODO: actually we could do this even without OpenMP?! */
890 #ifdef HAVE_SCHED_SETAFFINITY
891 if (hw_opt->bThreadPinning)
893 int thread, nthread_local, nthread_node, nthread_hw_max, nphyscore;
897 /* threads on this MPI process or TMPI thread */
898 if (cr->duty & DUTY_PP)
900 nthread_local = gmx_omp_nthreads_get(emntNonbonded);
904 nthread_local = gmx_omp_nthreads_get(emntPME);
907 /* map the current process to cores */
909 nthread_node = nthread_local;
911 if (PAR(cr) || MULTISIM(cr))
913 /* We need to determine a scan of the thread counts in this
918 MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),cr->nodeid_intra,
920 MPI_Scan(&nthread_local,&thread,1,MPI_INT,MPI_SUM,comm_intra);
921 /* MPI_Scan is inclusive, but here we need exclusive */
922 thread -= nthread_local;
923 /* Get the total number of threads on this physical node */
924 MPI_Allreduce(&nthread_local,&nthread_node,1,MPI_INT,MPI_SUM,comm_intra);
925 MPI_Comm_free(&comm_intra);
930 if (hw_opt->core_pinning_offset > 0)
932 offset = hw_opt->core_pinning_offset;
935 fprintf(stderr, "Applying core pinning offset %d\n", offset);
939 fprintf(fplog, "Applying core pinning offset %d\n", offset);
943 /* With Intel Hyper-Threading enabled, we want to pin consecutive
944 * threads to physical cores when using more threads than physical
945 * cores or when the user requests so.
947 nthread_hw_max = hwinfo->nthreads_hw_avail;
949 if (hw_opt->bPinHyperthreading ||
950 (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
951 nthread_node > nthread_hw_max/2 && getenv("GMX_DISABLE_PINHT") == NULL))
953 if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) != GMX_CPUID_X86_SMT_ENABLED)
955 /* We print to stderr on all processes, as we might have
956 * different settings on different physical nodes.
958 if (gmx_cpuid_vendor(hwinfo->cpuid_info) != GMX_CPUID_VENDOR_INTEL)
960 md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
961 "but non-Intel CPU detected (vendor: %s)\n",
962 gmx_cpuid_vendor_string[gmx_cpuid_vendor(hwinfo->cpuid_info)]);
966 md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
967 "but the CPU detected does not have Intel Hyper-Threading support "
968 "(or it is turned off)\n");
971 nphyscore = nthread_hw_max/2;
975 fprintf(stderr, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
980 fprintf(fplog, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
985 /* set the per-thread affinity */
986 #pragma omp parallel firstprivate(thread) num_threads(nthread_local)
992 thread += gmx_omp_get_thread_num();
995 core = offset + thread;
999 /* Lock pairs of threads to the same hyperthreaded core */
1000 core = offset + thread/2 + (thread % 2)*nphyscore;
1002 CPU_SET(core, &mask);
1003 sched_setaffinity((pid_t) syscall (SYS_gettid), sizeof(cpu_set_t), &mask);
1006 #endif /* HAVE_SCHED_SETAFFINITY */
1007 #endif /* GMX_OPENMP */
1011 static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt,
1014 gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp);
1016 #ifndef GMX_THREAD_MPI
1017 if (hw_opt->nthreads_tot > 0)
1019 gmx_fatal(FARGS,"Setting the total number of threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
1021 if (hw_opt->nthreads_tmpi > 0)
1023 gmx_fatal(FARGS,"Setting the number of thread-MPI threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
1027 if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
1029 /* We have the same number of OpenMP threads for PP and PME processes,
1030 * thus we can perform several consistency checks.
1032 if (hw_opt->nthreads_tmpi > 0 &&
1033 hw_opt->nthreads_omp > 0 &&
1034 hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
1036 gmx_fatal(FARGS,"The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
1037 hw_opt->nthreads_tot,hw_opt->nthreads_tmpi,hw_opt->nthreads_omp);
1040 if (hw_opt->nthreads_tmpi > 0 &&
1041 hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
1043 gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
1044 hw_opt->nthreads_tot,hw_opt->nthreads_tmpi);
1047 if (hw_opt->nthreads_omp > 0 &&
1048 hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
1050 gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
1051 hw_opt->nthreads_tot,hw_opt->nthreads_omp);
1054 if (hw_opt->nthreads_tmpi > 0 &&
1055 hw_opt->nthreads_omp <= 0)
1057 hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
1062 if (hw_opt->nthreads_omp > 1)
1064 gmx_fatal(FARGS,"OpenMP threads are requested, but Gromacs was compiled without OpenMP support");
1068 if (cutoff_scheme == ecutsGROUP)
1070 /* We only have OpenMP support for PME only nodes */
1071 if (hw_opt->nthreads_omp > 1)
1073 gmx_fatal(FARGS,"OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
1074 ecutscheme_names[cutoff_scheme],
1075 ecutscheme_names[ecutsVERLET]);
1077 hw_opt->nthreads_omp = 1;
1080 if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
1082 gmx_fatal(FARGS,"You need to specify -ntomp in addition to -ntomp_pme");
1085 if (hw_opt->nthreads_tot == 1)
1087 hw_opt->nthreads_tmpi = 1;
1089 if (hw_opt->nthreads_omp > 1)
1091 gmx_fatal(FARGS,"You requested %d OpenMP threads with %d total threads",
1092 hw_opt->nthreads_tmpi,hw_opt->nthreads_tot);
1094 hw_opt->nthreads_omp = 1;
1097 if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
1099 hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
1104 fprintf(debug,"hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
1105 hw_opt->nthreads_tot,
1106 hw_opt->nthreads_tmpi,
1107 hw_opt->nthreads_omp,
1108 hw_opt->nthreads_omp_pme,
1109 hw_opt->gpu_id!=NULL ? hw_opt->gpu_id : "");
1115 /* Override the value in inputrec with value passed on the command line (if any) */
1116 static void override_nsteps_cmdline(FILE *fplog,
1119 const t_commrec *cr)
1124 /* override with anything else than the default -2 */
1125 if (nsteps_cmdline > -2)
1129 ir->nsteps = nsteps_cmdline;
1130 if (EI_DYNAMICS(ir->eI))
1132 sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps, %.3f ps",
1133 nsteps_cmdline, nsteps_cmdline*ir->delta_t);
1137 sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps",
1141 md_print_warn(cr, fplog, "%s\n", stmp);
1145 /* Data structure set by SIMMASTER which needs to be passed to all nodes
1146 * before the other nodes have read the tpx file and called gmx_detect_hardware.
1149 int cutoff_scheme; /* The cutoff scheme from inputrec_t */
1150 gmx_bool bUseGPU; /* Use GPU or GPU emulation */
1153 int mdrunner(gmx_hw_opt_t *hw_opt,
1154 FILE *fplog,t_commrec *cr,int nfile,
1155 const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
1156 gmx_bool bCompact, int nstglobalcomm,
1157 ivec ddxyz,int dd_node_order,real rdd,real rconstr,
1158 const char *dddlb_opt,real dlb_scale,
1159 const char *ddcsx,const char *ddcsy,const char *ddcsz,
1160 const char *nbpu_opt,
1161 int nsteps_cmdline, int nstepout,int resetstep,
1162 int nmultisim,int repl_ex_nst,int repl_ex_nex,
1163 int repl_ex_seed, real pforce,real cpt_period,real max_hours,
1164 const char *deviceOptions, unsigned long Flags)
1166 gmx_bool bForceUseGPU,bTryUseGPU;
1167 double nodetime=0,realtime;
1168 t_inputrec *inputrec;
1169 t_state *state=NULL;
1171 gmx_ddbox_t ddbox={0};
1172 int npme_major,npme_minor;
1175 gmx_mtop_t *mtop=NULL;
1176 t_mdatoms *mdatoms=NULL;
1177 t_forcerec *fr=NULL;
1180 gmx_pme_t *pmedata=NULL;
1181 gmx_vsite_t *vsite=NULL;
1182 gmx_constr_t constr;
1183 int i,m,nChargePerturbed=-1,status,nalloc;
1185 gmx_wallcycle_t wcycle;
1186 gmx_bool bReadRNG,bReadEkin;
1188 gmx_runtime_t runtime;
1190 gmx_large_int_t reset_counters;
1191 gmx_edsam_t ed=NULL;
1192 t_commrec *cr_old=cr;
1195 gmx_membed_t membed=NULL;
1196 gmx_hw_info_t *hwinfo=NULL;
1197 master_inf_t minf={-1,FALSE};
1199 /* CAUTION: threads may be started later on in this function, so
1200 cr doesn't reflect the final parallel state right now */
1204 if (Flags & MD_APPENDFILES)
1209 bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
1210 bTryUseGPU = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
1215 /* Read (nearly) all data required for the simulation */
1216 read_tpx_state(ftp2fn(efTPX,nfile,fnm),inputrec,state,NULL,mtop);
1218 if (inputrec->cutoff_scheme != ecutsVERLET &&
1219 ((Flags & MD_TESTVERLET) || getenv("GMX_VERLET_SCHEME") != NULL))
1221 convert_to_verlet_scheme(fplog,inputrec,mtop,det(state->box));
1224 /* Detect hardware, gather information. With tMPI only thread 0 does it
1225 * and after threads are started broadcasts hwinfo around. */
1227 gmx_detect_hardware(fplog, hwinfo, cr,
1228 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
1230 minf.cutoff_scheme = inputrec->cutoff_scheme;
1231 minf.bUseGPU = FALSE;
1233 if (inputrec->cutoff_scheme == ecutsVERLET)
1235 prepare_verlet_scheme(fplog,hwinfo,cr,hw_opt,nbpu_opt,
1236 inputrec,mtop,state->box,
1239 else if (hwinfo->bCanUseGPU)
1241 md_print_warn(cr,fplog,
1242 "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
1243 " To use a GPU, set the mdp option: cutoff-scheme = Verlet\n"
1244 " (for quick performance testing you can use the -testverlet option)\n");
1248 gmx_fatal(FARGS,"GPU requested, but can't be used without cutoff-scheme=Verlet");
1252 #ifndef GMX_THREAD_MPI
1255 gmx_bcast_sim(sizeof(minf),&minf,cr);
1258 if (minf.bUseGPU && cr->npmenodes == -1)
1260 /* Don't automatically use PME-only nodes with GPUs */
1264 /* Check for externally set OpenMP affinity and turn off internal
1265 * pinning if any is found. We need to do this check early to tell
1266 * thread-MPI whether it should do pinning when spawning threads.
1268 gmx_omp_check_thread_affinity(fplog, cr, hw_opt);
1270 #ifdef GMX_THREAD_MPI
1271 /* With thread-MPI inputrec is only set here on the master thread */
1275 check_and_update_hw_opt(hw_opt,minf.cutoff_scheme);
1277 #ifdef GMX_THREAD_MPI
1278 /* Early check for externally set process affinity. Can't do over all
1279 * MPI processes because hwinfo is not available everywhere, but with
1280 * thread-MPI it's needed as pinning might get turned off which needs
1281 * to be known before starting thread-MPI. */
1282 check_cpu_affinity_set(fplog,
1284 hw_opt, hwinfo->nthreads_hw_avail, FALSE);
1287 #ifdef GMX_THREAD_MPI
1288 if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
1290 gmx_fatal(FARGS,"You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME nodes");
1294 if (hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
1297 gmx_fatal(FARGS,"You need to explicitly specify the number of PME nodes (-npme) when using different number of OpenMP threads for PP and PME nodes");
1301 #ifdef GMX_THREAD_MPI
1304 /* NOW the threads will be started: */
1305 hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
1309 if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
1311 hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
1314 if (hw_opt->nthreads_tmpi > 1)
1316 /* now start the threads. */
1317 cr=mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm,
1318 oenv, bVerbose, bCompact, nstglobalcomm,
1319 ddxyz, dd_node_order, rdd, rconstr,
1320 dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
1322 nsteps_cmdline, nstepout, resetstep, nmultisim,
1323 repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
1324 cpt_period, max_hours, deviceOptions,
1326 /* the main thread continues here with a new cr. We don't deallocate
1327 the old cr because other threads may still be reading it. */
1330 gmx_comm("Failed to spawn threads");
1335 /* END OF CAUTION: cr is now reliable */
1337 /* g_membed initialisation *
1338 * Because we change the mtop, init_membed is called before the init_parallel *
1339 * (in case we ever want to make it run in parallel) */
1340 if (opt2bSet("-membed",nfile,fnm))
1344 fprintf(stderr,"Initializing membed");
1346 membed = init_membed(fplog,nfile,fnm,mtop,inputrec,state,cr,&cpt_period);
1351 /* now broadcast everything to the non-master nodes/threads: */
1352 init_parallel(fplog, cr, inputrec, mtop);
1354 /* This check needs to happen after get_nthreads_mpi() */
1355 if (inputrec->cutoff_scheme == ecutsVERLET && (Flags & MD_PARTDEC))
1357 gmx_fatal_collective(FARGS,cr,NULL,
1358 "The Verlet cut-off scheme is not supported with particle decomposition.\n"
1359 "You can achieve the same effect as particle decomposition by running in parallel using only OpenMP threads.");
1364 pr_inputrec(fplog,0,"Input Parameters",inputrec,FALSE);
1367 #if defined GMX_THREAD_MPI
1368 /* With tMPI we detected on thread 0 and we'll just pass the hwinfo pointer
1369 * to the other threads -- slightly uncool, but works fine, just need to
1370 * make sure that the data doesn't get freed twice. */
1377 gmx_bcast(sizeof(&hwinfo), &hwinfo, cr);
1380 if (PAR(cr) && !SIMMASTER(cr))
1382 /* now we have inputrec on all nodes, can run the detection */
1383 /* TODO: perhaps it's better to propagate within a node instead? */
1385 gmx_detect_hardware(fplog, hwinfo, cr,
1386 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
1389 /* Now do the affinity check with MPI/no-MPI (done earlier with thread-MPI). */
1390 check_cpu_affinity_set(fplog, cr,
1391 hw_opt, hwinfo->nthreads_hw_avail, FALSE);
1394 /* now make sure the state is initialized and propagated */
1395 set_state_entries(state,inputrec,cr->nnodes);
1397 /* remove when vv and rerun works correctly! */
1398 if (PAR(cr) && EI_VV(inputrec->eI) && ((Flags & MD_RERUN) || (Flags & MD_RERUN_VSITE)))
1401 "Currently can't do velocity verlet with rerun in parallel.");
1404 /* A parallel command line option consistency check that we can
1405 only do after any threads have started. */
1407 (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
1410 "The -dd or -npme option request a parallel simulation, "
1412 "but %s was compiled without threads or MPI enabled"
1414 #ifdef GMX_THREAD_MPI
1415 "but the number of threads (option -nt) is 1"
1417 "but %s was not started through mpirun/mpiexec or only one process was requested through mpirun/mpiexec"
1424 if ((Flags & MD_RERUN) &&
1425 (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
1427 gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
1430 if (can_use_allvsall(inputrec,mtop,TRUE,cr,fplog) && PAR(cr))
1432 /* All-vs-all loops do not work with domain decomposition */
1433 Flags |= MD_PARTDEC;
1436 if (!EEL_PME(inputrec->coulombtype) || (Flags & MD_PARTDEC))
1438 if (cr->npmenodes > 0)
1440 if (!EEL_PME(inputrec->coulombtype))
1442 gmx_fatal_collective(FARGS,cr,NULL,
1443 "PME nodes are requested, but the system does not use PME electrostatics");
1445 if (Flags & MD_PARTDEC)
1447 gmx_fatal_collective(FARGS,cr,NULL,
1448 "PME nodes are requested, but particle decomposition does not support separate PME nodes");
1456 fcRegisterSteps(inputrec->nsteps,inputrec->init_step);
1459 /* NMR restraints must be initialized before load_checkpoint,
1460 * since with time averaging the history is added to t_state.
1461 * For proper consistency check we therefore need to extend
1463 * So the PME-only nodes (if present) will also initialize
1464 * the distance restraints.
1468 /* This needs to be called before read_checkpoint to extend the state */
1469 init_disres(fplog,mtop,inputrec,cr,Flags & MD_PARTDEC,fcd,state);
1471 if (gmx_mtop_ftype_count(mtop,F_ORIRES) > 0)
1473 if (PAR(cr) && !(Flags & MD_PARTDEC))
1475 gmx_fatal(FARGS,"Orientation restraints do not work (yet) with domain decomposition, use particle decomposition (mdrun option -pd)");
1477 /* Orientation restraints */
1480 init_orires(fplog,mtop,state->x,inputrec,cr->ms,&(fcd->orires),
1485 if (DEFORM(*inputrec))
1487 /* Store the deform reference box before reading the checkpoint */
1490 copy_mat(state->box,box);
1494 gmx_bcast(sizeof(box),box,cr);
1496 /* Because we do not have the update struct available yet
1497 * in which the reference values should be stored,
1498 * we store them temporarily in static variables.
1499 * This should be thread safe, since they are only written once
1500 * and with identical values.
1502 #ifdef GMX_THREAD_MPI
1503 tMPI_Thread_mutex_lock(&deform_init_box_mutex);
1505 deform_init_init_step_tpx = inputrec->init_step;
1506 copy_mat(box,deform_init_box_tpx);
1507 #ifdef GMX_THREAD_MPI
1508 tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
1512 if (opt2bSet("-cpi",nfile,fnm))
1514 /* Check if checkpoint file exists before doing continuation.
1515 * This way we can use identical input options for the first and subsequent runs...
1517 if( gmx_fexist_master(opt2fn_master("-cpi",nfile,fnm,cr),cr) )
1519 load_checkpoint(opt2fn_master("-cpi",nfile,fnm,cr),&fplog,
1520 cr,Flags & MD_PARTDEC,ddxyz,
1521 inputrec,state,&bReadRNG,&bReadEkin,
1522 (Flags & MD_APPENDFILES),
1523 (Flags & MD_APPENDFILESSET));
1527 Flags |= MD_READ_RNG;
1531 Flags |= MD_READ_EKIN;
1536 if (((MASTER(cr) || (Flags & MD_SEPPOT)) && (Flags & MD_APPENDFILES))
1537 #ifdef GMX_THREAD_MPI
1538 /* With thread MPI only the master node/thread exists in mdrun.c,
1539 * therefore non-master nodes need to open the "seppot" log file here.
1541 || (!MASTER(cr) && (Flags & MD_SEPPOT))
1545 gmx_log_open(ftp2fn(efLOG,nfile,fnm),cr,!(Flags & MD_SEPPOT),
1549 /* override nsteps with value from cmdline */
1550 override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
1554 copy_mat(state->box,box);
1559 gmx_bcast(sizeof(box),box,cr);
1562 /* Essential dynamics */
1563 if (opt2bSet("-ei",nfile,fnm))
1565 /* Open input and output files, allocate space for ED data structure */
1566 ed = ed_open(nfile,fnm,Flags,cr);
1569 if (PAR(cr) && !((Flags & MD_PARTDEC) ||
1570 EI_TPI(inputrec->eI) ||
1571 inputrec->eI == eiNM))
1573 cr->dd = init_domain_decomposition(fplog,cr,Flags,ddxyz,rdd,rconstr,
1574 dddlb_opt,dlb_scale,
1578 &ddbox,&npme_major,&npme_minor);
1580 make_dd_communicators(fplog,cr,dd_node_order);
1582 /* Set overallocation to avoid frequent reallocation of arrays */
1583 set_over_alloc_dd(TRUE);
1587 /* PME, if used, is done on all nodes with 1D decomposition */
1589 cr->duty = (DUTY_PP | DUTY_PME);
1592 if (!EI_TPI(inputrec->eI))
1594 npme_major = cr->nnodes;
1597 if (inputrec->ePBC == epbcSCREW)
1600 "pbc=%s is only implemented with domain decomposition",
1601 epbc_names[inputrec->ePBC]);
1607 /* After possible communicator splitting in make_dd_communicators.
1608 * we can set up the intra/inter node communication.
1610 gmx_setup_nodecomm(fplog,cr);
1613 /* Initialize per-node process ID and counters. */
1614 gmx_init_intra_counters(cr);
1617 md_print_info(cr,fplog,"Using %d MPI %s\n",
1619 #ifdef GMX_THREAD_MPI
1620 cr->nnodes==1 ? "thread" : "threads"
1622 cr->nnodes==1 ? "process" : "processes"
1627 gmx_omp_nthreads_init(fplog, cr,
1628 hwinfo->nthreads_hw_avail,
1629 hw_opt->nthreads_omp,
1630 hw_opt->nthreads_omp_pme,
1631 (cr->duty & DUTY_PP) == 0,
1632 inputrec->cutoff_scheme == ecutsVERLET);
1634 gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi, minf.bUseGPU);
1636 /* getting number of PP/PME threads
1637 PME: env variable should be read only on one node to make sure it is
1638 identical everywhere;
1640 /* TODO nthreads_pp is only used for pinning threads.
1641 * This is a temporary solution until we have a hw topology library.
1643 nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
1644 nthreads_pme = gmx_omp_nthreads_get(emntPME);
1646 wcycle = wallcycle_init(fplog,resetstep,cr,nthreads_pp,nthreads_pme);
1650 /* Master synchronizes its value of reset_counters with all nodes
1651 * including PME only nodes */
1652 reset_counters = wcycle_get_reset_counters(wcycle);
1653 gmx_bcast_sim(sizeof(reset_counters),&reset_counters,cr);
1654 wcycle_set_reset_counters(wcycle, reset_counters);
1658 if (cr->duty & DUTY_PP)
1660 /* For domain decomposition we allocate dynamically
1661 * in dd_partition_system.
1663 if (DOMAINDECOMP(cr))
1665 bcast_state_setup(cr,state);
1671 bcast_state(cr,state,TRUE);
1675 /* Initiate forcerecord */
1677 fr->hwinfo = hwinfo;
1678 init_forcerec(fplog,oenv,fr,fcd,inputrec,mtop,cr,box,FALSE,
1679 opt2fn("-table",nfile,fnm),
1680 opt2fn("-tabletf",nfile,fnm),
1681 opt2fn("-tablep",nfile,fnm),
1682 opt2fn("-tableb",nfile,fnm),
1686 /* version for PCA_NOT_READ_NODE (see md.c) */
1687 /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
1688 "nofile","nofile","nofile","nofile",FALSE,pforce);
1690 fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT);
1692 /* Initialize QM-MM */
1695 init_QMMMrec(cr,box,mtop,inputrec,fr);
1698 /* Initialize the mdatoms structure.
1699 * mdatoms is not filled with atom data,
1700 * as this can not be done now with domain decomposition.
1702 mdatoms = init_mdatoms(fplog,mtop,inputrec->efep!=efepNO);
1704 /* Initialize the virtual site communication */
1705 vsite = init_vsite(mtop,cr,FALSE);
1707 calc_shifts(box,fr->shift_vec);
1709 /* With periodic molecules the charge groups should be whole at start up
1710 * and the virtual sites should not be far from their proper positions.
1712 if (!inputrec->bContinuation && MASTER(cr) &&
1713 !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
1715 /* Make molecules whole at start of run */
1716 if (fr->ePBC != epbcNONE)
1718 do_pbc_first_mtop(fplog,inputrec->ePBC,box,mtop,state->x);
1722 /* Correct initial vsite positions are required
1723 * for the initial distribution in the domain decomposition
1724 * and for the initial shell prediction.
1726 construct_vsites_mtop(fplog,vsite,mtop,state->x);
1730 if (EEL_PME(fr->eeltype))
1732 ewaldcoeff = fr->ewaldcoeff;
1733 pmedata = &fr->pmedata;
1742 /* This is a PME only node */
1744 /* We don't need the state */
1747 ewaldcoeff = calc_ewaldcoeff(inputrec->rcoulomb, inputrec->ewald_rtol);
1751 /* Before setting affinity, check whether the affinity has changed
1752 * - which indicates that probably the OpenMP library has changed it since
1753 * we first checked). */
1754 check_cpu_affinity_set(fplog, cr, hw_opt, hwinfo->nthreads_hw_avail, TRUE);
1756 /* Set the CPU affinity */
1757 set_cpu_affinity(fplog,cr,hw_opt,nthreads_pme,hwinfo,inputrec);
1759 /* Initiate PME if necessary,
1760 * either on all nodes or on dedicated PME nodes only. */
1761 if (EEL_PME(inputrec->coulombtype))
1765 nChargePerturbed = mdatoms->nChargePerturbed;
1767 if (cr->npmenodes > 0)
1769 /* The PME only nodes need to know nChargePerturbed */
1770 gmx_bcast_sim(sizeof(nChargePerturbed),&nChargePerturbed,cr);
1773 if (cr->duty & DUTY_PME)
1775 status = gmx_pme_init(pmedata,cr,npme_major,npme_minor,inputrec,
1776 mtop ? mtop->natoms : 0,nChargePerturbed,
1777 (Flags & MD_REPRODUCIBLE),nthreads_pme);
1780 gmx_fatal(FARGS,"Error %d initializing PME",status);
1786 if (integrator[inputrec->eI].func == do_md
1789 integrator[inputrec->eI].func == do_md_openmm
1793 /* Turn on signal handling on all nodes */
1795 * (A user signal from the PME nodes (if any)
1796 * is communicated to the PP nodes.
1798 signal_handler_install();
1801 if (cr->duty & DUTY_PP)
1803 if (inputrec->ePull != epullNO)
1805 /* Initialize pull code */
1806 init_pull(fplog,inputrec,nfile,fnm,mtop,cr,oenv, inputrec->fepvals->init_lambda,
1807 EI_DYNAMICS(inputrec->eI) && MASTER(cr),Flags);
1812 /* Initialize enforced rotation code */
1813 init_rot(fplog,inputrec,nfile,fnm,cr,state->x,box,mtop,oenv,
1817 constr = init_constraints(fplog,mtop,inputrec,ed,state,cr);
1819 if (DOMAINDECOMP(cr))
1821 dd_init_bondeds(fplog,cr->dd,mtop,vsite,constr,inputrec,
1822 Flags & MD_DDBONDCHECK,fr->cginfo_mb);
1824 set_dd_parameters(fplog,cr->dd,dlb_scale,inputrec,fr,&ddbox);
1826 setup_dd_grid(fplog,cr->dd);
1829 /* Now do whatever the user wants us to do (how flexible...) */
1830 integrator[inputrec->eI].func(fplog,cr,nfile,fnm,
1831 oenv,bVerbose,bCompact,
1834 nstepout,inputrec,mtop,
1836 mdatoms,nrnb,wcycle,ed,fr,
1837 repl_ex_nst,repl_ex_nex,repl_ex_seed,
1839 cpt_period,max_hours,
1844 if (inputrec->ePull != epullNO)
1846 finish_pull(fplog,inputrec->pull);
1851 finish_rot(fplog,inputrec->rot);
1858 gmx_pmeonly(*pmedata,cr,nrnb,wcycle,ewaldcoeff,FALSE,inputrec);
1861 if (EI_DYNAMICS(inputrec->eI) || EI_TPI(inputrec->eI))
1863 /* Some timing stats */
1866 if (runtime.proc == 0)
1868 runtime.proc = runtime.real;
1877 wallcycle_stop(wcycle,ewcRUN);
1879 /* Finish up, write some stuff
1880 * if rerunMD, don't write last frame again
1882 finish_run(fplog,cr,ftp2fn(efSTO,nfile,fnm),
1883 inputrec,nrnb,wcycle,&runtime,
1884 fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ?
1885 nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL,
1887 EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
1889 if ((cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU)
1891 char gpu_err_str[STRLEN];
1893 /* free GPU memory and uninitialize GPU (by destroying the context) */
1894 nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
1896 if (!free_gpu(gpu_err_str))
1898 gmx_warning("On node %d failed to free GPU #%d: %s",
1899 cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
1903 if (opt2bSet("-membed",nfile,fnm))
1908 #ifdef GMX_THREAD_MPI
1909 if (PAR(cr) && SIMMASTER(cr))
1912 gmx_hardware_info_free(hwinfo);
1915 /* Does what it says */
1916 print_date_and_time(fplog,cr->nodeid,"Finished mdrun",&runtime);
1918 /* Close logfile already here if we were appending to it */
1919 if (MASTER(cr) && (Flags & MD_APPENDFILES))
1921 gmx_log_close(fplog);
1924 rc=(int)gmx_get_stop_condition();
1926 #ifdef GMX_THREAD_MPI
1927 /* we need to join all threads. The sub-threads join when they
1928 exit this function, but the master thread needs to be told to
1930 if (PAR(cr) && MASTER(cr))