1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This source code is part of
8 * GROningen MAchine for Chemical Simulations
11 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13 * Copyright (c) 2001-2004, The GROMACS development team,
14 * check out http://www.gromacs.org for more information.
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version 2
19 * of the License, or (at your option) any later version.
21 * If you want to redistribute modifications, please consider that
22 * scientific software is very special. Version control is crucial -
23 * bugs must be traceable. We will be happy to consider code for
24 * inclusion in the official distribution, but derived work must not
25 * be called official GROMACS. Details are found in the README & COPYING
26 * files - if they are missing, get the official version at www.gromacs.org.
28 * To help us fund GROMACS development, we humbly ask that you cite
29 * the papers on the package - you can find them in the top README file.
31 * For more info, check our website at http://www.gromacs.org
34 * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
42 #include <sys/syscall.h>
57 #include "md_logging.h"
58 #include "md_support.h"
61 #include "pull_rotation.h"
74 #include "checkpoint.h"
75 #include "mtop_util.h"
76 #include "sighandler.h"
79 #include "gmx_detect_hardware.h"
80 #include "gmx_omp_nthreads.h"
81 #include "pull_rotation.h"
82 #include "calc_verletbuf.h"
83 #include "nbnxn_search.h"
84 #include "../mdlib/nbnxn_consts.h"
85 #include "gmx_fatal_collective.h"
102 #include "md_openmm.h"
105 #include "gpu_utils.h"
106 #include "nbnxn_cuda_data_mgmt.h"
109 gmx_integrator_t *func;
112 /* The array should match the eI array in include/types/enums.h */
113 #ifdef GMX_OPENMM /* FIXME do_md_openmm needs fixing */
114 const gmx_intp_t integrator[eiNR] = { {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm},{do_md_openmm}};
116 const gmx_intp_t integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md},{do_md}};
119 gmx_large_int_t deform_init_init_step_tpx;
120 matrix deform_init_box_tpx;
121 #ifdef GMX_THREAD_MPI
122 tMPI_Thread_mutex_t deform_init_box_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
126 #ifdef GMX_THREAD_MPI
127 struct mdrunner_arglist
129 gmx_hw_opt_t *hw_opt;
142 const char *dddlb_opt;
147 const char *nbpu_opt;
158 const char *deviceOptions;
160 int ret; /* return value */
164 /* The function used for spawning threads. Extracts the mdrunner()
165 arguments from its one argument and calls mdrunner(), after making
167 static void mdrunner_start_fn(void *arg)
169 struct mdrunner_arglist *mda=(struct mdrunner_arglist*)arg;
170 struct mdrunner_arglist mc=*mda; /* copy the arg list to make sure
171 that it's thread-local. This doesn't
172 copy pointed-to items, of course,
173 but those are all const. */
174 t_commrec *cr; /* we need a local version of this */
178 fnm = dup_tfn(mc.nfile, mc.fnm);
180 cr = init_par_threads(mc.cr);
187 mda->ret=mdrunner(mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv,
188 mc.bVerbose, mc.bCompact, mc.nstglobalcomm,
189 mc.ddxyz, mc.dd_node_order, mc.rdd,
190 mc.rconstr, mc.dddlb_opt, mc.dlb_scale,
191 mc.ddcsx, mc.ddcsy, mc.ddcsz,
193 mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
194 mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce,
195 mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.Flags);
198 /* called by mdrunner() to start a specific number of threads (including
199 the main thread) for thread-parallel runs. This in turn calls mdrunner()
201 All options besides nthreads are the same as for mdrunner(). */
202 static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
203 FILE *fplog,t_commrec *cr,int nfile,
204 const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
205 gmx_bool bCompact, int nstglobalcomm,
206 ivec ddxyz,int dd_node_order,real rdd,real rconstr,
207 const char *dddlb_opt,real dlb_scale,
208 const char *ddcsx,const char *ddcsy,const char *ddcsz,
209 const char *nbpu_opt,
210 int nsteps_cmdline, int nstepout,int resetstep,
211 int nmultisim,int repl_ex_nst,int repl_ex_nex, int repl_ex_seed,
212 real pforce,real cpt_period, real max_hours,
213 const char *deviceOptions, unsigned long Flags)
216 struct mdrunner_arglist *mda;
217 t_commrec *crn; /* the new commrec */
220 /* first check whether we even need to start tMPI */
221 if (hw_opt->nthreads_tmpi < 2)
226 /* a few small, one-time, almost unavoidable memory leaks: */
228 fnmn=dup_tfn(nfile, fnm);
230 /* fill the data structure to pass as void pointer to thread start fn */
237 mda->bVerbose=bVerbose;
238 mda->bCompact=bCompact;
239 mda->nstglobalcomm=nstglobalcomm;
240 mda->ddxyz[XX]=ddxyz[XX];
241 mda->ddxyz[YY]=ddxyz[YY];
242 mda->ddxyz[ZZ]=ddxyz[ZZ];
243 mda->dd_node_order=dd_node_order;
245 mda->rconstr=rconstr;
246 mda->dddlb_opt=dddlb_opt;
247 mda->dlb_scale=dlb_scale;
251 mda->nbpu_opt=nbpu_opt;
252 mda->nsteps_cmdline=nsteps_cmdline;
253 mda->nstepout=nstepout;
254 mda->resetstep=resetstep;
255 mda->nmultisim=nmultisim;
256 mda->repl_ex_nst=repl_ex_nst;
257 mda->repl_ex_nex=repl_ex_nex;
258 mda->repl_ex_seed=repl_ex_seed;
260 mda->cpt_period=cpt_period;
261 mda->max_hours=max_hours;
262 mda->deviceOptions=deviceOptions;
265 fprintf(stderr, "Starting %d tMPI threads\n",hw_opt->nthreads_tmpi);
267 /* now spawn new threads that start mdrunner_start_fn(), while
268 the main thread returns */
269 ret=tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi,
270 mdrunner_start_fn, (void*)(mda) );
271 if (ret!=TMPI_SUCCESS)
274 /* make a new comm_rec to reflect the new situation */
275 crn=init_par_threads(cr);
280 static int get_tmpi_omp_thread_distribution(const gmx_hw_opt_t *hw_opt,
286 /* There are no separate PME nodes here, as we ensured in
287 * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
288 * and a conditional ensures we would not have ended up here.
289 * Note that separate PME nodes might be switched on later.
293 nthreads_tmpi = ngpu;
294 if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi)
296 nthreads_tmpi = nthreads_tot;
299 else if (hw_opt->nthreads_omp > 0)
301 if (hw_opt->nthreads_omp > nthreads_tot)
303 gmx_fatal(FARGS,"More OpenMP threads requested (%d) than the total number of threads requested (%d)",hw_opt->nthreads_omp,nthreads_tot);
305 nthreads_tmpi = nthreads_tot/hw_opt->nthreads_omp;
309 /* TODO choose nthreads_omp based on hardware topology
310 when we have a hardware topology detection library */
311 /* Don't use OpenMP parallelization */
312 nthreads_tmpi = nthreads_tot;
315 return nthreads_tmpi;
319 /* Get the number of threads to use for thread-MPI based on how many
320 * were requested, which algorithms we're using,
321 * and how many particles there are.
322 * At the point we have already called check_and_update_hw_opt.
323 * Thus all options should be internally consistent and consistent
324 * with the hardware, except that ntmpi could be larger than #GPU.
326 static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
327 gmx_hw_opt_t *hw_opt,
328 t_inputrec *inputrec, gmx_mtop_t *mtop,
332 int nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu;
333 int min_atoms_per_mpi_thread;
338 if (hw_opt->nthreads_tmpi > 0)
340 /* Trivial, return right away */
341 return hw_opt->nthreads_tmpi;
344 /* How many total (#tMPI*#OpenMP) threads can we start? */
345 if (hw_opt->nthreads_tot > 0)
347 nthreads_tot_max = hw_opt->nthreads_tot;
351 nthreads_tot_max = tMPI_Thread_get_hw_number();
354 bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
357 ngpu = hwinfo->gpu_info.ncuda_dev_use;
365 get_tmpi_omp_thread_distribution(hw_opt,nthreads_tot_max,ngpu);
367 if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
369 /* Steps are divided over the nodes iso splitting the atoms */
370 min_atoms_per_mpi_thread = 0;
376 min_atoms_per_mpi_thread = MIN_ATOMS_PER_GPU;
380 min_atoms_per_mpi_thread = MIN_ATOMS_PER_MPI_THREAD;
384 /* Check if an algorithm does not support parallel simulation. */
385 if (nthreads_tmpi != 1 &&
386 ( inputrec->eI == eiLBFGS ||
387 inputrec->coulombtype == eelEWALD ) )
391 md_print_warn(cr,fplog,"The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
392 if (hw_opt->nthreads_tmpi > nthreads_tmpi)
394 gmx_fatal(FARGS,"You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
397 else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_thread)
399 /* the thread number was chosen automatically, but there are too many
400 threads (too few atoms per thread) */
401 nthreads_new = max(1,mtop->natoms/min_atoms_per_mpi_thread);
403 if (nthreads_new > 8 || (nthreads_tmpi == 8 && nthreads_new > 4))
405 /* TODO replace this once we have proper HT detection
406 * Use only multiples of 4 above 8 threads
407 * or with an 8-core processor
408 * (to avoid 6 threads on 8 core processors with 4 real cores).
410 nthreads_new = (nthreads_new/4)*4;
412 else if (nthreads_new > 4)
414 /* Avoid 5 or 7 threads */
415 nthreads_new = (nthreads_new/2)*2;
418 nthreads_tmpi = nthreads_new;
420 fprintf(stderr,"\n");
421 fprintf(stderr,"NOTE: Parallelization is limited by the small number of atoms,\n");
422 fprintf(stderr," only starting %d thread-MPI threads.\n",nthreads_tmpi);
423 fprintf(stderr," You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
426 return nthreads_tmpi;
428 #endif /* GMX_THREAD_MPI */
431 /* Environment variable for setting nstlist */
432 static const char* NSTLIST_ENVVAR = "GMX_NSTLIST";
433 /* Try to increase nstlist when using a GPU with nstlist less than this */
434 static const int NSTLIST_GPU_ENOUGH = 20;
435 /* Increase nstlist until the non-bonded cost increases more than this factor */
436 static const float NBNXN_GPU_LIST_OK_FAC = 1.25;
437 /* Don't increase nstlist beyond a non-bonded cost increases of this factor */
438 static const float NBNXN_GPU_LIST_MAX_FAC = 1.40;
440 /* Try to increase nstlist when running on a GPU */
441 static void increase_nstlist(FILE *fp,t_commrec *cr,
442 t_inputrec *ir,const gmx_mtop_t *mtop,matrix box)
445 int nstlist_orig,nstlist_prev;
446 verletbuf_list_setup_t ls;
447 real rlist_inc,rlist_ok,rlist_max,rlist_new,rlist_prev;
450 gmx_bool bBox,bDD,bCont;
451 const char *nstl_fmt="\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
452 const char *vbd_err="Can not increase nstlist for GPU run because verlet-buffer-drift is not set or used";
453 const char *box_err="Can not increase nstlist for GPU run because the box is too small";
454 const char *dd_err ="Can not increase nstlist for GPU run because of domain decomposition limitations";
457 /* Number of + nstlist alternative values to try when switching */
458 const int nstl[]={ 20, 25, 40, 50 };
459 #define NNSTL sizeof(nstl)/sizeof(nstl[0])
461 env = getenv(NSTLIST_ENVVAR);
466 fprintf(fp,nstl_fmt,ir->nstlist);
470 if (ir->verletbuf_drift == 0)
472 gmx_fatal(FARGS,"You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
475 if (ir->verletbuf_drift < 0)
479 fprintf(stderr,"%s\n",vbd_err);
483 fprintf(fp,"%s\n",vbd_err);
489 nstlist_orig = ir->nstlist;
492 sprintf(buf,"Getting nstlist from environment variable GMX_NSTLIST=%s",env);
495 fprintf(stderr,"%s\n",buf);
499 fprintf(fp,"%s\n",buf);
501 sscanf(env,"%d",&ir->nstlist);
504 verletbuf_get_list_setup(TRUE,&ls);
506 /* Allow rlist to make the list double the size of the cut-off sphere */
507 rlist_inc = nbnxn_get_rlist_effective_inc(NBNXN_GPU_CLUSTER_SIZE,mtop->natoms/det(box));
508 rlist_ok = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_OK_FAC,1.0/3.0) - rlist_inc;
509 rlist_max = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_MAX_FAC,1.0/3.0) - rlist_inc;
512 fprintf(debug,"GPU nstlist tuning: rlist_inc %.3f rlist_max %.3f\n",
513 rlist_inc,rlist_max);
517 nstlist_prev = nstlist_orig;
518 rlist_prev = ir->rlist;
523 ir->nstlist = nstl[i];
526 /* Set the pair-list buffer size in ir */
527 calc_verlet_buffer_size(mtop,det(box),ir,ir->verletbuf_drift,&ls,
530 /* Does rlist fit in the box? */
531 bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC,box));
533 if (bBox && DOMAINDECOMP(cr))
535 /* Check if rlist fits in the domain decomposition */
536 if (inputrec2nboundeddim(ir) < DIM)
538 gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
540 copy_mat(box,state_tmp.box);
541 bDD = change_dd_cutoff(cr,&state_tmp,ir,rlist_new);
548 if (bBox && bDD && rlist_new <= rlist_max)
550 /* Increase nstlist */
551 nstlist_prev = ir->nstlist;
552 rlist_prev = rlist_new;
553 bCont = (i+1 < NNSTL && rlist_new < rlist_ok);
557 /* Stick with the previous nstlist */
558 ir->nstlist = nstlist_prev;
559 rlist_new = rlist_prev;
571 gmx_warning(!bBox ? box_err : dd_err);
574 fprintf(fp,"\n%s\n",bBox ? box_err : dd_err);
576 ir->nstlist = nstlist_orig;
578 else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
580 sprintf(buf,"Changing nstlist from %d to %d, rlist from %g to %g",
581 nstlist_orig,ir->nstlist,
582 ir->rlist,rlist_new);
585 fprintf(stderr,"%s\n\n",buf);
589 fprintf(fp,"%s\n\n",buf);
591 ir->rlist = rlist_new;
592 ir->rlistlong = rlist_new;
596 static void prepare_verlet_scheme(FILE *fplog,
597 gmx_hw_info_t *hwinfo,
599 gmx_hw_opt_t *hw_opt,
600 const char *nbpu_opt,
602 const gmx_mtop_t *mtop,
606 /* Here we only check for GPU usage on the MPI master process,
607 * as here we don't know how many GPUs we will use yet.
608 * We check for a GPU on all processes later.
610 *bUseGPU = hwinfo->bCanUseGPU || (getenv("GMX_EMULATE_GPU") != NULL);
612 if (ir->verletbuf_drift > 0)
614 /* Update the Verlet buffer size for the current run setup */
615 verletbuf_list_setup_t ls;
618 /* Here we assume CPU acceleration is on. But as currently
619 * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
620 * and 4x2 gives a larger buffer than 4x4, this is ok.
622 verletbuf_get_list_setup(*bUseGPU,&ls);
624 calc_verlet_buffer_size(mtop,det(box),ir,
625 ir->verletbuf_drift,&ls,
627 if (rlist_new != ir->rlist)
631 fprintf(fplog,"\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
633 ls.cluster_size_i,ls.cluster_size_j);
635 ir->rlist = rlist_new;
636 ir->rlistlong = rlist_new;
640 /* With GPU or emulation we should check nstlist for performance */
641 if ((EI_DYNAMICS(ir->eI) &&
643 ir->nstlist < NSTLIST_GPU_ENOUGH) ||
644 getenv(NSTLIST_ENVVAR) != NULL)
646 /* Choose a better nstlist */
647 increase_nstlist(fplog,cr,ir,mtop,box);
651 static void convert_to_verlet_scheme(FILE *fplog,
653 gmx_mtop_t *mtop,real box_vol)
655 char *conv_mesg="Converting input file with group cut-off scheme to the Verlet cut-off scheme";
657 md_print_warn(NULL,fplog,"%s\n",conv_mesg);
659 ir->cutoff_scheme = ecutsVERLET;
660 ir->verletbuf_drift = 0.005;
662 if (ir->rcoulomb != ir->rvdw)
664 gmx_fatal(FARGS,"The VdW and Coulomb cut-offs are different, whereas the Verlet scheme only supports equal cut-offs");
667 if (ir->vdwtype == evdwUSER || EEL_USER(ir->coulombtype))
669 gmx_fatal(FARGS,"User non-bonded potentials are not (yet) supported with the Verlet scheme");
671 else if (EVDW_SWITCHED(ir->vdwtype) || EEL_SWITCHED(ir->coulombtype))
673 md_print_warn(NULL,fplog,"Converting switched or shifted interactions to a shifted potential (without force shift), this will lead to slightly different interaction potentials");
675 if (EVDW_SWITCHED(ir->vdwtype))
677 ir->vdwtype = evdwCUT;
679 if (EEL_SWITCHED(ir->coulombtype))
681 if (EEL_FULL(ir->coulombtype))
683 /* With full electrostatic only PME can be switched */
684 ir->coulombtype = eelPME;
688 md_print_warn(NULL,fplog,"NOTE: Replacing %s electrostatics with reaction-field with epsilon-rf=inf\n",eel_names[ir->coulombtype]);
689 ir->coulombtype = eelRF;
690 ir->epsilon_rf = 0.0;
694 /* We set the target energy drift to a small number.
695 * Note that this is only for testing. For production the user
696 * should think about this and set the mdp options.
698 ir->verletbuf_drift = 1e-4;
701 if (inputrec2nboundeddim(ir) != 3)
703 gmx_fatal(FARGS,"Can only convert old tpr files to the Verlet cut-off scheme with 3D pbc");
706 if (ir->efep != efepNO || ir->implicit_solvent != eisNO)
708 gmx_fatal(FARGS,"Will not convert old tpr files to the Verlet cut-off scheme with free-energy calculations or implicit solvent");
711 if (EI_DYNAMICS(ir->eI) && !(EI_MD(ir->eI) && ir->etc == etcNO))
713 verletbuf_list_setup_t ls;
715 verletbuf_get_list_setup(FALSE,&ls);
716 calc_verlet_buffer_size(mtop,box_vol,ir,ir->verletbuf_drift,&ls,
721 ir->verletbuf_drift = -1;
722 ir->rlist = 1.05*max(ir->rvdw,ir->rcoulomb);
725 gmx_mtop_remove_chargegroups(mtop);
729 /* Set CPU affinity. Can be important for performance.
730 On some systems (e.g. Cray) CPU Affinity is set by default.
731 But default assigning doesn't work (well) with only some ranks
732 having threads. This causes very low performance.
733 External tools have cumbersome syntax for setting affinity
734 in the case that only some ranks have threads.
735 Thus it is important that GROMACS sets the affinity internally
736 if only PME is using threads.
738 static void set_cpu_affinity(FILE *fplog,
740 const gmx_hw_opt_t *hw_opt,
742 const gmx_hw_info_t *hwinfo,
743 const t_inputrec *inputrec)
745 #ifdef GMX_OPENMP /* TODO: actually we could do this even without OpenMP?! */
746 #ifdef __linux /* TODO: only linux? why not everywhere if sched_setaffinity is available */
747 if (hw_opt->bThreadPinning)
749 int thread, nthread_local, nthread_node, nthread_hw_max, nphyscore;
753 /* threads on this MPI process or TMPI thread */
754 if (cr->duty & DUTY_PP)
756 nthread_local = gmx_omp_nthreads_get(emntNonbonded);
760 nthread_local = gmx_omp_nthreads_get(emntPME);
763 /* map the current process to cores */
765 nthread_node = nthread_local;
767 if (PAR(cr) || MULTISIM(cr))
769 /* We need to determine a scan of the thread counts in this
774 MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),cr->nodeid_intra,
776 MPI_Scan(&nthread_local,&thread,1,MPI_INT,MPI_SUM,comm_intra);
777 /* MPI_Scan is inclusive, but here we need exclusive */
778 thread -= nthread_local;
779 /* Get the total number of threads on this physical node */
780 MPI_Allreduce(&nthread_local,&nthread_node,1,MPI_INT,MPI_SUM,comm_intra);
781 MPI_Comm_free(&comm_intra);
786 if (hw_opt->core_pinning_offset > 0)
788 offset = hw_opt->core_pinning_offset;
791 fprintf(stderr, "Applying core pinning offset %d\n", offset);
795 fprintf(fplog, "Applying core pinning offset %d\n", offset);
799 /* With Intel Hyper-Threading enabled, we want to pin consecutive
800 * threads to physical cores when using more threads than physical
801 * cores or when the user requests so.
803 nthread_hw_max = hwinfo->nthreads_hw_avail;
805 if (hw_opt->bPinHyperthreading ||
806 (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
807 nthread_node > nthread_hw_max/2 && getenv("GMX_DISABLE_PINHT") == NULL))
809 if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) != GMX_CPUID_X86_SMT_ENABLED)
811 /* We print to stderr on all processes, as we might have
812 * different settings on different physical nodes.
814 if (gmx_cpuid_vendor(hwinfo->cpuid_info) != GMX_CPUID_VENDOR_INTEL)
816 md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
817 "but non-Intel CPU detected (vendor: %s)\n",
818 gmx_cpuid_vendor_string[gmx_cpuid_vendor(hwinfo->cpuid_info)]);
822 md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
823 "but the CPU detected does not have Intel Hyper-Threading support "
824 "(or it is turned off)\n");
827 nphyscore = nthread_hw_max/2;
831 fprintf(stderr, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
836 fprintf(fplog, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
841 /* set the per-thread affinity */
842 #pragma omp parallel firstprivate(thread) num_threads(nthread_local)
848 thread += gmx_omp_get_thread_num();
851 core = offset + thread;
855 /* Lock pairs of threads to the same hyperthreaded core */
856 core = offset + thread/2 + (thread % 2)*nphyscore;
858 CPU_SET(core, &mask);
859 sched_setaffinity((pid_t) syscall (SYS_gettid), sizeof(cpu_set_t), &mask);
863 #endif /* GMX_OPENMP */
867 static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt,
870 gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp);
872 #ifndef GMX_THREAD_MPI
873 if (hw_opt->nthreads_tot > 0)
875 gmx_fatal(FARGS,"Setting the total number of threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
877 if (hw_opt->nthreads_tmpi > 0)
879 gmx_fatal(FARGS,"Setting the number of thread-MPI threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
883 if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
885 /* We have the same number of OpenMP threads for PP and PME processes,
886 * thus we can perform several consistency checks.
888 if (hw_opt->nthreads_tmpi > 0 &&
889 hw_opt->nthreads_omp > 0 &&
890 hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
892 gmx_fatal(FARGS,"The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
893 hw_opt->nthreads_tot,hw_opt->nthreads_tmpi,hw_opt->nthreads_omp);
896 if (hw_opt->nthreads_tmpi > 0 &&
897 hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
899 gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
900 hw_opt->nthreads_tot,hw_opt->nthreads_tmpi);
903 if (hw_opt->nthreads_omp > 0 &&
904 hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
906 gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
907 hw_opt->nthreads_tot,hw_opt->nthreads_omp);
910 if (hw_opt->nthreads_tmpi > 0 &&
911 hw_opt->nthreads_omp <= 0)
913 hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
918 if (hw_opt->nthreads_omp > 1)
920 gmx_fatal(FARGS,"OpenMP threads are requested, but Gromacs was compiled without OpenMP support");
924 if (cutoff_scheme == ecutsGROUP)
926 /* We only have OpenMP support for PME only nodes */
927 if (hw_opt->nthreads_omp > 1)
929 gmx_fatal(FARGS,"OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
930 ecutscheme_names[cutoff_scheme],
931 ecutscheme_names[ecutsVERLET]);
933 hw_opt->nthreads_omp = 1;
936 if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
938 gmx_fatal(FARGS,"You need to specify -ntomp in addition to -ntomp_pme");
941 if (hw_opt->nthreads_tot == 1)
943 hw_opt->nthreads_tmpi = 1;
945 if (hw_opt->nthreads_omp > 1)
947 gmx_fatal(FARGS,"You requested %d OpenMP threads with %d total threads",
948 hw_opt->nthreads_tmpi,hw_opt->nthreads_tot);
950 hw_opt->nthreads_omp = 1;
953 if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
955 hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
960 fprintf(debug,"hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
961 hw_opt->nthreads_tot,
962 hw_opt->nthreads_tmpi,
963 hw_opt->nthreads_omp,
964 hw_opt->nthreads_omp_pme,
965 hw_opt->gpu_id!=NULL ? hw_opt->gpu_id : "");
971 /* Override the value in inputrec with value passed on the command line (if any) */
972 static void override_nsteps_cmdline(FILE *fplog,
980 /* override with anything else than the default -2 */
981 if (nsteps_cmdline > -2)
985 ir->nsteps = nsteps_cmdline;
986 if (EI_DYNAMICS(ir->eI))
988 sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps, %.3f ps",
989 nsteps_cmdline, nsteps_cmdline*ir->delta_t);
993 sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps",
997 md_print_warn(cr, fplog, "%s\n", stmp);
1001 /* Data structure set by SIMMASTER which needs to be passed to all nodes
1002 * before the other nodes have read the tpx file and called gmx_detect_hardware.
1005 int cutoff_scheme; /* The cutoff-scheme from inputrec_t */
1006 gmx_bool bUseGPU; /* Use GPU or GPU emulation */
1009 int mdrunner(gmx_hw_opt_t *hw_opt,
1010 FILE *fplog,t_commrec *cr,int nfile,
1011 const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
1012 gmx_bool bCompact, int nstglobalcomm,
1013 ivec ddxyz,int dd_node_order,real rdd,real rconstr,
1014 const char *dddlb_opt,real dlb_scale,
1015 const char *ddcsx,const char *ddcsy,const char *ddcsz,
1016 const char *nbpu_opt,
1017 int nsteps_cmdline, int nstepout,int resetstep,
1018 int nmultisim,int repl_ex_nst,int repl_ex_nex,
1019 int repl_ex_seed, real pforce,real cpt_period,real max_hours,
1020 const char *deviceOptions, unsigned long Flags)
1022 gmx_bool bForceUseGPU,bTryUseGPU;
1023 double nodetime=0,realtime;
1024 t_inputrec *inputrec;
1025 t_state *state=NULL;
1027 gmx_ddbox_t ddbox={0};
1028 int npme_major,npme_minor;
1031 gmx_mtop_t *mtop=NULL;
1032 t_mdatoms *mdatoms=NULL;
1033 t_forcerec *fr=NULL;
1036 gmx_pme_t *pmedata=NULL;
1037 gmx_vsite_t *vsite=NULL;
1038 gmx_constr_t constr;
1039 int i,m,nChargePerturbed=-1,status,nalloc;
1041 gmx_wallcycle_t wcycle;
1042 gmx_bool bReadRNG,bReadEkin;
1044 gmx_runtime_t runtime;
1046 gmx_large_int_t reset_counters;
1047 gmx_edsam_t ed=NULL;
1048 t_commrec *cr_old=cr;
1051 gmx_membed_t membed=NULL;
1052 gmx_hw_info_t *hwinfo=NULL;
1053 master_inf_t minf={-1,FALSE};
1055 /* CAUTION: threads may be started later on in this function, so
1056 cr doesn't reflect the final parallel state right now */
1060 if (Flags & MD_APPENDFILES)
1065 bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
1066 bTryUseGPU = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
1071 /* Read (nearly) all data required for the simulation */
1072 read_tpx_state(ftp2fn(efTPX,nfile,fnm),inputrec,state,NULL,mtop);
1074 if (inputrec->cutoff_scheme != ecutsVERLET &&
1075 ((Flags & MD_TESTVERLET) || getenv("GMX_VERLET_SCHEME") != NULL))
1077 convert_to_verlet_scheme(fplog,inputrec,mtop,det(state->box));
1080 /* Detect hardware, gather information. With tMPI only thread 0 does it
1081 * and after threads are started broadcasts hwinfo around. */
1083 gmx_detect_hardware(fplog, hwinfo, cr,
1084 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
1086 minf.cutoff_scheme = inputrec->cutoff_scheme;
1087 minf.bUseGPU = FALSE;
1089 if (inputrec->cutoff_scheme == ecutsVERLET)
1091 prepare_verlet_scheme(fplog,hwinfo,cr,hw_opt,nbpu_opt,
1092 inputrec,mtop,state->box,
1095 else if (hwinfo->bCanUseGPU)
1097 md_print_warn(cr,fplog,
1098 "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
1099 " To use a GPU, set the mdp option: cutoff-scheme = Verlet\n"
1100 " (for quick performance testing you can use the -testverlet option)\n");
1104 gmx_fatal(FARGS,"GPU requested, but can't be used without cutoff-scheme=Verlet");
1108 #ifndef GMX_THREAD_MPI
1111 gmx_bcast_sim(sizeof(minf),&minf,cr);
1114 if (minf.bUseGPU && cr->npmenodes == -1)
1116 /* Don't automatically use PME-only nodes with GPUs */
1120 #ifdef GMX_THREAD_MPI
1121 /* With thread-MPI inputrec is only set here on the master thread */
1125 check_and_update_hw_opt(hw_opt,minf.cutoff_scheme);
1127 #ifdef GMX_THREAD_MPI
1128 if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
1130 gmx_fatal(FARGS,"You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME nodes");
1134 if (hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
1137 gmx_fatal(FARGS,"You need to explicitly specify the number of PME nodes (-npme) when using different number of OpenMP threads for PP and PME nodes");
1141 #ifdef GMX_THREAD_MPI
1144 /* NOW the threads will be started: */
1145 hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
1149 if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
1151 hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
1154 if (hw_opt->nthreads_tmpi > 1)
1156 /* now start the threads. */
1157 cr=mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm,
1158 oenv, bVerbose, bCompact, nstglobalcomm,
1159 ddxyz, dd_node_order, rdd, rconstr,
1160 dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
1162 nsteps_cmdline, nstepout, resetstep, nmultisim,
1163 repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
1164 cpt_period, max_hours, deviceOptions,
1166 /* the main thread continues here with a new cr. We don't deallocate
1167 the old cr because other threads may still be reading it. */
1170 gmx_comm("Failed to spawn threads");
1175 /* END OF CAUTION: cr is now reliable */
1177 /* g_membed initialisation *
1178 * Because we change the mtop, init_membed is called before the init_parallel *
1179 * (in case we ever want to make it run in parallel) */
1180 if (opt2bSet("-membed",nfile,fnm))
1184 fprintf(stderr,"Initializing membed");
1186 membed = init_membed(fplog,nfile,fnm,mtop,inputrec,state,cr,&cpt_period);
1191 /* now broadcast everything to the non-master nodes/threads: */
1192 init_parallel(fplog, cr, inputrec, mtop);
1194 /* This check needs to happen after get_nthreads_mpi() */
1195 if (inputrec->cutoff_scheme == ecutsVERLET && (Flags & MD_PARTDEC))
1197 gmx_fatal_collective(FARGS,cr,NULL,
1198 "The Verlet cut-off scheme is not supported with particle decomposition.\n"
1199 "You can achieve the same effect as particle decomposition by running in parallel using only OpenMP threads.");
1204 pr_inputrec(fplog,0,"Input Parameters",inputrec,FALSE);
1207 #if defined GMX_THREAD_MPI
1208 /* With tMPI we detected on thread 0 and we'll just pass the hwinfo pointer
1209 * to the other threads -- slightly uncool, but works fine, just need to
1210 * make sure that the data doesn't get freed twice. */
1217 gmx_bcast(sizeof(&hwinfo), &hwinfo, cr);
1220 if (PAR(cr) && !SIMMASTER(cr))
1222 /* now we have inputrec on all nodes, can run the detection */
1223 /* TODO: perhaps it's better to propagate within a node instead? */
1225 gmx_detect_hardware(fplog, hwinfo, cr,
1226 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
1230 /* now make sure the state is initialized and propagated */
1231 set_state_entries(state,inputrec,cr->nnodes);
1233 /* remove when vv and rerun works correctly! */
1234 if (PAR(cr) && EI_VV(inputrec->eI) && ((Flags & MD_RERUN) || (Flags & MD_RERUN_VSITE)))
1237 "Currently can't do velocity verlet with rerun in parallel.");
1240 /* A parallel command line option consistency check that we can
1241 only do after any threads have started. */
1243 (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
1246 "The -dd or -npme option request a parallel simulation, "
1248 "but %s was compiled without threads or MPI enabled"
1250 #ifdef GMX_THREAD_MPI
1251 "but the number of threads (option -nt) is 1"
1253 "but %s was not started through mpirun/mpiexec or only one process was requested through mpirun/mpiexec"
1260 if ((Flags & MD_RERUN) &&
1261 (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
1263 gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
1266 if (can_use_allvsall(inputrec,mtop,TRUE,cr,fplog) && PAR(cr))
1268 /* All-vs-all loops do not work with domain decomposition */
1269 Flags |= MD_PARTDEC;
1272 if (!EEL_PME(inputrec->coulombtype) || (Flags & MD_PARTDEC))
1274 if (cr->npmenodes > 0)
1276 if (!EEL_PME(inputrec->coulombtype))
1278 gmx_fatal_collective(FARGS,cr,NULL,
1279 "PME nodes are requested, but the system does not use PME electrostatics");
1281 if (Flags & MD_PARTDEC)
1283 gmx_fatal_collective(FARGS,cr,NULL,
1284 "PME nodes are requested, but particle decomposition does not support separate PME nodes");
1292 fcRegisterSteps(inputrec->nsteps,inputrec->init_step);
1295 /* NMR restraints must be initialized before load_checkpoint,
1296 * since with time averaging the history is added to t_state.
1297 * For proper consistency check we therefore need to extend
1299 * So the PME-only nodes (if present) will also initialize
1300 * the distance restraints.
1304 /* This needs to be called before read_checkpoint to extend the state */
1305 init_disres(fplog,mtop,inputrec,cr,Flags & MD_PARTDEC,fcd,state);
1307 if (gmx_mtop_ftype_count(mtop,F_ORIRES) > 0)
1309 if (PAR(cr) && !(Flags & MD_PARTDEC))
1311 gmx_fatal(FARGS,"Orientation restraints do not work (yet) with domain decomposition, use particle decomposition (mdrun option -pd)");
1313 /* Orientation restraints */
1316 init_orires(fplog,mtop,state->x,inputrec,cr->ms,&(fcd->orires),
1321 if (DEFORM(*inputrec))
1323 /* Store the deform reference box before reading the checkpoint */
1326 copy_mat(state->box,box);
1330 gmx_bcast(sizeof(box),box,cr);
1332 /* Because we do not have the update struct available yet
1333 * in which the reference values should be stored,
1334 * we store them temporarily in static variables.
1335 * This should be thread safe, since they are only written once
1336 * and with identical values.
1338 #ifdef GMX_THREAD_MPI
1339 tMPI_Thread_mutex_lock(&deform_init_box_mutex);
1341 deform_init_init_step_tpx = inputrec->init_step;
1342 copy_mat(box,deform_init_box_tpx);
1343 #ifdef GMX_THREAD_MPI
1344 tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
1348 if (opt2bSet("-cpi",nfile,fnm))
1350 /* Check if checkpoint file exists before doing continuation.
1351 * This way we can use identical input options for the first and subsequent runs...
1353 if( gmx_fexist_master(opt2fn_master("-cpi",nfile,fnm,cr),cr) )
1355 load_checkpoint(opt2fn_master("-cpi",nfile,fnm,cr),&fplog,
1356 cr,Flags & MD_PARTDEC,ddxyz,
1357 inputrec,state,&bReadRNG,&bReadEkin,
1358 (Flags & MD_APPENDFILES),
1359 (Flags & MD_APPENDFILESSET));
1363 Flags |= MD_READ_RNG;
1367 Flags |= MD_READ_EKIN;
1372 if (((MASTER(cr) || (Flags & MD_SEPPOT)) && (Flags & MD_APPENDFILES))
1373 #ifdef GMX_THREAD_MPI
1374 /* With thread MPI only the master node/thread exists in mdrun.c,
1375 * therefore non-master nodes need to open the "seppot" log file here.
1377 || (!MASTER(cr) && (Flags & MD_SEPPOT))
1381 gmx_log_open(ftp2fn(efLOG,nfile,fnm),cr,!(Flags & MD_SEPPOT),
1385 /* override nsteps with value from cmdline */
1386 override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
1390 copy_mat(state->box,box);
1395 gmx_bcast(sizeof(box),box,cr);
1398 /* Essential dynamics */
1399 if (opt2bSet("-ei",nfile,fnm))
1401 /* Open input and output files, allocate space for ED data structure */
1402 ed = ed_open(nfile,fnm,Flags,cr);
1405 if (PAR(cr) && !((Flags & MD_PARTDEC) ||
1406 EI_TPI(inputrec->eI) ||
1407 inputrec->eI == eiNM))
1409 cr->dd = init_domain_decomposition(fplog,cr,Flags,ddxyz,rdd,rconstr,
1410 dddlb_opt,dlb_scale,
1414 &ddbox,&npme_major,&npme_minor);
1416 make_dd_communicators(fplog,cr,dd_node_order);
1418 /* Set overallocation to avoid frequent reallocation of arrays */
1419 set_over_alloc_dd(TRUE);
1423 /* PME, if used, is done on all nodes with 1D decomposition */
1425 cr->duty = (DUTY_PP | DUTY_PME);
1428 if (!EI_TPI(inputrec->eI))
1430 npme_major = cr->nnodes;
1433 if (inputrec->ePBC == epbcSCREW)
1436 "pbc=%s is only implemented with domain decomposition",
1437 epbc_names[inputrec->ePBC]);
1443 /* After possible communicator splitting in make_dd_communicators.
1444 * we can set up the intra/inter node communication.
1446 gmx_setup_nodecomm(fplog,cr);
1449 /* Initialize per-node process ID and counters. */
1450 gmx_init_intra_counters(cr);
1453 md_print_info(cr,fplog,"Using %d MPI %s\n",
1455 #ifdef GMX_THREAD_MPI
1456 cr->nnodes==1 ? "thread" : "threads"
1458 cr->nnodes==1 ? "process" : "processes"
1463 gmx_omp_nthreads_init(fplog, cr,
1464 hwinfo->nthreads_hw_avail,
1465 hw_opt->nthreads_omp,
1466 hw_opt->nthreads_omp_pme,
1467 (cr->duty & DUTY_PP) == 0,
1468 inputrec->cutoff_scheme == ecutsVERLET);
1470 gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi, minf.bUseGPU);
1472 /* getting number of PP/PME threads
1473 PME: env variable should be read only on one node to make sure it is
1474 identical everywhere;
1476 /* TODO nthreads_pp is only used for pinning threads.
1477 * This is a temporary solution until we have a hw topology library.
1479 nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
1480 nthreads_pme = gmx_omp_nthreads_get(emntPME);
1482 wcycle = wallcycle_init(fplog,resetstep,cr,nthreads_pp,nthreads_pme);
1486 /* Master synchronizes its value of reset_counters with all nodes
1487 * including PME only nodes */
1488 reset_counters = wcycle_get_reset_counters(wcycle);
1489 gmx_bcast_sim(sizeof(reset_counters),&reset_counters,cr);
1490 wcycle_set_reset_counters(wcycle, reset_counters);
1494 if (cr->duty & DUTY_PP)
1496 /* For domain decomposition we allocate dynamically
1497 * in dd_partition_system.
1499 if (DOMAINDECOMP(cr))
1501 bcast_state_setup(cr,state);
1507 bcast_state(cr,state,TRUE);
1511 /* Initiate forcerecord */
1513 fr->hwinfo = hwinfo;
1514 init_forcerec(fplog,oenv,fr,fcd,inputrec,mtop,cr,box,FALSE,
1515 opt2fn("-table",nfile,fnm),
1516 opt2fn("-tabletf",nfile,fnm),
1517 opt2fn("-tablep",nfile,fnm),
1518 opt2fn("-tableb",nfile,fnm),
1522 /* version for PCA_NOT_READ_NODE (see md.c) */
1523 /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
1524 "nofile","nofile","nofile","nofile",FALSE,pforce);
1526 fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT);
1528 /* Initialize QM-MM */
1531 init_QMMMrec(cr,box,mtop,inputrec,fr);
1534 /* Initialize the mdatoms structure.
1535 * mdatoms is not filled with atom data,
1536 * as this can not be done now with domain decomposition.
1538 mdatoms = init_mdatoms(fplog,mtop,inputrec->efep!=efepNO);
1540 /* Initialize the virtual site communication */
1541 vsite = init_vsite(mtop,cr);
1543 calc_shifts(box,fr->shift_vec);
1545 /* With periodic molecules the charge groups should be whole at start up
1546 * and the virtual sites should not be far from their proper positions.
1548 if (!inputrec->bContinuation && MASTER(cr) &&
1549 !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
1551 /* Make molecules whole at start of run */
1552 if (fr->ePBC != epbcNONE)
1554 do_pbc_first_mtop(fplog,inputrec->ePBC,box,mtop,state->x);
1558 /* Correct initial vsite positions are required
1559 * for the initial distribution in the domain decomposition
1560 * and for the initial shell prediction.
1562 construct_vsites_mtop(fplog,vsite,mtop,state->x);
1566 if (EEL_PME(fr->eeltype))
1568 ewaldcoeff = fr->ewaldcoeff;
1569 pmedata = &fr->pmedata;
1578 /* This is a PME only node */
1580 /* We don't need the state */
1583 ewaldcoeff = calc_ewaldcoeff(inputrec->rcoulomb, inputrec->ewald_rtol);
1587 #if defined GMX_THREAD_MPI
1588 /* With the number of TMPI threads equal to the number of cores
1589 * we already pinned in thread-MPI, so don't pin again here.
1591 if (hw_opt->nthreads_tmpi != tMPI_Thread_get_hw_number())
1594 /* Set the CPU affinity */
1595 set_cpu_affinity(fplog,cr,hw_opt,nthreads_pme,hwinfo,inputrec);
1598 /* Initiate PME if necessary,
1599 * either on all nodes or on dedicated PME nodes only. */
1600 if (EEL_PME(inputrec->coulombtype))
1604 nChargePerturbed = mdatoms->nChargePerturbed;
1606 if (cr->npmenodes > 0)
1608 /* The PME only nodes need to know nChargePerturbed */
1609 gmx_bcast_sim(sizeof(nChargePerturbed),&nChargePerturbed,cr);
1612 if (cr->duty & DUTY_PME)
1614 status = gmx_pme_init(pmedata,cr,npme_major,npme_minor,inputrec,
1615 mtop ? mtop->natoms : 0,nChargePerturbed,
1616 (Flags & MD_REPRODUCIBLE),nthreads_pme);
1619 gmx_fatal(FARGS,"Error %d initializing PME",status);
1625 if (integrator[inputrec->eI].func == do_md
1628 integrator[inputrec->eI].func == do_md_openmm
1632 /* Turn on signal handling on all nodes */
1634 * (A user signal from the PME nodes (if any)
1635 * is communicated to the PP nodes.
1637 signal_handler_install();
1640 if (cr->duty & DUTY_PP)
1642 if (inputrec->ePull != epullNO)
1644 /* Initialize pull code */
1645 init_pull(fplog,inputrec,nfile,fnm,mtop,cr,oenv, inputrec->fepvals->init_lambda,
1646 EI_DYNAMICS(inputrec->eI) && MASTER(cr),Flags);
1651 /* Initialize enforced rotation code */
1652 init_rot(fplog,inputrec,nfile,fnm,cr,state->x,box,mtop,oenv,
1656 constr = init_constraints(fplog,mtop,inputrec,ed,state,cr);
1658 if (DOMAINDECOMP(cr))
1660 dd_init_bondeds(fplog,cr->dd,mtop,vsite,constr,inputrec,
1661 Flags & MD_DDBONDCHECK,fr->cginfo_mb);
1663 set_dd_parameters(fplog,cr->dd,dlb_scale,inputrec,fr,&ddbox);
1665 setup_dd_grid(fplog,cr->dd);
1668 /* Now do whatever the user wants us to do (how flexible...) */
1669 integrator[inputrec->eI].func(fplog,cr,nfile,fnm,
1670 oenv,bVerbose,bCompact,
1673 nstepout,inputrec,mtop,
1675 mdatoms,nrnb,wcycle,ed,fr,
1676 repl_ex_nst,repl_ex_nex,repl_ex_seed,
1678 cpt_period,max_hours,
1683 if (inputrec->ePull != epullNO)
1685 finish_pull(fplog,inputrec->pull);
1690 finish_rot(fplog,inputrec->rot);
1697 gmx_pmeonly(*pmedata,cr,nrnb,wcycle,ewaldcoeff,FALSE,inputrec);
1700 if (EI_DYNAMICS(inputrec->eI) || EI_TPI(inputrec->eI))
1702 /* Some timing stats */
1705 if (runtime.proc == 0)
1707 runtime.proc = runtime.real;
1716 wallcycle_stop(wcycle,ewcRUN);
1718 /* Finish up, write some stuff
1719 * if rerunMD, don't write last frame again
1721 finish_run(fplog,cr,ftp2fn(efSTO,nfile,fnm),
1722 inputrec,nrnb,wcycle,&runtime,
1723 fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ?
1724 nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL,
1726 EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
1728 if ((cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU)
1730 char gpu_err_str[STRLEN];
1732 /* free GPU memory and uninitialize GPU (by destroying the context) */
1733 nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
1735 if (!free_gpu(gpu_err_str))
1737 gmx_warning("On node %d failed to free GPU #%d: %s",
1738 cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
1742 if (opt2bSet("-membed",nfile,fnm))
1747 #ifdef GMX_THREAD_MPI
1748 if (PAR(cr) && SIMMASTER(cr))
1751 gmx_hardware_info_free(hwinfo);
1754 /* Does what it says */
1755 print_date_and_time(fplog,cr->nodeid,"Finished mdrun",&runtime);
1757 /* Close logfile already here if we were appending to it */
1758 if (MASTER(cr) && (Flags & MD_APPENDFILES))
1760 gmx_log_close(fplog);
1763 rc=(int)gmx_get_stop_condition();
1765 #ifdef GMX_THREAD_MPI
1766 /* we need to join all threads. The sub-threads join when they
1767 exit this function, but the master thread needs to be told to
1769 if (PAR(cr) && MASTER(cr))