1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This source code is part of
8 * GROningen MAchine for Chemical Simulations
11 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13 * Copyright (c) 2001-2004, The GROMACS development team,
14 * check out http://www.gromacs.org for more information.
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version 2
19 * of the License, or (at your option) any later version.
21 * If you want to redistribute modifications, please consider that
22 * scientific software is very special. Version control is crucial -
23 * bugs must be traceable. We will be happy to consider code for
24 * inclusion in the official distribution, but derived work must not
25 * be called official GROMACS. Details are found in the README & COPYING
26 * files - if they are missing, get the official version at www.gromacs.org.
28 * To help us fund GROMACS development, we humbly ask that you cite
29 * the papers on the package - you can find them in the top README file.
31 * For more info, check our website at http://www.gromacs.org
34 * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
42 #include <sys/syscall.h>
57 #include "md_logging.h"
58 #include "md_support.h"
61 #include "pull_rotation.h"
74 #include "checkpoint.h"
75 #include "mtop_util.h"
76 #include "sighandler.h"
79 #include "gmx_detect_hardware.h"
80 #include "gmx_omp_nthreads.h"
81 #include "pull_rotation.h"
82 #include "calc_verletbuf.h"
83 #include "../mdlib/nbnxn_search.h"
84 #include "../mdlib/nbnxn_consts.h"
85 #include "gmx_fatal_collective.h"
102 #include "md_openmm.h"
105 #include "gpu_utils.h"
106 #include "nbnxn_cuda_data_mgmt.h"
109 gmx_integrator_t *func;
112 /* The array should match the eI array in include/types/enums.h */
113 #ifdef GMX_OPENMM /* FIXME do_md_openmm needs fixing */
114 const gmx_intp_t integrator[eiNR] = { {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm},{do_md_openmm}};
116 const gmx_intp_t integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md},{do_md}};
119 gmx_large_int_t deform_init_init_step_tpx;
120 matrix deform_init_box_tpx;
121 #ifdef GMX_THREAD_MPI
122 tMPI_Thread_mutex_t deform_init_box_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
126 #ifdef GMX_THREAD_MPI
127 struct mdrunner_arglist
129 gmx_hw_opt_t *hw_opt;
142 const char *dddlb_opt;
147 const char *nbpu_opt;
158 const char *deviceOptions;
160 int ret; /* return value */
164 /* The function used for spawning threads. Extracts the mdrunner()
165 arguments from its one argument and calls mdrunner(), after making
167 static void mdrunner_start_fn(void *arg)
169 struct mdrunner_arglist *mda=(struct mdrunner_arglist*)arg;
170 struct mdrunner_arglist mc=*mda; /* copy the arg list to make sure
171 that it's thread-local. This doesn't
172 copy pointed-to items, of course,
173 but those are all const. */
174 t_commrec *cr; /* we need a local version of this */
178 fnm = dup_tfn(mc.nfile, mc.fnm);
180 cr = init_par_threads(mc.cr);
187 mda->ret=mdrunner(mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv,
188 mc.bVerbose, mc.bCompact, mc.nstglobalcomm,
189 mc.ddxyz, mc.dd_node_order, mc.rdd,
190 mc.rconstr, mc.dddlb_opt, mc.dlb_scale,
191 mc.ddcsx, mc.ddcsy, mc.ddcsz,
193 mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
194 mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce,
195 mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.Flags);
198 /* called by mdrunner() to start a specific number of threads (including
199 the main thread) for thread-parallel runs. This in turn calls mdrunner()
201 All options besides nthreads are the same as for mdrunner(). */
202 static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
203 FILE *fplog,t_commrec *cr,int nfile,
204 const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
205 gmx_bool bCompact, int nstglobalcomm,
206 ivec ddxyz,int dd_node_order,real rdd,real rconstr,
207 const char *dddlb_opt,real dlb_scale,
208 const char *ddcsx,const char *ddcsy,const char *ddcsz,
209 const char *nbpu_opt,
210 int nsteps_cmdline, int nstepout,int resetstep,
211 int nmultisim,int repl_ex_nst,int repl_ex_nex, int repl_ex_seed,
212 real pforce,real cpt_period, real max_hours,
213 const char *deviceOptions, unsigned long Flags)
216 struct mdrunner_arglist *mda;
217 t_commrec *crn; /* the new commrec */
220 /* first check whether we even need to start tMPI */
221 if (hw_opt->nthreads_tmpi < 2)
226 /* a few small, one-time, almost unavoidable memory leaks: */
228 fnmn=dup_tfn(nfile, fnm);
230 /* fill the data structure to pass as void pointer to thread start fn */
237 mda->bVerbose=bVerbose;
238 mda->bCompact=bCompact;
239 mda->nstglobalcomm=nstglobalcomm;
240 mda->ddxyz[XX]=ddxyz[XX];
241 mda->ddxyz[YY]=ddxyz[YY];
242 mda->ddxyz[ZZ]=ddxyz[ZZ];
243 mda->dd_node_order=dd_node_order;
245 mda->rconstr=rconstr;
246 mda->dddlb_opt=dddlb_opt;
247 mda->dlb_scale=dlb_scale;
251 mda->nbpu_opt=nbpu_opt;
252 mda->nsteps_cmdline=nsteps_cmdline;
253 mda->nstepout=nstepout;
254 mda->resetstep=resetstep;
255 mda->nmultisim=nmultisim;
256 mda->repl_ex_nst=repl_ex_nst;
257 mda->repl_ex_nex=repl_ex_nex;
258 mda->repl_ex_seed=repl_ex_seed;
260 mda->cpt_period=cpt_period;
261 mda->max_hours=max_hours;
262 mda->deviceOptions=deviceOptions;
265 fprintf(stderr, "Starting %d tMPI threads\n",hw_opt->nthreads_tmpi);
267 /* now spawn new threads that start mdrunner_start_fn(), while
268 the main thread returns */
269 ret=tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi,
270 (hw_opt->bThreadPinning ? TMPI_AFFINITY_ALL_CORES : TMPI_AFFINITY_NONE),
271 mdrunner_start_fn, (void*)(mda) );
272 if (ret!=TMPI_SUCCESS)
275 /* make a new comm_rec to reflect the new situation */
276 crn=init_par_threads(cr);
281 static int get_tmpi_omp_thread_distribution(const gmx_hw_opt_t *hw_opt,
287 /* There are no separate PME nodes here, as we ensured in
288 * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
289 * and a conditional ensures we would not have ended up here.
290 * Note that separate PME nodes might be switched on later.
294 nthreads_tmpi = ngpu;
295 if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi)
297 nthreads_tmpi = nthreads_tot;
300 else if (hw_opt->nthreads_omp > 0)
302 if (hw_opt->nthreads_omp > nthreads_tot)
304 gmx_fatal(FARGS,"More OpenMP threads requested (%d) than the total number of threads requested (%d)",hw_opt->nthreads_omp,nthreads_tot);
306 nthreads_tmpi = nthreads_tot/hw_opt->nthreads_omp;
310 /* TODO choose nthreads_omp based on hardware topology
311 when we have a hardware topology detection library */
312 /* Don't use OpenMP parallelization */
313 nthreads_tmpi = nthreads_tot;
316 return nthreads_tmpi;
320 /* Get the number of threads to use for thread-MPI based on how many
321 * were requested, which algorithms we're using,
322 * and how many particles there are.
323 * At the point we have already called check_and_update_hw_opt.
324 * Thus all options should be internally consistent and consistent
325 * with the hardware, except that ntmpi could be larger than #GPU.
327 static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
328 gmx_hw_opt_t *hw_opt,
329 t_inputrec *inputrec, gmx_mtop_t *mtop,
333 int nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu;
334 int min_atoms_per_mpi_thread;
339 if (hw_opt->nthreads_tmpi > 0)
341 /* Trivial, return right away */
342 return hw_opt->nthreads_tmpi;
345 /* How many total (#tMPI*#OpenMP) threads can we start? */
346 if (hw_opt->nthreads_tot > 0)
348 nthreads_tot_max = hw_opt->nthreads_tot;
352 nthreads_tot_max = tMPI_Thread_get_hw_number();
355 bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
358 ngpu = hwinfo->gpu_info.ncuda_dev_use;
366 get_tmpi_omp_thread_distribution(hw_opt,nthreads_tot_max,ngpu);
368 if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
370 /* Steps are divided over the nodes iso splitting the atoms */
371 min_atoms_per_mpi_thread = 0;
377 min_atoms_per_mpi_thread = MIN_ATOMS_PER_GPU;
381 min_atoms_per_mpi_thread = MIN_ATOMS_PER_MPI_THREAD;
385 /* Check if an algorithm does not support parallel simulation. */
386 if (nthreads_tmpi != 1 &&
387 ( inputrec->eI == eiLBFGS ||
388 inputrec->coulombtype == eelEWALD ) )
392 md_print_warn(cr,fplog,"The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
393 if (hw_opt->nthreads_tmpi > nthreads_tmpi)
395 gmx_fatal(FARGS,"You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
398 else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_thread)
400 /* the thread number was chosen automatically, but there are too many
401 threads (too few atoms per thread) */
402 nthreads_new = max(1,mtop->natoms/min_atoms_per_mpi_thread);
404 if (nthreads_new > 8 || (nthreads_tmpi == 8 && nthreads_new > 4))
406 /* TODO replace this once we have proper HT detection
407 * Use only multiples of 4 above 8 threads
408 * or with an 8-core processor
409 * (to avoid 6 threads on 8 core processors with 4 real cores).
411 nthreads_new = (nthreads_new/4)*4;
413 else if (nthreads_new > 4)
415 /* Avoid 5 or 7 threads */
416 nthreads_new = (nthreads_new/2)*2;
419 nthreads_tmpi = nthreads_new;
421 fprintf(stderr,"\n");
422 fprintf(stderr,"NOTE: Parallelization is limited by the small number of atoms,\n");
423 fprintf(stderr," only starting %d thread-MPI threads.\n",nthreads_tmpi);
424 fprintf(stderr," You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
427 return nthreads_tmpi;
429 #endif /* GMX_THREAD_MPI */
432 /* Environment variable for setting nstlist */
433 static const char* NSTLIST_ENVVAR = "GMX_NSTLIST";
434 /* Try to increase nstlist when using a GPU with nstlist less than this */
435 static const int NSTLIST_GPU_ENOUGH = 20;
436 /* Increase nstlist until the non-bonded cost increases more than this factor */
437 static const float NBNXN_GPU_LIST_OK_FAC = 1.25;
438 /* Don't increase nstlist beyond a non-bonded cost increases of this factor */
439 static const float NBNXN_GPU_LIST_MAX_FAC = 1.40;
441 /* Try to increase nstlist when running on a GPU */
442 static void increase_nstlist(FILE *fp,t_commrec *cr,
443 t_inputrec *ir,const gmx_mtop_t *mtop,matrix box)
446 int nstlist_orig,nstlist_prev;
447 verletbuf_list_setup_t ls;
448 real rlist_inc,rlist_ok,rlist_max,rlist_new,rlist_prev;
451 gmx_bool bBox,bDD,bCont;
452 const char *nstl_fmt="\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
453 const char *vbd_err="Can not increase nstlist for GPU run because verlet-buffer-drift is not set or used";
454 const char *box_err="Can not increase nstlist for GPU run because the box is too small";
455 const char *dd_err ="Can not increase nstlist for GPU run because of domain decomposition limitations";
458 /* Number of + nstlist alternative values to try when switching */
459 const int nstl[]={ 20, 25, 40, 50 };
460 #define NNSTL sizeof(nstl)/sizeof(nstl[0])
462 env = getenv(NSTLIST_ENVVAR);
467 fprintf(fp,nstl_fmt,ir->nstlist);
471 if (ir->verletbuf_drift == 0)
473 gmx_fatal(FARGS,"You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
476 if (ir->verletbuf_drift < 0)
480 fprintf(stderr,"%s\n",vbd_err);
484 fprintf(fp,"%s\n",vbd_err);
490 nstlist_orig = ir->nstlist;
493 sprintf(buf,"Getting nstlist from environment variable GMX_NSTLIST=%s",env);
496 fprintf(stderr,"%s\n",buf);
500 fprintf(fp,"%s\n",buf);
502 sscanf(env,"%d",&ir->nstlist);
505 verletbuf_get_list_setup(TRUE,&ls);
507 /* Allow rlist to make the list double the size of the cut-off sphere */
508 rlist_inc = nbnxn_get_rlist_effective_inc(NBNXN_GPU_CLUSTER_SIZE,mtop->natoms/det(box));
509 rlist_ok = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_OK_FAC,1.0/3.0) - rlist_inc;
510 rlist_max = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_MAX_FAC,1.0/3.0) - rlist_inc;
513 fprintf(debug,"GPU nstlist tuning: rlist_inc %.3f rlist_max %.3f\n",
514 rlist_inc,rlist_max);
518 nstlist_prev = nstlist_orig;
519 rlist_prev = ir->rlist;
524 ir->nstlist = nstl[i];
527 /* Set the pair-list buffer size in ir */
528 calc_verlet_buffer_size(mtop,det(box),ir,ir->verletbuf_drift,&ls,
531 /* Does rlist fit in the box? */
532 bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC,box));
534 if (bBox && DOMAINDECOMP(cr))
536 /* Check if rlist fits in the domain decomposition */
537 if (inputrec2nboundeddim(ir) < DIM)
539 gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
541 copy_mat(box,state_tmp.box);
542 bDD = change_dd_cutoff(cr,&state_tmp,ir,rlist_new);
549 if (bBox && bDD && rlist_new <= rlist_max)
551 /* Increase nstlist */
552 nstlist_prev = ir->nstlist;
553 rlist_prev = rlist_new;
554 bCont = (i+1 < NNSTL && rlist_new < rlist_ok);
558 /* Stick with the previous nstlist */
559 ir->nstlist = nstlist_prev;
560 rlist_new = rlist_prev;
572 gmx_warning(!bBox ? box_err : dd_err);
575 fprintf(fp,"\n%s\n",bBox ? box_err : dd_err);
577 ir->nstlist = nstlist_orig;
579 else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
581 sprintf(buf,"Changing nstlist from %d to %d, rlist from %g to %g",
582 nstlist_orig,ir->nstlist,
583 ir->rlist,rlist_new);
586 fprintf(stderr,"%s\n\n",buf);
590 fprintf(fp,"%s\n\n",buf);
592 ir->rlist = rlist_new;
593 ir->rlistlong = rlist_new;
597 static void prepare_verlet_scheme(FILE *fplog,
598 gmx_hw_info_t *hwinfo,
600 gmx_hw_opt_t *hw_opt,
601 const char *nbpu_opt,
603 const gmx_mtop_t *mtop,
607 /* Here we only check for GPU usage on the MPI master process,
608 * as here we don't know how many GPUs we will use yet.
609 * We check for a GPU on all processes later.
611 *bUseGPU = hwinfo->bCanUseGPU || (getenv("GMX_EMULATE_GPU") != NULL);
613 if (ir->verletbuf_drift > 0)
615 /* Update the Verlet buffer size for the current run setup */
616 verletbuf_list_setup_t ls;
619 /* Here we assume CPU acceleration is on. But as currently
620 * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
621 * and 4x2 gives a larger buffer than 4x4, this is ok.
623 verletbuf_get_list_setup(*bUseGPU,&ls);
625 calc_verlet_buffer_size(mtop,det(box),ir,
626 ir->verletbuf_drift,&ls,
628 if (rlist_new != ir->rlist)
632 fprintf(fplog,"\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
634 ls.cluster_size_i,ls.cluster_size_j);
636 ir->rlist = rlist_new;
637 ir->rlistlong = rlist_new;
641 /* With GPU or emulation we should check nstlist for performance */
642 if ((EI_DYNAMICS(ir->eI) &&
644 ir->nstlist < NSTLIST_GPU_ENOUGH) ||
645 getenv(NSTLIST_ENVVAR) != NULL)
647 /* Choose a better nstlist */
648 increase_nstlist(fplog,cr,ir,mtop,box);
652 static void convert_to_verlet_scheme(FILE *fplog,
654 gmx_mtop_t *mtop,real box_vol)
656 char *conv_mesg="Converting input file with group cut-off scheme to the Verlet cut-off scheme";
658 md_print_warn(NULL,fplog,"%s\n",conv_mesg);
660 ir->cutoff_scheme = ecutsVERLET;
661 ir->verletbuf_drift = 0.005;
663 if (ir->rcoulomb != ir->rvdw)
665 gmx_fatal(FARGS,"The VdW and Coulomb cut-offs are different, whereas the Verlet scheme only supports equal cut-offs");
668 if (ir->vdwtype == evdwUSER || EEL_USER(ir->coulombtype))
670 gmx_fatal(FARGS,"User non-bonded potentials are not (yet) supported with the Verlet scheme");
672 else if (EVDW_SWITCHED(ir->vdwtype) || EEL_SWITCHED(ir->coulombtype))
674 md_print_warn(NULL,fplog,"Converting switched or shifted interactions to a shifted potential (without force shift), this will lead to slightly different interaction potentials");
676 if (EVDW_SWITCHED(ir->vdwtype))
678 ir->vdwtype = evdwCUT;
680 if (EEL_SWITCHED(ir->coulombtype))
682 if (EEL_FULL(ir->coulombtype))
684 /* With full electrostatic only PME can be switched */
685 ir->coulombtype = eelPME;
689 md_print_warn(NULL,fplog,"NOTE: Replacing %s electrostatics with reaction-field with epsilon-rf=inf\n",eel_names[ir->coulombtype]);
690 ir->coulombtype = eelRF;
691 ir->epsilon_rf = 0.0;
695 /* We set the target energy drift to a small number.
696 * Note that this is only for testing. For production the user
697 * should think about this and set the mdp options.
699 ir->verletbuf_drift = 1e-4;
702 if (inputrec2nboundeddim(ir) != 3)
704 gmx_fatal(FARGS,"Can only convert old tpr files to the Verlet cut-off scheme with 3D pbc");
707 if (ir->efep != efepNO || ir->implicit_solvent != eisNO)
709 gmx_fatal(FARGS,"Will not convert old tpr files to the Verlet cut-off scheme with free-energy calculations or implicit solvent");
712 if (EI_DYNAMICS(ir->eI) && !(EI_MD(ir->eI) && ir->etc == etcNO))
714 verletbuf_list_setup_t ls;
716 verletbuf_get_list_setup(FALSE,&ls);
717 calc_verlet_buffer_size(mtop,box_vol,ir,ir->verletbuf_drift,&ls,
722 ir->verletbuf_drift = -1;
723 ir->rlist = 1.05*max(ir->rvdw,ir->rcoulomb);
726 gmx_mtop_remove_chargegroups(mtop);
730 /* Set CPU affinity. Can be important for performance.
731 On some systems (e.g. Cray) CPU Affinity is set by default.
732 But default assigning doesn't work (well) with only some ranks
733 having threads. This causes very low performance.
734 External tools have cumbersome syntax for setting affinity
735 in the case that only some ranks have threads.
736 Thus it is important that GROMACS sets the affinity internally
737 if only PME is using threads.
739 static void set_cpu_affinity(FILE *fplog,
741 gmx_hw_opt_t *hw_opt,
743 const gmx_hw_info_t *hwinfo,
744 const t_inputrec *inputrec)
746 #if defined GMX_THREAD_MPI
747 /* With the number of TMPI threads equal to the number of cores
748 * we already pinned in thread-MPI, so don't pin again here.
750 if (hw_opt->nthreads_tmpi == tMPI_Thread_get_hw_number())
756 #ifdef GMX_OPENMP /* TODO: actually we could do this even without OpenMP?! */
757 #ifdef __linux /* TODO: only linux? why not everywhere if sched_setaffinity is available */
758 if (hw_opt->bThreadPinning)
760 int thread, nthread_local, nthread_node, nthread_hw_max, nphyscore;
764 /* threads on this MPI process or TMPI thread */
765 if (cr->duty & DUTY_PP)
767 nthread_local = gmx_omp_nthreads_get(emntNonbonded);
771 nthread_local = gmx_omp_nthreads_get(emntPME);
774 /* map the current process to cores */
776 nthread_node = nthread_local;
778 if (PAR(cr) || MULTISIM(cr))
780 /* We need to determine a scan of the thread counts in this
785 MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),cr->nodeid_intra,
787 MPI_Scan(&nthread_local,&thread,1,MPI_INT,MPI_SUM,comm_intra);
788 /* MPI_Scan is inclusive, but here we need exclusive */
789 thread -= nthread_local;
790 /* Get the total number of threads on this physical node */
791 MPI_Allreduce(&nthread_local,&nthread_node,1,MPI_INT,MPI_SUM,comm_intra);
792 MPI_Comm_free(&comm_intra);
797 if (hw_opt->core_pinning_offset > 0)
799 offset = hw_opt->core_pinning_offset;
802 fprintf(stderr, "Applying core pinning offset %d\n", offset);
806 fprintf(fplog, "Applying core pinning offset %d\n", offset);
810 /* With Intel Hyper-Threading enabled, we want to pin consecutive
811 * threads to physical cores when using more threads than physical
812 * cores or when the user requests so.
814 nthread_hw_max = hwinfo->nthreads_hw_avail;
816 if (hw_opt->bPinHyperthreading ||
817 (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
818 nthread_node > nthread_hw_max/2 && getenv("GMX_DISABLE_PINHT") == NULL))
820 if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) != GMX_CPUID_X86_SMT_ENABLED)
822 /* We print to stderr on all processes, as we might have
823 * different settings on different physical nodes.
825 if (gmx_cpuid_vendor(hwinfo->cpuid_info) != GMX_CPUID_VENDOR_INTEL)
827 md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
828 "but non-Intel CPU detected (vendor: %s)\n",
829 gmx_cpuid_vendor_string[gmx_cpuid_vendor(hwinfo->cpuid_info)]);
833 md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
834 "but the CPU detected does not have Intel Hyper-Threading support "
835 "(or it is turned off)\n");
838 nphyscore = nthread_hw_max/2;
842 fprintf(stderr, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
847 fprintf(fplog, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
852 /* set the per-thread affinity */
853 #pragma omp parallel firstprivate(thread) num_threads(nthread_local)
859 thread += gmx_omp_get_thread_num();
862 core = offset + thread;
866 /* Lock pairs of threads to the same hyperthreaded core */
867 core = offset + thread/2 + (thread % 2)*nphyscore;
869 CPU_SET(core, &mask);
870 sched_setaffinity((pid_t) syscall (SYS_gettid), sizeof(cpu_set_t), &mask);
874 #endif /* GMX_OPENMP */
878 static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt,
881 gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp);
883 #ifndef GMX_THREAD_MPI
884 if (hw_opt->nthreads_tot > 0)
886 gmx_fatal(FARGS,"Setting the total number of threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
888 if (hw_opt->nthreads_tmpi > 0)
890 gmx_fatal(FARGS,"Setting the number of thread-MPI threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
894 if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
896 /* We have the same number of OpenMP threads for PP and PME processes,
897 * thus we can perform several consistency checks.
899 if (hw_opt->nthreads_tmpi > 0 &&
900 hw_opt->nthreads_omp > 0 &&
901 hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
903 gmx_fatal(FARGS,"The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
904 hw_opt->nthreads_tot,hw_opt->nthreads_tmpi,hw_opt->nthreads_omp);
907 if (hw_opt->nthreads_tmpi > 0 &&
908 hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
910 gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
911 hw_opt->nthreads_tot,hw_opt->nthreads_tmpi);
914 if (hw_opt->nthreads_omp > 0 &&
915 hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
917 gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
918 hw_opt->nthreads_tot,hw_opt->nthreads_omp);
921 if (hw_opt->nthreads_tmpi > 0 &&
922 hw_opt->nthreads_omp <= 0)
924 hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
929 if (hw_opt->nthreads_omp > 1)
931 gmx_fatal(FARGS,"OpenMP threads are requested, but Gromacs was compiled without OpenMP support");
935 if (cutoff_scheme == ecutsGROUP)
937 /* We only have OpenMP support for PME only nodes */
938 if (hw_opt->nthreads_omp > 1)
940 gmx_fatal(FARGS,"OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
941 ecutscheme_names[cutoff_scheme],
942 ecutscheme_names[ecutsVERLET]);
944 hw_opt->nthreads_omp = 1;
947 if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
949 gmx_fatal(FARGS,"You need to specify -ntomp in addition to -ntomp_pme");
952 if (hw_opt->nthreads_tot == 1)
954 hw_opt->nthreads_tmpi = 1;
956 if (hw_opt->nthreads_omp > 1)
958 gmx_fatal(FARGS,"You requested %d OpenMP threads with %d total threads",
959 hw_opt->nthreads_tmpi,hw_opt->nthreads_tot);
961 hw_opt->nthreads_omp = 1;
964 if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
966 hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
971 fprintf(debug,"hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
972 hw_opt->nthreads_tot,
973 hw_opt->nthreads_tmpi,
974 hw_opt->nthreads_omp,
975 hw_opt->nthreads_omp_pme,
976 hw_opt->gpu_id!=NULL ? hw_opt->gpu_id : "");
982 /* Override the value in inputrec with value passed on the command line (if any) */
983 static void override_nsteps_cmdline(FILE *fplog,
991 /* override with anything else than the default -2 */
992 if (nsteps_cmdline > -2)
996 ir->nsteps = nsteps_cmdline;
997 if (EI_DYNAMICS(ir->eI))
999 sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps, %.3f ps",
1000 nsteps_cmdline, nsteps_cmdline*ir->delta_t);
1004 sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps",
1008 md_print_warn(cr, fplog, "%s\n", stmp);
1012 /* Data structure set by SIMMASTER which needs to be passed to all nodes
1013 * before the other nodes have read the tpx file and called gmx_detect_hardware.
1016 int cutoff_scheme; /* The cutoff-scheme from inputrec_t */
1017 gmx_bool bUseGPU; /* Use GPU or GPU emulation */
1020 int mdrunner(gmx_hw_opt_t *hw_opt,
1021 FILE *fplog,t_commrec *cr,int nfile,
1022 const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
1023 gmx_bool bCompact, int nstglobalcomm,
1024 ivec ddxyz,int dd_node_order,real rdd,real rconstr,
1025 const char *dddlb_opt,real dlb_scale,
1026 const char *ddcsx,const char *ddcsy,const char *ddcsz,
1027 const char *nbpu_opt,
1028 int nsteps_cmdline, int nstepout,int resetstep,
1029 int nmultisim,int repl_ex_nst,int repl_ex_nex,
1030 int repl_ex_seed, real pforce,real cpt_period,real max_hours,
1031 const char *deviceOptions, unsigned long Flags)
1033 gmx_bool bForceUseGPU,bTryUseGPU;
1034 double nodetime=0,realtime;
1035 t_inputrec *inputrec;
1036 t_state *state=NULL;
1038 gmx_ddbox_t ddbox={0};
1039 int npme_major,npme_minor;
1042 gmx_mtop_t *mtop=NULL;
1043 t_mdatoms *mdatoms=NULL;
1044 t_forcerec *fr=NULL;
1047 gmx_pme_t *pmedata=NULL;
1048 gmx_vsite_t *vsite=NULL;
1049 gmx_constr_t constr;
1050 int i,m,nChargePerturbed=-1,status,nalloc;
1052 gmx_wallcycle_t wcycle;
1053 gmx_bool bReadRNG,bReadEkin;
1055 gmx_runtime_t runtime;
1057 gmx_large_int_t reset_counters;
1058 gmx_edsam_t ed=NULL;
1059 t_commrec *cr_old=cr;
1062 gmx_membed_t membed=NULL;
1063 gmx_hw_info_t *hwinfo=NULL;
1064 master_inf_t minf={-1,FALSE};
1066 /* CAUTION: threads may be started later on in this function, so
1067 cr doesn't reflect the final parallel state right now */
1071 if (Flags & MD_APPENDFILES)
1076 bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
1077 bTryUseGPU = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
1082 /* Read (nearly) all data required for the simulation */
1083 read_tpx_state(ftp2fn(efTPX,nfile,fnm),inputrec,state,NULL,mtop);
1085 if (inputrec->cutoff_scheme != ecutsVERLET &&
1086 ((Flags & MD_TESTVERLET) || getenv("GMX_VERLET_SCHEME") != NULL))
1088 convert_to_verlet_scheme(fplog,inputrec,mtop,det(state->box));
1091 /* Detect hardware, gather information. With tMPI only thread 0 does it
1092 * and after threads are started broadcasts hwinfo around. */
1094 gmx_detect_hardware(fplog, hwinfo, cr,
1095 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
1097 minf.cutoff_scheme = inputrec->cutoff_scheme;
1098 minf.bUseGPU = FALSE;
1100 if (inputrec->cutoff_scheme == ecutsVERLET)
1102 prepare_verlet_scheme(fplog,hwinfo,cr,hw_opt,nbpu_opt,
1103 inputrec,mtop,state->box,
1106 else if (hwinfo->bCanUseGPU)
1108 md_print_warn(cr,fplog,
1109 "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
1110 " To use a GPU, set the mdp option: cutoff-scheme = Verlet\n"
1111 " (for quick performance testing you can use the -testverlet option)\n");
1115 gmx_fatal(FARGS,"GPU requested, but can't be used without cutoff-scheme=Verlet");
1119 #ifndef GMX_THREAD_MPI
1122 gmx_bcast_sim(sizeof(minf),&minf,cr);
1125 if (minf.bUseGPU && cr->npmenodes == -1)
1127 /* Don't automatically use PME-only nodes with GPUs */
1131 #ifdef GMX_THREAD_MPI
1132 /* With thread-MPI inputrec is only set here on the master thread */
1136 check_and_update_hw_opt(hw_opt,minf.cutoff_scheme);
1138 #ifdef GMX_THREAD_MPI
1139 if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
1141 gmx_fatal(FARGS,"You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME nodes");
1145 if (hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
1148 gmx_fatal(FARGS,"You need to explicitly specify the number of PME nodes (-npme) when using different number of OpenMP threads for PP and PME nodes");
1152 /* Check for externally set OpenMP affinity and turn off internal
1153 * pinning if any is found. We need to do this check early to tell
1154 * thread-MPI whether it should do pinning when spawning threads.
1156 gmx_omp_check_thread_affinity(fplog, cr, hw_opt);
1158 #ifdef GMX_THREAD_MPI
1161 /* NOW the threads will be started: */
1162 hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
1166 if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
1168 hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
1171 if (hw_opt->nthreads_tmpi > 1)
1173 /* now start the threads. */
1174 cr=mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm,
1175 oenv, bVerbose, bCompact, nstglobalcomm,
1176 ddxyz, dd_node_order, rdd, rconstr,
1177 dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
1179 nsteps_cmdline, nstepout, resetstep, nmultisim,
1180 repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
1181 cpt_period, max_hours, deviceOptions,
1183 /* the main thread continues here with a new cr. We don't deallocate
1184 the old cr because other threads may still be reading it. */
1187 gmx_comm("Failed to spawn threads");
1192 /* END OF CAUTION: cr is now reliable */
1194 /* g_membed initialisation *
1195 * Because we change the mtop, init_membed is called before the init_parallel *
1196 * (in case we ever want to make it run in parallel) */
1197 if (opt2bSet("-membed",nfile,fnm))
1201 fprintf(stderr,"Initializing membed");
1203 membed = init_membed(fplog,nfile,fnm,mtop,inputrec,state,cr,&cpt_period);
1208 /* now broadcast everything to the non-master nodes/threads: */
1209 init_parallel(fplog, cr, inputrec, mtop);
1211 /* This check needs to happen after get_nthreads_mpi() */
1212 if (inputrec->cutoff_scheme == ecutsVERLET && (Flags & MD_PARTDEC))
1214 gmx_fatal_collective(FARGS,cr,NULL,
1215 "The Verlet cut-off scheme is not supported with particle decomposition.\n"
1216 "You can achieve the same effect as particle decomposition by running in parallel using only OpenMP threads.");
1221 pr_inputrec(fplog,0,"Input Parameters",inputrec,FALSE);
1224 #if defined GMX_THREAD_MPI
1225 /* With tMPI we detected on thread 0 and we'll just pass the hwinfo pointer
1226 * to the other threads -- slightly uncool, but works fine, just need to
1227 * make sure that the data doesn't get freed twice. */
1234 gmx_bcast(sizeof(&hwinfo), &hwinfo, cr);
1237 if (PAR(cr) && !SIMMASTER(cr))
1239 /* now we have inputrec on all nodes, can run the detection */
1240 /* TODO: perhaps it's better to propagate within a node instead? */
1242 gmx_detect_hardware(fplog, hwinfo, cr,
1243 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
1247 /* now make sure the state is initialized and propagated */
1248 set_state_entries(state,inputrec,cr->nnodes);
1250 /* remove when vv and rerun works correctly! */
1251 if (PAR(cr) && EI_VV(inputrec->eI) && ((Flags & MD_RERUN) || (Flags & MD_RERUN_VSITE)))
1254 "Currently can't do velocity verlet with rerun in parallel.");
1257 /* A parallel command line option consistency check that we can
1258 only do after any threads have started. */
1260 (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
1263 "The -dd or -npme option request a parallel simulation, "
1265 "but %s was compiled without threads or MPI enabled"
1267 #ifdef GMX_THREAD_MPI
1268 "but the number of threads (option -nt) is 1"
1270 "but %s was not started through mpirun/mpiexec or only one process was requested through mpirun/mpiexec"
1277 if ((Flags & MD_RERUN) &&
1278 (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
1280 gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
1283 if (can_use_allvsall(inputrec,mtop,TRUE,cr,fplog) && PAR(cr))
1285 /* All-vs-all loops do not work with domain decomposition */
1286 Flags |= MD_PARTDEC;
1289 if (!EEL_PME(inputrec->coulombtype) || (Flags & MD_PARTDEC))
1291 if (cr->npmenodes > 0)
1293 if (!EEL_PME(inputrec->coulombtype))
1295 gmx_fatal_collective(FARGS,cr,NULL,
1296 "PME nodes are requested, but the system does not use PME electrostatics");
1298 if (Flags & MD_PARTDEC)
1300 gmx_fatal_collective(FARGS,cr,NULL,
1301 "PME nodes are requested, but particle decomposition does not support separate PME nodes");
1309 fcRegisterSteps(inputrec->nsteps,inputrec->init_step);
1312 /* NMR restraints must be initialized before load_checkpoint,
1313 * since with time averaging the history is added to t_state.
1314 * For proper consistency check we therefore need to extend
1316 * So the PME-only nodes (if present) will also initialize
1317 * the distance restraints.
1321 /* This needs to be called before read_checkpoint to extend the state */
1322 init_disres(fplog,mtop,inputrec,cr,Flags & MD_PARTDEC,fcd,state);
1324 if (gmx_mtop_ftype_count(mtop,F_ORIRES) > 0)
1326 if (PAR(cr) && !(Flags & MD_PARTDEC))
1328 gmx_fatal(FARGS,"Orientation restraints do not work (yet) with domain decomposition, use particle decomposition (mdrun option -pd)");
1330 /* Orientation restraints */
1333 init_orires(fplog,mtop,state->x,inputrec,cr->ms,&(fcd->orires),
1338 if (DEFORM(*inputrec))
1340 /* Store the deform reference box before reading the checkpoint */
1343 copy_mat(state->box,box);
1347 gmx_bcast(sizeof(box),box,cr);
1349 /* Because we do not have the update struct available yet
1350 * in which the reference values should be stored,
1351 * we store them temporarily in static variables.
1352 * This should be thread safe, since they are only written once
1353 * and with identical values.
1355 #ifdef GMX_THREAD_MPI
1356 tMPI_Thread_mutex_lock(&deform_init_box_mutex);
1358 deform_init_init_step_tpx = inputrec->init_step;
1359 copy_mat(box,deform_init_box_tpx);
1360 #ifdef GMX_THREAD_MPI
1361 tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
1365 if (opt2bSet("-cpi",nfile,fnm))
1367 /* Check if checkpoint file exists before doing continuation.
1368 * This way we can use identical input options for the first and subsequent runs...
1370 if( gmx_fexist_master(opt2fn_master("-cpi",nfile,fnm,cr),cr) )
1372 load_checkpoint(opt2fn_master("-cpi",nfile,fnm,cr),&fplog,
1373 cr,Flags & MD_PARTDEC,ddxyz,
1374 inputrec,state,&bReadRNG,&bReadEkin,
1375 (Flags & MD_APPENDFILES),
1376 (Flags & MD_APPENDFILESSET));
1380 Flags |= MD_READ_RNG;
1384 Flags |= MD_READ_EKIN;
1389 if (((MASTER(cr) || (Flags & MD_SEPPOT)) && (Flags & MD_APPENDFILES))
1390 #ifdef GMX_THREAD_MPI
1391 /* With thread MPI only the master node/thread exists in mdrun.c,
1392 * therefore non-master nodes need to open the "seppot" log file here.
1394 || (!MASTER(cr) && (Flags & MD_SEPPOT))
1398 gmx_log_open(ftp2fn(efLOG,nfile,fnm),cr,!(Flags & MD_SEPPOT),
1402 /* override nsteps with value from cmdline */
1403 override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
1407 copy_mat(state->box,box);
1412 gmx_bcast(sizeof(box),box,cr);
1415 /* Essential dynamics */
1416 if (opt2bSet("-ei",nfile,fnm))
1418 /* Open input and output files, allocate space for ED data structure */
1419 ed = ed_open(nfile,fnm,Flags,cr);
1422 if (PAR(cr) && !((Flags & MD_PARTDEC) ||
1423 EI_TPI(inputrec->eI) ||
1424 inputrec->eI == eiNM))
1426 cr->dd = init_domain_decomposition(fplog,cr,Flags,ddxyz,rdd,rconstr,
1427 dddlb_opt,dlb_scale,
1431 &ddbox,&npme_major,&npme_minor);
1433 make_dd_communicators(fplog,cr,dd_node_order);
1435 /* Set overallocation to avoid frequent reallocation of arrays */
1436 set_over_alloc_dd(TRUE);
1440 /* PME, if used, is done on all nodes with 1D decomposition */
1442 cr->duty = (DUTY_PP | DUTY_PME);
1445 if (!EI_TPI(inputrec->eI))
1447 npme_major = cr->nnodes;
1450 if (inputrec->ePBC == epbcSCREW)
1453 "pbc=%s is only implemented with domain decomposition",
1454 epbc_names[inputrec->ePBC]);
1460 /* After possible communicator splitting in make_dd_communicators.
1461 * we can set up the intra/inter node communication.
1463 gmx_setup_nodecomm(fplog,cr);
1466 /* Initialize per-node process ID and counters. */
1467 gmx_init_intra_counters(cr);
1470 md_print_info(cr,fplog,"Using %d MPI %s\n",
1472 #ifdef GMX_THREAD_MPI
1473 cr->nnodes==1 ? "thread" : "threads"
1475 cr->nnodes==1 ? "process" : "processes"
1480 gmx_omp_nthreads_init(fplog, cr,
1481 hwinfo->nthreads_hw_avail,
1482 hw_opt->nthreads_omp,
1483 hw_opt->nthreads_omp_pme,
1484 (cr->duty & DUTY_PP) == 0,
1485 inputrec->cutoff_scheme == ecutsVERLET);
1487 gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi, minf.bUseGPU);
1489 /* getting number of PP/PME threads
1490 PME: env variable should be read only on one node to make sure it is
1491 identical everywhere;
1493 /* TODO nthreads_pp is only used for pinning threads.
1494 * This is a temporary solution until we have a hw topology library.
1496 nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
1497 nthreads_pme = gmx_omp_nthreads_get(emntPME);
1499 wcycle = wallcycle_init(fplog,resetstep,cr,nthreads_pp,nthreads_pme);
1503 /* Master synchronizes its value of reset_counters with all nodes
1504 * including PME only nodes */
1505 reset_counters = wcycle_get_reset_counters(wcycle);
1506 gmx_bcast_sim(sizeof(reset_counters),&reset_counters,cr);
1507 wcycle_set_reset_counters(wcycle, reset_counters);
1511 if (cr->duty & DUTY_PP)
1513 /* For domain decomposition we allocate dynamically
1514 * in dd_partition_system.
1516 if (DOMAINDECOMP(cr))
1518 bcast_state_setup(cr,state);
1524 bcast_state(cr,state,TRUE);
1528 /* Initiate forcerecord */
1530 fr->hwinfo = hwinfo;
1531 init_forcerec(fplog,oenv,fr,fcd,inputrec,mtop,cr,box,FALSE,
1532 opt2fn("-table",nfile,fnm),
1533 opt2fn("-tabletf",nfile,fnm),
1534 opt2fn("-tablep",nfile,fnm),
1535 opt2fn("-tableb",nfile,fnm),
1539 /* version for PCA_NOT_READ_NODE (see md.c) */
1540 /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
1541 "nofile","nofile","nofile","nofile",FALSE,pforce);
1543 fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT);
1545 /* Initialize QM-MM */
1548 init_QMMMrec(cr,box,mtop,inputrec,fr);
1551 /* Initialize the mdatoms structure.
1552 * mdatoms is not filled with atom data,
1553 * as this can not be done now with domain decomposition.
1555 mdatoms = init_mdatoms(fplog,mtop,inputrec->efep!=efepNO);
1557 /* Initialize the virtual site communication */
1558 vsite = init_vsite(mtop,cr,FALSE);
1560 calc_shifts(box,fr->shift_vec);
1562 /* With periodic molecules the charge groups should be whole at start up
1563 * and the virtual sites should not be far from their proper positions.
1565 if (!inputrec->bContinuation && MASTER(cr) &&
1566 !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
1568 /* Make molecules whole at start of run */
1569 if (fr->ePBC != epbcNONE)
1571 do_pbc_first_mtop(fplog,inputrec->ePBC,box,mtop,state->x);
1575 /* Correct initial vsite positions are required
1576 * for the initial distribution in the domain decomposition
1577 * and for the initial shell prediction.
1579 construct_vsites_mtop(fplog,vsite,mtop,state->x);
1583 if (EEL_PME(fr->eeltype))
1585 ewaldcoeff = fr->ewaldcoeff;
1586 pmedata = &fr->pmedata;
1595 /* This is a PME only node */
1597 /* We don't need the state */
1600 ewaldcoeff = calc_ewaldcoeff(inputrec->rcoulomb, inputrec->ewald_rtol);
1604 /* Set the CPU affinity */
1605 set_cpu_affinity(fplog,cr,hw_opt,nthreads_pme,hwinfo,inputrec);
1607 /* Initiate PME if necessary,
1608 * either on all nodes or on dedicated PME nodes only. */
1609 if (EEL_PME(inputrec->coulombtype))
1613 nChargePerturbed = mdatoms->nChargePerturbed;
1615 if (cr->npmenodes > 0)
1617 /* The PME only nodes need to know nChargePerturbed */
1618 gmx_bcast_sim(sizeof(nChargePerturbed),&nChargePerturbed,cr);
1621 if (cr->duty & DUTY_PME)
1623 status = gmx_pme_init(pmedata,cr,npme_major,npme_minor,inputrec,
1624 mtop ? mtop->natoms : 0,nChargePerturbed,
1625 (Flags & MD_REPRODUCIBLE),nthreads_pme);
1628 gmx_fatal(FARGS,"Error %d initializing PME",status);
1634 if (integrator[inputrec->eI].func == do_md
1637 integrator[inputrec->eI].func == do_md_openmm
1641 /* Turn on signal handling on all nodes */
1643 * (A user signal from the PME nodes (if any)
1644 * is communicated to the PP nodes.
1646 signal_handler_install();
1649 if (cr->duty & DUTY_PP)
1651 if (inputrec->ePull != epullNO)
1653 /* Initialize pull code */
1654 init_pull(fplog,inputrec,nfile,fnm,mtop,cr,oenv, inputrec->fepvals->init_lambda,
1655 EI_DYNAMICS(inputrec->eI) && MASTER(cr),Flags);
1660 /* Initialize enforced rotation code */
1661 init_rot(fplog,inputrec,nfile,fnm,cr,state->x,box,mtop,oenv,
1665 constr = init_constraints(fplog,mtop,inputrec,ed,state,cr);
1667 if (DOMAINDECOMP(cr))
1669 dd_init_bondeds(fplog,cr->dd,mtop,vsite,constr,inputrec,
1670 Flags & MD_DDBONDCHECK,fr->cginfo_mb);
1672 set_dd_parameters(fplog,cr->dd,dlb_scale,inputrec,fr,&ddbox);
1674 setup_dd_grid(fplog,cr->dd);
1677 /* Now do whatever the user wants us to do (how flexible...) */
1678 integrator[inputrec->eI].func(fplog,cr,nfile,fnm,
1679 oenv,bVerbose,bCompact,
1682 nstepout,inputrec,mtop,
1684 mdatoms,nrnb,wcycle,ed,fr,
1685 repl_ex_nst,repl_ex_nex,repl_ex_seed,
1687 cpt_period,max_hours,
1692 if (inputrec->ePull != epullNO)
1694 finish_pull(fplog,inputrec->pull);
1699 finish_rot(fplog,inputrec->rot);
1706 gmx_pmeonly(*pmedata,cr,nrnb,wcycle,ewaldcoeff,FALSE,inputrec);
1709 if (EI_DYNAMICS(inputrec->eI) || EI_TPI(inputrec->eI))
1711 /* Some timing stats */
1714 if (runtime.proc == 0)
1716 runtime.proc = runtime.real;
1725 wallcycle_stop(wcycle,ewcRUN);
1727 /* Finish up, write some stuff
1728 * if rerunMD, don't write last frame again
1730 finish_run(fplog,cr,ftp2fn(efSTO,nfile,fnm),
1731 inputrec,nrnb,wcycle,&runtime,
1732 fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ?
1733 nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL,
1735 EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
1737 if ((cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU)
1739 char gpu_err_str[STRLEN];
1741 /* free GPU memory and uninitialize GPU (by destroying the context) */
1742 nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
1744 if (!free_gpu(gpu_err_str))
1746 gmx_warning("On node %d failed to free GPU #%d: %s",
1747 cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
1751 if (opt2bSet("-membed",nfile,fnm))
1756 #ifdef GMX_THREAD_MPI
1757 if (PAR(cr) && SIMMASTER(cr))
1760 gmx_hardware_info_free(hwinfo);
1763 /* Does what it says */
1764 print_date_and_time(fplog,cr->nodeid,"Finished mdrun",&runtime);
1766 /* Close logfile already here if we were appending to it */
1767 if (MASTER(cr) && (Flags & MD_APPENDFILES))
1769 gmx_log_close(fplog);
1772 rc=(int)gmx_get_stop_condition();
1774 #ifdef GMX_THREAD_MPI
1775 /* we need to join all threads. The sub-threads join when they
1776 exit this function, but the master thread needs to be told to
1778 if (PAR(cr) && MASTER(cr))