1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This source code is part of
8 * GROningen MAchine for Chemical Simulations
11 * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
12 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
13 * Copyright (c) 2001-2004, The GROMACS development team,
14 * check out http://www.gromacs.org for more information.
16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version 2
19 * of the License, or (at your option) any later version.
21 * If you want to redistribute modifications, please consider that
22 * scientific software is very special. Version control is crucial -
23 * bugs must be traceable. We will be happy to consider code for
24 * inclusion in the official distribution, but derived work must not
25 * be called official GROMACS. Details are found in the README & COPYING
26 * files - if they are missing, get the official version at www.gromacs.org.
28 * To help us fund GROMACS development, we humbly ask that you cite
29 * the papers on the package - you can find them in the top README file.
31 * For more info, check our website at http://www.gromacs.org
34 * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
42 #include <sys/syscall.h>
54 #include "md_logging.h"
55 #include "md_support.h"
65 #include "mpelogging.h"
71 #include "checkpoint.h"
72 #include "mtop_util.h"
73 #include "sighandler.h"
76 #include "gmx_detect_hardware.h"
77 #include "gmx_omp_nthreads.h"
78 #include "pull_rotation.h"
79 #include "calc_verletbuf.h"
80 #include "../mdlib/nbnxn_search.h"
81 #include "../mdlib/nbnxn_consts.h"
82 #include "gmx_fatal_collective.h"
84 #include "md_openmm.h"
99 #include "md_openmm.h"
102 #include "gpu_utils.h"
103 #include "nbnxn_cuda_data_mgmt.h"
106 gmx_integrator_t *func;
109 /* The array should match the eI array in include/types/enums.h */
110 #ifdef GMX_OPENMM /* FIXME do_md_openmm needs fixing */
111 const gmx_intp_t integrator[eiNR] = { {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm}, {do_md_openmm},{do_md_openmm}};
113 const gmx_intp_t integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md},{do_md}};
116 gmx_large_int_t deform_init_init_step_tpx;
117 matrix deform_init_box_tpx;
118 #ifdef GMX_THREAD_MPI
119 tMPI_Thread_mutex_t deform_init_box_mutex=TMPI_THREAD_MUTEX_INITIALIZER;
123 #ifdef GMX_THREAD_MPI
124 struct mdrunner_arglist
126 gmx_hw_opt_t *hw_opt;
139 const char *dddlb_opt;
144 const char *nbpu_opt;
155 const char *deviceOptions;
157 int ret; /* return value */
161 /* The function used for spawning threads. Extracts the mdrunner()
162 arguments from its one argument and calls mdrunner(), after making
164 static void mdrunner_start_fn(void *arg)
166 struct mdrunner_arglist *mda=(struct mdrunner_arglist*)arg;
167 struct mdrunner_arglist mc=*mda; /* copy the arg list to make sure
168 that it's thread-local. This doesn't
169 copy pointed-to items, of course,
170 but those are all const. */
171 t_commrec *cr; /* we need a local version of this */
175 fnm = dup_tfn(mc.nfile, mc.fnm);
177 cr = init_par_threads(mc.cr);
184 mda->ret=mdrunner(mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv,
185 mc.bVerbose, mc.bCompact, mc.nstglobalcomm,
186 mc.ddxyz, mc.dd_node_order, mc.rdd,
187 mc.rconstr, mc.dddlb_opt, mc.dlb_scale,
188 mc.ddcsx, mc.ddcsy, mc.ddcsz,
190 mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
191 mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce,
192 mc.cpt_period, mc.max_hours, mc.deviceOptions, mc.Flags);
195 /* called by mdrunner() to start a specific number of threads (including
196 the main thread) for thread-parallel runs. This in turn calls mdrunner()
198 All options besides nthreads are the same as for mdrunner(). */
199 static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
200 FILE *fplog,t_commrec *cr,int nfile,
201 const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
202 gmx_bool bCompact, int nstglobalcomm,
203 ivec ddxyz,int dd_node_order,real rdd,real rconstr,
204 const char *dddlb_opt,real dlb_scale,
205 const char *ddcsx,const char *ddcsy,const char *ddcsz,
206 const char *nbpu_opt,
207 int nsteps_cmdline, int nstepout,int resetstep,
208 int nmultisim,int repl_ex_nst,int repl_ex_nex, int repl_ex_seed,
209 real pforce,real cpt_period, real max_hours,
210 const char *deviceOptions, unsigned long Flags)
213 struct mdrunner_arglist *mda;
214 t_commrec *crn; /* the new commrec */
217 /* first check whether we even need to start tMPI */
218 if (hw_opt->nthreads_tmpi < 2)
223 /* a few small, one-time, almost unavoidable memory leaks: */
225 fnmn=dup_tfn(nfile, fnm);
227 /* fill the data structure to pass as void pointer to thread start fn */
234 mda->bVerbose=bVerbose;
235 mda->bCompact=bCompact;
236 mda->nstglobalcomm=nstglobalcomm;
237 mda->ddxyz[XX]=ddxyz[XX];
238 mda->ddxyz[YY]=ddxyz[YY];
239 mda->ddxyz[ZZ]=ddxyz[ZZ];
240 mda->dd_node_order=dd_node_order;
242 mda->rconstr=rconstr;
243 mda->dddlb_opt=dddlb_opt;
244 mda->dlb_scale=dlb_scale;
248 mda->nbpu_opt=nbpu_opt;
249 mda->nsteps_cmdline=nsteps_cmdline;
250 mda->nstepout=nstepout;
251 mda->resetstep=resetstep;
252 mda->nmultisim=nmultisim;
253 mda->repl_ex_nst=repl_ex_nst;
254 mda->repl_ex_nex=repl_ex_nex;
255 mda->repl_ex_seed=repl_ex_seed;
257 mda->cpt_period=cpt_period;
258 mda->max_hours=max_hours;
259 mda->deviceOptions=deviceOptions;
262 fprintf(stderr, "Starting %d tMPI threads\n",hw_opt->nthreads_tmpi);
264 /* now spawn new threads that start mdrunner_start_fn(), while
265 the main thread returns */
266 ret=tMPI_Init_fn(TRUE, hw_opt->nthreads_tmpi,
267 (hw_opt->bThreadPinning ? TMPI_AFFINITY_ALL_CORES : TMPI_AFFINITY_NONE),
268 mdrunner_start_fn, (void*)(mda) );
269 if (ret!=TMPI_SUCCESS)
272 /* make a new comm_rec to reflect the new situation */
273 crn=init_par_threads(cr);
278 static int get_tmpi_omp_thread_distribution(const gmx_hw_opt_t *hw_opt,
284 /* There are no separate PME nodes here, as we ensured in
285 * check_and_update_hw_opt that nthreads_tmpi>0 with PME nodes
286 * and a conditional ensures we would not have ended up here.
287 * Note that separate PME nodes might be switched on later.
291 nthreads_tmpi = ngpu;
292 if (nthreads_tot > 0 && nthreads_tot < nthreads_tmpi)
294 nthreads_tmpi = nthreads_tot;
297 else if (hw_opt->nthreads_omp > 0)
299 if (hw_opt->nthreads_omp > nthreads_tot)
301 gmx_fatal(FARGS,"More OpenMP threads requested (%d) than the total number of threads requested (%d)",hw_opt->nthreads_omp,nthreads_tot);
303 nthreads_tmpi = nthreads_tot/hw_opt->nthreads_omp;
307 /* TODO choose nthreads_omp based on hardware topology
308 when we have a hardware topology detection library */
309 /* Don't use OpenMP parallelization */
310 nthreads_tmpi = nthreads_tot;
313 return nthreads_tmpi;
317 /* Get the number of threads to use for thread-MPI based on how many
318 * were requested, which algorithms we're using,
319 * and how many particles there are.
320 * At the point we have already called check_and_update_hw_opt.
321 * Thus all options should be internally consistent and consistent
322 * with the hardware, except that ntmpi could be larger than #GPU.
324 static int get_nthreads_mpi(gmx_hw_info_t *hwinfo,
325 gmx_hw_opt_t *hw_opt,
326 t_inputrec *inputrec, gmx_mtop_t *mtop,
330 int nthreads_tot_max,nthreads_tmpi,nthreads_new,ngpu;
331 int min_atoms_per_mpi_thread;
336 if (hw_opt->nthreads_tmpi > 0)
338 /* Trivial, return right away */
339 return hw_opt->nthreads_tmpi;
342 /* How many total (#tMPI*#OpenMP) threads can we start? */
343 if (hw_opt->nthreads_tot > 0)
345 nthreads_tot_max = hw_opt->nthreads_tot;
349 nthreads_tot_max = tMPI_Thread_get_hw_number();
352 bCanUseGPU = (inputrec->cutoff_scheme == ecutsVERLET && hwinfo->bCanUseGPU);
355 ngpu = hwinfo->gpu_info.ncuda_dev_use;
363 get_tmpi_omp_thread_distribution(hw_opt,nthreads_tot_max,ngpu);
365 if (inputrec->eI == eiNM || EI_TPI(inputrec->eI))
367 /* Steps are divided over the nodes iso splitting the atoms */
368 min_atoms_per_mpi_thread = 0;
374 min_atoms_per_mpi_thread = MIN_ATOMS_PER_GPU;
378 min_atoms_per_mpi_thread = MIN_ATOMS_PER_MPI_THREAD;
382 /* Check if an algorithm does not support parallel simulation. */
383 if (nthreads_tmpi != 1 &&
384 ( inputrec->eI == eiLBFGS ||
385 inputrec->coulombtype == eelEWALD ) )
389 md_print_warn(cr,fplog,"The integration or electrostatics algorithm doesn't support parallel runs. Using a single thread-MPI thread.\n");
390 if (hw_opt->nthreads_tmpi > nthreads_tmpi)
392 gmx_fatal(FARGS,"You asked for more than 1 thread-MPI thread, but an algorithm doesn't support that");
395 else if (mtop->natoms/nthreads_tmpi < min_atoms_per_mpi_thread)
397 /* the thread number was chosen automatically, but there are too many
398 threads (too few atoms per thread) */
399 nthreads_new = max(1,mtop->natoms/min_atoms_per_mpi_thread);
401 if (nthreads_new > 8 || (nthreads_tmpi == 8 && nthreads_new > 4))
403 /* TODO replace this once we have proper HT detection
404 * Use only multiples of 4 above 8 threads
405 * or with an 8-core processor
406 * (to avoid 6 threads on 8 core processors with 4 real cores).
408 nthreads_new = (nthreads_new/4)*4;
410 else if (nthreads_new > 4)
412 /* Avoid 5 or 7 threads */
413 nthreads_new = (nthreads_new/2)*2;
416 nthreads_tmpi = nthreads_new;
418 fprintf(stderr,"\n");
419 fprintf(stderr,"NOTE: Parallelization is limited by the small number of atoms,\n");
420 fprintf(stderr," only starting %d thread-MPI threads.\n",nthreads_tmpi);
421 fprintf(stderr," You can use the -nt and/or -ntmpi option to optimize the number of threads.\n\n");
424 return nthreads_tmpi;
426 #endif /* GMX_THREAD_MPI */
429 /* Environment variable for setting nstlist */
430 static const char* NSTLIST_ENVVAR = "GMX_NSTLIST";
431 /* Try to increase nstlist when using a GPU with nstlist less than this */
432 static const int NSTLIST_GPU_ENOUGH = 20;
433 /* Increase nstlist until the non-bonded cost increases more than this factor */
434 static const float NBNXN_GPU_LIST_OK_FAC = 1.25;
435 /* Don't increase nstlist beyond a non-bonded cost increases of this factor */
436 static const float NBNXN_GPU_LIST_MAX_FAC = 1.40;
438 /* Try to increase nstlist when running on a GPU */
439 static void increase_nstlist(FILE *fp,t_commrec *cr,
440 t_inputrec *ir,const gmx_mtop_t *mtop,matrix box)
443 int nstlist_orig,nstlist_prev;
444 verletbuf_list_setup_t ls;
445 real rlist_inc,rlist_ok,rlist_max,rlist_new,rlist_prev;
448 gmx_bool bBox,bDD,bCont;
449 const char *nstl_fmt="\nFor optimal performance with a GPU nstlist (now %d) should be larger.\nThe optimum depends on your CPU and GPU resources.\nYou might want to try several nstlist values.\n";
450 const char *vbd_err="Can not increase nstlist for GPU run because verlet-buffer-drift is not set or used";
451 const char *box_err="Can not increase nstlist for GPU run because the box is too small";
452 const char *dd_err ="Can not increase nstlist for GPU run because of domain decomposition limitations";
455 /* Number of + nstlist alternative values to try when switching */
456 const int nstl[]={ 20, 25, 40, 50 };
457 #define NNSTL sizeof(nstl)/sizeof(nstl[0])
459 env = getenv(NSTLIST_ENVVAR);
464 fprintf(fp,nstl_fmt,ir->nstlist);
468 if (ir->verletbuf_drift == 0)
470 gmx_fatal(FARGS,"You are using an old tpr file with a GPU, please generate a new tpr file with an up to date version of grompp");
473 if (ir->verletbuf_drift < 0)
477 fprintf(stderr,"%s\n",vbd_err);
481 fprintf(fp,"%s\n",vbd_err);
487 nstlist_orig = ir->nstlist;
490 sprintf(buf,"Getting nstlist from environment variable GMX_NSTLIST=%s",env);
493 fprintf(stderr,"%s\n",buf);
497 fprintf(fp,"%s\n",buf);
499 sscanf(env,"%d",&ir->nstlist);
502 verletbuf_get_list_setup(TRUE,&ls);
504 /* Allow rlist to make the list double the size of the cut-off sphere */
505 rlist_inc = nbnxn_get_rlist_effective_inc(NBNXN_GPU_CLUSTER_SIZE,mtop->natoms/det(box));
506 rlist_ok = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_OK_FAC,1.0/3.0) - rlist_inc;
507 rlist_max = (max(ir->rvdw,ir->rcoulomb) + rlist_inc)*pow(NBNXN_GPU_LIST_MAX_FAC,1.0/3.0) - rlist_inc;
510 fprintf(debug,"GPU nstlist tuning: rlist_inc %.3f rlist_max %.3f\n",
511 rlist_inc,rlist_max);
515 nstlist_prev = nstlist_orig;
516 rlist_prev = ir->rlist;
521 ir->nstlist = nstl[i];
524 /* Set the pair-list buffer size in ir */
525 calc_verlet_buffer_size(mtop,det(box),ir,ir->verletbuf_drift,&ls,
528 /* Does rlist fit in the box? */
529 bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC,box));
531 if (bBox && DOMAINDECOMP(cr))
533 /* Check if rlist fits in the domain decomposition */
534 if (inputrec2nboundeddim(ir) < DIM)
536 gmx_incons("Changing nstlist with domain decomposition and unbounded dimensions is not implemented yet");
538 copy_mat(box,state_tmp.box);
539 bDD = change_dd_cutoff(cr,&state_tmp,ir,rlist_new);
546 if (bBox && bDD && rlist_new <= rlist_max)
548 /* Increase nstlist */
549 nstlist_prev = ir->nstlist;
550 rlist_prev = rlist_new;
551 bCont = (i+1 < NNSTL && rlist_new < rlist_ok);
555 /* Stick with the previous nstlist */
556 ir->nstlist = nstlist_prev;
557 rlist_new = rlist_prev;
569 gmx_warning(!bBox ? box_err : dd_err);
572 fprintf(fp,"\n%s\n",bBox ? box_err : dd_err);
574 ir->nstlist = nstlist_orig;
576 else if (ir->nstlist != nstlist_orig || rlist_new != ir->rlist)
578 sprintf(buf,"Changing nstlist from %d to %d, rlist from %g to %g",
579 nstlist_orig,ir->nstlist,
580 ir->rlist,rlist_new);
583 fprintf(stderr,"%s\n\n",buf);
587 fprintf(fp,"%s\n\n",buf);
589 ir->rlist = rlist_new;
590 ir->rlistlong = rlist_new;
594 static void prepare_verlet_scheme(FILE *fplog,
595 gmx_hw_info_t *hwinfo,
597 gmx_hw_opt_t *hw_opt,
598 const char *nbpu_opt,
600 const gmx_mtop_t *mtop,
604 /* Here we only check for GPU usage on the MPI master process,
605 * as here we don't know how many GPUs we will use yet.
606 * We check for a GPU on all processes later.
608 *bUseGPU = hwinfo->bCanUseGPU || (getenv("GMX_EMULATE_GPU") != NULL);
610 if (ir->verletbuf_drift > 0)
612 /* Update the Verlet buffer size for the current run setup */
613 verletbuf_list_setup_t ls;
616 /* Here we assume CPU acceleration is on. But as currently
617 * calc_verlet_buffer_size gives the same results for 4x8 and 4x4
618 * and 4x2 gives a larger buffer than 4x4, this is ok.
620 verletbuf_get_list_setup(*bUseGPU,&ls);
622 calc_verlet_buffer_size(mtop,det(box),ir,
623 ir->verletbuf_drift,&ls,
625 if (rlist_new != ir->rlist)
629 fprintf(fplog,"\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
631 ls.cluster_size_i,ls.cluster_size_j);
633 ir->rlist = rlist_new;
634 ir->rlistlong = rlist_new;
638 /* With GPU or emulation we should check nstlist for performance */
639 if ((EI_DYNAMICS(ir->eI) &&
641 ir->nstlist < NSTLIST_GPU_ENOUGH) ||
642 getenv(NSTLIST_ENVVAR) != NULL)
644 /* Choose a better nstlist */
645 increase_nstlist(fplog,cr,ir,mtop,box);
649 static void convert_to_verlet_scheme(FILE *fplog,
651 gmx_mtop_t *mtop,real box_vol)
653 char *conv_mesg="Converting input file with group cut-off scheme to the Verlet cut-off scheme";
655 md_print_warn(NULL,fplog,"%s\n",conv_mesg);
657 ir->cutoff_scheme = ecutsVERLET;
658 ir->verletbuf_drift = 0.005;
660 if (ir->rcoulomb != ir->rvdw)
662 gmx_fatal(FARGS,"The VdW and Coulomb cut-offs are different, whereas the Verlet scheme only supports equal cut-offs");
665 if (ir->vdwtype == evdwUSER || EEL_USER(ir->coulombtype))
667 gmx_fatal(FARGS,"User non-bonded potentials are not (yet) supported with the Verlet scheme");
669 else if (EVDW_SWITCHED(ir->vdwtype) || EEL_SWITCHED(ir->coulombtype))
671 md_print_warn(NULL,fplog,"Converting switched or shifted interactions to a shifted potential (without force shift), this will lead to slightly different interaction potentials");
673 if (EVDW_SWITCHED(ir->vdwtype))
675 ir->vdwtype = evdwCUT;
677 if (EEL_SWITCHED(ir->coulombtype))
679 if (EEL_FULL(ir->coulombtype))
681 /* With full electrostatic only PME can be switched */
682 ir->coulombtype = eelPME;
686 md_print_warn(NULL,fplog,"NOTE: Replacing %s electrostatics with reaction-field with epsilon-rf=inf\n",eel_names[ir->coulombtype]);
687 ir->coulombtype = eelRF;
688 ir->epsilon_rf = 0.0;
692 /* We set the target energy drift to a small number.
693 * Note that this is only for testing. For production the user
694 * should think about this and set the mdp options.
696 ir->verletbuf_drift = 1e-4;
699 if (inputrec2nboundeddim(ir) != 3)
701 gmx_fatal(FARGS,"Can only convert old tpr files to the Verlet cut-off scheme with 3D pbc");
704 if (ir->efep != efepNO || ir->implicit_solvent != eisNO)
706 gmx_fatal(FARGS,"Will not convert old tpr files to the Verlet cut-off scheme with free-energy calculations or implicit solvent");
709 if (EI_DYNAMICS(ir->eI) && !(EI_MD(ir->eI) && ir->etc == etcNO))
711 verletbuf_list_setup_t ls;
713 verletbuf_get_list_setup(FALSE,&ls);
714 calc_verlet_buffer_size(mtop,box_vol,ir,ir->verletbuf_drift,&ls,
719 ir->verletbuf_drift = -1;
720 ir->rlist = 1.05*max(ir->rvdw,ir->rcoulomb);
723 gmx_mtop_remove_chargegroups(mtop);
727 /* Set CPU affinity. Can be important for performance.
728 On some systems (e.g. Cray) CPU Affinity is set by default.
729 But default assigning doesn't work (well) with only some ranks
730 having threads. This causes very low performance.
731 External tools have cumbersome syntax for setting affinity
732 in the case that only some ranks have threads.
733 Thus it is important that GROMACS sets the affinity internally
734 if only PME is using threads.
736 static void set_cpu_affinity(FILE *fplog,
738 gmx_hw_opt_t *hw_opt,
740 const gmx_hw_info_t *hwinfo,
741 const t_inputrec *inputrec)
743 #if defined GMX_THREAD_MPI
744 /* With the number of TMPI threads equal to the number of cores
745 * we already pinned in thread-MPI, so don't pin again here.
747 if (hw_opt->nthreads_tmpi == tMPI_Thread_get_hw_number())
753 #ifdef GMX_OPENMP /* TODO: actually we could do this even without OpenMP?! */
754 #ifdef __linux /* TODO: only linux? why not everywhere if sched_setaffinity is available */
755 if (hw_opt->bThreadPinning)
757 int thread, nthread_local, nthread_node, nthread_hw_max, nphyscore;
761 /* threads on this MPI process or TMPI thread */
762 if (cr->duty & DUTY_PP)
764 nthread_local = gmx_omp_nthreads_get(emntNonbonded);
768 nthread_local = gmx_omp_nthreads_get(emntPME);
771 /* map the current process to cores */
773 nthread_node = nthread_local;
775 if (PAR(cr) || MULTISIM(cr))
777 /* We need to determine a scan of the thread counts in this
782 MPI_Comm_split(MPI_COMM_WORLD,gmx_hostname_num(),cr->nodeid_intra,
784 MPI_Scan(&nthread_local,&thread,1,MPI_INT,MPI_SUM,comm_intra);
785 /* MPI_Scan is inclusive, but here we need exclusive */
786 thread -= nthread_local;
787 /* Get the total number of threads on this physical node */
788 MPI_Allreduce(&nthread_local,&nthread_node,1,MPI_INT,MPI_SUM,comm_intra);
789 MPI_Comm_free(&comm_intra);
794 if (hw_opt->core_pinning_offset > 0)
796 offset = hw_opt->core_pinning_offset;
799 fprintf(stderr, "Applying core pinning offset %d\n", offset);
803 fprintf(fplog, "Applying core pinning offset %d\n", offset);
807 /* With Intel Hyper-Threading enabled, we want to pin consecutive
808 * threads to physical cores when using more threads than physical
809 * cores or when the user requests so.
811 nthread_hw_max = hwinfo->nthreads_hw_avail;
813 if (hw_opt->bPinHyperthreading ||
814 (gmx_cpuid_x86_smt(hwinfo->cpuid_info) == GMX_CPUID_X86_SMT_ENABLED &&
815 nthread_node > nthread_hw_max/2 && getenv("GMX_DISABLE_PINHT") == NULL))
817 if (gmx_cpuid_x86_smt(hwinfo->cpuid_info) != GMX_CPUID_X86_SMT_ENABLED)
819 /* We print to stderr on all processes, as we might have
820 * different settings on different physical nodes.
822 if (gmx_cpuid_vendor(hwinfo->cpuid_info) != GMX_CPUID_VENDOR_INTEL)
824 md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
825 "but non-Intel CPU detected (vendor: %s)\n",
826 gmx_cpuid_vendor_string[gmx_cpuid_vendor(hwinfo->cpuid_info)]);
830 md_print_warn(NULL, fplog, "Pinning for Hyper-Threading layout requested, "
831 "but the CPU detected does not have Intel Hyper-Threading support "
832 "(or it is turned off)\n");
835 nphyscore = nthread_hw_max/2;
839 fprintf(stderr, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
844 fprintf(fplog, "Pinning to Hyper-Threading cores with %d physical cores in a compute node\n",
849 /* set the per-thread affinity */
850 #pragma omp parallel firstprivate(thread) num_threads(nthread_local)
856 thread += gmx_omp_get_thread_num();
859 core = offset + thread;
863 /* Lock pairs of threads to the same hyperthreaded core */
864 core = offset + thread/2 + (thread % 2)*nphyscore;
866 CPU_SET(core, &mask);
867 sched_setaffinity((pid_t) syscall (SYS_gettid), sizeof(cpu_set_t), &mask);
871 #endif /* GMX_OPENMP */
875 static void check_and_update_hw_opt(gmx_hw_opt_t *hw_opt,
878 gmx_omp_nthreads_read_env(&hw_opt->nthreads_omp);
880 #ifndef GMX_THREAD_MPI
881 if (hw_opt->nthreads_tot > 0)
883 gmx_fatal(FARGS,"Setting the total number of threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
885 if (hw_opt->nthreads_tmpi > 0)
887 gmx_fatal(FARGS,"Setting the number of thread-MPI threads is only supported with thread-MPI and Gromacs was compiled without thread-MPI");
891 if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp_pme <= 0)
893 /* We have the same number of OpenMP threads for PP and PME processes,
894 * thus we can perform several consistency checks.
896 if (hw_opt->nthreads_tmpi > 0 &&
897 hw_opt->nthreads_omp > 0 &&
898 hw_opt->nthreads_tot != hw_opt->nthreads_tmpi*hw_opt->nthreads_omp)
900 gmx_fatal(FARGS,"The total number of threads requested (%d) does not match the thread-MPI threads (%d) times the OpenMP threads (%d) requested",
901 hw_opt->nthreads_tot,hw_opt->nthreads_tmpi,hw_opt->nthreads_omp);
904 if (hw_opt->nthreads_tmpi > 0 &&
905 hw_opt->nthreads_tot % hw_opt->nthreads_tmpi != 0)
907 gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of thread-MPI threads requested (%d)",
908 hw_opt->nthreads_tot,hw_opt->nthreads_tmpi);
911 if (hw_opt->nthreads_omp > 0 &&
912 hw_opt->nthreads_tot % hw_opt->nthreads_omp != 0)
914 gmx_fatal(FARGS,"The total number of threads requested (%d) is not divisible by the number of OpenMP threads requested (%d)",
915 hw_opt->nthreads_tot,hw_opt->nthreads_omp);
918 if (hw_opt->nthreads_tmpi > 0 &&
919 hw_opt->nthreads_omp <= 0)
921 hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
926 if (hw_opt->nthreads_omp > 1)
928 gmx_fatal(FARGS,"OpenMP threads are requested, but Gromacs was compiled without OpenMP support");
932 if (cutoff_scheme == ecutsGROUP)
934 /* We only have OpenMP support for PME only nodes */
935 if (hw_opt->nthreads_omp > 1)
937 gmx_fatal(FARGS,"OpenMP threads have been requested with cut-off scheme %s, but these are only supported with cut-off scheme %s",
938 ecutscheme_names[cutoff_scheme],
939 ecutscheme_names[ecutsVERLET]);
941 hw_opt->nthreads_omp = 1;
944 if (hw_opt->nthreads_omp_pme > 0 && hw_opt->nthreads_omp <= 0)
946 gmx_fatal(FARGS,"You need to specify -ntomp in addition to -ntomp_pme");
949 if (hw_opt->nthreads_tot == 1)
951 hw_opt->nthreads_tmpi = 1;
953 if (hw_opt->nthreads_omp > 1)
955 gmx_fatal(FARGS,"You requested %d OpenMP threads with %d total threads",
956 hw_opt->nthreads_tmpi,hw_opt->nthreads_tot);
958 hw_opt->nthreads_omp = 1;
961 if (hw_opt->nthreads_omp_pme <= 0 && hw_opt->nthreads_omp > 0)
963 hw_opt->nthreads_omp_pme = hw_opt->nthreads_omp;
968 fprintf(debug,"hw_opt: nt %d ntmpi %d ntomp %d ntomp_pme %d gpu_id '%s'\n",
969 hw_opt->nthreads_tot,
970 hw_opt->nthreads_tmpi,
971 hw_opt->nthreads_omp,
972 hw_opt->nthreads_omp_pme,
973 hw_opt->gpu_id!=NULL ? hw_opt->gpu_id : "");
979 /* Override the value in inputrec with value passed on the command line (if any) */
980 static void override_nsteps_cmdline(FILE *fplog,
988 /* override with anything else than the default -2 */
989 if (nsteps_cmdline > -2)
993 ir->nsteps = nsteps_cmdline;
994 if (EI_DYNAMICS(ir->eI))
996 sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps, %.3f ps",
997 nsteps_cmdline, nsteps_cmdline*ir->delta_t);
1001 sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps",
1005 md_print_warn(cr, fplog, "%s\n", stmp);
1009 /* Data structure set by SIMMASTER which needs to be passed to all nodes
1010 * before the other nodes have read the tpx file and called gmx_detect_hardware.
1013 int cutoff_scheme; /* The cutoff-scheme from inputrec_t */
1014 gmx_bool bUseGPU; /* Use GPU or GPU emulation */
1017 int mdrunner(gmx_hw_opt_t *hw_opt,
1018 FILE *fplog,t_commrec *cr,int nfile,
1019 const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
1020 gmx_bool bCompact, int nstglobalcomm,
1021 ivec ddxyz,int dd_node_order,real rdd,real rconstr,
1022 const char *dddlb_opt,real dlb_scale,
1023 const char *ddcsx,const char *ddcsy,const char *ddcsz,
1024 const char *nbpu_opt,
1025 int nsteps_cmdline, int nstepout,int resetstep,
1026 int nmultisim,int repl_ex_nst,int repl_ex_nex,
1027 int repl_ex_seed, real pforce,real cpt_period,real max_hours,
1028 const char *deviceOptions, unsigned long Flags)
1030 gmx_bool bForceUseGPU,bTryUseGPU;
1031 double nodetime=0,realtime;
1032 t_inputrec *inputrec;
1033 t_state *state=NULL;
1035 gmx_ddbox_t ddbox={0};
1036 int npme_major,npme_minor;
1039 gmx_mtop_t *mtop=NULL;
1040 t_mdatoms *mdatoms=NULL;
1041 t_forcerec *fr=NULL;
1044 gmx_pme_t *pmedata=NULL;
1045 gmx_vsite_t *vsite=NULL;
1046 gmx_constr_t constr;
1047 int i,m,nChargePerturbed=-1,status,nalloc;
1049 gmx_wallcycle_t wcycle;
1050 gmx_bool bReadRNG,bReadEkin;
1052 gmx_runtime_t runtime;
1054 gmx_large_int_t reset_counters;
1055 gmx_edsam_t ed=NULL;
1056 t_commrec *cr_old=cr;
1059 gmx_membed_t membed=NULL;
1060 gmx_hw_info_t *hwinfo=NULL;
1061 master_inf_t minf={-1,FALSE};
1063 /* CAUTION: threads may be started later on in this function, so
1064 cr doesn't reflect the final parallel state right now */
1068 if (Flags & MD_APPENDFILES)
1073 bForceUseGPU = (strncmp(nbpu_opt, "gpu", 3) == 0);
1074 bTryUseGPU = (strncmp(nbpu_opt, "auto", 4) == 0) || bForceUseGPU;
1079 /* Read (nearly) all data required for the simulation */
1080 read_tpx_state(ftp2fn(efTPX,nfile,fnm),inputrec,state,NULL,mtop);
1082 if (inputrec->cutoff_scheme != ecutsVERLET &&
1083 ((Flags & MD_TESTVERLET) || getenv("GMX_VERLET_SCHEME") != NULL))
1085 convert_to_verlet_scheme(fplog,inputrec,mtop,det(state->box));
1088 /* Detect hardware, gather information. With tMPI only thread 0 does it
1089 * and after threads are started broadcasts hwinfo around. */
1091 gmx_detect_hardware(fplog, hwinfo, cr,
1092 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
1094 minf.cutoff_scheme = inputrec->cutoff_scheme;
1095 minf.bUseGPU = FALSE;
1097 if (inputrec->cutoff_scheme == ecutsVERLET)
1099 prepare_verlet_scheme(fplog,hwinfo,cr,hw_opt,nbpu_opt,
1100 inputrec,mtop,state->box,
1103 else if (hwinfo->bCanUseGPU)
1105 md_print_warn(cr,fplog,
1106 "NOTE: GPU(s) found, but the current simulation can not use GPUs\n"
1107 " To use a GPU, set the mdp option: cutoff-scheme = Verlet\n"
1108 " (for quick performance testing you can use the -testverlet option)\n");
1112 gmx_fatal(FARGS,"GPU requested, but can't be used without cutoff-scheme=Verlet");
1116 #ifndef GMX_THREAD_MPI
1119 gmx_bcast_sim(sizeof(minf),&minf,cr);
1122 if (minf.bUseGPU && cr->npmenodes == -1)
1124 /* Don't automatically use PME-only nodes with GPUs */
1128 #ifdef GMX_THREAD_MPI
1129 /* With thread-MPI inputrec is only set here on the master thread */
1133 check_and_update_hw_opt(hw_opt,minf.cutoff_scheme);
1135 #ifdef GMX_THREAD_MPI
1136 if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
1138 gmx_fatal(FARGS,"You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME nodes");
1142 if (hw_opt->nthreads_omp_pme != hw_opt->nthreads_omp &&
1145 gmx_fatal(FARGS,"You need to explicitly specify the number of PME nodes (-npme) when using different number of OpenMP threads for PP and PME nodes");
1149 /* Check for externally set OpenMP affinity and turn off internal
1150 * pinning if any is found. We need to do this check early to tell
1151 * thread-MPI whether it should do pinning when spawning threads.
1153 gmx_omp_check_thread_affinity(fplog, cr, hw_opt);
1155 #ifdef GMX_THREAD_MPI
1158 /* NOW the threads will be started: */
1159 hw_opt->nthreads_tmpi = get_nthreads_mpi(hwinfo,
1163 if (hw_opt->nthreads_tot > 0 && hw_opt->nthreads_omp <= 0)
1165 hw_opt->nthreads_omp = hw_opt->nthreads_tot/hw_opt->nthreads_tmpi;
1168 if (hw_opt->nthreads_tmpi > 1)
1170 /* now start the threads. */
1171 cr=mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm,
1172 oenv, bVerbose, bCompact, nstglobalcomm,
1173 ddxyz, dd_node_order, rdd, rconstr,
1174 dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
1176 nsteps_cmdline, nstepout, resetstep, nmultisim,
1177 repl_ex_nst, repl_ex_nex, repl_ex_seed, pforce,
1178 cpt_period, max_hours, deviceOptions,
1180 /* the main thread continues here with a new cr. We don't deallocate
1181 the old cr because other threads may still be reading it. */
1184 gmx_comm("Failed to spawn threads");
1189 /* END OF CAUTION: cr is now reliable */
1191 /* g_membed initialisation *
1192 * Because we change the mtop, init_membed is called before the init_parallel *
1193 * (in case we ever want to make it run in parallel) */
1194 if (opt2bSet("-membed",nfile,fnm))
1198 fprintf(stderr,"Initializing membed");
1200 membed = init_membed(fplog,nfile,fnm,mtop,inputrec,state,cr,&cpt_period);
1205 /* now broadcast everything to the non-master nodes/threads: */
1206 init_parallel(fplog, cr, inputrec, mtop);
1208 /* This check needs to happen after get_nthreads_mpi() */
1209 if (inputrec->cutoff_scheme == ecutsVERLET && (Flags & MD_PARTDEC))
1211 gmx_fatal_collective(FARGS,cr,NULL,
1212 "The Verlet cut-off scheme is not supported with particle decomposition.\n"
1213 "You can achieve the same effect as particle decomposition by running in parallel using only OpenMP threads.");
1218 pr_inputrec(fplog,0,"Input Parameters",inputrec,FALSE);
1221 #if defined GMX_THREAD_MPI
1222 /* With tMPI we detected on thread 0 and we'll just pass the hwinfo pointer
1223 * to the other threads -- slightly uncool, but works fine, just need to
1224 * make sure that the data doesn't get freed twice. */
1231 gmx_bcast(sizeof(&hwinfo), &hwinfo, cr);
1234 if (PAR(cr) && !SIMMASTER(cr))
1236 /* now we have inputrec on all nodes, can run the detection */
1237 /* TODO: perhaps it's better to propagate within a node instead? */
1239 gmx_detect_hardware(fplog, hwinfo, cr,
1240 bForceUseGPU, bTryUseGPU, hw_opt->gpu_id);
1244 /* now make sure the state is initialized and propagated */
1245 set_state_entries(state,inputrec,cr->nnodes);
1247 /* remove when vv and rerun works correctly! */
1248 if (PAR(cr) && EI_VV(inputrec->eI) && ((Flags & MD_RERUN) || (Flags & MD_RERUN_VSITE)))
1251 "Currently can't do velocity verlet with rerun in parallel.");
1254 /* A parallel command line option consistency check that we can
1255 only do after any threads have started. */
1257 (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
1260 "The -dd or -npme option request a parallel simulation, "
1262 "but %s was compiled without threads or MPI enabled"
1264 #ifdef GMX_THREAD_MPI
1265 "but the number of threads (option -nt) is 1"
1267 "but %s was not started through mpirun/mpiexec or only one process was requested through mpirun/mpiexec"
1274 if ((Flags & MD_RERUN) &&
1275 (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
1277 gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun");
1280 if (can_use_allvsall(inputrec,mtop,TRUE,cr,fplog) && PAR(cr))
1282 /* All-vs-all loops do not work with domain decomposition */
1283 Flags |= MD_PARTDEC;
1286 if (!EEL_PME(inputrec->coulombtype) || (Flags & MD_PARTDEC))
1288 if (cr->npmenodes > 0)
1290 if (!EEL_PME(inputrec->coulombtype))
1292 gmx_fatal_collective(FARGS,cr,NULL,
1293 "PME nodes are requested, but the system does not use PME electrostatics");
1295 if (Flags & MD_PARTDEC)
1297 gmx_fatal_collective(FARGS,cr,NULL,
1298 "PME nodes are requested, but particle decomposition does not support separate PME nodes");
1306 fcRegisterSteps(inputrec->nsteps,inputrec->init_step);
1309 /* NMR restraints must be initialized before load_checkpoint,
1310 * since with time averaging the history is added to t_state.
1311 * For proper consistency check we therefore need to extend
1313 * So the PME-only nodes (if present) will also initialize
1314 * the distance restraints.
1318 /* This needs to be called before read_checkpoint to extend the state */
1319 init_disres(fplog,mtop,inputrec,cr,Flags & MD_PARTDEC,fcd,state);
1321 if (gmx_mtop_ftype_count(mtop,F_ORIRES) > 0)
1323 if (PAR(cr) && !(Flags & MD_PARTDEC))
1325 gmx_fatal(FARGS,"Orientation restraints do not work (yet) with domain decomposition, use particle decomposition (mdrun option -pd)");
1327 /* Orientation restraints */
1330 init_orires(fplog,mtop,state->x,inputrec,cr->ms,&(fcd->orires),
1335 if (DEFORM(*inputrec))
1337 /* Store the deform reference box before reading the checkpoint */
1340 copy_mat(state->box,box);
1344 gmx_bcast(sizeof(box),box,cr);
1346 /* Because we do not have the update struct available yet
1347 * in which the reference values should be stored,
1348 * we store them temporarily in static variables.
1349 * This should be thread safe, since they are only written once
1350 * and with identical values.
1352 #ifdef GMX_THREAD_MPI
1353 tMPI_Thread_mutex_lock(&deform_init_box_mutex);
1355 deform_init_init_step_tpx = inputrec->init_step;
1356 copy_mat(box,deform_init_box_tpx);
1357 #ifdef GMX_THREAD_MPI
1358 tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
1362 if (opt2bSet("-cpi",nfile,fnm))
1364 /* Check if checkpoint file exists before doing continuation.
1365 * This way we can use identical input options for the first and subsequent runs...
1367 if( gmx_fexist_master(opt2fn_master("-cpi",nfile,fnm,cr),cr) )
1369 load_checkpoint(opt2fn_master("-cpi",nfile,fnm,cr),&fplog,
1370 cr,Flags & MD_PARTDEC,ddxyz,
1371 inputrec,state,&bReadRNG,&bReadEkin,
1372 (Flags & MD_APPENDFILES),
1373 (Flags & MD_APPENDFILESSET));
1377 Flags |= MD_READ_RNG;
1381 Flags |= MD_READ_EKIN;
1386 if (((MASTER(cr) || (Flags & MD_SEPPOT)) && (Flags & MD_APPENDFILES))
1387 #ifdef GMX_THREAD_MPI
1388 /* With thread MPI only the master node/thread exists in mdrun.c,
1389 * therefore non-master nodes need to open the "seppot" log file here.
1391 || (!MASTER(cr) && (Flags & MD_SEPPOT))
1395 gmx_log_open(ftp2fn(efLOG,nfile,fnm),cr,!(Flags & MD_SEPPOT),
1399 /* override nsteps with value from cmdline */
1400 override_nsteps_cmdline(fplog, nsteps_cmdline, inputrec, cr);
1404 copy_mat(state->box,box);
1409 gmx_bcast(sizeof(box),box,cr);
1412 /* Essential dynamics */
1413 if (opt2bSet("-ei",nfile,fnm))
1415 /* Open input and output files, allocate space for ED data structure */
1416 ed = ed_open(nfile,fnm,Flags,cr);
1419 if (PAR(cr) && !((Flags & MD_PARTDEC) ||
1420 EI_TPI(inputrec->eI) ||
1421 inputrec->eI == eiNM))
1423 cr->dd = init_domain_decomposition(fplog,cr,Flags,ddxyz,rdd,rconstr,
1424 dddlb_opt,dlb_scale,
1428 &ddbox,&npme_major,&npme_minor);
1430 make_dd_communicators(fplog,cr,dd_node_order);
1432 /* Set overallocation to avoid frequent reallocation of arrays */
1433 set_over_alloc_dd(TRUE);
1437 /* PME, if used, is done on all nodes with 1D decomposition */
1439 cr->duty = (DUTY_PP | DUTY_PME);
1442 if (!EI_TPI(inputrec->eI))
1444 npme_major = cr->nnodes;
1447 if (inputrec->ePBC == epbcSCREW)
1450 "pbc=%s is only implemented with domain decomposition",
1451 epbc_names[inputrec->ePBC]);
1457 /* After possible communicator splitting in make_dd_communicators.
1458 * we can set up the intra/inter node communication.
1460 gmx_setup_nodecomm(fplog,cr);
1463 /* Initialize per-node process ID and counters. */
1464 gmx_init_intra_counters(cr);
1467 md_print_info(cr,fplog,"Using %d MPI %s\n",
1469 #ifdef GMX_THREAD_MPI
1470 cr->nnodes==1 ? "thread" : "threads"
1472 cr->nnodes==1 ? "process" : "processes"
1477 gmx_omp_nthreads_init(fplog, cr,
1478 hwinfo->nthreads_hw_avail,
1479 hw_opt->nthreads_omp,
1480 hw_opt->nthreads_omp_pme,
1481 (cr->duty & DUTY_PP) == 0,
1482 inputrec->cutoff_scheme == ecutsVERLET);
1484 gmx_check_hw_runconf_consistency(fplog, hwinfo, cr, hw_opt->nthreads_tmpi, minf.bUseGPU);
1486 /* getting number of PP/PME threads
1487 PME: env variable should be read only on one node to make sure it is
1488 identical everywhere;
1490 /* TODO nthreads_pp is only used for pinning threads.
1491 * This is a temporary solution until we have a hw topology library.
1493 nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
1494 nthreads_pme = gmx_omp_nthreads_get(emntPME);
1496 wcycle = wallcycle_init(fplog,resetstep,cr,nthreads_pp,nthreads_pme);
1500 /* Master synchronizes its value of reset_counters with all nodes
1501 * including PME only nodes */
1502 reset_counters = wcycle_get_reset_counters(wcycle);
1503 gmx_bcast_sim(sizeof(reset_counters),&reset_counters,cr);
1504 wcycle_set_reset_counters(wcycle, reset_counters);
1508 if (cr->duty & DUTY_PP)
1510 /* For domain decomposition we allocate dynamically
1511 * in dd_partition_system.
1513 if (DOMAINDECOMP(cr))
1515 bcast_state_setup(cr,state);
1521 bcast_state(cr,state,TRUE);
1525 /* Initiate forcerecord */
1527 fr->hwinfo = hwinfo;
1528 init_forcerec(fplog,oenv,fr,fcd,inputrec,mtop,cr,box,FALSE,
1529 opt2fn("-table",nfile,fnm),
1530 opt2fn("-tabletf",nfile,fnm),
1531 opt2fn("-tablep",nfile,fnm),
1532 opt2fn("-tableb",nfile,fnm),
1536 /* version for PCA_NOT_READ_NODE (see md.c) */
1537 /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
1538 "nofile","nofile","nofile","nofile",FALSE,pforce);
1540 fr->bSepDVDL = ((Flags & MD_SEPPOT) == MD_SEPPOT);
1542 /* Initialize QM-MM */
1545 init_QMMMrec(cr,box,mtop,inputrec,fr);
1548 /* Initialize the mdatoms structure.
1549 * mdatoms is not filled with atom data,
1550 * as this can not be done now with domain decomposition.
1552 mdatoms = init_mdatoms(fplog,mtop,inputrec->efep!=efepNO);
1554 /* Initialize the virtual site communication */
1555 vsite = init_vsite(mtop,cr,FALSE);
1557 calc_shifts(box,fr->shift_vec);
1559 /* With periodic molecules the charge groups should be whole at start up
1560 * and the virtual sites should not be far from their proper positions.
1562 if (!inputrec->bContinuation && MASTER(cr) &&
1563 !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
1565 /* Make molecules whole at start of run */
1566 if (fr->ePBC != epbcNONE)
1568 do_pbc_first_mtop(fplog,inputrec->ePBC,box,mtop,state->x);
1572 /* Correct initial vsite positions are required
1573 * for the initial distribution in the domain decomposition
1574 * and for the initial shell prediction.
1576 construct_vsites_mtop(fplog,vsite,mtop,state->x);
1580 if (EEL_PME(fr->eeltype))
1582 ewaldcoeff = fr->ewaldcoeff;
1583 pmedata = &fr->pmedata;
1592 /* This is a PME only node */
1594 /* We don't need the state */
1597 ewaldcoeff = calc_ewaldcoeff(inputrec->rcoulomb, inputrec->ewald_rtol);
1601 /* Set the CPU affinity */
1602 set_cpu_affinity(fplog,cr,hw_opt,nthreads_pme,hwinfo,inputrec);
1604 /* Initiate PME if necessary,
1605 * either on all nodes or on dedicated PME nodes only. */
1606 if (EEL_PME(inputrec->coulombtype))
1610 nChargePerturbed = mdatoms->nChargePerturbed;
1612 if (cr->npmenodes > 0)
1614 /* The PME only nodes need to know nChargePerturbed */
1615 gmx_bcast_sim(sizeof(nChargePerturbed),&nChargePerturbed,cr);
1618 if (cr->duty & DUTY_PME)
1620 status = gmx_pme_init(pmedata,cr,npme_major,npme_minor,inputrec,
1621 mtop ? mtop->natoms : 0,nChargePerturbed,
1622 (Flags & MD_REPRODUCIBLE),nthreads_pme);
1625 gmx_fatal(FARGS,"Error %d initializing PME",status);
1631 if (integrator[inputrec->eI].func == do_md
1634 integrator[inputrec->eI].func == do_md_openmm
1638 /* Turn on signal handling on all nodes */
1640 * (A user signal from the PME nodes (if any)
1641 * is communicated to the PP nodes.
1643 signal_handler_install();
1646 if (cr->duty & DUTY_PP)
1648 if (inputrec->ePull != epullNO)
1650 /* Initialize pull code */
1651 init_pull(fplog,inputrec,nfile,fnm,mtop,cr,oenv, inputrec->fepvals->init_lambda,
1652 EI_DYNAMICS(inputrec->eI) && MASTER(cr),Flags);
1657 /* Initialize enforced rotation code */
1658 init_rot(fplog,inputrec,nfile,fnm,cr,state->x,box,mtop,oenv,
1662 constr = init_constraints(fplog,mtop,inputrec,ed,state,cr);
1664 if (DOMAINDECOMP(cr))
1666 dd_init_bondeds(fplog,cr->dd,mtop,vsite,constr,inputrec,
1667 Flags & MD_DDBONDCHECK,fr->cginfo_mb);
1669 set_dd_parameters(fplog,cr->dd,dlb_scale,inputrec,fr,&ddbox);
1671 setup_dd_grid(fplog,cr->dd);
1674 /* Now do whatever the user wants us to do (how flexible...) */
1675 integrator[inputrec->eI].func(fplog,cr,nfile,fnm,
1676 oenv,bVerbose,bCompact,
1679 nstepout,inputrec,mtop,
1681 mdatoms,nrnb,wcycle,ed,fr,
1682 repl_ex_nst,repl_ex_nex,repl_ex_seed,
1684 cpt_period,max_hours,
1689 if (inputrec->ePull != epullNO)
1691 finish_pull(fplog,inputrec->pull);
1696 finish_rot(fplog,inputrec->rot);
1703 gmx_pmeonly(*pmedata,cr,nrnb,wcycle,ewaldcoeff,FALSE,inputrec);
1706 if (EI_DYNAMICS(inputrec->eI) || EI_TPI(inputrec->eI))
1708 /* Some timing stats */
1711 if (runtime.proc == 0)
1713 runtime.proc = runtime.real;
1722 wallcycle_stop(wcycle,ewcRUN);
1724 /* Finish up, write some stuff
1725 * if rerunMD, don't write last frame again
1727 finish_run(fplog,cr,ftp2fn(efSTO,nfile,fnm),
1728 inputrec,nrnb,wcycle,&runtime,
1729 fr != NULL && fr->nbv != NULL && fr->nbv->bUseGPU ?
1730 nbnxn_cuda_get_timings(fr->nbv->cu_nbv) : NULL,
1732 EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
1734 if ((cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU)
1736 char gpu_err_str[STRLEN];
1738 /* free GPU memory and uninitialize GPU (by destroying the context) */
1739 nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
1741 if (!free_gpu(gpu_err_str))
1743 gmx_warning("On node %d failed to free GPU #%d: %s",
1744 cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
1748 if (opt2bSet("-membed",nfile,fnm))
1753 #ifdef GMX_THREAD_MPI
1754 if (PAR(cr) && SIMMASTER(cr))
1757 gmx_hardware_info_free(hwinfo);
1760 /* Does what it says */
1761 print_date_and_time(fplog,cr->nodeid,"Finished mdrun",&runtime);
1763 /* Close logfile already here if we were appending to it */
1764 if (MASTER(cr) && (Flags & MD_APPENDFILES))
1766 gmx_log_close(fplog);
1769 rc=(int)gmx_get_stop_condition();
1771 #ifdef GMX_THREAD_MPI
1772 /* we need to join all threads. The sub-threads join when they
1773 exit this function, but the master thread needs to be told to
1775 if (PAR(cr) && MASTER(cr))