From 263a6dcf5c35f55356fc11b9978f9fd6eb9bf1a4 Mon Sep 17 00:00:00 2001 From: Artem Zhmurov Date: Tue, 29 Sep 2020 14:24:21 +0000 Subject: [PATCH] Don't use fah_fsync There's no need in a custom fsync functionality for F@H core. Signed-off-by: Dmitry Moskalchuk --- .../thread_mpi/include/thread_mpi/wait.h | 12 ++++- src/gromacs/fileio/checkpoint.cpp | 27 ++++++----- src/gromacs/mdlib/mdoutf.cpp | 9 ++++ src/gromacs/mdlib/trajectory_writing.cpp | 30 +++--------- src/gromacs/mdrun/md.cpp | 20 +++----- src/gromacs/mdrun/runner.cpp | 35 +++++++++----- src/gromacs/utility/fatalerror.cpp | 15 +++--- src/gromacs/utility/futil.cpp | 46 +++++++------------ src/gromacs/utility/init.cpp | 10 ++-- 9 files changed, 100 insertions(+), 104 deletions(-) diff --git a/src/external/thread_mpi/include/thread_mpi/wait.h b/src/external/thread_mpi/include/thread_mpi/wait.h index 47e2244a9b..ae29422919 100644 --- a/src/external/thread_mpi/include/thread_mpi/wait.h +++ b/src/external/thread_mpi/include/thread_mpi/wait.h @@ -41,7 +41,17 @@ #if TMPI_WAIT_FOR_NO_ONE -#if !(defined( _WIN32 ) || defined( _WIN64 ) ) +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#if GMX_FAHCORE +// This lets F@H throttle CPU usage +#define TMPI_YIELD_WAIT_DATA +#define TMPI_YIELD_WAIT_DATA_INIT(data) +#define TMPI_YIELD_WAIT(data) fcYieldWait() + +#elif !(defined( _WIN32 ) || defined( _WIN64 ) ) #ifdef HAVE_UNISTD_H #include #endif diff --git a/src/gromacs/fileio/checkpoint.cpp b/src/gromacs/fileio/checkpoint.cpp index 6aaedafbc5..9c6cfe4213 100644 --- a/src/gromacs/fileio/checkpoint.cpp +++ b/src/gromacs/fileio/checkpoint.cpp @@ -90,10 +90,6 @@ #include "gromacs/utility/sysinfo.h" #include "gromacs/utility/txtdump.h" -#if GMX_FAHCORE -# include "corewrap.h" -#endif - #define CPT_MAGIC1 171817 #define CPT_MAGIC2 171819 @@ -2470,14 +2466,21 @@ void write_checkpoint(const char* fn, sfree(fntemp); #if GMX_FAHCORE - /*code for alternate checkpointing scheme. moved from top of loop over - steps */ - fcRequestCheckPoint(); - if (fcCheckPointParallel(cr->nodeid, NULL, 0) == 0) - { - gmx_fatal(3, __FILE__, __LINE__, "Checkpoint error on step %d\n", step); - } -#endif /* end GMX_FAHCORE block */ + /* Always FAH checkpoint immediately after a Gromacs checkpoint. + * + * Note that it is critical that we save a FAH checkpoint directly + * after writing a Gromacs checkpoint. If the program dies, either + * by the machine powering off suddenly or the process being, + * killed, FAH can recover files that have only appended data by + * truncating them to the last recorded length. The Gromacs + * checkpoint does not just append data, it is fully rewritten each + * time so a crash between moving the new Gromacs checkpoint file in + * to place and writing a FAH checkpoint is not recoverable. Thus + * the time between these operations must be kept as short a + * possible. + */ + fcCheckpoint(); +#endif } static void check_int(FILE* fplog, const char* type, int p, int f, gmx_bool* mm) diff --git a/src/gromacs/mdlib/mdoutf.cpp b/src/gromacs/mdlib/mdoutf.cpp index 950587952d..0825e0ab7e 100644 --- a/src/gromacs/mdlib/mdoutf.cpp +++ b/src/gromacs/mdlib/mdoutf.cpp @@ -432,6 +432,15 @@ void mdoutf_write_to_trajectory_files(FILE* fplog, nullptr, nullptr); } } + +#if GMX_FAHCORE + /* Write a FAH checkpoint after writing any other data. We may end up + checkpointing twice but it's fast so it's ok. */ + if ((mdof_flags & ~MDOF_CPT)) + { + fcCheckpoint(); + } +#endif } } diff --git a/src/gromacs/mdlib/trajectory_writing.cpp b/src/gromacs/mdlib/trajectory_writing.cpp index e06b416a0a..3d99d2353d 100644 --- a/src/gromacs/mdlib/trajectory_writing.cpp +++ b/src/gromacs/mdlib/trajectory_writing.cpp @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by + * Copyright (c) 2013,2014,2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -117,28 +117,6 @@ void do_md_trajectory_writing(FILE* fplog, mdof_flags |= MDOF_LAMBDA_COMPRESSED; } -#if GMX_FAHCORE - if (bLastStep) - { - /* Enforce writing positions and velocities at end of run */ - mdof_flags |= (MDOF_X | MDOF_V); - } - if (MASTER(cr)) - { - fcReportProgress(ir->nsteps, step); - } - -# if defined(__native_client__) - fcCheckin(MASTER(cr)); -# endif - - /* sync bCPT and fc record-keeping */ - if (bCPT && MASTER(cr)) - { - fcRequestCheckPoint(); - } -#endif - if (mdof_flags != 0) { wallcycle_start(mdoutf_get_wcycle(outf), ewcTRAJ); @@ -204,4 +182,10 @@ void do_md_trajectory_writing(FILE* fplog, } wallcycle_stop(mdoutf_get_wcycle(outf), ewcTRAJ); } +#if GMX_FAHCORE + if (MASTER(cr)) + { + fcWriteVisFrame(ir->ePBC, state_global->box, top_global, state_global->x.rvec_array()); + } +#endif } diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp index efd18b09e3..5ca5064229 100644 --- a/src/gromacs/mdrun/md.cpp +++ b/src/gromacs/mdrun/md.cpp @@ -145,10 +145,6 @@ #include "replicaexchange.h" #include "shellfc.h" -#if GMX_FAHCORE -# include "corewrap.h" -#endif - using gmx::SimulationSignaller; void gmx::LegacySimulator::do_md() @@ -668,15 +664,6 @@ void gmx::LegacySimulator::do_md() wallcycle_start(wcycle, ewcRUN); print_start(fplog, cr, walltime_accounting, "mdrun"); -#if GMX_FAHCORE - /* safest point to do file checkpointing is here. More general point would be immediately before integrator call */ - int chkpt_ret = fcCheckPointParallel(cr->nodeid, NULL, 0); - if (chkpt_ret == 0) - { - gmx_fatal(3, __FILE__, __LINE__, "Checkpoint error on step %d\n", 0); - } -#endif - /*********************************************************** * * Loop over MD steps @@ -1646,6 +1633,13 @@ void gmx::LegacySimulator::do_md() step++; step_rel++; +#if GMX_FAHCORE + if (MASTER(cr)) + { + fcReportProgress(ir->nsteps + ir->init_step, step); + } +#endif + resetHandler->resetCounters(step, step_rel, mdlog, fplog, cr, fr->nbv.get(), nrnb, fr->pmedata, pme_loadbal, wcycle, walltime_accounting); diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp index 93c934a996..3c9def3eaa 100644 --- a/src/gromacs/mdrun/runner.cpp +++ b/src/gromacs/mdrun/runner.cpp @@ -160,10 +160,6 @@ #include "replicaexchange.h" #include "simulatorbuilder.h" -#if GMX_FAHCORE -# include "corewrap.h" -#endif - namespace gmx { @@ -196,7 +192,7 @@ static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger& md #pragma GCC diagnostic ignored "-Wunused-result" devFlags.enableGpuBufferOps = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr) && (GMX_GPU == GMX_GPU_CUDA) && useGpuForNonbonded; - devFlags.forceGpuUpdateDefault = (getenv("GMX_FORCE_UPDATE_DEFAULT_GPU") != nullptr); + devFlags.forceGpuUpdateDefault = (getenv("GMX_FORCE_UPDATE_DEFAULT_GPU") != nullptr) || GMX_FAHCORE; devFlags.enableGpuHaloExchange = (getenv("GMX_GPU_DD_COMMS") != nullptr && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA)); devFlags.enableGpuPmePPComm = @@ -1004,13 +1000,6 @@ int Mdrunner::mdrunner() } } -#if GMX_FAHCORE - if (MASTER(cr)) - { - fcRegisterSteps(inputrec->nsteps, inputrec->init_step); - } -#endif - /* NMR restraints must be initialized before load_checkpoint, * since with time averaging the history is added to t_state. * For proper consistency check we therefore need to extend @@ -1027,6 +1016,21 @@ int Mdrunner::mdrunner() auto deform = prepareBoxDeformation(globalState->box, cr, *inputrec); +#if GMX_FAHCORE + /* We have to remember the generation's first step before reading checkpoint. + This way, we can report to the F@H core both the generation's first step + and the restored first step, thus making it able to distinguish between + an interruption/resume and start of the n-th generation simulation. + Having this information, the F@H core can correctly calculate and report + the progress. + */ + int gen_first_step = 0; + if (MASTER(cr)) + { + gen_first_step = inputrec->init_step; + } +#endif + ObservablesHistory observablesHistory = {}; if (startingBehavior != StartingBehavior::NewSimulation) @@ -1057,6 +1061,13 @@ int Mdrunner::mdrunner() } } +#if GMX_FAHCORE + if (MASTER(cr)) + { + fcRegisterSteps(inputrec->nsteps + inputrec->init_step, gen_first_step); + } +#endif + if (mdrunOptions.numStepsCommandline > -2) { GMX_LOG(mdlog.info) diff --git a/src/gromacs/utility/fatalerror.cpp b/src/gromacs/utility/fatalerror.cpp index c8534f791a..e27f64b692 100644 --- a/src/gromacs/utility/fatalerror.cpp +++ b/src/gromacs/utility/fatalerror.cpp @@ -3,7 +3,7 @@ * * Copyright (c) 1991-2000, University of Groningen, The Netherlands. * Copyright (c) 2001-2004, The GROMACS development team. - * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by + * Copyright (c) 2013,2014,2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -192,13 +192,16 @@ void gmx_exit_on_fatal_error(ExitType exitType, int returnValue) } #endif - if (exitType == ExitType_CleanExit) + if (!GMX_FAHCORE) { - std::exit(returnValue); + if (exitType == ExitType_CleanExit) + { + std::exit(returnValue); + } + // We cannot use std::exit() if other threads may still be executing, since that would cause + // destructors to be called for global objects that may still be in use elsewhere. + std::_Exit(returnValue); } - // We cannot use std::exit() if other threads may still be executing, since that would cause - // destructors to be called for global objects that may still be in use elsewhere. - std::_Exit(returnValue); } void gmx_fatal_mpi_va(int /*f_errno*/, diff --git a/src/gromacs/utility/futil.cpp b/src/gromacs/utility/futil.cpp index 3505365e5f..c16605e90a 100644 --- a/src/gromacs/utility/futil.cpp +++ b/src/gromacs/utility/futil.cpp @@ -3,7 +3,7 @@ * * Copyright (c) 1991-2000, University of Groningen, The Netherlands. * Copyright (c) 2001-2004, The GROMACS development team. - * Copyright (c) 2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by + * Copyright (c) 2013,2014,2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -160,15 +160,11 @@ static void push_ps(FILE* fp) } #if GMX_FAHCORE -/* don't use pipes!*/ -# define popen fah_fopen -# define pclose fah_fclose -# define SKIP_FFOPS 1 -#else # ifdef gmx_ffclose # undef gmx_ffclose # endif -# if (!HAVE_PIPES && !defined(__native_client__)) +#endif +#if (!HAVE_PIPES && !defined(__native_client__)) static FILE* popen(const char* nm, const char* mode) { gmx_impl("Sorry no pipes..."); @@ -182,14 +178,10 @@ static int pclose(FILE* fp) return 0; } -# endif /* !HAVE_PIPES && !defined(__native_client__) */ -#endif /* GMX_FAHCORE */ +#endif /* !HAVE_PIPES && !defined(__native_client__) */ int gmx_ffclose(FILE* fp) { -#ifdef SKIP_FFOPS - return fclose(fp); -#else t_pstack *ps, *tmp; int ret = 0; @@ -238,7 +230,6 @@ int gmx_ffclose(FILE* fp) } return ret; -#endif } @@ -291,7 +282,7 @@ gmx_off_t gmx_ftell(FILE* stream) int gmx_truncate(const std::string& filename, gmx_off_t length) { -#if GMX_NATIVE_WINDOWS +#if GMX_NATIVE_WINDOWS && !GMX_FAHCORE FILE* fp = fopen(filename.c_str(), "rb+"); if (fp == NULL) { @@ -417,9 +408,6 @@ void make_backup(const std::string& name) FILE* gmx_ffopen(const std::string& file, const char* mode) { -#ifdef SKIP_FFOPS - return fopen(file, mode); -#else FILE* ff = nullptr; gmx_bool bRead; int bs; @@ -494,7 +482,6 @@ FILE* gmx_ffopen(const std::string& file, const char* mode) } } return ff; -#endif } namespace gmx @@ -613,6 +600,10 @@ int gmx_file_rename(const char* oldname, const char* newname) #else if (MoveFileEx(oldname, newname, MOVEFILE_REPLACE_EXISTING | MOVEFILE_WRITE_THROUGH)) { +# if GMX_FAHCORE + /* This just lets the F@H checksumming system know about the rename */ + fcRename(oldname, newname); +# endif return 0; } else @@ -683,33 +674,28 @@ int gmx_fsync(FILE* fp) { int rc = 0; -#if GMX_FAHCORE - /* the fahcore defines its own os-independent fsync */ - rc = fah_fsync(fp); -#else /* GMX_FAHCORE */ { int fn; /* get the file number */ -# if HAVE_FILENO +#if HAVE_FILENO fn = fileno(fp); -# elif HAVE__FILENO +#elif HAVE__FILENO fn = _fileno(fp); -# else +#else fn = -1; -# endif +#endif /* do the actual fsync */ if (fn >= 0) { -# if HAVE_FSYNC +#if HAVE_FSYNC rc = fsync(fn); -# elif HAVE__COMMIT +#elif HAVE__COMMIT rc = _commit(fn); -# endif +#endif } } -#endif /* GMX_FAHCORE */ /* We check for these error codes this way because POSIX requires them to be defined, and using anything other than macros is unlikely: */ diff --git a/src/gromacs/utility/init.cpp b/src/gromacs/utility/init.cpp index 07d87b8f2f..b9ee238517 100644 --- a/src/gromacs/utility/init.cpp +++ b/src/gromacs/utility/init.cpp @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2013,2014,2015,2016,2018,2019, by the GROMACS development team, led by + * Copyright (c) 2013,2014,2015,2016,2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -81,10 +81,7 @@ void init(int* argc, char*** argv) // NOLINT(readability-non-const-parameter) } else { -# if GMX_FAHCORE - fah_MPI_Init(argc, argv); -# else -# if GMX_OPENMP +# if GMX_OPENMP /* Formally we need to use MPI_Init_thread and ask for MPI_THREAD_FUNNELED * level of thread support when using OpenMP. However, in practice we * have never seen any problems with just using MPI_Init(), and some MPI @@ -113,9 +110,8 @@ void init(int* argc, char*** argv) // NOLINT(readability-non-const-parameter) "the MPI library. Keep your fingers crossed."); MPI_Init(argc, argv); } -# else +# else MPI_Init(argc, argv); -# endif # endif } // Bump the counter to record this initialization event -- 2.22.0