From: Roland Schulz Date: Wed, 4 Jul 2018 19:11:14 +0000 (-0700) Subject: Merge release-2018 into master X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=c9480da0eb72c2ed2ddc33c93095d138e391f483;p=alexxy%2Fgromacs.git Merge release-2018 into master Changes to fix errors: gmx_editconf.cpp: clang-tidy fix-it mdlib/constr.cpp: -> to . Trivial conflicts: cmake/gmxVersionInfo.cmake docs/CMakeLists.txt src/external/build-fftw/CMakeLists.txt src/gromacs/awh/read-params.cpp src/gromacs/commandline/cmdlineprogramcontext.cpp src/gromacs/gpu_utils/ocl_compiler.cpp Change-Id: I1776d8c9a766e46f7212cb0ca3bb3ecb0f312fa5 --- c9480da0eb72c2ed2ddc33c93095d138e391f483 diff --cc docs/CMakeLists.txt index 49f188703d,c1c1c89d70..cdf259d6af --- a/docs/CMakeLists.txt +++ b/docs/CMakeLists.txt @@@ -311,14 -125,7 +311,15 @@@ if (SPHINX_FOUND fragments/doxygen-links.rst install-guide/index.rst release-notes/index.rst + release-notes/highlights.rst + release-notes/features.rst + release-notes/performance.rst + release-notes/tools.rst + release-notes/bugs-fixed.rst + release-notes/removed-features.rst + release-notes/portability.rst + release-notes/miscellaneous.rst + release-notes/2018/2018.3.rst release-notes/2018/2018.2.rst release-notes/2018/2018.1.rst release-notes/2018/major/highlights.rst diff --cc src/gromacs/awh/read-params.cpp index 3b07f6cb17,2aaa153a01..414fbe8723 --- a/src/gromacs/awh/read-params.cpp +++ b/src/gromacs/awh/read-params.cpp @@@ -511,21 -531,27 +511,27 @@@ AwhParams *readAndCheckAwhParams(std::v warning_error(wi, buf); } - CTYPE("Coordinate sampling interval in number of steps"); + printStringNoNewline(inp, "Coordinate sampling interval in number of steps"); sprintf(opt, "%s-nstsample", prefix); - ITYPE(opt, awhParams->nstSampleCoord, 10); + awhParams->nstSampleCoord = get_eint(inp, opt, 10, wi); - CTYPE("Free energy and bias update interval in number of samples"); + printStringNoNewline(inp, "Free energy and bias update interval in number of samples"); sprintf(opt, "%s-nsamples-update", prefix); - ITYPE(opt, awhParams->numSamplesUpdateFreeEnergy, 10); + awhParams->numSamplesUpdateFreeEnergy = get_eint(inp, opt, 10, wi); + if (awhParams->numSamplesUpdateFreeEnergy <= 0) + { + char buf[STRLEN]; + sprintf(buf, "%s needs to be an integer > 0", opt); + warning_error(wi, buf); + } - CTYPE("When true, biases with share-group>0 are shared between multiple simulations"); + printStringNoNewline(inp, "When true, biases with share-group>0 are shared between multiple simulations"); sprintf(opt, "%s-share-multisim", prefix); - EETYPE(opt, awhParams->shareBiasMultisim, yesno_names); + awhParams->shareBiasMultisim = get_eeenum(inp, opt, yesno_names, wi); - CTYPE("The number of independent AWH biases"); + printStringNoNewline(inp, "The number of independent AWH biases"); sprintf(opt, "%s-nbias", prefix); - ITYPE(opt, awhParams->numBias, 1); + awhParams->numBias = get_eint(inp, opt, 1, wi); if (awhParams->numBias <= 0) { gmx_fatal(FARGS, "%s needs to be an integer > 0", opt); diff --cc src/gromacs/commandline/cmdlineprogramcontext.cpp index d6954c1bb9,70692bbcb0..559a315b5e --- a/src/gromacs/commandline/cmdlineprogramcontext.cpp +++ b/src/gromacs/commandline/cmdlineprogramcontext.cpp @@@ -186,8 -186,12 +186,8 @@@ bool isAcceptableLibraryPath(const std: */ bool isAcceptableLibraryPathPrefix(const std::string &path) { - std::string testPath = Path::join(path, DATA_INSTALL_DIR, "top"); + std::string testPath = Path::join(path, GMX_INSTALL_GMXDATADIR, "top"); - if (isAcceptableLibraryPath(testPath)) - { - return true; - } - return false; + return isAcceptableLibraryPath(testPath); } /*! \brief diff --cc src/gromacs/gmxana/gmx_editconf.cpp index 176a334a13,f26bd749cb..d03f922cb1 --- a/src/gromacs/gmxana/gmx_editconf.cpp +++ b/src/gromacs/gmxana/gmx_editconf.cpp @@@ -1311,7 -1311,18 +1311,18 @@@ int gmx_editconf(int argc, char *argv[] atoms.resinfo[atoms.atom[i].resind].chainid = label[0]; } } - write_pdbfile(out, *top_tmp->name, &atoms, x, ePBC, box, ' ', -1, conect, TRUE); + /* Need to bypass the regular write_pdbfile because I don't want to change + * all instances to include the boolean flag for writing out PQR files. + */ + int *index; + snew(index, atoms.nr); + for (int i = 0; i < atoms.nr; i++) + { + index[i] = i; + } + write_pdbfile_indexed(out, *top_tmp->name, &atoms, x, ePBC, box, ' ', -1, atoms.nr, index, conect, - TRUE, outftp == efPQR ? true : false); ++ TRUE, outftp == efPQR); + sfree(index); if (bLegend) { pdb_legend(out, atoms.nr, atoms.nres, &atoms, x); diff --cc src/gromacs/gpu_utils/ocl_compiler.cpp index 3384e20772,957c6bc453..fce784e1de --- a/src/gromacs/gpu_utils/ocl_compiler.cpp +++ b/src/gromacs/gpu_utils/ocl_compiler.cpp @@@ -230,9 -228,9 +230,9 @@@ getSourceRootPath(const std::string &so root path from the path to the binary that is running. */ InstallationPrefixInfo info = getProgramContext().installationPrefix(); std::string dataPathSuffix = (info.bSourceLayout ? - "src/gromacs/mdlib/nbnxn_ocl" : + sourceRelativePath : - OCL_INSTALL_DIR); + GMX_INSTALL_OCLDIR); - kernelRootPath = Path::join(info.path, dataPathSuffix); + sourceRootPath = Path::join(info.path, dataPathSuffix); } else { diff --cc src/gromacs/mdlib/constr.cpp index 8a15aad14c,19bb1d21ed..71bb29eb5e --- a/src/gromacs/mdlib/constr.cpp +++ b/src/gromacs/mdlib/constr.cpp @@@ -669,19 -622,40 +669,41 @@@ Constraints::Impl::apply(boo } else { - t = ir->init_t; + t = ir.init_t; } - set_pbc(&pbc, ir->ePBC, box); - pull_constraint(ir->pull_work, md, &pbc, cr, ir->delta_t, t, x, xprime, v, *vir); + set_pbc(&pbc, ir.ePBC, box); + pull_constraint(ir.pull_work, &md, &pbc, cr, ir.delta_t, t, x, xprime, v, *vir); } - if (constr->ed && delta_step > 0) + if (ed && delta_step > 0) { /* apply the essential dynamics constraints here */ - do_edsam(ir, step, cr, xprime, v, box, constr->ed); + do_edsam(&ir, step, cr, xprime, v, box, ed); } } + wallcycle_stop(wcycle, ewcCONSTR); - if (v != nullptr && md->cFREEZE) ++ if (v != nullptr && md.cFREEZE) + { + /* Set the velocities of frozen dimensions to zero */ + + // cppcheck-suppress unreadVariable + int gmx_unused numThreads = gmx_omp_nthreads_get(emntUpdate); + + #pragma omp parallel for num_threads(numThreads) schedule(static) - for (int i = 0; i < md->homenr; i++) ++ for (int i = 0; i < md.homenr; i++) + { - int freezeGroup = md->cFREEZE[i]; ++ int freezeGroup = md.cFREEZE[i]; + + for (int d = 0; d < DIM; d++) + { - if (ir->opts.nFreeze[freezeGroup][d]) ++ if (ir.opts.nFreeze[freezeGroup][d]) + { + v[i][d] = 0; + } + } + } + } + return bOK; } diff --cc src/gromacs/mdrun/minimize.cpp index b109ac7f18,0000000000..35c7945e7e mode 100644,000000..100644 --- a/src/gromacs/mdrun/minimize.cpp +++ b/src/gromacs/mdrun/minimize.cpp @@@ -1,2925 -1,0 +1,2936 @@@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 1991-2000, University of Groningen, The Netherlands. + * Copyright (c) 2001-2004, The GROMACS development team. + * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * + * \brief This file defines integrators for energy minimization + * + * \author Berk Hess + * \author Erik Lindahl + * \ingroup module_mdrun + */ +#include "gmxpre.h" + +#include "config.h" + +#include +#include +#include + +#include +#include + +#include "gromacs/commandline/filenm.h" +#include "gromacs/domdec/domdec.h" +#include "gromacs/domdec/domdec_struct.h" +#include "gromacs/ewald/pme.h" +#include "gromacs/fileio/confio.h" +#include "gromacs/fileio/mtxio.h" +#include "gromacs/gmxlib/network.h" +#include "gromacs/gmxlib/nrnb.h" +#include "gromacs/imd/imd.h" +#include "gromacs/linearalgebra/sparsematrix.h" +#include "gromacs/listed-forces/manage-threading.h" +#include "gromacs/math/functions.h" +#include "gromacs/math/vec.h" +#include "gromacs/mdlib/constr.h" +#include "gromacs/mdlib/force.h" +#include "gromacs/mdlib/forcerec.h" +#include "gromacs/mdlib/gmx_omp_nthreads.h" +#include "gromacs/mdlib/md_support.h" +#include "gromacs/mdlib/mdatoms.h" +#include "gromacs/mdlib/mdebin.h" +#include "gromacs/mdlib/mdrun.h" +#include "gromacs/mdlib/mdsetup.h" +#include "gromacs/mdlib/ns.h" +#include "gromacs/mdlib/shellfc.h" +#include "gromacs/mdlib/sim_util.h" +#include "gromacs/mdlib/tgroup.h" +#include "gromacs/mdlib/trajectory_writing.h" +#include "gromacs/mdlib/update.h" +#include "gromacs/mdlib/vsite.h" +#include "gromacs/mdtypes/commrec.h" +#include "gromacs/mdtypes/inputrec.h" +#include "gromacs/mdtypes/md_enums.h" +#include "gromacs/mdtypes/state.h" +#include "gromacs/pbcutil/mshift.h" +#include "gromacs/pbcutil/pbc.h" +#include "gromacs/timing/wallcycle.h" +#include "gromacs/timing/walltime_accounting.h" +#include "gromacs/topology/mtop_util.h" +#include "gromacs/topology/topology.h" +#include "gromacs/utility/cstringutil.h" +#include "gromacs/utility/exceptions.h" +#include "gromacs/utility/fatalerror.h" +#include "gromacs/utility/logger.h" +#include "gromacs/utility/smalloc.h" + +#include "integrator.h" + +//! Utility structure for manipulating states during EM +typedef struct { + //! Copy of the global state + t_state s; + //! Force array + PaddedRVecVector f; + //! Potential energy + real epot; + //! Norm of the force + real fnorm; + //! Maximum force + real fmax; + //! Direction + int a_fmax; +} em_state_t; + +//! Print the EM starting conditions +static void print_em_start(FILE *fplog, + const t_commrec *cr, + gmx_walltime_accounting_t walltime_accounting, + gmx_wallcycle_t wcycle, + const char *name) +{ + walltime_accounting_start(walltime_accounting); + wallcycle_start(wcycle, ewcRUN); + print_start(fplog, cr, walltime_accounting, name); +} + +//! Stop counting time for EM +static void em_time_end(gmx_walltime_accounting_t walltime_accounting, + gmx_wallcycle_t wcycle) +{ + wallcycle_stop(wcycle, ewcRUN); + + walltime_accounting_end(walltime_accounting); +} + +//! Printing a log file and console header +static void sp_header(FILE *out, const char *minimizer, real ftol, int nsteps) +{ + fprintf(out, "\n"); + fprintf(out, "%s:\n", minimizer); + fprintf(out, " Tolerance (Fmax) = %12.5e\n", ftol); + fprintf(out, " Number of steps = %12d\n", nsteps); +} + +//! Print warning message +static void warn_step(FILE *fp, + real ftol, + real fmax, + gmx_bool bLastStep, + gmx_bool bConstrain) +{ + constexpr bool realIsDouble = GMX_DOUBLE; + char buffer[2048]; + + if (!std::isfinite(fmax)) + { + sprintf(buffer, + "\nEnergy minimization has stopped because the force " + "on at least one atom is not finite. This usually means " + "atoms are overlapping. Modify the input coordinates to " + "remove atom overlap or use soft-core potentials with " + "the free energy code to avoid infinite forces.\n%s", + !realIsDouble ? + "You could also be lucky that switching to double precision " + "is sufficient to obtain finite forces.\n" : + ""); + } + else if (bLastStep) + { + sprintf(buffer, + "\nEnergy minimization reached the maximum number " + "of steps before the forces reached the requested " + "precision Fmax < %g.\n", ftol); + } + else + { + sprintf(buffer, + "\nEnergy minimization has stopped, but the forces have " + "not converged to the requested precision Fmax < %g (which " + "may not be possible for your system). It stopped " + "because the algorithm tried to make a new step whose size " + "was too small, or there was no change in the energy since " + "last step. Either way, we regard the minimization as " + "converged to within the available machine precision, " + "given your starting configuration and EM parameters.\n%s%s", + ftol, + !realIsDouble ? + "\nDouble precision normally gives you higher accuracy, but " + "this is often not needed for preparing to run molecular " + "dynamics.\n" : + "", + bConstrain ? + "You might need to increase your constraint accuracy, or turn\n" + "off constraints altogether (set constraints = none in mdp file)\n" : + ""); + } + + fputs(wrap_lines(buffer, 78, 0, FALSE), stderr); + fputs(wrap_lines(buffer, 78, 0, FALSE), fp); +} + +//! Print message about convergence of the EM +static void print_converged(FILE *fp, const char *alg, real ftol, + gmx_int64_t count, gmx_bool bDone, gmx_int64_t nsteps, + const em_state_t *ems, double sqrtNumAtoms) +{ + char buf[STEPSTRSIZE]; + + if (bDone) + { + fprintf(fp, "\n%s converged to Fmax < %g in %s steps\n", + alg, ftol, gmx_step_str(count, buf)); + } + else if (count < nsteps) + { + fprintf(fp, "\n%s converged to machine precision in %s steps,\n" + "but did not reach the requested Fmax < %g.\n", + alg, gmx_step_str(count, buf), ftol); + } + else + { + fprintf(fp, "\n%s did not converge to Fmax < %g in %s steps.\n", + alg, ftol, gmx_step_str(count, buf)); + } + +#if GMX_DOUBLE + fprintf(fp, "Potential Energy = %21.14e\n", ems->epot); + fprintf(fp, "Maximum force = %21.14e on atom %d\n", ems->fmax, ems->a_fmax + 1); + fprintf(fp, "Norm of force = %21.14e\n", ems->fnorm/sqrtNumAtoms); +#else + fprintf(fp, "Potential Energy = %14.7e\n", ems->epot); + fprintf(fp, "Maximum force = %14.7e on atom %d\n", ems->fmax, ems->a_fmax + 1); + fprintf(fp, "Norm of force = %14.7e\n", ems->fnorm/sqrtNumAtoms); +#endif +} + +//! Compute the norm and max of the force array in parallel +static void get_f_norm_max(const t_commrec *cr, + t_grpopts *opts, t_mdatoms *mdatoms, const rvec *f, + real *fnorm, real *fmax, int *a_fmax) +{ + double fnorm2, *sum; + real fmax2, fam; + int la_max, a_max, start, end, i, m, gf; + + /* This routine finds the largest force and returns it. + * On parallel machines the global max is taken. + */ + fnorm2 = 0; + fmax2 = 0; + la_max = -1; + start = 0; + end = mdatoms->homenr; + if (mdatoms->cFREEZE) + { + for (i = start; i < end; i++) + { + gf = mdatoms->cFREEZE[i]; + fam = 0; + for (m = 0; m < DIM; m++) + { + if (!opts->nFreeze[gf][m]) + { + fam += gmx::square(f[i][m]); + } + } + fnorm2 += fam; + if (fam > fmax2) + { + fmax2 = fam; + la_max = i; + } + } + } + else + { + for (i = start; i < end; i++) + { + fam = norm2(f[i]); + fnorm2 += fam; + if (fam > fmax2) + { + fmax2 = fam; + la_max = i; + } + } + } + + if (la_max >= 0 && DOMAINDECOMP(cr)) + { + a_max = cr->dd->globalAtomIndices[la_max]; + } + else + { + a_max = la_max; + } + if (PAR(cr)) + { + snew(sum, 2*cr->nnodes+1); + sum[2*cr->nodeid] = fmax2; + sum[2*cr->nodeid+1] = a_max; + sum[2*cr->nnodes] = fnorm2; + gmx_sumd(2*cr->nnodes+1, sum, cr); + fnorm2 = sum[2*cr->nnodes]; + /* Determine the global maximum */ + for (i = 0; i < cr->nnodes; i++) + { + if (sum[2*i] > fmax2) + { + fmax2 = sum[2*i]; + a_max = (int)(sum[2*i+1] + 0.5); + } + } + sfree(sum); + } + + if (fnorm) + { + *fnorm = sqrt(fnorm2); + } + if (fmax) + { + *fmax = sqrt(fmax2); + } + if (a_fmax) + { + *a_fmax = a_max; + } +} + +//! Compute the norm of the force +static void get_state_f_norm_max(const t_commrec *cr, + t_grpopts *opts, t_mdatoms *mdatoms, + em_state_t *ems) +{ + get_f_norm_max(cr, opts, mdatoms, as_rvec_array(ems->f.data()), + &ems->fnorm, &ems->fmax, &ems->a_fmax); +} + +//! Initialize the energy minimization +static void init_em(FILE *fplog, const char *title, + const t_commrec *cr, + const gmx_multisim_t *ms, + gmx::IMDOutputProvider *outputProvider, + t_inputrec *ir, + const MdrunOptions &mdrunOptions, + t_state *state_global, gmx_mtop_t *top_global, + em_state_t *ems, gmx_localtop_t **top, + t_nrnb *nrnb, rvec mu_tot, + t_forcerec *fr, gmx_enerdata_t **enerd, + t_graph **graph, gmx::MDAtoms *mdAtoms, gmx_global_stat_t *gstat, + gmx_vsite_t *vsite, gmx::Constraints *constr, gmx_shellfc_t **shellfc, + int nfile, const t_filenm fnm[], + gmx_mdoutf_t *outf, t_mdebin **mdebin, + gmx_wallcycle_t wcycle) +{ + real dvdl_constr; + + if (fplog) + { + fprintf(fplog, "Initiating %s\n", title); + } + + if (MASTER(cr)) + { + state_global->ngtc = 0; + + /* Initialize lambda variables */ + initialize_lambdas(fplog, ir, &(state_global->fep_state), state_global->lambda, nullptr); + } + + init_nrnb(nrnb); + + /* Interactive molecular dynamics */ + init_IMD(ir, cr, ms, top_global, fplog, 1, + MASTER(cr) ? as_rvec_array(state_global->x.data()) : nullptr, + nfile, fnm, nullptr, mdrunOptions); + + if (ir->eI == eiNM) + { + GMX_ASSERT(shellfc != nullptr, "With NM we always support shells"); + + *shellfc = init_shell_flexcon(stdout, + top_global, + constr ? constr->numFlexibleConstraints() : 0, + ir->nstcalcenergy, + DOMAINDECOMP(cr)); + } + else + { + GMX_ASSERT(EI_ENERGY_MINIMIZATION(ir->eI), "This else currently only handles energy minimizers, consider if your algorithm needs shell/flexible-constraint support"); + + /* With energy minimization, shells and flexible constraints are + * automatically minimized when treated like normal DOFS. + */ + if (shellfc != nullptr) + { + *shellfc = nullptr; + } + } + + auto mdatoms = mdAtoms->mdatoms(); + if (DOMAINDECOMP(cr)) + { + *top = dd_init_local_top(top_global); + + dd_init_local_state(cr->dd, state_global, &ems->s); + + /* Distribute the charge groups over the nodes from the master node */ + dd_partition_system(fplog, ir->init_step, cr, TRUE, 1, + state_global, top_global, ir, + &ems->s, &ems->f, mdAtoms, *top, + fr, vsite, constr, + nrnb, nullptr, FALSE); + dd_store_state(cr->dd, &ems->s); + + *graph = nullptr; + } + else + { + state_change_natoms(state_global, state_global->natoms); + /* Just copy the state */ + ems->s = *state_global; + state_change_natoms(&ems->s, ems->s.natoms); + /* We need to allocate one element extra, since we might use + * (unaligned) 4-wide SIMD loads to access rvec entries. + */ + ems->f.resize(gmx::paddedRVecVectorSize(ems->s.natoms)); + + snew(*top, 1); + mdAlgorithmsSetupAtomData(cr, ir, top_global, *top, fr, + graph, mdAtoms, + constr, vsite, shellfc ? *shellfc : nullptr); + + if (vsite) + { + set_vsite_top(vsite, *top, mdatoms); + } + } + + update_mdatoms(mdAtoms->mdatoms(), ems->s.lambda[efptMASS]); + + if (constr) + { + // TODO how should this cross-module support dependency be managed? + if (ir->eConstrAlg == econtSHAKE && + gmx_mtop_ftype_count(top_global, F_CONSTR) > 0) + { + gmx_fatal(FARGS, "Can not do energy minimization with %s, use %s\n", + econstr_names[econtSHAKE], econstr_names[econtLINCS]); + } + + if (!ir->bContinuation) + { + /* Constrain the starting coordinates */ + dvdl_constr = 0; + constr->apply(TRUE, TRUE, + -1, 0, 1.0, + as_rvec_array(ems->s.x.data()), + as_rvec_array(ems->s.x.data()), + nullptr, + ems->s.box, + ems->s.lambda[efptFEP], &dvdl_constr, + nullptr, nullptr, gmx::ConstraintVariable::Positions); + } + } + + if (PAR(cr)) + { + *gstat = global_stat_init(ir); + } + else + { + *gstat = nullptr; + } + + *outf = init_mdoutf(fplog, nfile, fnm, mdrunOptions, cr, outputProvider, ir, top_global, nullptr, wcycle); + + snew(*enerd, 1); + init_enerdata(top_global->groups.grps[egcENER].nr, ir->fepvals->n_lambda, + *enerd); + + if (mdebin != nullptr) + { + /* Init bin for energy stuff */ + *mdebin = init_mdebin(mdoutf_get_fp_ene(*outf), top_global, ir, nullptr); + } + + clear_rvec(mu_tot); + calc_shifts(ems->s.box, fr->shift_vec); +} + +//! Finalize the minimization +static void finish_em(const t_commrec *cr, gmx_mdoutf_t outf, + gmx_walltime_accounting_t walltime_accounting, + gmx_wallcycle_t wcycle) +{ + if (!thisRankHasDuty(cr, DUTY_PME)) + { + /* Tell the PME only node to finish */ + gmx_pme_send_finish(cr); + } + + done_mdoutf(outf); + + em_time_end(walltime_accounting, wcycle); +} + +//! Swap two different EM states during minimization +static void swap_em_state(em_state_t **ems1, em_state_t **ems2) +{ + em_state_t *tmp; + + tmp = *ems1; + *ems1 = *ems2; + *ems2 = tmp; +} + +//! Save the EM trajectory +static void write_em_traj(FILE *fplog, const t_commrec *cr, + gmx_mdoutf_t outf, + gmx_bool bX, gmx_bool bF, const char *confout, + gmx_mtop_t *top_global, + t_inputrec *ir, gmx_int64_t step, + em_state_t *state, + t_state *state_global, + ObservablesHistory *observablesHistory) +{ + int mdof_flags = 0; + + if (bX) + { + mdof_flags |= MDOF_X; + } + if (bF) + { + mdof_flags |= MDOF_F; + } + + /* If we want IMD output, set appropriate MDOF flag */ + if (ir->bIMD) + { + mdof_flags |= MDOF_IMD; + } + + mdoutf_write_to_trajectory_files(fplog, cr, outf, mdof_flags, + top_global, step, (double)step, + &state->s, state_global, observablesHistory, + state->f); + + if (confout != nullptr && MASTER(cr)) + { + GMX_RELEASE_ASSERT(bX, "The code below assumes that (with domain decomposition), x is collected to state_global in the call above."); + /* With domain decomposition the call above collected the state->s.x + * into state_global->x. Without DD we copy the local state pointer. + */ + if (!DOMAINDECOMP(cr)) + { + state_global = &state->s; + } + + if (ir->ePBC != epbcNONE && !ir->bPeriodicMols && DOMAINDECOMP(cr)) + { + /* Make molecules whole only for confout writing */ + do_pbc_mtop(fplog, ir->ePBC, state->s.box, top_global, + as_rvec_array(state_global->x.data())); + } + + write_sto_conf_mtop(confout, + *top_global->name, top_global, + as_rvec_array(state_global->x.data()), nullptr, ir->ePBC, state->s.box); + } +} + +//! \brief Do one minimization step +// +// \returns true when the step succeeded, false when a constraint error occurred +static bool do_em_step(const t_commrec *cr, + t_inputrec *ir, t_mdatoms *md, + em_state_t *ems1, real a, const PaddedRVecVector *force, + em_state_t *ems2, + gmx::Constraints *constr, + gmx_int64_t count) + +{ + t_state *s1, *s2; + int start, end; + real dvdl_constr; + int nthreads gmx_unused; + + bool validStep = true; + + s1 = &ems1->s; + s2 = &ems2->s; + + if (DOMAINDECOMP(cr) && s1->ddp_count != cr->dd->ddp_count) + { + gmx_incons("state mismatch in do_em_step"); + } + + s2->flags = s1->flags; + + if (s2->natoms != s1->natoms) + { + state_change_natoms(s2, s1->natoms); + /* We need to allocate one element extra, since we might use + * (unaligned) 4-wide SIMD loads to access rvec entries. + */ + ems2->f.resize(gmx::paddedRVecVectorSize(s2->natoms)); + } + if (DOMAINDECOMP(cr) && s2->cg_gl.size() != s1->cg_gl.size()) + { + s2->cg_gl.resize(s1->cg_gl.size()); + } + + copy_mat(s1->box, s2->box); + /* Copy free energy state */ + s2->lambda = s1->lambda; + copy_mat(s1->box, s2->box); + + start = 0; + end = md->homenr; + + // cppcheck-suppress unreadVariable + nthreads = gmx_omp_nthreads_get(emntUpdate); +#pragma omp parallel num_threads(nthreads) + { + const rvec *x1 = as_rvec_array(s1->x.data()); + rvec *x2 = as_rvec_array(s2->x.data()); + const rvec *f = as_rvec_array(force->data()); + + int gf = 0; +#pragma omp for schedule(static) nowait + for (int i = start; i < end; i++) + { + try + { + if (md->cFREEZE) + { + gf = md->cFREEZE[i]; + } + for (int m = 0; m < DIM; m++) + { + if (ir->opts.nFreeze[gf][m]) + { + x2[i][m] = x1[i][m]; + } + else + { + x2[i][m] = x1[i][m] + a*f[i][m]; + } + } + } + GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; + } + + if (s2->flags & (1<cg_p.data()); + rvec *p2 = as_rvec_array(s2->cg_p.data()); +#pragma omp for schedule(static) nowait + for (int i = start; i < end; i++) + { + // Trivial OpenMP block that does not throw + copy_rvec(p1[i], p2[i]); + } + } + + if (DOMAINDECOMP(cr)) + { + s2->ddp_count = s1->ddp_count; + + /* OpenMP does not supported unsigned loop variables */ +#pragma omp for schedule(static) nowait + for (int i = 0; i < static_cast(s2->cg_gl.size()); i++) + { + s2->cg_gl[i] = s1->cg_gl[i]; + } + s2->ddp_count_cg_gl = s1->ddp_count_cg_gl; + } + } + + if (constr) + { + dvdl_constr = 0; + validStep = + constr->apply(TRUE, TRUE, + count, 0, 1.0, + as_rvec_array(s1->x.data()), as_rvec_array(s2->x.data()), + nullptr, s2->box, + s2->lambda[efptBONDED], &dvdl_constr, + nullptr, nullptr, gmx::ConstraintVariable::Positions); + ++ if (cr->nnodes > 1) ++ { ++ /* This global reduction will affect performance at high ++ * parallelization, but we can not really avoid it. ++ * But usually EM is not run at high parallelization. ++ */ ++ int reductionBuffer = !validStep; ++ gmx_sumi(1, &reductionBuffer, cr); ++ validStep = (reductionBuffer == 0); ++ } ++ + // We should move this check to the different minimizers + if (!validStep && ir->eI != eiSteep) + { + gmx_fatal(FARGS, "The coordinates could not be constrained. Minimizer '%s' can not handle constraint failures, use minimizer '%s' before using '%s'.", + EI(ir->eI), EI(eiSteep), EI(ir->eI)); + } + } + + return validStep; +} + +//! Prepare EM for using domain decomposition parallellization +static void em_dd_partition_system(FILE *fplog, int step, const t_commrec *cr, + gmx_mtop_t *top_global, t_inputrec *ir, + em_state_t *ems, gmx_localtop_t *top, + gmx::MDAtoms *mdAtoms, t_forcerec *fr, + gmx_vsite_t *vsite, gmx::Constraints *constr, + t_nrnb *nrnb, gmx_wallcycle_t wcycle) +{ + /* Repartition the domain decomposition */ + dd_partition_system(fplog, step, cr, FALSE, 1, + nullptr, top_global, ir, + &ems->s, &ems->f, + mdAtoms, top, fr, vsite, constr, + nrnb, wcycle, FALSE); + dd_store_state(cr->dd, &ems->s); +} + +namespace +{ + +/*! \brief Class to handle the work of setting and doing an energy evaluation. + * + * This class is a mere aggregate of parameters to pass to evaluate an + * energy, so that future changes to names and types of them consume + * less time when refactoring other code. + * + * Aggregate initialization is used, for which the chief risk is that + * if a member is added at the end and not all initializer lists are + * updated, then the member will be value initialized, which will + * typically mean initialization to zero. + * + * We only want to construct one of these with an initializer list, so + * we explicitly delete the default constructor. */ +class EnergyEvaluator +{ + public: + //! We only intend to construct such objects with an initializer list. +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9) + // Aspects of the C++11 spec changed after GCC 4.8.5, and + // compilation of the initializer list construction in + // runner.cpp fails in GCC 4.8.5. + EnergyEvaluator() = delete; +#endif + /*! \brief Evaluates an energy on the state in \c ems. + * + * \todo In practice, the same objects mu_tot, vir, and pres + * are always passed to this function, so we would rather have + * them as data members. However, their C-array types are + * unsuited for aggregate initialization. When the types + * improve, the call signature of this method can be reduced. + */ + void run(em_state_t *ems, rvec mu_tot, + tensor vir, tensor pres, + gmx_int64_t count, gmx_bool bFirst); + //! Handles logging. + FILE *fplog; + //! Handles communication. + const t_commrec *cr; + //! Coordinates multi-simulations. + const gmx_multisim_t *ms; + //! Holds the simulation topology. + gmx_mtop_t *top_global; + //! Holds the domain topology. + gmx_localtop_t *top; + //! User input options. + t_inputrec *inputrec; + //! Manages flop accounting. + t_nrnb *nrnb; + //! Manages wall cycle accounting. + gmx_wallcycle_t wcycle; + //! Coordinates global reduction. + gmx_global_stat_t gstat; + //! Handles virtual sites. + gmx_vsite_t *vsite; + //! Handles constraints. + gmx::Constraints *constr; + //! Handles strange things. + t_fcdata *fcd; + //! Molecular graph for SHAKE. + t_graph *graph; + //! Per-atom data for this domain. + gmx::MDAtoms *mdAtoms; + //! Handles how to calculate the forces. + t_forcerec *fr; + //! Stores the computed energies. + gmx_enerdata_t *enerd; +}; + +void +EnergyEvaluator::run(em_state_t *ems, rvec mu_tot, + tensor vir, tensor pres, + gmx_int64_t count, gmx_bool bFirst) +{ + real t; + gmx_bool bNS; + tensor force_vir, shake_vir, ekin; + real dvdl_constr, prescorr, enercorr, dvdlcorr; + real terminate = 0; + + /* Set the time to the initial time, the time does not change during EM */ + t = inputrec->init_t; + + if (bFirst || + (DOMAINDECOMP(cr) && ems->s.ddp_count < cr->dd->ddp_count)) + { + /* This is the first state or an old state used before the last ns */ + bNS = TRUE; + } + else + { + bNS = FALSE; + if (inputrec->nstlist > 0) + { + bNS = TRUE; + } + } + + if (vsite) + { + construct_vsites(vsite, as_rvec_array(ems->s.x.data()), 1, nullptr, + top->idef.iparams, top->idef.il, + fr->ePBC, fr->bMolPBC, cr, ems->s.box); + } + + if (DOMAINDECOMP(cr) && bNS) + { + /* Repartition the domain decomposition */ + em_dd_partition_system(fplog, count, cr, top_global, inputrec, + ems, top, mdAtoms, fr, vsite, constr, + nrnb, wcycle); + } + + /* Calc force & energy on new trial position */ + /* do_force always puts the charge groups in the box and shifts again + * We do not unshift, so molecules are always whole in congrad.c + */ + do_force(fplog, cr, ms, inputrec, nullptr, + count, nrnb, wcycle, top, &top_global->groups, + ems->s.box, ems->s.x, &ems->s.hist, + ems->f, force_vir, mdAtoms->mdatoms(), enerd, fcd, + ems->s.lambda, graph, fr, vsite, mu_tot, t, nullptr, + GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES | + GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY | + (bNS ? GMX_FORCE_NS : 0), + DOMAINDECOMP(cr) ? + DdOpenBalanceRegionBeforeForceComputation::yes : + DdOpenBalanceRegionBeforeForceComputation::no, + DOMAINDECOMP(cr) ? + DdCloseBalanceRegionAfterForceComputation::yes : + DdCloseBalanceRegionAfterForceComputation::no); + + /* Clear the unused shake virial and pressure */ + clear_mat(shake_vir); + clear_mat(pres); + + /* Communicate stuff when parallel */ + if (PAR(cr) && inputrec->eI != eiNM) + { + wallcycle_start(wcycle, ewcMoveE); + + global_stat(gstat, cr, enerd, force_vir, shake_vir, mu_tot, + inputrec, nullptr, nullptr, nullptr, 1, &terminate, + nullptr, FALSE, + CGLO_ENERGY | + CGLO_PRESSURE | + CGLO_CONSTRAINT); + + wallcycle_stop(wcycle, ewcMoveE); + } + + /* Calculate long range corrections to pressure and energy */ + calc_dispcorr(inputrec, fr, ems->s.box, ems->s.lambda[efptVDW], + pres, force_vir, &prescorr, &enercorr, &dvdlcorr); + enerd->term[F_DISPCORR] = enercorr; + enerd->term[F_EPOT] += enercorr; + enerd->term[F_PRES] += prescorr; + enerd->term[F_DVDL] += dvdlcorr; + + ems->epot = enerd->term[F_EPOT]; + + if (constr) + { + /* Project out the constraint components of the force */ + dvdl_constr = 0; + rvec *f_rvec = as_rvec_array(ems->f.data()); + constr->apply(FALSE, FALSE, + count, 0, 1.0, + as_rvec_array(ems->s.x.data()), f_rvec, f_rvec, + ems->s.box, + ems->s.lambda[efptBONDED], &dvdl_constr, + nullptr, &shake_vir, gmx::ConstraintVariable::ForceDispl); + enerd->term[F_DVDL_CONSTR] += dvdl_constr; + m_add(force_vir, shake_vir, vir); + } + else + { + copy_mat(force_vir, vir); + } + + clear_mat(ekin); + enerd->term[F_PRES] = + calc_pres(fr->ePBC, inputrec->nwall, ems->s.box, ekin, vir, pres); + + sum_dhdl(enerd, ems->s.lambda, inputrec->fepvals); + + if (EI_ENERGY_MINIMIZATION(inputrec->eI)) + { + get_state_f_norm_max(cr, &(inputrec->opts), mdAtoms->mdatoms(), ems); + } +} + +} // namespace + +//! Parallel utility summing energies and forces +static double reorder_partsum(const t_commrec *cr, t_grpopts *opts, t_mdatoms *mdatoms, + gmx_mtop_t *top_global, + em_state_t *s_min, em_state_t *s_b) +{ + t_block *cgs_gl; + int ncg, *cg_gl, *index, c, cg, i, a0, a1, a, gf, m; + double partsum; + unsigned char *grpnrFREEZE; + + if (debug) + { + fprintf(debug, "Doing reorder_partsum\n"); + } + + const rvec *fm = as_rvec_array(s_min->f.data()); + const rvec *fb = as_rvec_array(s_b->f.data()); + + cgs_gl = dd_charge_groups_global(cr->dd); + index = cgs_gl->index; + + /* Collect fm in a global vector fmg. + * This conflicts with the spirit of domain decomposition, + * but to fully optimize this a much more complicated algorithm is required. + */ + rvec *fmg; + snew(fmg, top_global->natoms); + + ncg = s_min->s.cg_gl.size(); + cg_gl = s_min->s.cg_gl.data(); + i = 0; + for (c = 0; c < ncg; c++) + { + cg = cg_gl[c]; + a0 = index[cg]; + a1 = index[cg+1]; + for (a = a0; a < a1; a++) + { + copy_rvec(fm[i], fmg[a]); + i++; + } + } + gmx_sum(top_global->natoms*3, fmg[0], cr); + + /* Now we will determine the part of the sum for the cgs in state s_b */ + ncg = s_b->s.cg_gl.size(); + cg_gl = s_b->s.cg_gl.data(); + partsum = 0; + i = 0; + gf = 0; + grpnrFREEZE = top_global->groups.grpnr[egcFREEZE]; + for (c = 0; c < ncg; c++) + { + cg = cg_gl[c]; + a0 = index[cg]; + a1 = index[cg+1]; + for (a = a0; a < a1; a++) + { + if (mdatoms->cFREEZE && grpnrFREEZE) + { + gf = grpnrFREEZE[i]; + } + for (m = 0; m < DIM; m++) + { + if (!opts->nFreeze[gf][m]) + { + partsum += (fb[i][m] - fmg[a][m])*fb[i][m]; + } + } + i++; + } + } + + sfree(fmg); + + return partsum; +} + +//! Print some stuff, like beta, whatever that means. +static real pr_beta(const t_commrec *cr, t_grpopts *opts, t_mdatoms *mdatoms, + gmx_mtop_t *top_global, + em_state_t *s_min, em_state_t *s_b) +{ + double sum; + + /* This is just the classical Polak-Ribiere calculation of beta; + * it looks a bit complicated since we take freeze groups into account, + * and might have to sum it in parallel runs. + */ + + if (!DOMAINDECOMP(cr) || + (s_min->s.ddp_count == cr->dd->ddp_count && + s_b->s.ddp_count == cr->dd->ddp_count)) + { + const rvec *fm = as_rvec_array(s_min->f.data()); + const rvec *fb = as_rvec_array(s_b->f.data()); + sum = 0; + int gf = 0; + /* This part of code can be incorrect with DD, + * since the atom ordering in s_b and s_min might differ. + */ + for (int i = 0; i < mdatoms->homenr; i++) + { + if (mdatoms->cFREEZE) + { + gf = mdatoms->cFREEZE[i]; + } + for (int m = 0; m < DIM; m++) + { + if (!opts->nFreeze[gf][m]) + { + sum += (fb[i][m] - fm[i][m])*fb[i][m]; + } + } + } + } + else + { + /* We need to reorder cgs while summing */ + sum = reorder_partsum(cr, opts, mdatoms, top_global, s_min, s_b); + } + if (PAR(cr)) + { + gmx_sumd(1, &sum, cr); + } + + return sum/gmx::square(s_min->fnorm); +} + +namespace gmx +{ + +void +Integrator::do_cg() +{ + const char *CG = "Polak-Ribiere Conjugate Gradients"; + + gmx_localtop_t *top; + gmx_enerdata_t *enerd; + gmx_global_stat_t gstat; + t_graph *graph; + double tmp, minstep; + real stepsize; + real a, b, c, beta = 0.0; + real epot_repl = 0; + real pnorm; + t_mdebin *mdebin; + gmx_bool converged, foundlower; + rvec mu_tot; + gmx_bool do_log = FALSE, do_ene = FALSE, do_x, do_f; + tensor vir, pres; + int number_steps, neval = 0, nstcg = inputrec->nstcgsteep; + gmx_mdoutf_t outf; + int m, step, nminstep; + auto mdatoms = mdAtoms->mdatoms(); + + step = 0; + + // Ensure the extra per-atom state array gets allocated + state_global->flags |= (1<nsteps; + + if (MASTER(cr)) + { + sp_header(stderr, CG, inputrec->em_tol, number_steps); + } + if (fplog) + { + sp_header(fplog, CG, inputrec->em_tol, number_steps); + } + + EnergyEvaluator energyEvaluator { + fplog, cr, ms, + top_global, top, + inputrec, nrnb, wcycle, gstat, + vsite, constr, fcd, graph, + mdAtoms, fr, enerd + }; + /* Call the force routine and some auxiliary (neighboursearching etc.) */ + /* do_force always puts the charge groups in the box and shifts again + * We do not unshift, so molecules are always whole in congrad.c + */ + energyEvaluator.run(s_min, mu_tot, vir, pres, -1, TRUE); + + if (MASTER(cr)) + { + /* Copy stuff to the energy bin for easy printing etc. */ + upd_mdebin(mdebin, FALSE, FALSE, (double)step, + mdatoms->tmass, enerd, &s_min->s, inputrec->fepvals, inputrec->expandedvals, s_min->s.box, + nullptr, nullptr, vir, pres, nullptr, mu_tot, constr); + + print_ebin_header(fplog, step, step); + print_ebin(mdoutf_get_fp_ene(outf), TRUE, FALSE, FALSE, fplog, step, step, eprNORMAL, + mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr); + } + + /* Estimate/guess the initial stepsize */ + stepsize = inputrec->em_stepsize/s_min->fnorm; + + if (MASTER(cr)) + { + double sqrtNumAtoms = sqrt(static_cast(state_global->natoms)); + fprintf(stderr, " F-max = %12.5e on atom %d\n", + s_min->fmax, s_min->a_fmax+1); + fprintf(stderr, " F-Norm = %12.5e\n", + s_min->fnorm/sqrtNumAtoms); + fprintf(stderr, "\n"); + /* and copy to the log file too... */ + fprintf(fplog, " F-max = %12.5e on atom %d\n", + s_min->fmax, s_min->a_fmax+1); + fprintf(fplog, " F-Norm = %12.5e\n", + s_min->fnorm/sqrtNumAtoms); + fprintf(fplog, "\n"); + } + /* Start the loop over CG steps. + * Each successful step is counted, and we continue until + * we either converge or reach the max number of steps. + */ + converged = FALSE; + for (step = 0; (number_steps < 0 || step <= number_steps) && !converged; step++) + { + + /* start taking steps in a new direction + * First time we enter the routine, beta=0, and the direction is + * simply the negative gradient. + */ + + /* Calculate the new direction in p, and the gradient in this direction, gpa */ + rvec *pm = as_rvec_array(s_min->s.cg_p.data()); + const rvec *sfm = as_rvec_array(s_min->f.data()); + double gpa = 0; + int gf = 0; + for (int i = 0; i < mdatoms->homenr; i++) + { + if (mdatoms->cFREEZE) + { + gf = mdatoms->cFREEZE[i]; + } + for (m = 0; m < DIM; m++) + { + if (!inputrec->opts.nFreeze[gf][m]) + { + pm[i][m] = sfm[i][m] + beta*pm[i][m]; + gpa -= pm[i][m]*sfm[i][m]; + /* f is negative gradient, thus the sign */ + } + else + { + pm[i][m] = 0; + } + } + } + + /* Sum the gradient along the line across CPUs */ + if (PAR(cr)) + { + gmx_sumd(1, &gpa, cr); + } + + /* Calculate the norm of the search vector */ + get_f_norm_max(cr, &(inputrec->opts), mdatoms, pm, &pnorm, nullptr, nullptr); + + /* Just in case stepsize reaches zero due to numerical precision... */ + if (stepsize <= 0) + { + stepsize = inputrec->em_stepsize/pnorm; + } + + /* + * Double check the value of the derivative in the search direction. + * If it is positive it must be due to the old information in the + * CG formula, so just remove that and start over with beta=0. + * This corresponds to a steepest descent step. + */ + if (gpa > 0) + { + beta = 0; + step--; /* Don't count this step since we are restarting */ + continue; /* Go back to the beginning of the big for-loop */ + } + + /* Calculate minimum allowed stepsize, before the average (norm) + * relative change in coordinate is smaller than precision + */ + minstep = 0; + for (int i = 0; i < mdatoms->homenr; i++) + { + for (m = 0; m < DIM; m++) + { + tmp = fabs(s_min->s.x[i][m]); + if (tmp < 1.0) + { + tmp = 1.0; + } + tmp = pm[i][m]/tmp; + minstep += tmp*tmp; + } + } + /* Add up from all CPUs */ + if (PAR(cr)) + { + gmx_sumd(1, &minstep, cr); + } + + minstep = GMX_REAL_EPS/sqrt(minstep/(3*state_global->natoms)); + + if (stepsize < minstep) + { + converged = TRUE; + break; + } + + /* Write coordinates if necessary */ + do_x = do_per_step(step, inputrec->nstxout); + do_f = do_per_step(step, inputrec->nstfout); + + write_em_traj(fplog, cr, outf, do_x, do_f, nullptr, + top_global, inputrec, step, + s_min, state_global, observablesHistory); + + /* Take a step downhill. + * In theory, we should minimize the function along this direction. + * That is quite possible, but it turns out to take 5-10 function evaluations + * for each line. However, we dont really need to find the exact minimum - + * it is much better to start a new CG step in a modified direction as soon + * as we are close to it. This will save a lot of energy evaluations. + * + * In practice, we just try to take a single step. + * If it worked (i.e. lowered the energy), we increase the stepsize but + * the continue straight to the next CG step without trying to find any minimum. + * If it didn't work (higher energy), there must be a minimum somewhere between + * the old position and the new one. + * + * Due to the finite numerical accuracy, it turns out that it is a good idea + * to even accept a SMALL increase in energy, if the derivative is still downhill. + * This leads to lower final energies in the tests I've done. / Erik + */ + s_a->epot = s_min->epot; + a = 0.0; + c = a + stepsize; /* reference position along line is zero */ + + if (DOMAINDECOMP(cr) && s_min->s.ddp_count < cr->dd->ddp_count) + { + em_dd_partition_system(fplog, step, cr, top_global, inputrec, + s_min, top, mdAtoms, fr, vsite, constr, + nrnb, wcycle); + } + + /* Take a trial step (new coords in s_c) */ + do_em_step(cr, inputrec, mdatoms, s_min, c, &s_min->s.cg_p, s_c, + constr, -1); + + neval++; + /* Calculate energy for the trial step */ + energyEvaluator.run(s_c, mu_tot, vir, pres, -1, FALSE); + + /* Calc derivative along line */ + const rvec *pc = as_rvec_array(s_c->s.cg_p.data()); + const rvec *sfc = as_rvec_array(s_c->f.data()); + double gpc = 0; + for (int i = 0; i < mdatoms->homenr; i++) + { + for (m = 0; m < DIM; m++) + { + gpc -= pc[i][m]*sfc[i][m]; /* f is negative gradient, thus the sign */ + } + } + /* Sum the gradient along the line across CPUs */ + if (PAR(cr)) + { + gmx_sumd(1, &gpc, cr); + } + + /* This is the max amount of increase in energy we tolerate */ + tmp = std::sqrt(GMX_REAL_EPS)*fabs(s_a->epot); + + /* Accept the step if the energy is lower, or if it is not significantly higher + * and the line derivative is still negative. + */ + if (s_c->epot < s_a->epot || (gpc < 0 && s_c->epot < (s_a->epot + tmp))) + { + foundlower = TRUE; + /* Great, we found a better energy. Increase step for next iteration + * if we are still going down, decrease it otherwise + */ + if (gpc < 0) + { + stepsize *= 1.618034; /* The golden section */ + } + else + { + stepsize *= 0.618034; /* 1/golden section */ + } + } + else + { + /* New energy is the same or higher. We will have to do some work + * to find a smaller value in the interval. Take smaller step next time! + */ + foundlower = FALSE; + stepsize *= 0.618034; + } + + + + + /* OK, if we didn't find a lower value we will have to locate one now - there must + * be one in the interval [a=0,c]. + * The same thing is valid here, though: Don't spend dozens of iterations to find + * the line minimum. We try to interpolate based on the derivative at the endpoints, + * and only continue until we find a lower value. In most cases this means 1-2 iterations. + * + * I also have a safeguard for potentially really pathological functions so we never + * take more than 20 steps before we give up ... + * + * If we already found a lower value we just skip this step and continue to the update. + */ + double gpb; + if (!foundlower) + { + nminstep = 0; + + do + { + /* Select a new trial point. + * If the derivatives at points a & c have different sign we interpolate to zero, + * otherwise just do a bisection. + */ + if (gpa < 0 && gpc > 0) + { + b = a + gpa*(a-c)/(gpc-gpa); + } + else + { + b = 0.5*(a+c); + } + + /* safeguard if interpolation close to machine accuracy causes errors: + * never go outside the interval + */ + if (b <= a || b >= c) + { + b = 0.5*(a+c); + } + + if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count) + { + /* Reload the old state */ + em_dd_partition_system(fplog, -1, cr, top_global, inputrec, + s_min, top, mdAtoms, fr, vsite, constr, + nrnb, wcycle); + } + + /* Take a trial step to this new point - new coords in s_b */ + do_em_step(cr, inputrec, mdatoms, s_min, b, &s_min->s.cg_p, s_b, + constr, -1); + + neval++; + /* Calculate energy for the trial step */ + energyEvaluator.run(s_b, mu_tot, vir, pres, -1, FALSE); + + /* p does not change within a step, but since the domain decomposition + * might change, we have to use cg_p of s_b here. + */ + const rvec *pb = as_rvec_array(s_b->s.cg_p.data()); + const rvec *sfb = as_rvec_array(s_b->f.data()); + gpb = 0; + for (int i = 0; i < mdatoms->homenr; i++) + { + for (m = 0; m < DIM; m++) + { + gpb -= pb[i][m]*sfb[i][m]; /* f is negative gradient, thus the sign */ + } + } + /* Sum the gradient along the line across CPUs */ + if (PAR(cr)) + { + gmx_sumd(1, &gpb, cr); + } + + if (debug) + { + fprintf(debug, "CGE: EpotA %f EpotB %f EpotC %f gpb %f\n", + s_a->epot, s_b->epot, s_c->epot, gpb); + } + + epot_repl = s_b->epot; + + /* Keep one of the intervals based on the value of the derivative at the new point */ + if (gpb > 0) + { + /* Replace c endpoint with b */ + swap_em_state(&s_b, &s_c); + c = b; + gpc = gpb; + } + else + { + /* Replace a endpoint with b */ + swap_em_state(&s_b, &s_a); + a = b; + gpa = gpb; + } + + /* + * Stop search as soon as we find a value smaller than the endpoints. + * Never run more than 20 steps, no matter what. + */ + nminstep++; + } + while ((epot_repl > s_a->epot || epot_repl > s_c->epot) && + (nminstep < 20)); + + if (std::fabs(epot_repl - s_min->epot) < fabs(s_min->epot)*GMX_REAL_EPS || + nminstep >= 20) + { + /* OK. We couldn't find a significantly lower energy. + * If beta==0 this was steepest descent, and then we give up. + * If not, set beta=0 and restart with steepest descent before quitting. + */ + if (beta == 0.0) + { + /* Converged */ + converged = TRUE; + break; + } + else + { + /* Reset memory before giving up */ + beta = 0.0; + continue; + } + } + + /* Select min energy state of A & C, put the best in B. + */ + if (s_c->epot < s_a->epot) + { + if (debug) + { + fprintf(debug, "CGE: C (%f) is lower than A (%f), moving C to B\n", + s_c->epot, s_a->epot); + } + swap_em_state(&s_b, &s_c); + gpb = gpc; + } + else + { + if (debug) + { + fprintf(debug, "CGE: A (%f) is lower than C (%f), moving A to B\n", + s_a->epot, s_c->epot); + } + swap_em_state(&s_b, &s_a); + gpb = gpa; + } + + } + else + { + if (debug) + { + fprintf(debug, "CGE: Found a lower energy %f, moving C to B\n", + s_c->epot); + } + swap_em_state(&s_b, &s_c); + gpb = gpc; + } + + /* new search direction */ + /* beta = 0 means forget all memory and restart with steepest descents. */ + if (nstcg && ((step % nstcg) == 0)) + { + beta = 0.0; + } + else + { + /* s_min->fnorm cannot be zero, because then we would have converged + * and broken out. + */ + + /* Polak-Ribiere update. + * Change to fnorm2/fnorm2_old for Fletcher-Reeves + */ + beta = pr_beta(cr, &inputrec->opts, mdatoms, top_global, s_min, s_b); + } + /* Limit beta to prevent oscillations */ + if (fabs(beta) > 5.0) + { + beta = 0.0; + } + + + /* update positions */ + swap_em_state(&s_min, &s_b); + gpa = gpb; + + /* Print it if necessary */ + if (MASTER(cr)) + { + if (mdrunOptions.verbose) + { + double sqrtNumAtoms = sqrt(static_cast(state_global->natoms)); + fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n", + step, s_min->epot, s_min->fnorm/sqrtNumAtoms, + s_min->fmax, s_min->a_fmax+1); + fflush(stderr); + } + /* Store the new (lower) energies */ + upd_mdebin(mdebin, FALSE, FALSE, (double)step, + mdatoms->tmass, enerd, &s_min->s, inputrec->fepvals, inputrec->expandedvals, s_min->s.box, + nullptr, nullptr, vir, pres, nullptr, mu_tot, constr); + + do_log = do_per_step(step, inputrec->nstlog); + do_ene = do_per_step(step, inputrec->nstenergy); + + /* Prepare IMD energy record, if bIMD is TRUE. */ + IMD_fill_energy_record(inputrec->bIMD, inputrec->imd, enerd, step, TRUE); + + if (do_log) + { + print_ebin_header(fplog, step, step); + } + print_ebin(mdoutf_get_fp_ene(outf), do_ene, FALSE, FALSE, + do_log ? fplog : nullptr, step, step, eprNORMAL, + mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr); + } + + /* Send energies and positions to the IMD client if bIMD is TRUE. */ + if (do_IMD(inputrec->bIMD, step, cr, TRUE, state_global->box, as_rvec_array(state_global->x.data()), inputrec, 0, wcycle) && MASTER(cr)) + { + IMD_send_positions(inputrec->imd); + } + + /* Stop when the maximum force lies below tolerance. + * If we have reached machine precision, converged is already set to true. + */ + converged = converged || (s_min->fmax < inputrec->em_tol); + + } /* End of the loop */ + + /* IMD cleanup, if bIMD is TRUE. */ + IMD_finalize(inputrec->bIMD, inputrec->imd); + + if (converged) + { + step--; /* we never took that last step in this case */ + + } + if (s_min->fmax > inputrec->em_tol) + { + if (MASTER(cr)) + { + warn_step(fplog, inputrec->em_tol, s_min->fmax, + step-1 == number_steps, FALSE); + } + converged = FALSE; + } + + if (MASTER(cr)) + { + /* If we printed energy and/or logfile last step (which was the last step) + * we don't have to do it again, but otherwise print the final values. + */ + if (!do_log) + { + /* Write final value to log since we didn't do anything the last step */ + print_ebin_header(fplog, step, step); + } + if (!do_ene || !do_log) + { + /* Write final energy file entries */ + print_ebin(mdoutf_get_fp_ene(outf), !do_ene, FALSE, FALSE, + !do_log ? fplog : nullptr, step, step, eprNORMAL, + mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr); + } + } + + /* Print some stuff... */ + if (MASTER(cr)) + { + fprintf(stderr, "\nwriting lowest energy coordinates.\n"); + } + + /* IMPORTANT! + * For accurate normal mode calculation it is imperative that we + * store the last conformation into the full precision binary trajectory. + * + * However, we should only do it if we did NOT already write this step + * above (which we did if do_x or do_f was true). + */ + do_x = !do_per_step(step, inputrec->nstxout); + do_f = (inputrec->nstfout > 0 && !do_per_step(step, inputrec->nstfout)); + + write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm), + top_global, inputrec, step, + s_min, state_global, observablesHistory); + + + if (MASTER(cr)) + { + double sqrtNumAtoms = sqrt(static_cast(state_global->natoms)); + print_converged(stderr, CG, inputrec->em_tol, step, converged, number_steps, + s_min, sqrtNumAtoms); + print_converged(fplog, CG, inputrec->em_tol, step, converged, number_steps, + s_min, sqrtNumAtoms); + + fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval); + } + + finish_em(cr, outf, walltime_accounting, wcycle); + + /* To print the actual number of steps we needed somewhere */ + walltime_accounting_set_nsteps_done(walltime_accounting, step); +} + + +void +Integrator::do_lbfgs() +{ + static const char *LBFGS = "Low-Memory BFGS Minimizer"; + em_state_t ems; + gmx_localtop_t *top; + gmx_enerdata_t *enerd; + gmx_global_stat_t gstat; + t_graph *graph; + int ncorr, nmaxcorr, point, cp, neval, nminstep; + double stepsize, step_taken, gpa, gpb, gpc, tmp, minstep; + real *rho, *alpha, *p, *s, **dx, **dg; + real a, b, c, maxdelta, delta; + real diag, Epot0; + real dgdx, dgdg, sq, yr, beta; + t_mdebin *mdebin; + gmx_bool converged; + rvec mu_tot; + gmx_bool do_log, do_ene, do_x, do_f, foundlower, *frozen; + tensor vir, pres; + int start, end, number_steps; + gmx_mdoutf_t outf; + int i, k, m, n, gf, step; + int mdof_flags; + auto mdatoms = mdAtoms->mdatoms(); + + if (PAR(cr)) + { + gmx_fatal(FARGS, "Cannot do parallel L-BFGS Minimization - yet.\n"); + } + + if (nullptr != constr) + { + gmx_fatal(FARGS, "The combination of constraints and L-BFGS minimization is not implemented. Either do not use constraints, or use another minimizer (e.g. steepest descent)."); + } + + n = 3*state_global->natoms; + nmaxcorr = inputrec->nbfgscorr; + + snew(frozen, n); + + snew(p, n); + snew(rho, nmaxcorr); + snew(alpha, nmaxcorr); + + snew(dx, nmaxcorr); + for (i = 0; i < nmaxcorr; i++) + { + snew(dx[i], n); + } + + snew(dg, nmaxcorr); + for (i = 0; i < nmaxcorr; i++) + { + snew(dg[i], n); + } + + step = 0; + neval = 0; + + /* Init em */ + init_em(fplog, LBFGS, cr, ms, outputProvider, inputrec, mdrunOptions, + state_global, top_global, &ems, &top, + nrnb, mu_tot, fr, &enerd, &graph, mdAtoms, &gstat, + vsite, constr, nullptr, + nfile, fnm, &outf, &mdebin, wcycle); + + start = 0; + end = mdatoms->homenr; + + /* We need 4 working states */ + em_state_t s0 {}, s1 {}, s2 {}, s3 {}; + em_state_t *sa = &s0; + em_state_t *sb = &s1; + em_state_t *sc = &s2; + em_state_t *last = &s3; + /* Initialize by copying the state from ems (we could skip x and f here) */ + *sa = ems; + *sb = ems; + *sc = ems; + + /* Print to log file */ + print_em_start(fplog, cr, walltime_accounting, wcycle, LBFGS); + + do_log = do_ene = do_x = do_f = TRUE; + + /* Max number of steps */ + number_steps = inputrec->nsteps; + + /* Create a 3*natoms index to tell whether each degree of freedom is frozen */ + gf = 0; + for (i = start; i < end; i++) + { + if (mdatoms->cFREEZE) + { + gf = mdatoms->cFREEZE[i]; + } + for (m = 0; m < DIM; m++) + { + frozen[3*i+m] = inputrec->opts.nFreeze[gf][m]; + } + } + if (MASTER(cr)) + { + sp_header(stderr, LBFGS, inputrec->em_tol, number_steps); + } + if (fplog) + { + sp_header(fplog, LBFGS, inputrec->em_tol, number_steps); + } + + if (vsite) + { + construct_vsites(vsite, as_rvec_array(state_global->x.data()), 1, nullptr, + top->idef.iparams, top->idef.il, + fr->ePBC, fr->bMolPBC, cr, state_global->box); + } + + /* Call the force routine and some auxiliary (neighboursearching etc.) */ + /* do_force always puts the charge groups in the box and shifts again + * We do not unshift, so molecules are always whole + */ + neval++; + EnergyEvaluator energyEvaluator { + fplog, cr, ms, + top_global, top, + inputrec, nrnb, wcycle, gstat, + vsite, constr, fcd, graph, + mdAtoms, fr, enerd + }; + energyEvaluator.run(&ems, mu_tot, vir, pres, -1, TRUE); + + if (MASTER(cr)) + { + /* Copy stuff to the energy bin for easy printing etc. */ + upd_mdebin(mdebin, FALSE, FALSE, (double)step, + mdatoms->tmass, enerd, state_global, inputrec->fepvals, inputrec->expandedvals, state_global->box, + nullptr, nullptr, vir, pres, nullptr, mu_tot, constr); + + print_ebin_header(fplog, step, step); + print_ebin(mdoutf_get_fp_ene(outf), TRUE, FALSE, FALSE, fplog, step, step, eprNORMAL, + mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr); + } + + /* Set the initial step. + * since it will be multiplied by the non-normalized search direction + * vector (force vector the first time), we scale it by the + * norm of the force. + */ + + if (MASTER(cr)) + { + double sqrtNumAtoms = sqrt(static_cast(state_global->natoms)); + fprintf(stderr, "Using %d BFGS correction steps.\n\n", nmaxcorr); + fprintf(stderr, " F-max = %12.5e on atom %d\n", ems.fmax, ems.a_fmax + 1); + fprintf(stderr, " F-Norm = %12.5e\n", ems.fnorm/sqrtNumAtoms); + fprintf(stderr, "\n"); + /* and copy to the log file too... */ + fprintf(fplog, "Using %d BFGS correction steps.\n\n", nmaxcorr); + fprintf(fplog, " F-max = %12.5e on atom %d\n", ems.fmax, ems.a_fmax + 1); + fprintf(fplog, " F-Norm = %12.5e\n", ems.fnorm/sqrtNumAtoms); + fprintf(fplog, "\n"); + } + + // Point is an index to the memory of search directions, where 0 is the first one. + point = 0; + + // Set initial search direction to the force (-gradient), or 0 for frozen particles. + real *fInit = static_cast(as_rvec_array(ems.f.data())[0]); + for (i = 0; i < n; i++) + { + if (!frozen[i]) + { + dx[point][i] = fInit[i]; /* Initial search direction */ + } + else + { + dx[point][i] = 0; + } + } + + // Stepsize will be modified during the search, and actually it is not critical + // (the main efficiency in the algorithm comes from changing directions), but + // we still need an initial value, so estimate it as the inverse of the norm + // so we take small steps where the potential fluctuates a lot. + stepsize = 1.0/ems.fnorm; + + /* Start the loop over BFGS steps. + * Each successful step is counted, and we continue until + * we either converge or reach the max number of steps. + */ + + ncorr = 0; + + /* Set the gradient from the force */ + converged = FALSE; + for (step = 0; (number_steps < 0 || step <= number_steps) && !converged; step++) + { + + /* Write coordinates if necessary */ + do_x = do_per_step(step, inputrec->nstxout); + do_f = do_per_step(step, inputrec->nstfout); + + mdof_flags = 0; + if (do_x) + { + mdof_flags |= MDOF_X; + } + + if (do_f) + { + mdof_flags |= MDOF_F; + } + + if (inputrec->bIMD) + { + mdof_flags |= MDOF_IMD; + } + + mdoutf_write_to_trajectory_files(fplog, cr, outf, mdof_flags, + top_global, step, (real)step, &ems.s, state_global, observablesHistory, ems.f); + + /* Do the linesearching in the direction dx[point][0..(n-1)] */ + + /* make s a pointer to current search direction - point=0 first time we get here */ + s = dx[point]; + + real *xx = static_cast(as_rvec_array(ems.s.x.data())[0]); + real *ff = static_cast(as_rvec_array(ems.f.data())[0]); + + // calculate line gradient in position A + for (gpa = 0, i = 0; i < n; i++) + { + gpa -= s[i]*ff[i]; + } + + /* Calculate minimum allowed stepsize along the line, before the average (norm) + * relative change in coordinate is smaller than precision + */ + for (minstep = 0, i = 0; i < n; i++) + { + tmp = fabs(xx[i]); + if (tmp < 1.0) + { + tmp = 1.0; + } + tmp = s[i]/tmp; + minstep += tmp*tmp; + } + minstep = GMX_REAL_EPS/sqrt(minstep/n); + + if (stepsize < minstep) + { + converged = TRUE; + break; + } + + // Before taking any steps along the line, store the old position + *last = ems; + real *lastx = static_cast(as_rvec_array(last->s.x.data())[0]); + real *lastf = static_cast(as_rvec_array(last->f.data())[0]); + Epot0 = ems.epot; + + *sa = ems; + + /* Take a step downhill. + * In theory, we should find the actual minimum of the function in this + * direction, somewhere along the line. + * That is quite possible, but it turns out to take 5-10 function evaluations + * for each line. However, we dont really need to find the exact minimum - + * it is much better to start a new BFGS step in a modified direction as soon + * as we are close to it. This will save a lot of energy evaluations. + * + * In practice, we just try to take a single step. + * If it worked (i.e. lowered the energy), we increase the stepsize but + * continue straight to the next BFGS step without trying to find any minimum, + * i.e. we change the search direction too. If the line was smooth, it is + * likely we are in a smooth region, and then it makes sense to take longer + * steps in the modified search direction too. + * + * If it didn't work (higher energy), there must be a minimum somewhere between + * the old position and the new one. Then we need to start by finding a lower + * value before we change search direction. Since the energy was apparently + * quite rough, we need to decrease the step size. + * + * Due to the finite numerical accuracy, it turns out that it is a good idea + * to accept a SMALL increase in energy, if the derivative is still downhill. + * This leads to lower final energies in the tests I've done. / Erik + */ + + // State "A" is the first position along the line. + // reference position along line is initially zero + a = 0.0; + + // Check stepsize first. We do not allow displacements + // larger than emstep. + // + do + { + // Pick a new position C by adding stepsize to A. + c = a + stepsize; + + // Calculate what the largest change in any individual coordinate + // would be (translation along line * gradient along line) + maxdelta = 0; + for (i = 0; i < n; i++) + { + delta = c*s[i]; + if (delta > maxdelta) + { + maxdelta = delta; + } + } + // If any displacement is larger than the stepsize limit, reduce the step + if (maxdelta > inputrec->em_stepsize) + { + stepsize *= 0.1; + } + } + while (maxdelta > inputrec->em_stepsize); + + // Take a trial step and move the coordinate array xc[] to position C + real *xc = static_cast(as_rvec_array(sc->s.x.data())[0]); + for (i = 0; i < n; i++) + { + xc[i] = lastx[i] + c*s[i]; + } + + neval++; + // Calculate energy for the trial step in position C + energyEvaluator.run(sc, mu_tot, vir, pres, step, FALSE); + + // Calc line gradient in position C + real *fc = static_cast(as_rvec_array(sc->f.data())[0]); + for (gpc = 0, i = 0; i < n; i++) + { + gpc -= s[i]*fc[i]; /* f is negative gradient, thus the sign */ + } + /* Sum the gradient along the line across CPUs */ + if (PAR(cr)) + { + gmx_sumd(1, &gpc, cr); + } + + // This is the max amount of increase in energy we tolerate. + // By allowing VERY small changes (close to numerical precision) we + // frequently find even better (lower) final energies. + tmp = std::sqrt(GMX_REAL_EPS)*fabs(sa->epot); + + // Accept the step if the energy is lower in the new position C (compared to A), + // or if it is not significantly higher and the line derivative is still negative. + if (sc->epot < sa->epot || (gpc < 0 && sc->epot < (sa->epot + tmp))) + { + // Great, we found a better energy. We no longer try to alter the + // stepsize, but simply accept this new better position. The we select a new + // search direction instead, which will be much more efficient than continuing + // to take smaller steps along a line. Set fnorm based on the new C position, + // which will be used to update the stepsize to 1/fnorm further down. + foundlower = TRUE; + } + else + { + // If we got here, the energy is NOT lower in point C, i.e. it will be the same + // or higher than in point A. In this case it is pointless to move to point C, + // so we will have to do more iterations along the same line to find a smaller + // value in the interval [A=0.0,C]. + // Here, A is still 0.0, but that will change when we do a search in the interval + // [0.0,C] below. That search we will do by interpolation or bisection rather + // than with the stepsize, so no need to modify it. For the next search direction + // it will be reset to 1/fnorm anyway. + foundlower = FALSE; + } + + if (!foundlower) + { + // OK, if we didn't find a lower value we will have to locate one now - there must + // be one in the interval [a,c]. + // The same thing is valid here, though: Don't spend dozens of iterations to find + // the line minimum. We try to interpolate based on the derivative at the endpoints, + // and only continue until we find a lower value. In most cases this means 1-2 iterations. + // I also have a safeguard for potentially really pathological functions so we never + // take more than 20 steps before we give up. + // If we already found a lower value we just skip this step and continue to the update. + real fnorm = 0; + nminstep = 0; + do + { + // Select a new trial point B in the interval [A,C]. + // If the derivatives at points a & c have different sign we interpolate to zero, + // otherwise just do a bisection since there might be multiple minima/maxima + // inside the interval. + if (gpa < 0 && gpc > 0) + { + b = a + gpa*(a-c)/(gpc-gpa); + } + else + { + b = 0.5*(a+c); + } + + /* safeguard if interpolation close to machine accuracy causes errors: + * never go outside the interval + */ + if (b <= a || b >= c) + { + b = 0.5*(a+c); + } + + // Take a trial step to point B + real *xb = static_cast(as_rvec_array(sb->s.x.data())[0]); + for (i = 0; i < n; i++) + { + xb[i] = lastx[i] + b*s[i]; + } + + neval++; + // Calculate energy for the trial step in point B + energyEvaluator.run(sb, mu_tot, vir, pres, step, FALSE); + fnorm = sb->fnorm; + + // Calculate gradient in point B + real *fb = static_cast(as_rvec_array(sb->f.data())[0]); + for (gpb = 0, i = 0; i < n; i++) + { + gpb -= s[i]*fb[i]; /* f is negative gradient, thus the sign */ + + } + /* Sum the gradient along the line across CPUs */ + if (PAR(cr)) + { + gmx_sumd(1, &gpb, cr); + } + + // Keep one of the intervals [A,B] or [B,C] based on the value of the derivative + // at the new point B, and rename the endpoints of this new interval A and C. + if (gpb > 0) + { + /* Replace c endpoint with b */ + c = b; + /* swap states b and c */ + swap_em_state(&sb, &sc); + } + else + { + /* Replace a endpoint with b */ + a = b; + /* swap states a and b */ + swap_em_state(&sa, &sb); + } + + /* + * Stop search as soon as we find a value smaller than the endpoints, + * or if the tolerance is below machine precision. + * Never run more than 20 steps, no matter what. + */ + nminstep++; + } + while ((sb->epot > sa->epot || sb->epot > sc->epot) && (nminstep < 20)); + + if (std::fabs(sb->epot - Epot0) < GMX_REAL_EPS || nminstep >= 20) + { + /* OK. We couldn't find a significantly lower energy. + * If ncorr==0 this was steepest descent, and then we give up. + * If not, reset memory to restart as steepest descent before quitting. + */ + if (ncorr == 0) + { + /* Converged */ + converged = TRUE; + break; + } + else + { + /* Reset memory */ + ncorr = 0; + /* Search in gradient direction */ + for (i = 0; i < n; i++) + { + dx[point][i] = ff[i]; + } + /* Reset stepsize */ + stepsize = 1.0/fnorm; + continue; + } + } + + /* Select min energy state of A & C, put the best in xx/ff/Epot + */ + if (sc->epot < sa->epot) + { + /* Use state C */ + ems = *sc; + step_taken = c; + } + else + { + /* Use state A */ + ems = *sa; + step_taken = a; + } + + } + else + { + /* found lower */ + /* Use state C */ + ems = *sc; + step_taken = c; + } + + /* Update the memory information, and calculate a new + * approximation of the inverse hessian + */ + + /* Have new data in Epot, xx, ff */ + if (ncorr < nmaxcorr) + { + ncorr++; + } + + for (i = 0; i < n; i++) + { + dg[point][i] = lastf[i]-ff[i]; + dx[point][i] *= step_taken; + } + + dgdg = 0; + dgdx = 0; + for (i = 0; i < n; i++) + { + dgdg += dg[point][i]*dg[point][i]; + dgdx += dg[point][i]*dx[point][i]; + } + + diag = dgdx/dgdg; + + rho[point] = 1.0/dgdx; + point++; + + if (point >= nmaxcorr) + { + point = 0; + } + + /* Update */ + for (i = 0; i < n; i++) + { + p[i] = ff[i]; + } + + cp = point; + + /* Recursive update. First go back over the memory points */ + for (k = 0; k < ncorr; k++) + { + cp--; + if (cp < 0) + { + cp = ncorr-1; + } + + sq = 0; + for (i = 0; i < n; i++) + { + sq += dx[cp][i]*p[i]; + } + + alpha[cp] = rho[cp]*sq; + + for (i = 0; i < n; i++) + { + p[i] -= alpha[cp]*dg[cp][i]; + } + } + + for (i = 0; i < n; i++) + { + p[i] *= diag; + } + + /* And then go forward again */ + for (k = 0; k < ncorr; k++) + { + yr = 0; + for (i = 0; i < n; i++) + { + yr += p[i]*dg[cp][i]; + } + + beta = rho[cp]*yr; + beta = alpha[cp]-beta; + + for (i = 0; i < n; i++) + { + p[i] += beta*dx[cp][i]; + } + + cp++; + if (cp >= ncorr) + { + cp = 0; + } + } + + for (i = 0; i < n; i++) + { + if (!frozen[i]) + { + dx[point][i] = p[i]; + } + else + { + dx[point][i] = 0; + } + } + + /* Print it if necessary */ + if (MASTER(cr)) + { + if (mdrunOptions.verbose) + { + double sqrtNumAtoms = sqrt(static_cast(state_global->natoms)); + fprintf(stderr, "\rStep %d, Epot=%12.6e, Fnorm=%9.3e, Fmax=%9.3e (atom %d)\n", + step, ems.epot, ems.fnorm/sqrtNumAtoms, ems.fmax, ems.a_fmax + 1); + fflush(stderr); + } + /* Store the new (lower) energies */ + upd_mdebin(mdebin, FALSE, FALSE, (double)step, + mdatoms->tmass, enerd, state_global, inputrec->fepvals, inputrec->expandedvals, state_global->box, + nullptr, nullptr, vir, pres, nullptr, mu_tot, constr); + do_log = do_per_step(step, inputrec->nstlog); + do_ene = do_per_step(step, inputrec->nstenergy); + if (do_log) + { + print_ebin_header(fplog, step, step); + } + print_ebin(mdoutf_get_fp_ene(outf), do_ene, FALSE, FALSE, + do_log ? fplog : nullptr, step, step, eprNORMAL, + mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr); + } + + /* Send x and E to IMD client, if bIMD is TRUE. */ + if (do_IMD(inputrec->bIMD, step, cr, TRUE, state_global->box, as_rvec_array(state_global->x.data()), inputrec, 0, wcycle) && MASTER(cr)) + { + IMD_send_positions(inputrec->imd); + } + + // Reset stepsize in we are doing more iterations + stepsize = 1.0/ems.fnorm; + + /* Stop when the maximum force lies below tolerance. + * If we have reached machine precision, converged is already set to true. + */ + converged = converged || (ems.fmax < inputrec->em_tol); + + } /* End of the loop */ + + /* IMD cleanup, if bIMD is TRUE. */ + IMD_finalize(inputrec->bIMD, inputrec->imd); + + if (converged) + { + step--; /* we never took that last step in this case */ + + } + if (ems.fmax > inputrec->em_tol) + { + if (MASTER(cr)) + { + warn_step(fplog, inputrec->em_tol, ems.fmax, + step-1 == number_steps, FALSE); + } + converged = FALSE; + } + + /* If we printed energy and/or logfile last step (which was the last step) + * we don't have to do it again, but otherwise print the final values. + */ + if (!do_log) /* Write final value to log since we didn't do anythin last step */ + { + print_ebin_header(fplog, step, step); + } + if (!do_ene || !do_log) /* Write final energy file entries */ + { + print_ebin(mdoutf_get_fp_ene(outf), !do_ene, FALSE, FALSE, + !do_log ? fplog : nullptr, step, step, eprNORMAL, + mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr); + } + + /* Print some stuff... */ + if (MASTER(cr)) + { + fprintf(stderr, "\nwriting lowest energy coordinates.\n"); + } + + /* IMPORTANT! + * For accurate normal mode calculation it is imperative that we + * store the last conformation into the full precision binary trajectory. + * + * However, we should only do it if we did NOT already write this step + * above (which we did if do_x or do_f was true). + */ + do_x = !do_per_step(step, inputrec->nstxout); + do_f = !do_per_step(step, inputrec->nstfout); + write_em_traj(fplog, cr, outf, do_x, do_f, ftp2fn(efSTO, nfile, fnm), + top_global, inputrec, step, + &ems, state_global, observablesHistory); + + if (MASTER(cr)) + { + double sqrtNumAtoms = sqrt(static_cast(state_global->natoms)); + print_converged(stderr, LBFGS, inputrec->em_tol, step, converged, + number_steps, &ems, sqrtNumAtoms); + print_converged(fplog, LBFGS, inputrec->em_tol, step, converged, + number_steps, &ems, sqrtNumAtoms); + + fprintf(fplog, "\nPerformed %d energy evaluations in total.\n", neval); + } + + finish_em(cr, outf, walltime_accounting, wcycle); + + /* To print the actual number of steps we needed somewhere */ + walltime_accounting_set_nsteps_done(walltime_accounting, step); +} + +void +Integrator::do_steep() +{ + const char *SD = "Steepest Descents"; + gmx_localtop_t *top; + gmx_enerdata_t *enerd; + gmx_global_stat_t gstat; + t_graph *graph; + real stepsize; + real ustep; + gmx_mdoutf_t outf; + t_mdebin *mdebin; + gmx_bool bDone, bAbort, do_x, do_f; + tensor vir, pres; + rvec mu_tot; + int nsteps; + int count = 0; + int steps_accepted = 0; + auto mdatoms = mdAtoms->mdatoms(); + + /* Create 2 states on the stack and extract pointers that we will swap */ + em_state_t s0 {}, s1 {}; + em_state_t *s_min = &s0; + em_state_t *s_try = &s1; + + /* Init em and store the local state in s_try */ + init_em(fplog, SD, cr, ms, outputProvider, inputrec, mdrunOptions, + state_global, top_global, s_try, &top, + nrnb, mu_tot, fr, &enerd, &graph, mdAtoms, &gstat, + vsite, constr, nullptr, + nfile, fnm, &outf, &mdebin, wcycle); + + /* Print to log file */ + print_em_start(fplog, cr, walltime_accounting, wcycle, SD); + + /* Set variables for stepsize (in nm). This is the largest + * step that we are going to make in any direction. + */ + ustep = inputrec->em_stepsize; + stepsize = 0; + + /* Max number of steps */ + nsteps = inputrec->nsteps; + + if (MASTER(cr)) + { + /* Print to the screen */ + sp_header(stderr, SD, inputrec->em_tol, nsteps); + } + if (fplog) + { + sp_header(fplog, SD, inputrec->em_tol, nsteps); + } + EnergyEvaluator energyEvaluator { + fplog, cr, ms, + top_global, top, + inputrec, nrnb, wcycle, gstat, + vsite, constr, fcd, graph, + mdAtoms, fr, enerd + }; + + /**** HERE STARTS THE LOOP **** + * count is the counter for the number of steps + * bDone will be TRUE when the minimization has converged + * bAbort will be TRUE when nsteps steps have been performed or when + * the stepsize becomes smaller than is reasonable for machine precision + */ + count = 0; + bDone = FALSE; + bAbort = FALSE; + while (!bDone && !bAbort) + { + bAbort = (nsteps >= 0) && (count == nsteps); + + /* set new coordinates, except for first step */ + bool validStep = true; + if (count > 0) + { + validStep = + do_em_step(cr, inputrec, mdatoms, + s_min, stepsize, &s_min->f, s_try, + constr, count); + } + + if (validStep) + { + energyEvaluator.run(s_try, mu_tot, vir, pres, count, count == 0); + } + else + { + // Signal constraint error during stepping with energy=inf + s_try->epot = std::numeric_limits::infinity(); + } + + if (MASTER(cr)) + { + print_ebin_header(fplog, count, count); + } + + if (count == 0) + { + s_min->epot = s_try->epot; + } + + /* Print it if necessary */ + if (MASTER(cr)) + { + if (mdrunOptions.verbose) + { + fprintf(stderr, "Step=%5d, Dmax= %6.1e nm, Epot= %12.5e Fmax= %11.5e, atom= %d%c", + count, ustep, s_try->epot, s_try->fmax, s_try->a_fmax+1, + ( (count == 0) || (s_try->epot < s_min->epot) ) ? '\n' : '\r'); + fflush(stderr); + } + + if ( (count == 0) || (s_try->epot < s_min->epot) ) + { + /* Store the new (lower) energies */ + upd_mdebin(mdebin, FALSE, FALSE, (double)count, + mdatoms->tmass, enerd, &s_try->s, inputrec->fepvals, inputrec->expandedvals, + s_try->s.box, nullptr, nullptr, vir, pres, nullptr, mu_tot, constr); + + /* Prepare IMD energy record, if bIMD is TRUE. */ + IMD_fill_energy_record(inputrec->bIMD, inputrec->imd, enerd, count, TRUE); + + print_ebin(mdoutf_get_fp_ene(outf), TRUE, + do_per_step(steps_accepted, inputrec->nstdisreout), + do_per_step(steps_accepted, inputrec->nstorireout), + fplog, count, count, eprNORMAL, + mdebin, fcd, &(top_global->groups), &(inputrec->opts), nullptr); + fflush(fplog); + } + } + + /* Now if the new energy is smaller than the previous... + * or if this is the first step! + * or if we did random steps! + */ + + if ( (count == 0) || (s_try->epot < s_min->epot) ) + { + steps_accepted++; + + /* Test whether the convergence criterion is met... */ + bDone = (s_try->fmax < inputrec->em_tol); + + /* Copy the arrays for force, positions and energy */ + /* The 'Min' array always holds the coords and forces of the minimal + sampled energy */ + swap_em_state(&s_min, &s_try); + if (count > 0) + { + ustep *= 1.2; + } + + /* Write to trn, if necessary */ + do_x = do_per_step(steps_accepted, inputrec->nstxout); + do_f = do_per_step(steps_accepted, inputrec->nstfout); + write_em_traj(fplog, cr, outf, do_x, do_f, nullptr, + top_global, inputrec, count, + s_min, state_global, observablesHistory); + } + else + { + /* If energy is not smaller make the step smaller... */ + ustep *= 0.5; + + if (DOMAINDECOMP(cr) && s_min->s.ddp_count != cr->dd->ddp_count) + { + /* Reload the old state */ + em_dd_partition_system(fplog, count, cr, top_global, inputrec, + s_min, top, mdAtoms, fr, vsite, constr, + nrnb, wcycle); + } + } + + /* Determine new step */ + stepsize = ustep/s_min->fmax; + + /* Check if stepsize is too small, with 1 nm as a characteristic length */ +#if GMX_DOUBLE + if (count == nsteps || ustep < 1e-12) +#else + if (count == nsteps || ustep < 1e-6) +#endif + { + if (MASTER(cr)) + { + warn_step(fplog, inputrec->em_tol, s_min->fmax, + count == nsteps, constr != nullptr); + } + bAbort = TRUE; + } + + /* Send IMD energies and positions, if bIMD is TRUE. */ + if (do_IMD(inputrec->bIMD, count, cr, TRUE, state_global->box, + MASTER(cr) ? as_rvec_array(state_global->x.data()) : nullptr, + inputrec, 0, wcycle) && + MASTER(cr)) + { + IMD_send_positions(inputrec->imd); + } + + count++; + } /* End of the loop */ + + /* IMD cleanup, if bIMD is TRUE. */ + IMD_finalize(inputrec->bIMD, inputrec->imd); + + /* Print some data... */ + if (MASTER(cr)) + { + fprintf(stderr, "\nwriting lowest energy coordinates.\n"); + } + write_em_traj(fplog, cr, outf, TRUE, inputrec->nstfout, ftp2fn(efSTO, nfile, fnm), + top_global, inputrec, count, + s_min, state_global, observablesHistory); + + if (MASTER(cr)) + { + double sqrtNumAtoms = sqrt(static_cast(state_global->natoms)); + + print_converged(stderr, SD, inputrec->em_tol, count, bDone, nsteps, + s_min, sqrtNumAtoms); + print_converged(fplog, SD, inputrec->em_tol, count, bDone, nsteps, + s_min, sqrtNumAtoms); + } + + finish_em(cr, outf, walltime_accounting, wcycle); + + /* To print the actual number of steps we needed somewhere */ + inputrec->nsteps = count; + + walltime_accounting_set_nsteps_done(walltime_accounting, count); +} + +void +Integrator::do_nm() +{ + const char *NM = "Normal Mode Analysis"; + gmx_mdoutf_t outf; + int nnodes, node; + gmx_localtop_t *top; + gmx_enerdata_t *enerd; + gmx_global_stat_t gstat; + t_graph *graph; + tensor vir, pres; + rvec mu_tot; + rvec *fneg, *dfdx; + gmx_bool bSparse; /* use sparse matrix storage format */ + size_t sz; + gmx_sparsematrix_t * sparse_matrix = nullptr; + real * full_matrix = nullptr; + + /* added with respect to mdrun */ + int row, col; + real der_range = 10.0*std::sqrt(GMX_REAL_EPS); + real x_min; + bool bIsMaster = MASTER(cr); + auto mdatoms = mdAtoms->mdatoms(); + + if (constr != nullptr) + { + gmx_fatal(FARGS, "Constraints present with Normal Mode Analysis, this combination is not supported"); + } + + gmx_shellfc_t *shellfc; + + em_state_t state_work {}; + + /* Init em and store the local state in state_minimum */ + init_em(fplog, NM, cr, ms, outputProvider, inputrec, mdrunOptions, + state_global, top_global, &state_work, &top, + nrnb, mu_tot, fr, &enerd, &graph, mdAtoms, &gstat, + vsite, constr, &shellfc, + nfile, fnm, &outf, nullptr, wcycle); + + std::vector atom_index = get_atom_index(top_global); + snew(fneg, atom_index.size()); + snew(dfdx, atom_index.size()); + +#if !GMX_DOUBLE + if (bIsMaster) + { + fprintf(stderr, + "NOTE: This version of GROMACS has been compiled in single precision,\n" + " which MIGHT not be accurate enough for normal mode analysis.\n" + " GROMACS now uses sparse matrix storage, so the memory requirements\n" + " are fairly modest even if you recompile in double precision.\n\n"); + } +#endif + + /* Check if we can/should use sparse storage format. + * + * Sparse format is only useful when the Hessian itself is sparse, which it + * will be when we use a cutoff. + * For small systems (n<1000) it is easier to always use full matrix format, though. + */ + if (EEL_FULL(fr->ic->eeltype) || fr->rlist == 0.0) + { + GMX_LOG(mdlog.warning).appendText("Non-cutoff electrostatics used, forcing full Hessian format."); + bSparse = FALSE; + } + else if (atom_index.size() < 1000) + { + GMX_LOG(mdlog.warning).appendTextFormatted("Small system size (N=%d), using full Hessian format.", + atom_index.size()); + bSparse = FALSE; + } + else + { + GMX_LOG(mdlog.warning).appendText("Using compressed symmetric sparse Hessian format."); + bSparse = TRUE; + } + + /* Number of dimensions, based on real atoms, that is not vsites or shell */ + sz = DIM*atom_index.size(); + + fprintf(stderr, "Allocating Hessian memory...\n\n"); + + if (bSparse) + { + sparse_matrix = gmx_sparsematrix_init(sz); + sparse_matrix->compressed_symmetric = TRUE; + } + else + { + snew(full_matrix, sz*sz); + } + + init_nrnb(nrnb); + + + /* Write start time and temperature */ + print_em_start(fplog, cr, walltime_accounting, wcycle, NM); + + /* fudge nr of steps to nr of atoms */ + inputrec->nsteps = atom_index.size()*2; + + if (bIsMaster) + { + fprintf(stderr, "starting normal mode calculation '%s'\n%d steps.\n\n", + *(top_global->name), (int)inputrec->nsteps); + } + + nnodes = cr->nnodes; + + /* Make evaluate_energy do a single node force calculation */ + cr->nnodes = 1; + EnergyEvaluator energyEvaluator { + fplog, cr, ms, + top_global, top, + inputrec, nrnb, wcycle, gstat, + vsite, constr, fcd, graph, + mdAtoms, fr, enerd + }; + energyEvaluator.run(&state_work, mu_tot, vir, pres, -1, TRUE); + cr->nnodes = nnodes; + + /* if forces are not small, warn user */ + get_state_f_norm_max(cr, &(inputrec->opts), mdatoms, &state_work); + + GMX_LOG(mdlog.warning).appendTextFormatted("Maximum force:%12.5e", state_work.fmax); + if (state_work.fmax > 1.0e-3) + { + GMX_LOG(mdlog.warning).appendText( + "The force is probably not small enough to " + "ensure that you are at a minimum.\n" + "Be aware that negative eigenvalues may occur\n" + "when the resulting matrix is diagonalized."); + } + + /*********************************************************** + * + * Loop over all pairs in matrix + * + * do_force called twice. Once with positive and + * once with negative displacement + * + ************************************************************/ + + /* Steps are divided one by one over the nodes */ + bool bNS = true; + for (unsigned int aid = cr->nodeid; aid < atom_index.size(); aid += nnodes) + { + size_t atom = atom_index[aid]; + for (size_t d = 0; d < DIM; d++) + { + gmx_int64_t step = 0; + int force_flags = GMX_FORCE_STATECHANGED | GMX_FORCE_ALLFORCES; + double t = 0; + + x_min = state_work.s.x[atom][d]; + + for (unsigned int dx = 0; (dx < 2); dx++) + { + if (dx == 0) + { + state_work.s.x[atom][d] = x_min - der_range; + } + else + { + state_work.s.x[atom][d] = x_min + der_range; + } + + /* Make evaluate_energy do a single node force calculation */ + cr->nnodes = 1; + if (shellfc) + { + /* Now is the time to relax the shells */ + relax_shell_flexcon(fplog, + cr, + ms, + mdrunOptions.verbose, + step, + inputrec, + bNS, + force_flags, + top, + constr, + enerd, + fcd, + &state_work.s, + state_work.f, + vir, + mdatoms, + nrnb, + wcycle, + graph, + &top_global->groups, + shellfc, + fr, + t, + mu_tot, + vsite, + DdOpenBalanceRegionBeforeForceComputation::no, + DdCloseBalanceRegionAfterForceComputation::no); + bNS = false; + step++; + } + else + { + energyEvaluator.run(&state_work, mu_tot, vir, pres, atom*2+dx, FALSE); + } + + cr->nnodes = nnodes; + + if (dx == 0) + { + for (size_t i = 0; i < atom_index.size(); i++) + { + copy_rvec(state_work.f[atom_index[i]], fneg[i]); + } + } + } + + /* x is restored to original */ + state_work.s.x[atom][d] = x_min; + + for (size_t j = 0; j < atom_index.size(); j++) + { + for (size_t k = 0; (k < DIM); k++) + { + dfdx[j][k] = + -(state_work.f[atom_index[j]][k] - fneg[j][k])/(2*der_range); + } + } + + if (!bIsMaster) + { +#if GMX_MPI +#define mpi_type GMX_MPI_REAL + MPI_Send(dfdx[0], atom_index.size()*DIM, mpi_type, MASTER(cr), + cr->nodeid, cr->mpi_comm_mygroup); +#endif + } + else + { + for (node = 0; (node < nnodes && atom+node < atom_index.size()); node++) + { + if (node > 0) + { +#if GMX_MPI + MPI_Status stat; + MPI_Recv(dfdx[0], atom_index.size()*DIM, mpi_type, node, node, + cr->mpi_comm_mygroup, &stat); +#undef mpi_type +#endif + } + + row = (atom + node)*DIM + d; + + for (size_t j = 0; j < atom_index.size(); j++) + { + for (size_t k = 0; k < DIM; k++) + { + col = j*DIM + k; + + if (bSparse) + { + if (col >= row && dfdx[j][k] != 0.0) + { + gmx_sparsematrix_increment_value(sparse_matrix, + row, col, dfdx[j][k]); + } + } + else + { + full_matrix[row*sz+col] = dfdx[j][k]; + } + } + } + } + } + + if (mdrunOptions.verbose && fplog) + { + fflush(fplog); + } + } + /* write progress */ + if (bIsMaster && mdrunOptions.verbose) + { + fprintf(stderr, "\rFinished step %d out of %d", + static_cast(std::min(atom+nnodes, atom_index.size())), + static_cast(atom_index.size())); + fflush(stderr); + } + } + + if (bIsMaster) + { + fprintf(stderr, "\n\nWriting Hessian...\n"); + gmx_mtxio_write(ftp2fn(efMTX, nfile, fnm), sz, sz, full_matrix, sparse_matrix); + } + + finish_em(cr, outf, walltime_accounting, wcycle); + + walltime_accounting_set_nsteps_done(walltime_accounting, atom_index.size()*2); +} + +} // namespace gmx diff --cc src/gromacs/mdrun/runner.cpp index 95f3632b28,0000000000..15d4f14df6 mode 100644,000000..100644 --- a/src/gromacs/mdrun/runner.cpp +++ b/src/gromacs/mdrun/runner.cpp @@@ -1,1390 -1,0 +1,1390 @@@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 1991-2000, University of Groningen, The Netherlands. + * Copyright (c) 2001-2004, The GROMACS development team. + * Copyright (c) 2011,2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * + * \brief Implements the MD runner routine calling all integrators. + * + * \author David van der Spoel + * \ingroup module_mdrun + */ +#include "gmxpre.h" + +#include "runner.h" + +#include "config.h" + +#include +#include +#include +#include + +#include + +#include "gromacs/commandline/filenm.h" +#include "gromacs/domdec/domdec.h" +#include "gromacs/domdec/domdec_struct.h" +#include "gromacs/ewald/ewald-utils.h" +#include "gromacs/ewald/pme.h" +#include "gromacs/ewald/pme-gpu-program.h" +#include "gromacs/fileio/checkpoint.h" +#include "gromacs/fileio/oenv.h" +#include "gromacs/fileio/tpxio.h" +#include "gromacs/gmxlib/network.h" +#include "gromacs/gmxlib/nrnb.h" +#include "gromacs/gpu_utils/clfftinitializer.h" +#include "gromacs/gpu_utils/gpu_utils.h" +#include "gromacs/hardware/cpuinfo.h" +#include "gromacs/hardware/detecthardware.h" +#include "gromacs/hardware/printhardware.h" +#include "gromacs/listed-forces/disre.h" +#include "gromacs/listed-forces/orires.h" +#include "gromacs/math/functions.h" +#include "gromacs/math/utilities.h" +#include "gromacs/math/vec.h" +#include "gromacs/mdlib/boxdeformation.h" +#include "gromacs/mdlib/calc_verletbuf.h" +#include "gromacs/mdlib/forcerec.h" +#include "gromacs/mdlib/gmx_omp_nthreads.h" +#include "gromacs/mdlib/main.h" +#include "gromacs/mdlib/makeconstraints.h" +#include "gromacs/mdlib/md_support.h" +#include "gromacs/mdlib/mdatoms.h" +#include "gromacs/mdlib/mdrun.h" +#include "gromacs/mdlib/membed.h" +#include "gromacs/mdlib/nb_verlet.h" +#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h" +#include "gromacs/mdlib/nbnxn_search.h" +#include "gromacs/mdlib/nbnxn_tuning.h" +#include "gromacs/mdlib/qmmm.h" +#include "gromacs/mdlib/repl_ex.h" +#include "gromacs/mdlib/sighandler.h" +#include "gromacs/mdlib/sim_util.h" +#include "gromacs/mdrunutility/mdmodules.h" +#include "gromacs/mdrunutility/threadaffinity.h" +#include "gromacs/mdtypes/commrec.h" +#include "gromacs/mdtypes/fcdata.h" +#include "gromacs/mdtypes/inputrec.h" +#include "gromacs/mdtypes/md_enums.h" +#include "gromacs/mdtypes/observableshistory.h" +#include "gromacs/mdtypes/state.h" +#include "gromacs/pbcutil/pbc.h" +#include "gromacs/pulling/pull.h" +#include "gromacs/pulling/pull_rotation.h" +#include "gromacs/taskassignment/decidegpuusage.h" +#include "gromacs/taskassignment/resourcedivision.h" +#include "gromacs/taskassignment/taskassignment.h" +#include "gromacs/taskassignment/usergpuids.h" +#include "gromacs/timing/wallcycle.h" +#include "gromacs/topology/mtop_util.h" +#include "gromacs/trajectory/trajectoryframe.h" +#include "gromacs/utility/basenetwork.h" +#include "gromacs/utility/cstringutil.h" +#include "gromacs/utility/exceptions.h" +#include "gromacs/utility/fatalerror.h" +#include "gromacs/utility/filestream.h" +#include "gromacs/utility/gmxassert.h" +#include "gromacs/utility/gmxmpi.h" +#include "gromacs/utility/logger.h" +#include "gromacs/utility/loggerbuilder.h" +#include "gromacs/utility/physicalnodecommunicator.h" +#include "gromacs/utility/pleasecite.h" +#include "gromacs/utility/programcontext.h" +#include "gromacs/utility/smalloc.h" +#include "gromacs/utility/stringutil.h" + +#include "integrator.h" + +#ifdef GMX_FAHCORE +#include "corewrap.h" +#endif + +namespace gmx +{ + +/*! \brief Barrier for safe simultaneous thread access to mdrunner data + * + * Used to ensure that the master thread does not modify mdrunner during copy + * on the spawned threads. */ +static void threadMpiMdrunnerAccessBarrier() +{ +#if GMX_THREAD_MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif +} + +void Mdrunner::reinitializeOnSpawnedThread() +{ + threadMpiMdrunnerAccessBarrier(); + + cr = reinitialize_commrec_for_this_thread(cr); + + GMX_RELEASE_ASSERT(!MASTER(cr), "reinitializeOnSpawnedThread should only be called on spawned threads"); + + // Only the master rank writes to the log file + fplog = nullptr; +} + +/*! \brief The callback used for running on spawned threads. + * + * Obtains the pointer to the master mdrunner object from the one + * argument permitted to the thread-launch API call, copies it to make + * a new runner for this thread, reinitializes necessary data, and + * proceeds to the simulation. */ +static void mdrunner_start_fn(const void *arg) +{ + try + { + auto masterMdrunner = reinterpret_cast(arg); + /* copy the arg list to make sure that it's thread-local. This + doesn't copy pointed-to items, of course; fnm, cr and fplog + are reset in the call below, all others should be const. */ + gmx::Mdrunner mdrunner = *masterMdrunner; + mdrunner.reinitializeOnSpawnedThread(); + mdrunner.mdrunner(); + } + GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; +} + + +/*! \brief Start thread-MPI threads. + * + * Called by mdrunner() to start a specific number of threads + * (including the main thread) for thread-parallel runs. This in turn + * calls mdrunner() for each thread. All options are the same as for + * mdrunner(). */ +t_commrec *Mdrunner::spawnThreads(int numThreadsToLaunch) const +{ + + /* first check whether we even need to start tMPI */ + if (numThreadsToLaunch < 2) + { + return cr; + } + +#if GMX_THREAD_MPI + /* now spawn new threads that start mdrunner_start_fn(), while + the main thread returns, we set thread affinity later */ + if (tMPI_Init_fn(TRUE, numThreadsToLaunch, TMPI_AFFINITY_NONE, + mdrunner_start_fn, static_cast(this)) != TMPI_SUCCESS) + { + GMX_THROW(gmx::InternalError("Failed to spawn thread-MPI threads")); + } + + threadMpiMdrunnerAccessBarrier(); +#else + GMX_UNUSED_VALUE(mdrunner_start_fn); +#endif + + return reinitialize_commrec_for_this_thread(cr); +} + +} // namespace + +/*! \brief Initialize variables for Verlet scheme simulation */ +static void prepare_verlet_scheme(FILE *fplog, + t_commrec *cr, + t_inputrec *ir, + int nstlist_cmdline, + const gmx_mtop_t *mtop, + const matrix box, + bool makeGpuPairList, + const gmx::CpuInfo &cpuinfo) +{ + /* For NVE simulations, we will retain the initial list buffer */ + if (EI_DYNAMICS(ir->eI) && + ir->verletbuf_tol > 0 && + !(EI_MD(ir->eI) && ir->etc == etcNO)) + { + /* Update the Verlet buffer size for the current run setup */ + + /* Here we assume SIMD-enabled kernels are being used. But as currently + * calc_verlet_buffer_size gives the same results for 4x8 and 4x4 + * and 4x2 gives a larger buffer than 4x4, this is ok. + */ + ListSetupType listType = (makeGpuPairList ? ListSetupType::Gpu : ListSetupType::CpuSimdWhenSupported); + VerletbufListSetup listSetup = verletbufGetSafeListSetup(listType); + + real rlist_new; + calc_verlet_buffer_size(mtop, det(box), ir, ir->nstlist, ir->nstlist - 1, -1, &listSetup, nullptr, &rlist_new); + + if (rlist_new != ir->rlist) + { + if (fplog != nullptr) + { + fprintf(fplog, "\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n", + ir->rlist, rlist_new, + listSetup.cluster_size_i, listSetup.cluster_size_j); + } + ir->rlist = rlist_new; + } + } + + if (nstlist_cmdline > 0 && (!EI_DYNAMICS(ir->eI) || ir->verletbuf_tol <= 0)) + { + gmx_fatal(FARGS, "Can not set nstlist without %s", + !EI_DYNAMICS(ir->eI) ? "dynamics" : "verlet-buffer-tolerance"); + } + + if (EI_DYNAMICS(ir->eI)) + { + /* Set or try nstlist values */ + increaseNstlist(fplog, cr, ir, nstlist_cmdline, mtop, box, makeGpuPairList, cpuinfo); + } +} + +/*! \brief Override the nslist value in inputrec + * + * with value passed on the command line (if any) + */ +static void override_nsteps_cmdline(const gmx::MDLogger &mdlog, + gmx_int64_t nsteps_cmdline, + t_inputrec *ir) +{ + assert(ir); + + /* override with anything else than the default -2 */ + if (nsteps_cmdline > -2) + { + char sbuf_steps[STEPSTRSIZE]; + char sbuf_msg[STRLEN]; + + ir->nsteps = nsteps_cmdline; + if (EI_DYNAMICS(ir->eI) && nsteps_cmdline != -1) + { + sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps, %.3g ps", + gmx_step_str(nsteps_cmdline, sbuf_steps), + fabs(nsteps_cmdline*ir->delta_t)); + } + else + { + sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps", + gmx_step_str(nsteps_cmdline, sbuf_steps)); + } + + GMX_LOG(mdlog.warning).asParagraph().appendText(sbuf_msg); + } + else if (nsteps_cmdline < -2) + { + gmx_fatal(FARGS, "Invalid nsteps value passed on the command line: %d", + nsteps_cmdline); + } + /* Do nothing if nsteps_cmdline == -2 */ +} + +namespace gmx +{ + +/*! \brief Return whether GPU acceleration of nonbondeds is supported with the given settings. + * + * If not, and if a warning may be issued, logs a warning about + * falling back to CPU code. With thread-MPI, only the first + * call to this function should have \c issueWarning true. */ +static bool gpuAccelerationOfNonbondedIsUseful(const MDLogger &mdlog, + const t_inputrec *ir, + bool issueWarning) +{ + if (ir->opts.ngener - ir->nwall > 1) + { + /* The GPU code does not support more than one energy group. + * If the user requested GPUs explicitly, a fatal error is given later. + */ + if (issueWarning) + { + GMX_LOG(mdlog.warning).asParagraph() + .appendText("Multiple energy groups is not implemented for GPUs, falling back to the CPU. " + "For better performance, run on the GPU without energy groups and then do " + "gmx mdrun -rerun option on the trajectory with an energy group .tpr file."); + } + return false; + } + return true; +} + +//! Initializes the logger for mdrun. +static gmx::LoggerOwner buildLogger(FILE *fplog, const t_commrec *cr) +{ + gmx::LoggerBuilder builder; + if (fplog != nullptr) + { + builder.addTargetFile(gmx::MDLogger::LogLevel::Info, fplog); + } + if (cr == nullptr || SIMMASTER(cr)) + { + builder.addTargetStream(gmx::MDLogger::LogLevel::Warning, + &gmx::TextOutputFile::standardError()); + } + return builder.build(); +} + +//! Make a TaskTarget from an mdrun argument string. +static TaskTarget findTaskTarget(const char *optionString) +{ + TaskTarget returnValue = TaskTarget::Auto; + + if (strncmp(optionString, "auto", 3) == 0) + { + returnValue = TaskTarget::Auto; + } + else if (strncmp(optionString, "cpu", 3) == 0) + { + returnValue = TaskTarget::Cpu; + } + else if (strncmp(optionString, "gpu", 3) == 0) + { + returnValue = TaskTarget::Gpu; + } + else + { + GMX_ASSERT(false, "Option string should have been checked for sanity already"); + } + + return returnValue; +} + +int Mdrunner::mdrunner() +{ + matrix box; + t_nrnb *nrnb; + t_forcerec *fr = nullptr; + t_fcdata *fcd = nullptr; + real ewaldcoeff_q = 0; + real ewaldcoeff_lj = 0; + gmx_vsite_t *vsite = nullptr; + int nChargePerturbed = -1, nTypePerturbed = 0; + gmx_wallcycle_t wcycle; + gmx_walltime_accounting_t walltime_accounting = nullptr; + int rc; + gmx_int64_t reset_counters; + int nthreads_pme = 1; + gmx_membed_t * membed = nullptr; + gmx_hw_info_t *hwinfo = nullptr; + + /* CAUTION: threads may be started later on in this function, so + cr doesn't reflect the final parallel state right now */ + std::unique_ptr mdModules(new gmx::MDModules); + t_inputrec inputrecInstance; + t_inputrec *inputrec = &inputrecInstance; + gmx_mtop_t mtop; + + if (mdrunOptions.continuationOptions.appendFiles) + { + fplog = nullptr; + } + + bool doMembed = opt2bSet("-membed", nfile, fnm); + bool doRerun = mdrunOptions.rerun; + + // Handle task-assignment related user options. + EmulateGpuNonbonded emulateGpuNonbonded = (getenv("GMX_EMULATE_GPU") != nullptr ? + EmulateGpuNonbonded::Yes : EmulateGpuNonbonded::No); + std::vector gpuIdsAvailable; + try + { + gpuIdsAvailable = parseUserGpuIds(hw_opt.gpuIdsAvailable); + // TODO We could put the GPU IDs into a std::map to find + // duplicates, but for the small numbers of IDs involved, this + // code is simple and fast. + for (size_t i = 0; i != gpuIdsAvailable.size(); ++i) + { + for (size_t j = i+1; j != gpuIdsAvailable.size(); ++j) + { + if (gpuIdsAvailable[i] == gpuIdsAvailable[j]) + { + GMX_THROW(InvalidInputError(formatString("The string of available GPU device IDs '%s' may not contain duplicate device IDs", hw_opt.gpuIdsAvailable.c_str()))); + } + } + } + } + GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; + + std::vector userGpuTaskAssignment; + try + { + userGpuTaskAssignment = parseUserGpuIds(hw_opt.userGpuTaskAssignment); + } + GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; + auto nonbondedTarget = findTaskTarget(nbpu_opt); + auto pmeTarget = findTaskTarget(pme_opt); + auto pmeFftTarget = findTaskTarget(pme_fft_opt); + PmeRunMode pmeRunMode = PmeRunMode::None; + + // Here we assume that SIMMASTER(cr) does not change even after the + // threads are started. + gmx::LoggerOwner logOwner(buildLogger(fplog, cr)); + gmx::MDLogger mdlog(logOwner.logger()); + + // TODO The thread-MPI master rank makes a working + // PhysicalNodeCommunicator here, but it gets rebuilt by all ranks + // after the threads have been launched. This works because no use + // is made of that communicator until after the execution paths + // have rejoined. But it is likely that we can improve the way + // this is expressed, e.g. by expressly running detection only the + // master rank for thread-MPI, rather than relying on the mutex + // and reference count. + PhysicalNodeCommunicator physicalNodeComm(MPI_COMM_WORLD, gmx_physicalnode_id_hash()); + hwinfo = gmx_detect_hardware(mdlog, physicalNodeComm); + + gmx_print_detected_hardware(fplog, cr, ms, mdlog, hwinfo); + + std::vector gpuIdsToUse; + auto compatibleGpus = getCompatibleGpus(hwinfo->gpu_info); + if (gpuIdsAvailable.empty()) + { + gpuIdsToUse = compatibleGpus; + } + else + { + for (const auto &availableGpuId : gpuIdsAvailable) + { + bool availableGpuIsCompatible = false; + for (const auto &compatibleGpuId : compatibleGpus) + { + if (availableGpuId == compatibleGpuId) + { + availableGpuIsCompatible = true; + break; + } + } + if (!availableGpuIsCompatible) + { + gmx_fatal(FARGS, "You limited the set of compatible GPUs to a set that included ID #%d, but that ID is not for a compatible GPU. List only compatible GPUs.", availableGpuId); + } + gpuIdsToUse.push_back(availableGpuId); + } + } + + if (fplog != nullptr) + { + /* Print references after all software/hardware printing */ + please_cite(fplog, "Abraham2015"); + please_cite(fplog, "Pall2015"); + please_cite(fplog, "Pronk2013"); + please_cite(fplog, "Hess2008b"); + please_cite(fplog, "Spoel2005a"); + please_cite(fplog, "Lindahl2001a"); + please_cite(fplog, "Berendsen95a"); + } + + std::unique_ptr globalState; + + if (SIMMASTER(cr)) + { + /* Only the master rank has the global state */ + globalState = std::unique_ptr(new t_state); + + /* Read (nearly) all data required for the simulation */ + read_tpx_state(ftp2fn(efTPR, nfile, fnm), inputrec, globalState.get(), &mtop); + + if (inputrec->cutoff_scheme != ecutsVERLET) + { + if (nstlist_cmdline > 0) + { + gmx_fatal(FARGS, "Can not set nstlist with the group cut-off scheme"); + } + + if (!compatibleGpus.empty()) + { + GMX_LOG(mdlog.warning).asParagraph().appendText( + "NOTE: GPU(s) found, but the current simulation can not use GPUs\n" + " To use a GPU, set the mdp option: cutoff-scheme = Verlet"); + } + } + } + + /* Check and update the hardware options for internal consistency */ - check_and_update_hw_opt_1(&hw_opt, cr, domdecOptions.numPmeRanks); ++ check_and_update_hw_opt_1(mdlog, &hw_opt, cr, domdecOptions.numPmeRanks); + + /* Early check for externally set process affinity. */ + gmx_check_thread_affinity_set(mdlog, cr, + &hw_opt, hwinfo->nthreads_hw_avail, FALSE); + + if (GMX_THREAD_MPI && SIMMASTER(cr)) + { + if (domdecOptions.numPmeRanks > 0 && hw_opt.nthreads_tmpi <= 0) + { + gmx_fatal(FARGS, "You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME ranks"); + } + + /* Since the master knows the cut-off scheme, update hw_opt for this. + * This is done later for normal MPI and also once more with tMPI + * for all tMPI ranks. + */ + check_and_update_hw_opt_2(&hw_opt, inputrec->cutoff_scheme); + + bool useGpuForNonbonded = false; + bool useGpuForPme = false; + try + { + // If the user specified the number of ranks, then we must + // respect that, but in default mode, we need to allow for + // the number of GPUs to choose the number of ranks. + + useGpuForNonbonded = decideWhetherToUseGpusForNonbondedWithThreadMpi + (nonbondedTarget, gpuIdsToUse, userGpuTaskAssignment, emulateGpuNonbonded, + inputrec->cutoff_scheme == ecutsVERLET, + gpuAccelerationOfNonbondedIsUseful(mdlog, inputrec, GMX_THREAD_MPI), + hw_opt.nthreads_tmpi); + auto canUseGpuForPme = pme_gpu_supports_build(nullptr) && pme_gpu_supports_input(inputrec, nullptr); + useGpuForPme = decideWhetherToUseGpusForPmeWithThreadMpi + (useGpuForNonbonded, pmeTarget, gpuIdsToUse, userGpuTaskAssignment, + canUseGpuForPme, hw_opt.nthreads_tmpi, domdecOptions.numPmeRanks); + + } + GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; + + /* Determine how many thread-MPI ranks to start. + * + * TODO Over-writing the user-supplied value here does + * prevent any possible subsequent checks from working + * correctly. */ + hw_opt.nthreads_tmpi = get_nthreads_mpi(hwinfo, + &hw_opt, + gpuIdsToUse, + useGpuForNonbonded, + useGpuForPme, + inputrec, &mtop, + mdlog, + doMembed); + + // Now start the threads for thread MPI. + cr = spawnThreads(hw_opt.nthreads_tmpi); + /* The main thread continues here with a new cr. We don't deallocate + the old cr because other threads may still be reading it. */ + // TODO Both master and spawned threads call dup_tfn and + // reinitialize_commrec_for_this_thread. Find a way to express + // this better. + physicalNodeComm = PhysicalNodeCommunicator(MPI_COMM_WORLD, gmx_physicalnode_id_hash()); + } + // END OF CAUTION: cr and physicalNodeComm are now reliable + + if (PAR(cr)) + { + /* now broadcast everything to the non-master nodes/threads: */ + init_parallel(cr, inputrec, &mtop); + } + + // Now each rank knows the inputrec that SIMMASTER read and used, + // and (if applicable) cr->nnodes has been assigned the number of + // thread-MPI ranks that have been chosen. The ranks can now all + // run the task-deciding functions and will agree on the result + // without needing to communicate. + // + // TODO Should we do the communication in debug mode to support + // having an assertion? + // + // Note that these variables describe only their own node. + bool useGpuForNonbonded = false; + bool useGpuForPme = false; + try + { + // It's possible that there are different numbers of GPUs on + // different nodes, which is the user's responsibilty to + // handle. If unsuitable, we will notice that during task + // assignment. + bool gpusWereDetected = hwinfo->ngpu_compatible_tot > 0; + useGpuForNonbonded = decideWhetherToUseGpusForNonbonded(nonbondedTarget, userGpuTaskAssignment, + emulateGpuNonbonded, inputrec->cutoff_scheme == ecutsVERLET, + gpuAccelerationOfNonbondedIsUseful(mdlog, inputrec, !GMX_THREAD_MPI), + gpusWereDetected); + auto canUseGpuForPme = pme_gpu_supports_build(nullptr) && pme_gpu_supports_input(inputrec, nullptr); + useGpuForPme = decideWhetherToUseGpusForPme(useGpuForNonbonded, pmeTarget, userGpuTaskAssignment, + canUseGpuForPme, cr->nnodes, domdecOptions.numPmeRanks, + gpusWereDetected); + + pmeRunMode = (useGpuForPme ? PmeRunMode::GPU : PmeRunMode::CPU); + if (pmeRunMode == PmeRunMode::GPU) + { + if (pmeFftTarget == TaskTarget::Cpu) + { + pmeRunMode = PmeRunMode::Mixed; + } + } + else if (pmeFftTarget == TaskTarget::Gpu) + { + gmx_fatal(FARGS, "Assigning FFTs to GPU requires PME to be assigned to GPU as well. With PME on CPU you should not be using -pmefft."); + } + } + GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; + + // TODO: Error handling + mdModules->assignOptionsToModules(*inputrec->params, nullptr); + + if (fplog != nullptr) + { + pr_inputrec(fplog, 0, "Input Parameters", inputrec, FALSE); + fprintf(fplog, "\n"); + } + + if (SIMMASTER(cr)) + { + /* now make sure the state is initialized and propagated */ + set_state_entries(globalState.get(), inputrec); + } + + /* NM and TPI parallelize over force/energy calculations, not atoms, + * so we need to initialize and broadcast the global state. + */ + if (inputrec->eI == eiNM || inputrec->eI == eiTPI) + { + if (!MASTER(cr)) + { + globalState = std::unique_ptr(new t_state); + } + broadcastStateWithoutDynamics(cr, globalState.get()); + } + + /* A parallel command line option consistency check that we can + only do after any threads have started. */ + if (!PAR(cr) && (domdecOptions.numCells[XX] > 1 || + domdecOptions.numCells[YY] > 1 || + domdecOptions.numCells[ZZ] > 1 || + domdecOptions.numPmeRanks > 0)) + { + gmx_fatal(FARGS, + "The -dd or -npme option request a parallel simulation, " +#if !GMX_MPI + "but %s was compiled without threads or MPI enabled" +#else +#if GMX_THREAD_MPI + "but the number of MPI-threads (option -ntmpi) is not set or is 1" +#else + "but %s was not started through mpirun/mpiexec or only one rank was requested through mpirun/mpiexec" +#endif +#endif + , output_env_get_program_display_name(oenv) + ); + } + + if (doRerun && + (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI)) + { + gmx_fatal(FARGS, "The .mdp file specified an energy mininization or normal mode algorithm, and these are not compatible with mdrun -rerun"); + } + + if (can_use_allvsall(inputrec, TRUE, cr, fplog) && DOMAINDECOMP(cr)) + { + gmx_fatal(FARGS, "All-vs-all loops do not work with domain decomposition, use a single MPI rank"); + } + + if (!(EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype))) + { + if (domdecOptions.numPmeRanks > 0) + { + gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr), + "PME-only ranks are requested, but the system does not use PME for electrostatics or LJ"); + } + + domdecOptions.numPmeRanks = 0; + } + + if (useGpuForNonbonded && domdecOptions.numPmeRanks < 0) + { + /* With NB GPUs we don't automatically use PME-only CPU ranks. PME ranks can + * improve performance with many threads per GPU, since our OpenMP + * scaling is bad, but it's difficult to automate the setup. + */ + domdecOptions.numPmeRanks = 0; + } + if (useGpuForPme) + { + if (domdecOptions.numPmeRanks < 0) + { + domdecOptions.numPmeRanks = 0; + // TODO possibly print a note that one can opt-in for a separate PME GPU rank? + } + else + { + GMX_RELEASE_ASSERT(domdecOptions.numPmeRanks <= 1, "PME GPU decomposition is not supported"); + } + } + +#ifdef GMX_FAHCORE + if (MASTER(cr)) + { + fcRegisterSteps(inputrec->nsteps, inputrec->init_step); + } +#endif + + /* NMR restraints must be initialized before load_checkpoint, + * since with time averaging the history is added to t_state. + * For proper consistency check we therefore need to extend + * t_state here. + * So the PME-only nodes (if present) will also initialize + * the distance restraints. + */ + snew(fcd, 1); + + /* This needs to be called before read_checkpoint to extend the state */ + init_disres(fplog, &mtop, inputrec, cr, ms, fcd, globalState.get(), replExParams.exchangeInterval > 0); + + init_orires(fplog, &mtop, inputrec, cr, ms, globalState.get(), &(fcd->orires)); + + auto deform = prepareBoxDeformation(globalState->box, cr, *inputrec); + + ObservablesHistory observablesHistory = {}; + + ContinuationOptions &continuationOptions = mdrunOptions.continuationOptions; + + if (continuationOptions.startedFromCheckpoint) + { + /* Check if checkpoint file exists before doing continuation. + * This way we can use identical input options for the first and subsequent runs... + */ + gmx_bool bReadEkin; + + load_checkpoint(opt2fn_master("-cpi", nfile, fnm, cr), &fplog, + cr, domdecOptions.numCells, + inputrec, globalState.get(), + &bReadEkin, &observablesHistory, + continuationOptions.appendFiles, + continuationOptions.appendFilesOptionSet, + mdrunOptions.reproducible); + + if (bReadEkin) + { + continuationOptions.haveReadEkin = true; + } + } + + if (SIMMASTER(cr) && continuationOptions.appendFiles) + { + gmx_log_open(ftp2fn(efLOG, nfile, fnm), cr, + continuationOptions.appendFiles, &fplog); + logOwner = buildLogger(fplog, nullptr); + mdlog = logOwner.logger(); + } + + if (mdrunOptions.numStepsCommandline > -2) + { + GMX_LOG(mdlog.info).asParagraph(). + appendText("The -nsteps functionality is deprecated, and may be removed in a future version. " + "Consider using gmx convert-tpr -nsteps or changing the appropriate .mdp file field."); + } + /* override nsteps with value set on the commamdline */ + override_nsteps_cmdline(mdlog, mdrunOptions.numStepsCommandline, inputrec); + + if (SIMMASTER(cr)) + { + copy_mat(globalState->box, box); + } + + if (PAR(cr)) + { + gmx_bcast(sizeof(box), box, cr); + } + + /* Update rlist and nstlist. */ + if (inputrec->cutoff_scheme == ecutsVERLET) + { + prepare_verlet_scheme(fplog, cr, inputrec, nstlist_cmdline, &mtop, box, + useGpuForNonbonded || (emulateGpuNonbonded == EmulateGpuNonbonded::Yes), *hwinfo->cpuInfo); + } + + /* Initalize the domain decomposition */ + if (PAR(cr) && !(EI_TPI(inputrec->eI) || + inputrec->eI == eiNM)) + { + cr->dd = init_domain_decomposition(fplog, cr, domdecOptions, mdrunOptions, + &mtop, inputrec, + box, positionsFromStatePointer(globalState.get())); + // Note that local state still does not exist yet. + } + else + { + /* PME, if used, is done on all nodes with 1D decomposition */ + cr->npmenodes = 0; + cr->duty = (DUTY_PP | DUTY_PME); + + if (inputrec->ePBC == epbcSCREW) + { + gmx_fatal(FARGS, + "pbc=%s is only implemented with domain decomposition", + epbc_names[inputrec->ePBC]); + } + } + + if (PAR(cr)) + { + /* After possible communicator splitting in make_dd_communicators. + * we can set up the intra/inter node communication. + */ + gmx_setup_nodecomm(fplog, cr); + } + +#if GMX_MPI + if (isMultiSim(ms)) + { + GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted( + "This is simulation %d out of %d running as a composite GROMACS\n" + "multi-simulation job. Setup for this simulation:\n", + ms->sim, ms->nsim); + } + GMX_LOG(mdlog.warning).appendTextFormatted( + "Using %d MPI %s\n", + cr->nnodes, +#if GMX_THREAD_MPI + cr->nnodes == 1 ? "thread" : "threads" +#else + cr->nnodes == 1 ? "process" : "processes" +#endif + ); + fflush(stderr); +#endif + + /* Check and update hw_opt for the cut-off scheme */ + check_and_update_hw_opt_2(&hw_opt, inputrec->cutoff_scheme); + + /* Check and update the number of OpenMP threads requested */ + checkAndUpdateRequestedNumOpenmpThreads(&hw_opt, *hwinfo, cr, ms, physicalNodeComm.size_, + pmeRunMode, mtop); + + gmx_omp_nthreads_init(mdlog, cr, + hwinfo->nthreads_hw_avail, + physicalNodeComm.size_, + hw_opt.nthreads_omp, + hw_opt.nthreads_omp_pme, + !thisRankHasDuty(cr, DUTY_PP), + inputrec->cutoff_scheme == ecutsVERLET); + +#ifndef NDEBUG + if (EI_TPI(inputrec->eI) && + inputrec->cutoff_scheme == ecutsVERLET) + { + gmx_feenableexcept(); + } +#endif + + // Build a data structure that expresses which kinds of non-bonded + // task are handled by this rank. + // + // TODO Later, this might become a loop over all registered modules + // relevant to the mdp inputs, to find those that have such tasks. + // + // TODO This could move before init_domain_decomposition() as part + // of refactoring that separates the responsibility for duty + // assignment from setup for communication between tasks, and + // setup for tasks handled with a domain (ie including short-ranged + // tasks, bonded tasks, etc.). + // + // Note that in general useGpuForNonbonded, etc. can have a value + // that is inconsistent with the presence of actual GPUs on any + // rank, and that is not known to be a problem until the + // duty of the ranks on a node become node. + // + // TODO Later we might need the concept of computeTasksOnThisRank, + // from which we construct gpuTasksOnThisRank. + // + // Currently the DD code assigns duty to ranks that can + // include PP work that currently can be executed on a single + // GPU, if present and compatible. This has to be coordinated + // across PP ranks on a node, with possible multiple devices + // or sharing devices on a node, either from the user + // selection, or automatically. + auto haveGpus = !gpuIdsToUse.empty(); + std::vector gpuTasksOnThisRank; + if (thisRankHasDuty(cr, DUTY_PP)) + { + if (useGpuForNonbonded) + { + if (haveGpus) + { + gpuTasksOnThisRank.push_back(GpuTask::Nonbonded); + } + else if (nonbondedTarget == TaskTarget::Gpu) + { + gmx_fatal(FARGS, "Cannot run short-ranged nonbonded interactions on a GPU because there is none detected."); + } + } + } + // TODO cr->duty & DUTY_PME should imply that a PME algorithm is active, but currently does not. + if (EEL_PME(inputrec->coulombtype) && (thisRankHasDuty(cr, DUTY_PME))) + { + if (useGpuForPme) + { + if (haveGpus) + { + gpuTasksOnThisRank.push_back(GpuTask::Pme); + } + else if (pmeTarget == TaskTarget::Gpu) + { + gmx_fatal(FARGS, "Cannot run PME on a GPU because there is none detected."); + } + } + } + + GpuTaskAssignment gpuTaskAssignment; + try + { + // Produce the task assignment for this rank. + gpuTaskAssignment = runTaskAssignment(gpuIdsToUse, userGpuTaskAssignment, *hwinfo, + mdlog, cr, ms, physicalNodeComm, gpuTasksOnThisRank); + } + GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; + + /* Prevent other ranks from continuing after an issue was found + * and reported as a fatal error. + * + * TODO This function implements a barrier so that MPI runtimes + * can organize an orderly shutdown if one of the ranks has had to + * issue a fatal error in various code already run. When we have + * MPI-aware error handling and reporting, this should be + * improved. */ +#if GMX_MPI + if (PAR(cr)) + { + MPI_Barrier(cr->mpi_comm_mysim); + } + if (isMultiSim(ms)) + { + if (SIMMASTER(cr)) + { + MPI_Barrier(ms->mpi_comm_masters); + } + /* We need another barrier to prevent non-master ranks from contiuing + * when an error occured in a different simulation. + */ + MPI_Barrier(cr->mpi_comm_mysim); + } +#endif + + /* Now that we know the setup is consistent, check for efficiency */ + check_resource_division_efficiency(hwinfo, !gpuTaskAssignment.empty(), mdrunOptions.ntompOptionIsSet, + cr, mdlog); + + gmx_device_info_t *nonbondedDeviceInfo = nullptr; + + if (thisRankHasDuty(cr, DUTY_PP)) + { + // This works because only one task of each type is currently permitted. + auto nbGpuTaskMapping = std::find_if(gpuTaskAssignment.begin(), gpuTaskAssignment.end(), + hasTaskType); + if (nbGpuTaskMapping != gpuTaskAssignment.end()) + { + int nonbondedDeviceId = nbGpuTaskMapping->deviceId_; + nonbondedDeviceInfo = getDeviceInfo(hwinfo->gpu_info, nonbondedDeviceId); + init_gpu(mdlog, nonbondedDeviceInfo); + + if (DOMAINDECOMP(cr)) + { + /* When we share GPUs over ranks, we need to know this for the DLB */ + dd_setup_dlb_resource_sharing(cr, nonbondedDeviceId); + } + + } + } + + std::unique_ptr initializedClfftLibrary; + + gmx_device_info_t *pmeDeviceInfo = nullptr; + // Later, this program could contain kernels that might be later + // re-used as auto-tuning progresses, or subsequent simulations + // are invoked. + PmeGpuProgramStorage pmeGpuProgram; + // This works because only one task of each type is currently permitted. + auto pmeGpuTaskMapping = std::find_if(gpuTaskAssignment.begin(), gpuTaskAssignment.end(), hasTaskType); + const bool thisRankHasPmeGpuTask = (pmeGpuTaskMapping != gpuTaskAssignment.end()); + if (thisRankHasPmeGpuTask) + { + pmeDeviceInfo = getDeviceInfo(hwinfo->gpu_info, pmeGpuTaskMapping->deviceId_); + init_gpu(mdlog, pmeDeviceInfo); + pmeGpuProgram = buildPmeGpuProgram(pmeDeviceInfo); + // TODO It would be nice to move this logic into the factory + // function. See Redmine #2535. + bool isMasterThread = !GMX_THREAD_MPI || MASTER(cr); + if (pmeRunMode == PmeRunMode::GPU && !initializedClfftLibrary && isMasterThread) + { + initializedClfftLibrary = initializeClfftLibrary(); + } + } + + /* getting number of PP/PME threads + PME: env variable should be read only on one node to make sure it is + identical everywhere; + */ + nthreads_pme = gmx_omp_nthreads_get(emntPME); + + int numThreadsOnThisRank; + /* threads on this MPI process or TMPI thread */ + if (thisRankHasDuty(cr, DUTY_PP)) + { + numThreadsOnThisRank = gmx_omp_nthreads_get(emntNonbonded); + } + else + { + numThreadsOnThisRank = nthreads_pme; + } + + checkHardwareOversubscription(numThreadsOnThisRank, cr->nodeid, + *hwinfo->hardwareTopology, + physicalNodeComm, mdlog); + + if (hw_opt.thread_affinity != threadaffOFF) + { + /* Before setting affinity, check whether the affinity has changed + * - which indicates that probably the OpenMP library has changed it + * since we first checked). + */ + gmx_check_thread_affinity_set(mdlog, cr, + &hw_opt, hwinfo->nthreads_hw_avail, TRUE); + + int numThreadsOnThisNode, intraNodeThreadOffset; + analyzeThreadsOnThisNode(physicalNodeComm, numThreadsOnThisRank, &numThreadsOnThisNode, + &intraNodeThreadOffset); + + /* Set the CPU affinity */ + gmx_set_thread_affinity(mdlog, cr, &hw_opt, *hwinfo->hardwareTopology, + numThreadsOnThisRank, numThreadsOnThisNode, + intraNodeThreadOffset, nullptr); + } + + if (mdrunOptions.timingOptions.resetStep > -1) + { + GMX_LOG(mdlog.info).asParagraph(). + appendText("The -resetstep functionality is deprecated, and may be removed in a future version."); + } + wcycle = wallcycle_init(fplog, mdrunOptions.timingOptions.resetStep, cr); + + if (PAR(cr)) + { + /* Master synchronizes its value of reset_counters with all nodes + * including PME only nodes */ + reset_counters = wcycle_get_reset_counters(wcycle); + gmx_bcast_sim(sizeof(reset_counters), &reset_counters, cr); + wcycle_set_reset_counters(wcycle, reset_counters); + } + + // Membrane embedding must be initialized before we call init_forcerec() + if (doMembed) + { + if (MASTER(cr)) + { + fprintf(stderr, "Initializing membed"); + } + /* Note that membed cannot work in parallel because mtop is + * changed here. Fix this if we ever want to make it run with + * multiple ranks. */ + membed = init_membed(fplog, nfile, fnm, &mtop, inputrec, globalState.get(), cr, &mdrunOptions.checkpointOptions.period); + } + + std::unique_ptr mdAtoms; + + snew(nrnb, 1); + if (thisRankHasDuty(cr, DUTY_PP)) + { + /* Initiate forcerecord */ + fr = mk_forcerec(); + fr->forceProviders = mdModules->initForceProviders(); + init_forcerec(fplog, mdlog, fr, fcd, + inputrec, &mtop, cr, box, + opt2fn("-table", nfile, fnm), + opt2fn("-tablep", nfile, fnm), + opt2fns("-tableb", nfile, fnm), + *hwinfo, nonbondedDeviceInfo, + FALSE, + pforce); + + /* Initialize QM-MM */ + if (fr->bQMMM) + { + GMX_LOG(mdlog.info).asParagraph(). + appendText("Large parts of the QM/MM support is deprecated, and may be removed in a future " + "version. Please get in touch with the developers if you find the support useful, " + "as help is needed if the functionality is to continue to be available."); + init_QMMMrec(cr, &mtop, inputrec, fr); + } + + /* Initialize the mdAtoms structure. + * mdAtoms is not filled with atom data, + * as this can not be done now with domain decomposition. + */ + mdAtoms = makeMDAtoms(fplog, mtop, *inputrec, thisRankHasPmeGpuTask); + if (globalState && thisRankHasPmeGpuTask) + { + // The pinning of coordinates in the global state object works, because we only use + // PME on GPU without DD or on a separate PME rank, and because the local state pointer + // points to the global state object without DD. + // FIXME: MD and EM separately set up the local state - this should happen in the same function, + // which should also perform the pinning. + changePinningPolicy(&globalState->x, pme_get_pinning_policy()); + } + + /* Initialize the virtual site communication */ + vsite = initVsite(mtop, cr); + + calc_shifts(box, fr->shift_vec); + + /* With periodic molecules the charge groups should be whole at start up + * and the virtual sites should not be far from their proper positions. + */ + if (!inputrec->bContinuation && MASTER(cr) && + !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols)) + { + /* Make molecules whole at start of run */ + if (fr->ePBC != epbcNONE) + { + rvec *xGlobal = as_rvec_array(globalState->x.data()); + do_pbc_first_mtop(fplog, inputrec->ePBC, box, &mtop, xGlobal); + } + if (vsite) + { + /* Correct initial vsite positions are required + * for the initial distribution in the domain decomposition + * and for the initial shell prediction. + */ + constructVsitesGlobal(mtop, globalState->x); + } + } + + if (EEL_PME(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype)) + { + ewaldcoeff_q = fr->ic->ewaldcoeff_q; + ewaldcoeff_lj = fr->ic->ewaldcoeff_lj; + } + } + else + { + /* This is a PME only node */ + + GMX_ASSERT(globalState == nullptr, "We don't need the state on a PME only rank and expect it to be unitialized"); + + ewaldcoeff_q = calc_ewaldcoeff_q(inputrec->rcoulomb, inputrec->ewald_rtol); + ewaldcoeff_lj = calc_ewaldcoeff_lj(inputrec->rvdw, inputrec->ewald_rtol_lj); + } + + gmx_pme_t *sepPmeData = nullptr; + // This reference hides the fact that PME data is owned by runner on PME-only ranks and by forcerec on other ranks + GMX_ASSERT(thisRankHasDuty(cr, DUTY_PP) == (fr != nullptr), "Double-checking that only PME-only ranks have no forcerec"); + gmx_pme_t * &pmedata = fr ? fr->pmedata : sepPmeData; + + /* Initiate PME if necessary, + * either on all nodes or on dedicated PME nodes only. */ + if (EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)) + { + if (mdAtoms && mdAtoms->mdatoms()) + { + nChargePerturbed = mdAtoms->mdatoms()->nChargePerturbed; + if (EVDW_PME(inputrec->vdwtype)) + { + nTypePerturbed = mdAtoms->mdatoms()->nTypePerturbed; + } + } + if (cr->npmenodes > 0) + { + /* The PME only nodes need to know nChargePerturbed(FEP on Q) and nTypePerturbed(FEP on LJ)*/ + gmx_bcast_sim(sizeof(nChargePerturbed), &nChargePerturbed, cr); + gmx_bcast_sim(sizeof(nTypePerturbed), &nTypePerturbed, cr); + } + + if (thisRankHasDuty(cr, DUTY_PME)) + { + try + { + pmedata = gmx_pme_init(cr, + getNumPmeDomains(cr->dd), + inputrec, + mtop.natoms, nChargePerturbed, nTypePerturbed, + mdrunOptions.reproducible, + ewaldcoeff_q, ewaldcoeff_lj, + nthreads_pme, + pmeRunMode, nullptr, + pmeDeviceInfo, pmeGpuProgram.get(), mdlog); + } + GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; + } + } + + + if (EI_DYNAMICS(inputrec->eI)) + { + /* Turn on signal handling on all nodes */ + /* + * (A user signal from the PME nodes (if any) + * is communicated to the PP nodes. + */ + signal_handler_install(); + } + + if (thisRankHasDuty(cr, DUTY_PP)) + { + /* Assumes uniform use of the number of OpenMP threads */ + walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntDefault)); + + if (inputrec->bPull) + { + /* Initialize pull code */ + inputrec->pull_work = + init_pull(fplog, inputrec->pull, inputrec, + &mtop, cr, inputrec->fepvals->init_lambda); + if (EI_DYNAMICS(inputrec->eI) && MASTER(cr)) + { + init_pull_output_files(inputrec->pull_work, + nfile, fnm, oenv, + continuationOptions); + } + } + + if (inputrec->bRot) + { + /* Initialize enforced rotation code */ + init_rot(fplog, inputrec, nfile, fnm, cr, globalState.get(), &mtop, oenv, mdrunOptions); + } + + /* Let makeConstraints know whether we have essential dynamics constraints. + * TODO: inputrec should tell us whether we use an algorithm, not a file option or the checkpoint + */ + bool doEssentialDynamics = (opt2fn_null("-ei", nfile, fnm) != nullptr || observablesHistory.edsamHistory); + auto constr = makeConstraints(mtop, *inputrec, doEssentialDynamics, + fplog, *mdAtoms->mdatoms(), + cr, *ms, nrnb, wcycle, fr->bMolPBC); + + if (DOMAINDECOMP(cr)) + { + GMX_RELEASE_ASSERT(fr, "fr was NULL while cr->duty was DUTY_PP"); + /* This call is not included in init_domain_decomposition mainly + * because fr->cginfo_mb is set later. + */ + dd_init_bondeds(fplog, cr->dd, &mtop, vsite, inputrec, + domdecOptions.checkBondedInteractions, + fr->cginfo_mb); + } + + /* Now do whatever the user wants us to do (how flexible...) */ + Integrator integrator { + fplog, cr, ms, mdlog, nfile, fnm, + oenv, + mdrunOptions, + vsite, constr.get(), + deform.get(), + mdModules->outputProvider(), + inputrec, &mtop, + fcd, + globalState.get(), + &observablesHistory, + mdAtoms.get(), nrnb, wcycle, fr, + replExParams, + membed, + walltime_accounting + }; + integrator.run(inputrec->eI); + if (inputrec->bRot) + { + finish_rot(inputrec->rot); + } + + if (inputrec->bPull) + { + finish_pull(inputrec->pull_work); + } + + } + else + { + GMX_RELEASE_ASSERT(pmedata, "pmedata was NULL while cr->duty was not DUTY_PP"); + /* do PME only */ + walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME)); + gmx_pmeonly(pmedata, cr, nrnb, wcycle, walltime_accounting, inputrec, pmeRunMode); + } + + wallcycle_stop(wcycle, ewcRUN); + + /* Finish up, write some stuff + * if rerunMD, don't write last frame again + */ + finish_run(fplog, mdlog, cr, + inputrec, nrnb, wcycle, walltime_accounting, + fr ? fr->nbv : nullptr, + pmedata, + EI_DYNAMICS(inputrec->eI) && !isMultiSim(ms)); + + // Free PME data + if (pmedata) + { + gmx_pme_destroy(pmedata); + pmedata = nullptr; + } + + // FIXME: this is only here to manually unpin mdAtoms->chargeA_ and state->x, + // before we destroy the GPU context(s) in free_gpu_resources(). + // Pinned buffers are associated with contexts in CUDA. + // As soon as we destroy GPU contexts after mdrunner() exits, these lines should go. + mdAtoms.reset(nullptr); + globalState.reset(nullptr); + mdModules.reset(nullptr); // destruct force providers here as they might also use the GPU + + /* Free GPU memory and set a physical node tMPI barrier (which should eventually go away) */ + free_gpu_resources(fr, physicalNodeComm); + free_gpu(nonbondedDeviceInfo); + free_gpu(pmeDeviceInfo); + done_forcerec(fr, mtop.molblock.size(), mtop.groups.grps[egcENER].nr); + sfree(fcd); + + if (doMembed) + { + free_membed(membed); + } + + gmx_hardware_info_free(); + + /* Does what it says */ + print_date_and_time(fplog, cr->nodeid, "Finished mdrun", gmx_gettime()); + walltime_accounting_destroy(walltime_accounting); + sfree(nrnb); + + /* Close logfile already here if we were appending to it */ + if (MASTER(cr) && continuationOptions.appendFiles) + { + gmx_log_close(fplog); + fplog = nullptr; + } + + rc = (int)gmx_get_stop_condition(); + +#if GMX_THREAD_MPI + /* we need to join all threads. The sub-threads join when they + exit this function, but the master thread needs to be told to + wait for that. */ + if (PAR(cr) && MASTER(cr)) + { + done_commrec(cr); + tMPI_Finalize(); + } +#endif + + return rc; +} + +} // namespace gmx