#include "config.h"
#include <assert.h>
-#include <math.h>
#include <stdlib.h>
+#include <cmath>
+
#include <algorithm>
#include "gromacs/domdec/domdec.h"
-#include "gromacs/fileio/gmxfio.h"
-#include "gromacs/legacyheaders/constr.h"
-#include "gromacs/legacyheaders/copyrite.h"
-#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
-#include "gromacs/legacyheaders/mdrun.h"
+#include "gromacs/fileio/copyrite.h"
+#include "gromacs/gmxlib/gmx_omp_nthreads.h"
#include "gromacs/legacyheaders/nrnb.h"
#include "gromacs/legacyheaders/types/commrec.h"
#include "gromacs/math/units.h"
#include "gromacs/math/vec.h"
-#include "gromacs/pbcutil/pbc-simd.h"
+#include "gromacs/mdlib/constr.h"
+#include "gromacs/mdlib/mdrun.h"
#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/pbcutil/pbc-simd.h"
#include "gromacs/simd/simd.h"
#include "gromacs/simd/simd_math.h"
#include "gromacs/simd/vector_operations.h"
#include "gromacs/topology/block.h"
#include "gromacs/topology/mtop_util.h"
#include "gromacs/utility/bitmask.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/gmxomp.h"
#include "gromacs/utility/smalloc.h"
/* MSVC 2010 produces buggy SIMD PBC code, disable SIMD for MSVC <= 2010 */
-#if defined GMX_SIMD_HAVE_REAL && !(defined _MSC_VER && _MSC_VER < 1700) && !defined(__ICL)
-#define LINCS_SIMD
+#if GMX_SIMD_HAVE_REAL && !(defined _MSC_VER && _MSC_VER < 1700) && !defined(__ICL)
+# define LINCS_SIMD
#endif
-#if defined(GMX_SIMD_X86_AVX_256) || defined(GMX_SIMD_X86_AVX2_256)
+#if GMX_SIMD_X86_AVX_256 || GMX_SIMD_X86_AVX2_256
// This was originally work-in-progress for augmenting the SIMD module with
// masked load/store operations. Instead, that turned into and extended SIMD
}
-# ifdef GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
+# if GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
# define gmx_hack_simd4_load3_r(mem) _mm256_maskload_pd((mem), _mm_castsi128_ps(_mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1)))
# define gmx_hack_simd4_store3_r(mem, x) _mm256_maskstore_pd((mem), _mm_castsi128_ps(_mm256_set_epi32(0, 0, -1, -1, -1, -1, -1, -1)), (x))
# else
a[6] = _mm256_extractf128_ps(row2, 1);
a[7] = _mm256_extractf128_ps(row3, 1);
}
-#ifdef GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
+#if GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG
# define gmx_hack_simd4_load3_r(mem) _mm_maskload_ps((mem), _mm_castsi256_pd(_mm_set_epi32(0, -1, -1, -1)))
# define gmx_hack_simd4_store3_r(mem, x) _mm_maskstore_ps((mem), _mm_castsi256_pd(_mm_set_epi32(0, -1, -1, -1)), (x))
#else
#endif /* AVX */
-#ifdef GMX_SIMD_HAVE_REAL
+#if GMX_SIMD_HAVE_REAL
/*! \brief Store differences between indexed rvecs in SIMD registers.
*
* Returns SIMD register with the difference vectors:
gmx_simd_real_t *dy,
gmx_simd_real_t *dz)
{
-#if defined(GMX_SIMD_X86_AVX_256) || defined(GMX_SIMD_X86_AVX2_256)
+#if GMX_SIMD_X86_AVX_256 || GMX_SIMD_X86_AVX2_256
int i;
gmx_simd4_real_t d[GMX_SIMD_REAL_WIDTH];
gmx_simd_real_t tmp;
real gmx_unused *buf,
rvec *v)
{
-#if defined(GMX_SIMD_X86_AVX_256) || defined(GMX_SIMD_X86_AVX2_256)
+#if GMX_SIMD_X86_AVX_256 || GMX_SIMD_X86_AVX2_256
int i;
gmx_simd4_real_t s4[GMX_SIMD_REAL_WIDTH];
gmx_simd_real_t zero = gmx_simd_setzero_r();
#else
static const int simd_width = 1;
#endif
-/* We can't use small memory alignments on many systems, so use min 64 bytes*/
-static const int align_bytes = std::max<int>(64, simd_width*sizeof(real));
+/* Align to 128 bytes, consistent with the current implementation of
+ AlignedAllocator, which currently forces 128 byte alignment. */
+static const int align_bytes = 128;
real *lincs_rmsd_data(struct gmx_lincsdata *lincsd)
{
#pragma omp parallel for reduction(+: ntriangle, ncc_triangle) num_threads(li->ntask) schedule(static)
for (th = 0; th < li->ntask; th++)
{
- set_lincs_matrix_task(li, &li->task[th], invmass, &ncc_triangle);
- ntriangle = li->task[th].ntriangle;
+ try
+ {
+ set_lincs_matrix_task(li, &li->task[th], invmass, &ncc_triangle);
+ ntriangle = li->task[th].ntriangle;
+ }
+ GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
}
li->ntriangle = ntriangle;
li->ncc_triangle = ncc_triangle;
li->matlam = lambda;
}
-static int count_triangle_constraints(t_ilist *ilist, t_blocka *at2con)
+static int count_triangle_constraints(const t_ilist *ilist,
+ const t_blocka *at2con)
{
int ncon1, ncon_tot;
int c0, a00, a01, n1, c1, a10, a11, ac1, n2, c2, a20, a21;
return (*(int *)a) - (*(int *)b);
}
-gmx_lincsdata_t init_lincs(FILE *fplog, gmx_mtop_t *mtop,
- int nflexcon_global, t_blocka *at2con,
+gmx_lincsdata_t init_lincs(FILE *fplog, const gmx_mtop_t *mtop,
+ int nflexcon_global, const t_blocka *at2con,
gmx_bool bPLINCS, int nIter, int nProjOrder)
{
struct gmx_lincsdata *li;
#pragma omp parallel for num_threads(li->ntask)
for (th = 0; th < li->ntask; th++)
{
- /* Per thread SIMD load buffer for loading 2 simd_width rvecs */
- snew_aligned(li->task[th].simd_buf, 2*simd_width*DIM,
- align_bytes);
+ try
+ {
+ /* Per thread SIMD load buffer for loading 2 simd_width rvecs */
+ snew_aligned(li->task[th].simd_buf, 2*simd_width*DIM,
+ align_bytes);
+ }
+ GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
}
if (bPLINCS || li->ncg_triangle > 0)
#pragma omp parallel for num_threads(li->ntask) schedule(static)
for (th = 0; th < li->ntask; th++)
{
- lincs_task_t *li_task;
- gmx_bitmask_t mask;
- int b;
-
- li_task = &li->task[th];
-
- if (li_task->b1 - li_task->b0 > li_task->ind_nalloc)
+ try
{
- li_task->ind_nalloc = over_alloc_large(li_task->b1-li_task->b0);
- srenew(li_task->ind, li_task->ind_nalloc);
- srenew(li_task->ind_r, li_task->ind_nalloc);
- }
+ lincs_task_t *li_task;
+ gmx_bitmask_t mask;
+ int b;
- bitmask_init_low_bits(&mask, th);
+ li_task = &li->task[th];
- li_task->nind = 0;
- li_task->nind_r = 0;
- for (b = li_task->b0; b < li_task->b1; b++)
- {
- /* We let the constraint with the lowest thread index
- * operate on atoms with constraints from multiple threads.
- */
- if (bitmask_is_disjoint(atf[li->bla[b*2]], mask) &&
- bitmask_is_disjoint(atf[li->bla[b*2+1]], mask))
+ if (li_task->b1 - li_task->b0 > li_task->ind_nalloc)
{
- /* Add the constraint to the local atom update index */
- li_task->ind[li_task->nind++] = b;
+ li_task->ind_nalloc = over_alloc_large(li_task->b1-li_task->b0);
+ srenew(li_task->ind, li_task->ind_nalloc);
+ srenew(li_task->ind_r, li_task->ind_nalloc);
}
- else
+
+ bitmask_init_low_bits(&mask, th);
+
+ li_task->nind = 0;
+ li_task->nind_r = 0;
+ for (b = li_task->b0; b < li_task->b1; b++)
{
- /* Add the constraint to the rest block */
- li_task->ind_r[li_task->nind_r++] = b;
+ /* We let the constraint with the lowest thread index
+ * operate on atoms with constraints from multiple threads.
+ */
+ if (bitmask_is_disjoint(atf[li->bla[b*2]], mask) &&
+ bitmask_is_disjoint(atf[li->bla[b*2+1]], mask))
+ {
+ /* Add the constraint to the local atom update index */
+ li_task->ind[li_task->nind++] = b;
+ }
+ else
+ {
+ /* Add the constraint to the rest block */
+ li_task->ind_r[li_task->nind_r++] = b;
+ }
}
}
+ GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
}
/* We need to copy all constraints which have not be assigned
}
}
-void set_lincs(t_idef *idef, t_mdatoms *md,
- gmx_bool bDynamics, t_commrec *cr,
+void set_lincs(const t_idef *idef,
+ const t_mdatoms *md,
+ gmx_bool bDynamics,
+ t_commrec *cr,
struct gmx_lincsdata *li)
{
int natoms, nflexcon;
#pragma omp parallel for num_threads(li->ntask) schedule(static)
for (th = 0; th < li->ntask; th++)
{
- lincs_task_t *li_task;
+ try
+ {
+ lincs_task_t *li_task;
- li_task = &li->task[th];
+ li_task = &li->task[th];
- if (li->ncg_triangle > 0 &&
- li_task->b1 - li_task->b0 > li_task->tri_alloc)
- {
- /* This is allocating too much, but it is difficult to improve */
- li_task->tri_alloc = over_alloc_dd(li_task->b1 - li_task->b0);
- srenew(li_task->triangle, li_task->tri_alloc);
- srenew(li_task->tri_bits, li_task->tri_alloc);
- }
+ if (li->ncg_triangle > 0 &&
+ li_task->b1 - li_task->b0 > li_task->tri_alloc)
+ {
+ /* This is allocating too much, but it is difficult to improve */
+ li_task->tri_alloc = over_alloc_dd(li_task->b1 - li_task->b0);
+ srenew(li_task->triangle, li_task->tri_alloc);
+ srenew(li_task->tri_bits, li_task->tri_alloc);
+ }
- set_matrix_indices(li, li_task, &at2con, bSortMatrix);
+ set_matrix_indices(li, li_task, &at2con, bSortMatrix);
+ }
+ GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
}
done_blocka(&at2con);
{
fprintf(fplog, "%s", buf);
}
- if (!gmx_isfinite(d1))
+ if (!std::isfinite(d1))
{
gmx_fatal(FARGS, "Bond length not finite.");
}
/* The OpenMP parallel region of constrain_lincs for coords */
#pragma omp parallel num_threads(lincsd->ntask)
{
- int th = gmx_omp_get_thread_num();
+ try
+ {
+ int th = gmx_omp_get_thread_num();
- clear_mat(lincsd->task[th].vir_r_m_dr);
+ clear_mat(lincsd->task[th].vir_r_m_dr);
- do_lincs(x, xprime, box, pbc, lincsd, th,
- md->invmass, cr,
- bCalcDHDL,
- ir->LincsWarnAngle, &bWarn,
- invdt, v, bCalcVir,
- th == 0 ? vir_r_m_dr : lincsd->task[th].vir_r_m_dr);
+ do_lincs(x, xprime, box, pbc, lincsd, th,
+ md->invmass, cr,
+ bCalcDHDL,
+ ir->LincsWarnAngle, &bWarn,
+ invdt, v, bCalcVir,
+ th == 0 ? vir_r_m_dr : lincsd->task[th].vir_r_m_dr);
+ }
+ GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
}
if (bLog && fplog && lincsd->nc > 0)
/* The OpenMP parallel region of constrain_lincs for derivatives */
#pragma omp parallel num_threads(lincsd->ntask)
{
- int th = gmx_omp_get_thread_num();
+ try
+ {
+ int th = gmx_omp_get_thread_num();
- do_lincsp(x, xprime, min_proj, pbc, lincsd, th,
- md->invmass, econq, bCalcDHDL,
- bCalcVir, th == 0 ? vir_r_m_dr : lincsd->task[th].vir_r_m_dr);
+ do_lincsp(x, xprime, min_proj, pbc, lincsd, th,
+ md->invmass, econq, bCalcDHDL,
+ bCalcVir, th == 0 ? vir_r_m_dr : lincsd->task[th].vir_r_m_dr);
+ }
+ GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
}
}
{
dhdlambda += lincsd->task[th].dhdlambda;
}
- if (econqCoord)
+ if (econq == econqCoord)
{
/* dhdlambda contains dH/dlambda*dt^2, correct for this */
/* TODO This should probably use invdt, so that sd integrator scaling works properly */
--- /dev/null
- #if defined HAVE_CLOCK_GETTIME && _POSIX_TIMERS >= 0
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2013, The GROMACS development team.
+ * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#include "gmxpre.h"
+
+#include "walltime_accounting.h"
+
+#include "config.h"
+
+#include <ctime>
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#ifdef HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif
+
+#include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/smalloc.h"
+
+/* TODO in future: convert gmx_walltime_accounting to a class,
+ * resolve who should have responsibility for recording the number of
+ * steps done, consider whether parts of finish_time, print_perf,
+ * wallcycle_print belong in this module.
+ *
+ * If/when any kind of task parallelism is implemented (even OpenMP
+ * regions simultaneously assigned to different tasks), consider
+ * whether this data structure (and/or cycle counters) should be
+ * maintained on a per-OpenMP-thread basis. */
+
+/*! \brief Manages caching wall-clock time measurements for
+ * simulations */
+typedef struct gmx_walltime_accounting {
+ //! Seconds since the epoch recorded at the start of the simulation
+ double start_time_stamp;
+ //! Seconds since the epoch recorded at the start of the simulation for this thread
+ double start_time_stamp_per_thread;
+ //! Total seconds elapsed over the simulation
+ double elapsed_time;
+ //! Total seconds elapsed over the simulation running this thread
+ double elapsed_time_over_all_threads;
+ /*! \brief Number of OpenMP threads that will be launched by this
+ * MPI rank.
+ *
+ * This is used to scale elapsed_time_over_all_threads so
+ * that any combination of real MPI, thread MPI and OpenMP (even
+ * mdrun -ntomp_pme) processes/threads would (when run at maximum
+ * efficiency) return values such that the sum of
+ * elapsed_time_over_all_threads over all threads was constant
+ * with respect to parallelism implementation. */
+ int numOpenMPThreads;
+ //! Set by integrators to report the amount of work they did
+ gmx_int64_t nsteps_done;
+} t_gmx_walltime_accounting;
+
+/*! \brief Calls system timing routines (e.g. clock_gettime) to get
+ * the (fractional) number of seconds elapsed since the epoch when
+ * this thread was executing.
+ *
+ * This can be used to measure system load. This can be unreliable if
+ * threads migrate between sockets. If thread-specific timers are not
+ * supported by the OS (e.g. if the OS is not POSIX-compliant), this
+ * function is implemented by gmx_gettime. */
+static double gmx_gettime_per_thread();
+
+// TODO In principle, all this should get protected by checks that
+// walltime_accounting is not null. In practice, that NULL condition
+// does not happen, and future refactoring will likely enforce it by
+// having the gmx_walltime_accounting_t object be owned by the runner
+// object. When these become member functions, existence will be
+// guaranteed.
+
+gmx_walltime_accounting_t
+walltime_accounting_init(int numOpenMPThreads)
+{
+ gmx_walltime_accounting_t walltime_accounting;
+
+ snew(walltime_accounting, 1);
+ walltime_accounting->start_time_stamp = 0;
+ walltime_accounting->start_time_stamp_per_thread = 0;
+ walltime_accounting->elapsed_time = 0;
+ walltime_accounting->nsteps_done = 0;
+ walltime_accounting->numOpenMPThreads = numOpenMPThreads;
+
+ return walltime_accounting;
+}
+
+void
+walltime_accounting_destroy(gmx_walltime_accounting_t walltime_accounting)
+{
+ sfree(walltime_accounting);
+}
+
+void
+walltime_accounting_start(gmx_walltime_accounting_t walltime_accounting)
+{
+ walltime_accounting->start_time_stamp = gmx_gettime();
+ walltime_accounting->start_time_stamp_per_thread = gmx_gettime_per_thread();
+ walltime_accounting->elapsed_time = 0;
+ walltime_accounting->nsteps_done = 0;
+}
+
+void
+walltime_accounting_end(gmx_walltime_accounting_t walltime_accounting)
+{
+ double now, now_per_thread;
+
+ now = gmx_gettime();
+ now_per_thread = gmx_gettime_per_thread();
+
+ walltime_accounting->elapsed_time = now - walltime_accounting->start_time_stamp;
+ walltime_accounting->elapsed_time_over_all_threads = now_per_thread - walltime_accounting->start_time_stamp_per_thread;
+ /* For thread-MPI, the per-thread CPU timer makes this just
+ * work. For OpenMP threads, the per-thread CPU timer measurement
+ * needs to be multiplied by the number of OpenMP threads used,
+ * under the current assumption that all regions ever opened
+ * within a process are of the same size, and each thread should
+ * keep one core busy.
+ */
+ walltime_accounting->elapsed_time_over_all_threads *= walltime_accounting->numOpenMPThreads;
+}
+
+double
+walltime_accounting_get_current_elapsed_time(gmx_walltime_accounting_t walltime_accounting)
+{
+ return gmx_gettime() - walltime_accounting->start_time_stamp;
+}
+
+double
+walltime_accounting_get_elapsed_time(gmx_walltime_accounting_t walltime_accounting)
+{
+ return walltime_accounting->elapsed_time;
+}
+
+double
+walltime_accounting_get_elapsed_time_over_all_threads(gmx_walltime_accounting_t walltime_accounting)
+{
+ return walltime_accounting->elapsed_time_over_all_threads;
+}
+
+double
+walltime_accounting_get_start_time_stamp(gmx_walltime_accounting_t walltime_accounting)
+{
+ return walltime_accounting->start_time_stamp;
+}
+
+gmx_int64_t
+walltime_accounting_get_nsteps_done(gmx_walltime_accounting_t walltime_accounting)
+{
+ return walltime_accounting->nsteps_done;
+}
+
+void
+walltime_accounting_set_nsteps_done(gmx_walltime_accounting_t walltime_accounting,
+ gmx_int64_t nsteps_done)
+{
+ walltime_accounting->nsteps_done = nsteps_done;
+}
+
+double
+gmx_gettime()
+{
- * implementation. */
++#if defined HAVE_CLOCK_GETTIME && _POSIX_TIMERS >= 0 && !(defined __bgq__ && defined __clang__)
+ /* Mac and Windows do not support this. For added fun, Windows
+ * defines _POSIX_TIMERS without actually providing the
++ * implementation. The BlueGene/Q CNK only supports gettimeofday,
++ * and bgclang doesn't provide a fully functional implementation
++ * for clock_gettime (unlike xlc). */
+ struct timespec t;
+ double seconds;
+
+ clock_gettime(CLOCK_REALTIME, &t);
+ seconds = static_cast<double>(t.tv_sec) + 1e-9*t.tv_nsec;
+
+ return seconds;
+#elif defined HAVE_GETTIMEOFDAY
+ // Note that gettimeofday() is deprecated by POSIX, but since Mac
+ // and Windows do not yet support POSIX, we are still stuck.
++ // Also, this is the only supported API call on Bluegene/Q.
+ struct timeval t;
+ double seconds;
+
+ gettimeofday(&t, NULL);
+ seconds = static_cast<double>(t.tv_sec) + 1e-6*t.tv_usec;
+
+ return seconds;
+#else
+ double seconds;
+
+ seconds = time(NULL);
+
+ return seconds;
+#endif
+}
+
+static double
+gmx_gettime_per_thread()
+{
+#if defined HAVE_CLOCK_GETTIME && _POSIX_THREAD_CPUTIME >= 0
+ struct timespec t;
+ double seconds;
+
+ clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t);
+ seconds = static_cast<double>(t.tv_sec) + 1e-9*t.tv_nsec;
+
+ return seconds;
+#else
+ return gmx_gettime();
+#endif
+}