# GROMACS 2018 3
# GROMACS 2019 4
# GROMACS 2020 5
+# GROMACS 2021 6
# LIBRARY_SOVERSION_MINOR so minor version for the built libraries.
# Should be increased for each release that changes only the implementation.
# In GROMACS, the typical policy is to increase it for each patch version
# The GROMACS convention is that these are the version number of the next
# release that is going to be made from this branch.
-set(GMX_VERSION_MAJOR 2020)
+set(GMX_VERSION_MAJOR 2021)
set(GMX_VERSION_PATCH 0)
# The suffix, on the other hand, is used mainly for betas and release
# candidates, where it signifies the most recent such release from
# this branch; it will be empty before the first such release, as well
# as after the final release is out.
-set(GMX_VERSION_SUFFIX "-beta3")
+set(GMX_VERSION_SUFFIX "")
# Conventionally with libtool, any ABI change must change the major
# version number, the minor version number should change if it's just
# here. The important thing is to minimize the chance of third-party
# code being able to dynamically link with a version of libgromacs
# that might not work.
-set(LIBRARY_SOVERSION_MAJOR 5)
+set(LIBRARY_SOVERSION_MAJOR 6)
set(LIBRARY_SOVERSION_MINOR 0)
set(LIBRARY_VERSION ${LIBRARY_SOVERSION_MAJOR}.${LIBRARY_SOVERSION_MINOR}.0)
endif()
set(REGRESSIONTEST_VERSION "${GMX_VERSION_STRING}")
-set(REGRESSIONTEST_BRANCH "refs/heads/release-2020")
+set(REGRESSIONTEST_BRANCH "refs/heads/master")
# Run the regressiontests packaging job with the correct pakage
# version string, and the release box checked, in order to have it
# build the regressiontests tarball with all the right naming. The
# naming affects the md5sum that has to go here, and if it isn't right
# release workflow will report a failure.
- set(REGRESSIONTEST_MD5SUM "162edf7d9d520672f5fa5888aae60989" CACHE INTERNAL "MD5 sum of the regressiontests tarball for this GROMACS version")
+ set(REGRESSIONTEST_MD5SUM "639ae20f00e0311dcec8e5af7876abdb" CACHE INTERNAL "MD5 sum of the regressiontests tarball for this GROMACS version")
math(EXPR GMX_VERSION_NUMERIC
"${GMX_VERSION_MAJOR}*10000 + ${GMX_VERSION_PATCH}")
* using CUDA, including class initialization, data-structures management
* and GPU kernel.
*
- * \note Management of periodic boundary should be unified with SETTLE and
- * removed from here.
* \todo Reconsider naming, i.e. "cuda" suffics should be changed to "gpu".
*
* \author Artem Zhmurov <zhmurov@gmail.com>
#include "gromacs/pbcutil/pbc.h"
#include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
#include "gromacs/topology/ifunc.h"
+ #include "gromacs/topology/topology.h"
namespace gmx
{
float3* d_v,
const real invdt,
const bool computeVirial,
- tensor virialScaled)
+ tensor virialScaled,
+ const PbcAiuc pbcAiuc)
{
ensureNoPendingCudaError("In CUDA version of LINCS");
}
config.stream = commandStream_;
+ kernelParams_.pbcAiuc = pbcAiuc;
+
const auto kernelArgs =
prepareGpuKernelArguments(kernelPtr, config, &kernelParams_, &d_x, &d_xp, &d_v, &invdt);
}
}
+ /*! \brief Constructs and returns an atom constraint adjacency list
+ *
+ * Each constraint will be represented as a tuple, containing index of the second
+ * constrained atom, index of the constraint and a sign that indicates the order of atoms in
+ * which they are listed. Sign is needed to compute the mass factors.
+ */
+ static std::vector<std::vector<std::tuple<int, int, int>>>
+ constructAtomsAdjacencyList(const int numAtoms, ArrayRef<const int> iatoms)
+ {
+ const int stride = 1 + NRAL(F_CONSTR);
+ const int numConstraints = iatoms.ssize() / stride;
+ std::vector<std::vector<std::tuple<int, int, int>>> atomsAdjacencyList(numAtoms);
+ for (int c = 0; c < numConstraints; c++)
+ {
+ int a1 = iatoms[stride * c + 1];
+ int a2 = iatoms[stride * c + 2];
+
+ // Each constraint will be represented as a tuple, containing index of the second
+ // constrained atom, index of the constraint and a sign that indicates the order of atoms in
+ // which they are listed. Sign is needed to compute the mass factors.
+ atomsAdjacencyList[a1].push_back(std::make_tuple(a2, c, +1));
+ atomsAdjacencyList[a2].push_back(std::make_tuple(a1, c, -1));
+ }
+
+ return atomsAdjacencyList;
+ }
+
/*! \brief Helper function to go through constraints recursively.
*
- * For each constraint, counts the number of coupled constraints and stores the value in numCoupledConstraints array.
+ * For each constraint, counts the number of coupled constraints and stores the value in \p numCoupledConstraints array.
* This information is used to split the array of constraints between thread blocks on a GPU so there is no
- * coupling between constraints from different thread blocks. After the 'numCoupledConstraints' array is filled, the
- * value numCoupledConstraints[c] should be equal to the number of constraints that are coupled to 'c' and located
+ * coupling between constraints from different thread blocks. After the \p numCoupledConstraints array is filled, the
+ * value \p numCoupledConstraints[c] should be equal to the number of constraints that are coupled to \p c and located
* after it in the constraints array.
*
* \param[in] a Atom index.
* \param[in] c Sequential index for constraint to consider adding.
* \param[in,out] currentMapIndex The rolling index for the constraints mapping.
*/
- inline void addWithCoupled(const t_iatom* iatoms,
+ inline void addWithCoupled(ArrayRef<const int> iatoms,
const int stride,
ArrayRef<const std::vector<std::tuple<int, int, int>>> atomsAdjacencyList,
ArrayRef<int> splitMap,
}
}
+ /*! \brief Computes and returns how many constraints are coupled to each constraint
+ *
+ * Needed to introduce splits in data so that all coupled constraints will be computed in a
+ * single GPU block. The position \p c of the vector \p numCoupledConstraints should have the number
+ * of constraints that are coupled to a constraint \p c and are after \p c in the vector. Only
+ * first index of the connected group of the constraints is needed later in the code, hence the
+ * numCoupledConstraints vector is also used to keep track if the constrain was already counted.
+ */
+ static std::vector<int> countNumCoupledConstraints(ArrayRef<const int> iatoms,
+ ArrayRef<const std::vector<std::tuple<int, int, int>>> atomsAdjacencyList)
+ {
+ const int stride = 1 + NRAL(F_CONSTR);
+ const int numConstraints = iatoms.ssize() / stride;
+ std::vector<int> numCoupledConstraints(numConstraints, -1);
+ for (int c = 0; c < numConstraints; c++)
+ {
+ const int a1 = iatoms[stride * c + 1];
+ const int a2 = iatoms[stride * c + 2];
+ if (numCoupledConstraints[c] == -1)
+ {
+ numCoupledConstraints[c] = countCoupled(a1, numCoupledConstraints, atomsAdjacencyList)
+ + countCoupled(a2, numCoupledConstraints, atomsAdjacencyList);
+ }
+ }
+
+ return numCoupledConstraints;
+ }
+
+ bool LincsCuda::isNumCoupledConstraintsSupported(const gmx_mtop_t& mtop)
+ {
+ for (const gmx_moltype_t& molType : mtop.moltype)
+ {
+ ArrayRef<const int> iatoms = molType.ilist[F_CONSTR].iatoms;
+ const auto atomsAdjacencyList = constructAtomsAdjacencyList(molType.atoms.nr, iatoms);
+ // Compute, how many constraints are coupled to each constraint
+ const auto numCoupledConstraints = countNumCoupledConstraints(iatoms, atomsAdjacencyList);
+ for (const int numCoupled : numCoupledConstraints)
+ {
+ if (numCoupled > c_threadsPerBlock)
+ {
+ return false;
+ }
+ }
+ }
+
+ return true;
+ }
+
void LincsCuda::set(const t_idef& idef, const t_mdatoms& md)
{
int numAtoms = md.nr;
std::vector<float> massFactorsHost;
// List of constrained atoms in local topology
- t_iatom* iatoms = idef.il[F_CONSTR].iatoms;
+ gmx::ArrayRef<const int> iatoms =
+ constArrayRefFromArray(idef.il[F_CONSTR].iatoms, idef.il[F_CONSTR].nr);
const int stride = NRAL(F_CONSTR) + 1;
const int numConstraints = idef.il[F_CONSTR].nr / stride;
return;
}
- // Constructing adjacency list --- usefull intermediate structure
- std::vector<std::vector<std::tuple<int, int, int>>> atomsAdjacencyList(numAtoms);
- for (int c = 0; c < numConstraints; c++)
- {
- int a1 = iatoms[stride * c + 1];
- int a2 = iatoms[stride * c + 2];
-
- // Each constraint will be represented as a tuple, containing index of the second
- // constrained atom, index of the constraint and a sign that indicates the order of atoms in
- // which they are listed. Sign is needed to compute the mass factors.
- atomsAdjacencyList.at(a1).push_back(std::make_tuple(a2, c, +1));
- atomsAdjacencyList.at(a2).push_back(std::make_tuple(a1, c, -1));
- }
+ // Construct the adjacency list, a useful intermediate structure
+ const auto atomsAdjacencyList = constructAtomsAdjacencyList(numAtoms, iatoms);
- // Compute, how many constraints are coupled to each constraint.
- // Needed to introduce splits in data so that all coupled constraints will be computed in a
- // single GPU block. The position 'c' of the vector numCoupledConstraints should have the number
- // of constraints that are coupled to a constraint 'c' and are after 'c' in the vector. Only
- // first index of the connected group of the constraints is needed later in the code, hence the
- // numCoupledConstraints vector is also used to keep track if the constrain was already counted.
- std::vector<int> numCoupledConstraints(numConstraints, -1);
- for (int c = 0; c < numConstraints; c++)
- {
- int a1 = iatoms[stride * c + 1];
- int a2 = iatoms[stride * c + 2];
- if (numCoupledConstraints.at(c) == -1)
- {
- numCoupledConstraints.at(c) = countCoupled(a1, numCoupledConstraints, atomsAdjacencyList)
- + countCoupled(a2, numCoupledConstraints, atomsAdjacencyList);
- }
- }
+ // Compute, how many constraints are coupled to each constraint
+ const auto numCoupledConstraints = countNumCoupledConstraints(iatoms, atomsAdjacencyList);
// Map of splits in the constraints data. For each 'old' constraint index gives 'new' which
// takes into account the empty spaces which might be needed in the end of each thread block.
GpuApiCallBehavior::Sync, nullptr);
}
-void LincsCuda::setPbc(const t_pbc* pbc)
-{
- setPbcAiuc(pbc->ndim_ePBC, pbc->box, &kernelParams_.pbcAiuc);
-}
-
} // namespace gmx
* Applies LINCS to coordinates and velocities, stored on GPU.
* The results are not automatically copied back to the CPU memory.
* Method uses this class data structures which should be updated
-- * when needed using set() and setPbc() method.
++ * when needed using set() method.
*
* \param[in] d_x Coordinates before timestep (in GPU memory)
* \param[in,out] d_xp Coordinates after timestep (in GPU memory). The
* multipliers when velocities are updated)
* \param[in] computeVirial If virial should be updated.
* \param[in,out] virialScaled Scaled virial tensor to be updated.
+ * \param[in] pbcAiuc PBC data.
*/
void apply(const float3* d_x,
float3* d_xp,
float3* d_v,
const real invdt,
const bool computeVirial,
- tensor virialScaled);
+ tensor virialScaled,
+ const PbcAiuc pbcAiuc);
/*! \brief
* Update data-structures (e.g. after NB search step).
*/
void set(const t_idef& idef, const t_mdatoms& md);
- * Update PBC data.
- *
- * Converts pbc data from t_pbc into the PbcAiuc format and stores the latter.
- *
- * \todo Remove this method. LINCS should not manage PBC.
- *
- * \param[in] pbc The PBC data in t_pbc format.
- */
- void setPbc(const t_pbc* pbc);
-
- /*! \brief
+ /*! \brief
+ * Returns whether the maximum number of coupled constraints is supported
+ * by the CUDA LINCS code.
+ *
+ * \param[in] mtop The molecular topology
+ */
+ static bool isNumCoupledConstraintsSupported(const gmx_mtop_t& mtop);
+
private:
//! CUDA stream
CommandStream commandStream_;
// Constraints need both coordinates before (d_x_) and after (d_xp_) update. However, after constraints
// are applied, the d_x_ can be discarded. So we intentionally swap the d_x_ and d_xp_ here to avoid the
// d_xp_ -> d_x_ copy after constraints. Note that the integrate saves them in the wrong order as well.
- lincsCuda_->apply(d_xp_, d_x_, updateVelocities, d_v_, 1.0 / dt, computeVirial, virial);
- settleCuda_->apply(d_xp_, d_x_, updateVelocities, d_v_, 1.0 / dt, computeVirial, virial);
+ lincsCuda_->apply(d_xp_, d_x_, updateVelocities, d_v_, 1.0 / dt, computeVirial, virial, pbcAiuc_);
+ settleCuda_->apply(d_xp_, d_x_, updateVelocities, d_v_, 1.0 / dt, computeVirial, virial, pbcAiuc_);
// scaledVirial -> virial (methods above returns scaled values)
float scaleFactor = 0.5f / (dt * dt);
void UpdateConstrainCuda::Impl::setPbc(const t_pbc* pbc)
{
setPbcAiuc(pbc->ndim_ePBC, pbc->box, &pbcAiuc_);
- integrator_->setPbc(pbc);
- lincsCuda_->setPbc(pbc);
- settleCuda_->setPbc(pbc);
}
GpuEventSynchronizer* UpdateConstrainCuda::Impl::getCoordinatesReadySync()
return impl_->getCoordinatesReadySync();
}
+ bool UpdateConstrainCuda::isNumCoupledConstraintsSupported(const gmx_mtop_t& mtop)
+ {
+ return LincsCuda::isNumCoupledConstraintsSupported(mtop);
+ }
+
} // namespace gmx
//! True if the Buffer ops development feature is enabled
// TODO: when the trigger of the buffer ops offload is fully automated this should go away
bool enableGpuBufferOps = false;
- //! If true, forces 'mdrun -update auto' default to 'gpu'
- bool forceGpuUpdateDefaultOn = false;
//! True if the GPU halo exchange development feature is enabled
bool enableGpuHaloExchange = false;
//! True if the PME PP direct communication GPU development feature is enabled
#pragma GCC diagnostic ignored "-Wunused-result"
devFlags.enableGpuBufferOps = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr)
&& (GMX_GPU == GMX_GPU_CUDA) && useGpuForNonbonded;
- devFlags.forceGpuUpdateDefaultOn = (getenv("GMX_FORCE_UPDATE_DEFAULT_GPU") != nullptr);
devFlags.enableGpuHaloExchange =
(getenv("GMX_GPU_DD_COMMS") != nullptr && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA));
devFlags.enableGpuPmePPComm =
}
}
- if (devFlags.forceGpuUpdateDefaultOn)
- {
- GMX_LOG(mdlog.warning)
- .asParagraph()
- .appendTextFormatted(
- "NOTE: This run will default to '-update gpu' as requested by the "
- "GMX_FORCE_UPDATE_DEFAULT_GPU environment variable.");
- }
-
if (devFlags.enableGpuPmePPComm)
{
if (pmeRunMode == PmeRunMode::GPU)
try
{
useGpuForUpdate = decideWhetherToUseGpuForUpdate(
- devFlags.forceGpuUpdateDefaultOn, useDomainDecomposition, useGpuForPme,
- useGpuForNonbonded, updateTarget, gpusWereDetected, *inputrec,
- gmx_mtop_interaction_count(mtop, IF_VSITE) > 0, doEssentialDynamics,
+ useDomainDecomposition, useGpuForPme, useGpuForNonbonded, updateTarget,
+ gpusWereDetected, *inputrec, mtop, doEssentialDynamics,
gmx_mtop_ftype_count(mtop, F_ORIRES) > 0, replExParams.exchangeInterval > 0);
}
GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
/* This call is not included in init_domain_decomposition mainly
* because fr->cginfo_mb is set later.
*/
- dd_init_bondeds(fplog, cr->dd, &mtop, vsite.get(), inputrec,
+ dd_init_bondeds(fplog, cr->dd, mtop, vsite.get(), inputrec,
domdecOptions.checkBondedInteractions, fr->cginfo_mb);
}
}
// FIXME: this is only here to manually unpin mdAtoms->chargeA_ and state->x,
- // before we destroy the GPU context(s) in free_gpu_resources().
+ // before we destroy the GPU context(s) in free_gpu().
// Pinned buffers are associated with contexts in CUDA.
// As soon as we destroy GPU contexts after mdrunner() exits, these lines should go.
mdAtoms.reset(nullptr);
globalState.reset(nullptr);
mdModules_.reset(nullptr); // destruct force providers here as they might also use the GPU
+ /* Free pinned buffers in *fr */
+ delete fr;
+ fr = nullptr;
+
+ if (hwinfo->gpu_info.n_dev > 0)
+ {
+ /* stop the GPU profiler (only CUDA) */
+ stopGpuProfiler();
+ }
+
+ /* With tMPI we need to wait for all ranks to finish deallocation before
+ * destroying the CUDA context in free_gpu() as some tMPI ranks may be sharing
+ * GPU and context.
+ *
+ * This is not a concern in OpenCL where we use one context per rank which
+ * is freed in nbnxn_gpu_free().
+ *
+ * Note: it is safe to not call the barrier on the ranks which do not use GPU,
+ * but it is easier and more futureproof to call it on the whole node.
+ *
+ * Note that this function needs to be called even if GPUs are not used
+ * in this run because the PME ranks have no knowledge of whether GPUs
+ * are used or not, but all ranks need to enter the barrier below.
+ * \todo Remove this physical node barrier after making sure
+ * that it's not needed anymore (with a shared GPU run).
+ */
+ if (GMX_THREAD_MPI)
+ {
+ physicalNodeComm.barrier();
+ }
- /* Free GPU memory and set a physical node tMPI barrier (which should eventually go away) */
- free_gpu_resources(fr, physicalNodeComm, hwinfo->gpu_info);
free_gpu(nonbondedDeviceInfo);
free_gpu(pmeDeviceInfo);
- done_forcerec(fr, mtop.molblock.size());
sfree(fcd);
if (doMembed)