* \todo Check if the force should be set to zero here.
* \todo This kernel can also accumulate incidental temperatures for each atom.
*
- * \tparam numTempScaleValues The number of different T-couple values.
- * \tparam velocityScaling Type of the Parrinello-Rahman velocity rescaling.
- * \param[in] numAtoms Total number of atoms.
- * \param[in,out] gm_x Coordinates to update upon integration.
- * \param[out] gm_xp A copy of the coordinates before the integration (for constraints).
- * \param[in,out] gm_v Velocities to update.
- * \param[in] gm_f Atomic forces.
- * \param[in] gm_inverseMasses Reciprocal masses.
- * \param[in] dt Timestep.
- * \param[in] gm_lambdas Temperature scaling factors (one per group)
- * \param[in] gm_tempScaleGroups Mapping of atoms into groups.
- * \param[in] dtPressureCouple Time step for pressure coupling
- * \param[in] velocityScalingMatrixDiagonal Diagonal elements of Parrinello-Rahman velocity scaling matrix
+ * \tparam numTempScaleValues The number of different T-couple values.
+ * \tparam velocityScaling Type of the Parrinello-Rahman velocity rescaling.
+ * \param[in] numAtoms Total number of atoms.
+ * \param[in,out] gm_x Coordinates to update upon integration.
+ * \param[out] gm_xp A copy of the coordinates before the integration (for constraints).
+ * \param[in,out] gm_v Velocities to update.
+ * \param[in] gm_f Atomic forces.
+ * \param[in] gm_inverseMasses Reciprocal masses.
+ * \param[in] dt Timestep.
+ * \param[in] gm_lambdas Temperature scaling factors (one per group)
+ * \param[in] gm_tempScaleGroups Mapping of atoms into groups.
+ * \param[in] dtPressureCouple Time step for pressure coupling
+ * \param[in] prVelocityScalingMatrixDiagonal Diagonal elements of Parrinello-Rahman velocity scaling matrix
*/
template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
__launch_bounds__(c_maxThreadsPerBlock) __global__
const float dt,
const float* __restrict__ gm_lambdas,
const unsigned short* __restrict__ gm_tempScaleGroups,
- const float3 velocityScalingMatrixDiagonal);
+ const float3 prVelocityScalingMatrixDiagonal);
template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
__launch_bounds__(c_maxThreadsPerBlock) __global__
const float dt,
const float* __restrict__ gm_lambdas,
const unsigned short* __restrict__ gm_tempScaleGroups,
- const float3 velocityScalingMatrixDiagonal)
+ const float3 prVelocityScalingMatrixDiagonal)
{
int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
if (threadIndex < numAtoms)
if (velocityScaling == VelocityScalingType::Diagonal)
{
- vp.x -= velocityScalingMatrixDiagonal.x * v.x;
- vp.y -= velocityScalingMatrixDiagonal.y * v.y;
- vp.z -= velocityScalingMatrixDiagonal.z * v.z;
+ vp.x -= prVelocityScalingMatrixDiagonal.x * v.x;
+ vp.y -= prVelocityScalingMatrixDiagonal.y * v.y;
+ vp.z -= prVelocityScalingMatrixDiagonal.z * v.z;
}
v = vp;
/*! \brief Select templated kernel.
*
- * Returns pointer to a CUDA kernel based on the number of temperature coupling groups.
- * If zero is passed as an argument, it is assumed that no temperature coupling groups are used.
+ * Returns pointer to a CUDA kernel based on the number of temperature coupling groups and
+ * whether or not the temperature and(or) pressure coupling is enabled.
*
- * \param[in] numTempScaleValues Numer of temperature coupling groups in the system
- * \param[in] velocityScaling Type of the Parrinello-Rahman velocity scaling
+ * \param[in] doTemperatureScaling If the kernel with temperature coupling velocity scaling
+ * should be selected.
+ * \param[in] numTempScaleValues Number of temperature coupling groups in the system.
+ * \param[in] prVelocityScalingType Type of the Parrinello-Rahman velocity scaling.
*
* \retrun Pointer to CUDA kernel
*/
- inline auto selectLeapFrogKernelPtr(int numTempScaleValues, VelocityScalingType velocityScaling)
+ inline auto selectLeapFrogKernelPtr(bool doTemperatureScaling,
+ int numTempScaleValues,
+ VelocityScalingType prVelocityScalingType)
{
+ // Check input for consistency: if there is temperature coupling, at least one coupling group should be defined.
+ GMX_ASSERT(!doTemperatureScaling || (numTempScaleValues > 0),
+ "Temperature coupling was requested with no temperature coupling groups.");
auto kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::None>;
- if (velocityScaling == VelocityScalingType::None)
+ if (prVelocityScalingType == VelocityScalingType::None)
{
- if (numTempScaleValues == 0)
+ if (!doTemperatureScaling)
{
kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::None>;
}
{
kernelPtr = leapfrog_kernel<NumTempScaleValues::Multiple, VelocityScalingType::None>;
}
- else
- {
- GMX_RELEASE_ASSERT(false,
- "Number of temperature coupling groups should be greater than zero "
- "(zero for no coupling).");
- }
}
- else if (velocityScaling == VelocityScalingType::Diagonal)
+ else if (prVelocityScalingType == VelocityScalingType::Diagonal)
{
- if (numTempScaleValues == 0)
+ if (!doTemperatureScaling)
{
kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::Diagonal>;
}
{
kernelPtr = leapfrog_kernel<NumTempScaleValues::Multiple, VelocityScalingType::Diagonal>;
}
- else
- {
- GMX_RELEASE_ASSERT(false,
- "Number of temperature coupling groups should be greater than zero "
- "(zero for no coupling).");
- }
}
else
{
float3* d_v,
const float3* d_f,
const real dt,
- const bool doTempCouple,
+ const bool doTemperatureScaling,
gmx::ArrayRef<const t_grp_tcstat> tcstat,
- const bool doPressureCouple,
+ const bool doParrinelloRahman,
const float dtPressureCouple,
- const matrix velocityScalingMatrix)
+ const matrix prVelocityScalingMatrix)
{
ensureNoPendingCudaError("In CUDA version of Leap-Frog integrator");
auto kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::None>;
- if (doTempCouple || doPressureCouple)
+ if (doTemperatureScaling || doParrinelloRahman)
{
- if (doTempCouple)
+ if (doTemperatureScaling)
{
GMX_ASSERT(numTempScaleValues_ == ssize(h_lambdas_),
"Number of temperature scaling factors changed since it was set for the "
copyToDeviceBuffer(&d_lambdas_, h_lambdas_.data(), 0, numTempScaleValues_,
commandStream_, GpuApiCallBehavior::Async, nullptr);
}
- VelocityScalingType velocityScaling = VelocityScalingType::None;
- if (doPressureCouple)
+ VelocityScalingType prVelocityScalingType = VelocityScalingType::None;
+ if (doParrinelloRahman)
{
- velocityScaling = VelocityScalingType::Diagonal;
- GMX_ASSERT(velocityScalingMatrix[YY][XX] == 0 && velocityScalingMatrix[ZZ][XX] == 0
- && velocityScalingMatrix[ZZ][YY] == 0 && velocityScalingMatrix[XX][YY] == 0
- && velocityScalingMatrix[XX][ZZ] == 0 && velocityScalingMatrix[YY][ZZ] == 0,
+ prVelocityScalingType = VelocityScalingType::Diagonal;
+ GMX_ASSERT(prVelocityScalingMatrix[YY][XX] == 0 && prVelocityScalingMatrix[ZZ][XX] == 0
+ && prVelocityScalingMatrix[ZZ][YY] == 0
+ && prVelocityScalingMatrix[XX][YY] == 0
+ && prVelocityScalingMatrix[XX][ZZ] == 0
+ && prVelocityScalingMatrix[YY][ZZ] == 0,
"Fully anisotropic Parrinello-Rahman pressure coupling is not yet supported "
"in GPU version of Leap-Frog integrator.");
- velocityScalingMatrixDiagonal_ =
- make_float3(dtPressureCouple * velocityScalingMatrix[XX][XX],
- dtPressureCouple * velocityScalingMatrix[YY][YY],
- dtPressureCouple * velocityScalingMatrix[ZZ][ZZ]);
+ prVelocityScalingMatrixDiagonal_ =
+ make_float3(dtPressureCouple * prVelocityScalingMatrix[XX][XX],
+ dtPressureCouple * prVelocityScalingMatrix[YY][YY],
+ dtPressureCouple * prVelocityScalingMatrix[ZZ][ZZ]);
}
- kernelPtr = selectLeapFrogKernelPtr(numTempScaleValues_, velocityScaling);
+ kernelPtr = selectLeapFrogKernelPtr(doTemperatureScaling, numTempScaleValues_, prVelocityScalingType);
}
const auto kernelArgs = prepareGpuKernelArguments(
kernelPtr, kernelLaunchConfig_, &numAtoms_, &d_x, &d_xp, &d_v, &d_f, &d_inverseMasses_,
- &dt, &d_lambdas_, &d_tempScaleGroups_, &velocityScalingMatrixDiagonal_);
+ &dt, &d_lambdas_, &d_tempScaleGroups_, &prVelocityScalingMatrixDiagonal_);
launchGpuKernel(kernelPtr, kernelLaunchConfig_, nullptr, "leapfrog_kernel", kernelArgs);
return;
freeDeviceBuffer(&d_inverseMasses_);
}
-void LeapFrogCuda::setPbc(const t_pbc* pbc)
-{
- setPbcAiuc(pbc->ndim_ePBC, pbc->box, &pbcAiuc_);
-}
-
void LeapFrogCuda::set(const t_mdatoms& md, const int numTempScaleValues, const unsigned short* tempScaleGroups)
{
numAtoms_ = md.nr;
LeapFrogCuda(CommandStream commandStream);
~LeapFrogCuda();
- /*! \brief
- * Update PBC data.
- *
- * Converts PBC data from t_pbc into the PbcAiuc format and stores the latter.
- *
- * \param[in] pbc The PBC data in t_pbc format.
- */
- void setPbc(const t_pbc* pbc);
-
/*! \brief Integrate
*
* Integrates the equation of motion using Leap-Frog algorithm.
* Updates coordinates and velocities on the GPU. The current coordinates are saved for constraints.
*
- * \param[in,out] d_x Coordinates to update
- * \param[out] d_xp Place to save the values of initial coordinates coordinates to.
- * \param[in,out] d_v Velocities (will be updated).
- * \param[in] d_f Forces.
- * \param[in] dt Timestep.
- * \param[in] doTempCouple If the temperature coupling should be applied.
- * \param[in] tcstat Temperature coupling data.
- * \param[in] doPressureCouple If the temperature coupling should be applied.
- * \param[in] dtPressureCouple Period between pressure coupling steps
- * \param[in] velocityScalingMatrix Parrinello-Rahman velocity scaling matrix
+ * \param[in,out] d_x Coordinates to update
+ * \param[out] d_xp Place to save the values of initial coordinates coordinates to.
+ * \param[in,out] d_v Velocities (will be updated).
+ * \param[in] d_f Forces.
+ * \param[in] dt Timestep.
+ * \param[in] doTemperatureScaling If velocities should be scaled for temperature coupling.
+ * \param[in] tcstat Temperature coupling data.
+ * \param[in] doParrinelloRahman If current step is a Parrinello-Rahman pressure coupling step.
+ * \param[in] dtPressureCouple Period between pressure coupling steps
+ * \param[in] prVelocityScalingMatrix Parrinello-Rahman velocity scaling matrix
*/
void integrate(const float3* d_x,
float3* d_xp,
float3* d_v,
const float3* d_f,
const real dt,
- const bool doTempCouple,
+ const bool doTemperatureScaling,
gmx::ArrayRef<const t_grp_tcstat> tcstat,
- const bool doPressureCouple,
+ const bool doParrinelloRahman,
const float dtPressureCouple,
- const matrix velocityScalingMatrix);
+ const matrix prVelocityScalingMatrix);
/*! \brief Set the integrator
*
CommandStream commandStream_;
//! CUDA kernel launch config
KernelLaunchConfig kernelLaunchConfig_;
- //! Periodic boundary data
- PbcAiuc pbcAiuc_;
//! Number of atoms
int numAtoms_;
//! Maximum size of the temperature coupling groups array
int numTempScaleGroupsAlloc_ = -1;
- //! Vector with diagonal elements of the pressure coupling velocity rescale factors
- float3 velocityScalingMatrixDiagonal_;
+ //! Vector with diagonal elements of the Parrinello-Rahman pressure coupling velocity rescale factors
+ float3 prVelocityScalingMatrixDiagonal_;
};
} // namespace gmx
* using CUDA, including class initialization, data-structures management
* and GPU kernel.
*
- * \note Management of periodic boundary should be unified with SETTLE and
- * removed from here.
* \todo Reconsider naming, i.e. "cuda" suffics should be changed to "gpu".
*
* \author Artem Zhmurov <zhmurov@gmail.com>
float3* d_v,
const real invdt,
const bool computeVirial,
- tensor virialScaled)
+ tensor virialScaled,
+ const PbcAiuc pbcAiuc)
{
ensureNoPendingCudaError("In CUDA version of LINCS");
}
config.stream = commandStream_;
+ kernelParams_.pbcAiuc = pbcAiuc;
+
const auto kernelArgs =
prepareGpuKernelArguments(kernelPtr, config, &kernelParams_, &d_x, &d_xp, &d_v, &invdt);
/*! \brief Helper function to go through constraints recursively.
*
- * For each constraint, counts the number of coupled constraints and stores the value in spaceNeeded array.
+ * For each constraint, counts the number of coupled constraints and stores the value in numCoupledConstraints array.
* This information is used to split the array of constraints between thread blocks on a GPU so there is no
- * coupling between constraints from different thread blocks. After the 'spaceNeeded' array is filled, the
- * value spaceNeeded[c] should be equal to the number of constraints that are coupled to 'c' and located
+ * coupling between constraints from different thread blocks. After the 'numCoupledConstraints' array is filled, the
+ * value numCoupledConstraints[c] should be equal to the number of constraints that are coupled to 'c' and located
* after it in the constraints array.
*
- * \param[in] a Atom index.
- * \param[in,out] spaceNeeded Indicates if the constraint was already counted and stores
- * the number of constraints (i) connected to it and (ii) located
- * after it in memory. This array is filled by this recursive function.
- * For a set of coupled constraints, only for the first one in this list
- * the number of consecutive coupled constraints is needed: if there is
- * not enough space for this set of constraints in the thread block,
- * the group has to be moved to the next one.
- * \param[in] atomAdjacencyList Stores information about connections between atoms.
+ * \param[in] a Atom index.
+ * \param[in,out] numCoupledConstraints Indicates if the constraint was already counted and stores
+ * the number of constraints (i) connected to it and (ii) located
+ * after it in memory. This array is filled by this recursive function.
+ * For a set of coupled constraints, only for the first one in this list
+ * the number of consecutive coupled constraints is needed: if there is
+ * not enough space for this set of constraints in the thread block,
+ * the group has to be moved to the next one.
+ * \param[in] atomsAdjacencyList Stores information about connections between atoms.
*/
- inline int countCoupled(int a,
- std::vector<int>* spaceNeeded,
- std::vector<std::vector<std::tuple<int, int, int>>>* atomsAdjacencyList)
+ inline int countCoupled(int a,
+ ArrayRef<int> numCoupledConstraints,
+ ArrayRef<const std::vector<std::tuple<int, int, int>>> atomsAdjacencyList)
{
- int c2, a2, sign;
int counted = 0;
- for (unsigned i = 0; i < atomsAdjacencyList->at(a).size(); i++)
+ for (const auto& adjacentAtom : atomsAdjacencyList[a])
{
- std::tie(a2, c2, sign) = atomsAdjacencyList->at(a).at(i);
- if (spaceNeeded->at(c2) == -1)
+ const int c2 = std::get<1>(adjacentAtom);
+ if (numCoupledConstraints[c2] == -1)
{
- spaceNeeded->at(c2) = 0; // To indicate we've been here
- counted += 1 + countCoupled(a2, spaceNeeded, atomsAdjacencyList);
+ numCoupledConstraints[c2] = 0; // To indicate we've been here
+ counted += 1 + countCoupled(std::get<0>(adjacentAtom), numCoupledConstraints, atomsAdjacencyList);
}
}
return counted;
}
+ /*! \brief Add constraint to \p splitMap with all constraints coupled to it.
+ *
+ * Adds the constraint \p c from the constrain list \p iatoms to the map \p splitMap
+ * if it was not yet added. Then goes through all the constraints coupled to \p c
+ * and calls itself recursively. This ensures that all the coupled constraints will
+ * be added to neighboring locations in the final data structures on the device,
+ * hence mapping all coupled constraints to the same thread block. A value of -1 in
+ * the \p splitMap is used to flag that constraint was not yet added to the \p splitMap.
+ *
+ * \param[in] iatoms The list of constraints.
+ * \param[in] stride Number of elements per constraint in \p iatoms.
+ * \param[in] atomsAdjacencyList Information about connections between atoms.
+ * \param[our] splitMap Map of sequential constraint indexes to indexes to be on the device
+ * \param[in] c Sequential index for constraint to consider adding.
+ * \param[in,out] currentMapIndex The rolling index for the constraints mapping.
+ */
+ inline void addWithCoupled(const t_iatom* iatoms,
+ const int stride,
+ ArrayRef<const std::vector<std::tuple<int, int, int>>> atomsAdjacencyList,
+ ArrayRef<int> splitMap,
+ const int c,
+ int* currentMapIndex)
+ {
+ if (splitMap[c] == -1)
+ {
+ splitMap[c] = *currentMapIndex;
+ (*currentMapIndex)++;
+
+ // Constraints, coupled through both atoms.
+ for (int atomIndexInConstraint = 0; atomIndexInConstraint < 2; atomIndexInConstraint++)
+ {
+ const int a1 = iatoms[stride * c + 1 + atomIndexInConstraint];
+ for (const auto& adjacentAtom : atomsAdjacencyList[a1])
+ {
+ const int c2 = std::get<1>(adjacentAtom);
+ if (c2 != c)
+ {
+ addWithCoupled(iatoms, stride, atomsAdjacencyList, splitMap, c2, currentMapIndex);
+ }
+ }
+ }
+ }
+ }
+
void LincsCuda::set(const t_idef& idef, const t_mdatoms& md)
{
int numAtoms = md.nr;
atomsAdjacencyList.at(a2).push_back(std::make_tuple(a1, c, -1));
}
- // Compute, how many coupled constraints are in front of each constraint.
- // Needed to introduce splits in data so that all coupled constraints will be computed in a single GPU block.
- // The position 'c' of the vector spaceNeeded should have the number of constraints that are coupled to a constraint
- // 'c' and are after 'c' in the vector. Only first index of the connected group of the constraints is needed later in the
- // code, hence the spaceNeeded vector is also used to keep track if the constrain was already counted.
- std::vector<int> spaceNeeded;
- spaceNeeded.resize(numConstraints, -1);
- std::fill(spaceNeeded.begin(), spaceNeeded.end(), -1);
+ // Compute, how many constraints are coupled to each constraint.
+ // Needed to introduce splits in data so that all coupled constraints will be computed in a
+ // single GPU block. The position 'c' of the vector numCoupledConstraints should have the number
+ // of constraints that are coupled to a constraint 'c' and are after 'c' in the vector. Only
+ // first index of the connected group of the constraints is needed later in the code, hence the
+ // numCoupledConstraints vector is also used to keep track if the constrain was already counted.
+ std::vector<int> numCoupledConstraints(numConstraints, -1);
for (int c = 0; c < numConstraints; c++)
{
int a1 = iatoms[stride * c + 1];
int a2 = iatoms[stride * c + 2];
- if (spaceNeeded.at(c) == -1)
+ if (numCoupledConstraints.at(c) == -1)
{
- spaceNeeded.at(c) = countCoupled(a1, &spaceNeeded, &atomsAdjacencyList)
- + countCoupled(a2, &spaceNeeded, &atomsAdjacencyList);
+ numCoupledConstraints.at(c) = countCoupled(a1, numCoupledConstraints, atomsAdjacencyList)
+ + countCoupled(a2, numCoupledConstraints, atomsAdjacencyList);
}
}
// Map of splits in the constraints data. For each 'old' constraint index gives 'new' which
// takes into account the empty spaces which might be needed in the end of each thread block.
- std::vector<int> splitMap;
- splitMap.resize(numConstraints, -1);
- int currentMapIndex = 0;
+ std::vector<int> splitMap(numConstraints, -1);
+ int currentMapIndex = 0;
for (int c = 0; c < numConstraints; c++)
{
// Check if coupled constraints all fit in one block
- GMX_RELEASE_ASSERT(
- spaceNeeded.at(c) < c_threadsPerBlock,
- "Maximum number of coupled constraints exceedes the size of the CUDA thread block. "
- "Most likely, you are trying to use GPU version of LINCS with constraints on "
- "all-bonds, "
- "which is not supported. Try using H-bonds constraints only.");
- if (currentMapIndex / c_threadsPerBlock != (currentMapIndex + spaceNeeded.at(c)) / c_threadsPerBlock)
+ if (numCoupledConstraints.at(c) > c_threadsPerBlock)
+ {
+ gmx_fatal(FARGS,
+ "Maximum number of coupled constraints (%d) exceeds the size of the CUDA "
+ "thread block (%d). Most likely, you are trying to use the GPU version of "
+ "LINCS with constraints on all-bonds, which is not supported for large "
+ "molecules. When compatible with the force field and integration settings, "
+ "using constraints on H-bonds only.",
+ numCoupledConstraints.at(c), c_threadsPerBlock);
+ }
+ if (currentMapIndex / c_threadsPerBlock
+ != (currentMapIndex + numCoupledConstraints.at(c)) / c_threadsPerBlock)
{
currentMapIndex = ((currentMapIndex / c_threadsPerBlock) + 1) * c_threadsPerBlock;
}
- splitMap.at(c) = currentMapIndex;
- currentMapIndex++;
+ addWithCoupled(iatoms, stride, atomsAdjacencyList, splitMap, c, ¤tMapIndex);
}
+
kernelParams_.numConstraintsThreads =
currentMapIndex + c_threadsPerBlock - currentMapIndex % c_threadsPerBlock;
GMX_RELEASE_ASSERT(kernelParams_.numConstraintsThreads % c_threadsPerBlock == 0,
maxCoupledConstraints * kernelParams_.numConstraintsThreads, commandStream_,
GpuApiCallBehavior::Sync, nullptr);
- GMX_RELEASE_ASSERT(md.invmass != nullptr, "Masses of attoms should be specified.\n");
+ GMX_RELEASE_ASSERT(md.invmass != nullptr, "Masses of atoms should be specified.\n");
copyToDeviceBuffer(&kernelParams_.d_inverseMasses, md.invmass, 0, numAtoms, commandStream_,
GpuApiCallBehavior::Sync, nullptr);
}
-void LincsCuda::setPbc(const t_pbc* pbc)
-{
- setPbcAiuc(pbc->ndim_ePBC, pbc->box, &kernelParams_.pbcAiuc);
-}
-
} // namespace gmx
/* \brief Launch end-of-step GPU tasks: buffer clearing and rolling pruning.
*
- * TODO: eliminate the \p useGpuNonbonded and \p useGpuNonbonded when these are
+ * TODO: eliminate \p useGpuPmeOnThisRank when this is
* incorporated in DomainLifetimeWorkload.
*/
static void launchGpuEndOfStepTasks(nonbonded_verlet_t* nbv,
gmx_pme_t* pmedata,
gmx_enerdata_t* enerd,
const gmx::MdrunScheduleWorkload& runScheduleWork,
- bool useGpuNonbonded,
- bool useGpuPme,
+ bool useGpuPmeOnThisRank,
int64_t step,
gmx_wallcycle_t wcycle)
{
- if (useGpuNonbonded)
+ if (runScheduleWork.simulationWork.useGpuNonbonded)
{
/* Launch pruning before buffer clearing because the API overhead of the
* clear kernel launches can leave the GPU idle while it could be running
wallcycle_stop(wcycle, ewcLAUNCH_GPU);
}
- if (useGpuPme)
+ if (useGpuPmeOnThisRank)
{
pme_gpu_reinit_computation(pmedata, wcycle);
}
}
}
- // Copy coordinate from the GPU if update is on the GPU and there are forces to be computed on the CPU. At search steps the
- // current coordinates are already on the host, hence copy is not needed.
+ // Copy coordinate from the GPU if update is on the GPU and there are forces to be computed on
+ // the CPU, or for the computation of virial. At search steps the current coordinates are
+ // already on the host, hence copy is not needed.
if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
- && runScheduleWork->domainWork.haveCpuLocalForceWork)
+ && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial))
{
stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local);
stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
}
wallcycle_stop(wcycle, ewcLAUNCH_GPU);
}
- }
- if (stepWork.doNeighborSearch)
- {
// Need to run after the GPU-offload bonded interaction lists
// are set up to be able to determine whether there is bonded work.
runScheduleWork->domainWork = setupDomainLifetimeWorkload(
if (ddUsesGpuDirectCommunication && (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork))
{
/* Wait for non-local coordinate data to be copied from device */
- nbv->wait_nonlocal_x_copy_D2H_done();
+ stateGpu->waitCoordinatesReadyOnHost(AtomLocality::NonLocal);
}
/* Compute the bonded and non-bonded energies and optionally forces */
do_force_lowlevel(fr, inputrec, &(top->idef), cr, ms, nrnb, wcycle, mdatoms, x, hist, &forceOut, enerd,
fr->pmePpCommGpu->getForcesReadySynchronizer())) // buffer received from other GPU
: nullptr; // PME reduction not active on GPU
- gmx::FixedCapacityVector<GpuEventSynchronizer*, 2> dependencyList;
+ gmx::FixedCapacityVector<GpuEventSynchronizer*, 3> dependencyList;
if (stepWork.useGpuPmeFReduction)
{
}
if (useGpuForcesHaloExchange)
{
- // Add a stream synchronization to satisfy a dependency
- // for the local buffer ops on the result of GPU halo
- // exchange, which operates in the non-local stream and
- // writes to to local parf og the force buffer.
- //
- // TODO improve this through use of an event - see Redmine #3093
- // push the event into the dependencyList
- nbv->stream_local_wait_for_nonlocal();
+ dependencyList.push_back(gpuHaloExchange->getForcesReadyOnDeviceEvent());
}
nbv->atomdata_add_nbat_f_to_f_gpu(AtomLocality::Local, stateGpu->getForces(), pmeForcePtr,
dependencyList, stepWork.useGpuPmeFReduction,
}
launchGpuEndOfStepTasks(nbv, fr->gpuBonded, fr->pmedata, enerd, *runScheduleWork,
- simulationWork.useGpuNonbonded, useGpuPmeOnThisRank, step, wcycle);
+ useGpuPmeOnThisRank, step, wcycle);
if (DOMAINDECOMP(cr))
{
namespace gmx
{
+ /*!\brief Number of CUDA threads in a block
+ *
+ * \todo Check if using smaller block size will lead to better prformance.
+ */
+ constexpr static int c_threadsPerBlock = 256;
+ //! Maximum number of threads in a block (for __launch_bounds__)
+ constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock;
+
+ /*! \brief Scaling matrix struct.
+ *
+ * \todo Should be generalized.
+ */
+ struct ScalingMatrix
+ {
+ float xx, yy, zz, yx, zx, zy;
+ };
+
+ __launch_bounds__(c_maxThreadsPerBlock) __global__
+ static void scaleCoordinates_kernel(const int numAtoms,
+ float3* __restrict__ gm_x,
+ const ScalingMatrix scalingMatrix)
+ {
+ int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
+ if (threadIndex < numAtoms)
+ {
+ float3 x = gm_x[threadIndex];
+
+ x.x = scalingMatrix.xx * x.x + scalingMatrix.yx * x.y + scalingMatrix.zx * x.z;
+ x.y = scalingMatrix.yy * x.y + scalingMatrix.zy * x.z;
+ x.z = scalingMatrix.zz * x.z;
+
+ gm_x[threadIndex] = x;
+ }
+ }
void UpdateConstrainCuda::Impl::integrate(GpuEventSynchronizer* fReadyOnDevice,
const real dt,
const bool updateVelocities,
const bool computeVirial,
tensor virial,
- const bool doTempCouple,
+ const bool doTemperatureScaling,
gmx::ArrayRef<const t_grp_tcstat> tcstat,
- const bool doPressureCouple,
+ const bool doParrinelloRahman,
const float dtPressureCouple,
- const matrix velocityScalingMatrix)
+ const matrix prVelocityScalingMatrix)
{
// Clearing virial matrix
// TODO There is no point in having separate virial matrix for constraints
// The integrate should save a copy of the current coordinates in d_xp_ and write updated once
// into d_x_. The d_xp_ is only needed by constraints.
- integrator_->integrate(d_x_, d_xp_, d_v_, d_f_, dt, doTempCouple, tcstat, doPressureCouple,
- dtPressureCouple, velocityScalingMatrix);
+ integrator_->integrate(d_x_, d_xp_, d_v_, d_f_, dt, doTemperatureScaling, tcstat,
+ doParrinelloRahman, dtPressureCouple, prVelocityScalingMatrix);
// Constraints need both coordinates before (d_x_) and after (d_xp_) update. However, after constraints
// are applied, the d_x_ can be discarded. So we intentionally swap the d_x_ and d_xp_ here to avoid the
// d_xp_ -> d_x_ copy after constraints. Note that the integrate saves them in the wrong order as well.
- lincsCuda_->apply(d_xp_, d_x_, updateVelocities, d_v_, 1.0 / dt, computeVirial, virial);
- settleCuda_->apply(d_xp_, d_x_, updateVelocities, d_v_, 1.0 / dt, computeVirial, virial);
+ lincsCuda_->apply(d_xp_, d_x_, updateVelocities, d_v_, 1.0 / dt, computeVirial, virial, pbcAiuc_);
+ settleCuda_->apply(d_xp_, d_x_, updateVelocities, d_v_, 1.0 / dt, computeVirial, virial, pbcAiuc_);
// scaledVirial -> virial (methods above returns scaled values)
float scaleFactor = 0.5f / (dt * dt);
return;
}
+ void UpdateConstrainCuda::Impl::scaleCoordinates(const matrix scalingMatrix)
+ {
+ ScalingMatrix mu;
+ mu.xx = scalingMatrix[XX][XX];
+ mu.yy = scalingMatrix[YY][YY];
+ mu.zz = scalingMatrix[ZZ][ZZ];
+ mu.yx = scalingMatrix[YY][XX];
+ mu.zx = scalingMatrix[ZZ][XX];
+ mu.zy = scalingMatrix[ZZ][YY];
+
+ const auto kernelArgs = prepareGpuKernelArguments(
+ scaleCoordinates_kernel, coordinateScalingKernelLaunchConfig_, &numAtoms_, &d_x_, &mu);
+ launchGpuKernel(scaleCoordinates_kernel, coordinateScalingKernelLaunchConfig_, nullptr,
+ "scaleCoordinates_kernel", kernelArgs);
+ // TODO: Although this only happens on the pressure coupling steps, this synchronization
+ // can affect the perfornamce if nstpcouple is small.
+ gpuStreamSynchronize(commandStream_);
+ }
+
UpdateConstrainCuda::Impl::Impl(const t_inputrec& ir,
const gmx_mtop_t& mtop,
const void* commandStream,
integrator_ = std::make_unique<LeapFrogCuda>(commandStream_);
lincsCuda_ = std::make_unique<LincsCuda>(ir.nLincsIter, ir.nProjOrder, commandStream_);
settleCuda_ = std::make_unique<SettleCuda>(mtop, commandStream_);
+
+ coordinateScalingKernelLaunchConfig_.blockSize[0] = c_threadsPerBlock;
+ coordinateScalingKernelLaunchConfig_.blockSize[1] = 1;
+ coordinateScalingKernelLaunchConfig_.blockSize[2] = 1;
+ coordinateScalingKernelLaunchConfig_.sharedMemorySize = 0;
+ coordinateScalingKernelLaunchConfig_.stream = commandStream_;
}
UpdateConstrainCuda::Impl::~Impl() {}
integrator_->set(md, numTempScaleValues, md.cTC);
lincsCuda_->set(idef, md);
settleCuda_->set(idef, md);
+
+ coordinateScalingKernelLaunchConfig_.gridSize[0] =
+ (numAtoms_ + c_threadsPerBlock - 1) / c_threadsPerBlock;
}
void UpdateConstrainCuda::Impl::setPbc(const t_pbc* pbc)
{
setPbcAiuc(pbc->ndim_ePBC, pbc->box, &pbcAiuc_);
- integrator_->setPbc(pbc);
- lincsCuda_->setPbc(pbc);
- settleCuda_->setPbc(pbc);
}
GpuEventSynchronizer* UpdateConstrainCuda::Impl::getCoordinatesReadySync()
const bool updateVelocities,
const bool computeVirial,
tensor virialScaled,
- const bool doTempCouple,
+ const bool doTemperatureScaling,
gmx::ArrayRef<const t_grp_tcstat> tcstat,
- const bool doPressureCouple,
+ const bool doParrinelloRahman,
const float dtPressureCouple,
- const matrix velocityScalingMatrix)
+ const matrix prVelocityScalingMatrix)
+ {
+ impl_->integrate(fReadyOnDevice, dt, updateVelocities, computeVirial, virialScaled, doTemperatureScaling,
+ tcstat, doParrinelloRahman, dtPressureCouple, prVelocityScalingMatrix);
+ }
+
+ void UpdateConstrainCuda::scaleCoordinates(const matrix scalingMatrix)
{
- impl_->integrate(fReadyOnDevice, dt, updateVelocities, computeVirial, virialScaled, doTempCouple,
- tcstat, doPressureCouple, dtPressureCouple, velocityScalingMatrix);
+ impl_->scaleCoordinates(scalingMatrix);
}
void UpdateConstrainCuda::set(DeviceBuffer<float> d_x,
*
* \param[in] mdlog Logger object.
* \param[in] useGpuForNonbonded True if the nonbonded task is offloaded in this run.
- * \param[in] useGpuForPme True if the PME task is offloaded in this run.
+ * \param[in] pmeRunMode The PME run mode for this run
* \returns The object populated with development feature flags.
*/
static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger& mdlog,
const bool useGpuForNonbonded,
- const bool useGpuForPme)
+ const PmeRunMode pmeRunMode)
{
DevelopmentFeatureFlags devFlags;
if (devFlags.enableGpuPmePPComm)
{
- if (useGpuForPme)
+ if (pmeRunMode == PmeRunMode::GPU)
{
GMX_LOG(mdlog.warning)
.asParagraph()
}
else
{
+ std::string clarification;
+ if (pmeRunMode == PmeRunMode::Mixed)
+ {
+ clarification =
+ "PME FFT and gather are not offloaded to the GPU (PME is running in mixed "
+ "mode).";
+ }
+ else
+ {
+ clarification = "PME is not offloaded to the GPU.";
+ }
GMX_LOG(mdlog.warning)
.asParagraph()
- .appendTextFormatted(
+ .appendText(
"NOTE: GMX_GPU_PME_PP_COMMS environment variable detected, but the "
- "'GPU PME-PP communications' feature was not enabled as PME is not "
- "offloaded to the GPU.");
+ "'GPU PME-PP communications' feature was not enabled as "
+ + clarification);
devFlags.enableGpuPmePPComm = false;
}
}
// Initialize development feature flags that enabled by environment variable
// and report those features that are enabled.
const DevelopmentFeatureFlags devFlags =
- manageDevelopmentFeatures(mdlog, useGpuForNonbonded, useGpuForPme);
+ manageDevelopmentFeatures(mdlog, useGpuForNonbonded, pmeRunMode);
// NOTE: The devFlags need decideWhetherToUseGpusForNonbonded(...) and decideWhetherToUseGpusForPme(...) for overrides,
// decideWhetherToUseGpuForUpdate() needs devFlags for the '-update auto' override, hence the interleaving.
/* This call is not included in init_domain_decomposition mainly
* because fr->cginfo_mb is set later.
*/
- dd_init_bondeds(fplog, cr->dd, &mtop, vsite.get(), inputrec,
+ dd_init_bondeds(fplog, cr->dd, mtop, vsite.get(), inputrec,
domdecOptions.checkBondedInteractions, fr->cginfo_mb);
}
}
// FIXME: this is only here to manually unpin mdAtoms->chargeA_ and state->x,
- // before we destroy the GPU context(s) in free_gpu_resources().
+ // before we destroy the GPU context(s) in free_gpu().
// Pinned buffers are associated with contexts in CUDA.
// As soon as we destroy GPU contexts after mdrunner() exits, these lines should go.
mdAtoms.reset(nullptr);
globalState.reset(nullptr);
mdModules_.reset(nullptr); // destruct force providers here as they might also use the GPU
+ /* Free pinned buffers in *fr */
+ delete fr;
+ fr = nullptr;
+
+ if (hwinfo->gpu_info.n_dev > 0)
+ {
+ /* stop the GPU profiler (only CUDA) */
+ stopGpuProfiler();
+ }
+
+ /* With tMPI we need to wait for all ranks to finish deallocation before
+ * destroying the CUDA context in free_gpu() as some tMPI ranks may be sharing
+ * GPU and context.
+ *
+ * This is not a concern in OpenCL where we use one context per rank which
+ * is freed in nbnxn_gpu_free().
+ *
+ * Note: it is safe to not call the barrier on the ranks which do not use GPU,
+ * but it is easier and more futureproof to call it on the whole node.
+ *
+ * Note that this function needs to be called even if GPUs are not used
+ * in this run because the PME ranks have no knowledge of whether GPUs
+ * are used or not, but all ranks need to enter the barrier below.
+ * \todo Remove this physical node barrier after making sure
+ * that it's not needed anymore (with a shared GPU run).
+ */
+ if (GMX_THREAD_MPI)
+ {
+ physicalNodeComm.barrier();
+ }
- /* Free GPU memory and set a physical node tMPI barrier (which should eventually go away) */
- free_gpu_resources(fr, physicalNodeComm, hwinfo->gpu_info);
free_gpu(nonbondedDeviceInfo);
free_gpu(pmeDeviceInfo);
- done_forcerec(fr, mtop.molblock.size());
sfree(fcd);
if (doMembed)
}
reallocateDeviceBuffer(&d_v_, DIM * numAtomsAll_, &d_vSize_, &d_vCapacity_, deviceContext_);
+ const int d_fOldCapacity = d_fCapacity_;
reallocateDeviceBuffer(&d_f_, DIM * numAtomsAll_, &d_fSize_, &d_fCapacity_, deviceContext_);
+ // Clearing of the forces can be done in local stream since the nonlocal stream cannot reach
+ // the force accumulation stage before syncing with the local stream. Only done in CUDA,
+ // since the force buffer ops are not implemented in OpenCL.
+ if (GMX_GPU == GMX_GPU_CUDA && d_fCapacity_ != d_fOldCapacity)
+ {
+ clearDeviceBufferAsync(&d_f_, 0, d_fCapacity_, localStream_);
+ }
}
std::tuple<int, int> StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality(AtomLocality atomLocality)
GMX_UNUSED_VALUE(dataSize);
+ GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
+
GMX_ASSERT(dataSize >= 0, "Trying to copy to device buffer before it was allocated.");
+ GMX_ASSERT(commandStream != nullptr,
+ "No stream is valid for copying with given atom locality.");
+
int atomsStartAt, numAtomsToCopy;
std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality);
GMX_UNUSED_VALUE(dataSize);
+ GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
+
GMX_ASSERT(dataSize >= 0, "Trying to copy from device buffer before it was allocated.");
+ GMX_ASSERT(commandStream != nullptr,
+ "No stream is valid for copying with given atom locality.");
+
int atomsStartAt, numAtomsToCopy;
std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality);
void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef<const gmx::RVec> h_x,
AtomLocality atomLocality)
{
- GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
- CommandStream commandStream = xCopyStreams_[atomLocality];
- GMX_ASSERT(commandStream != nullptr,
- "No stream is valid for copying positions with given atom locality.");
-
- copyToDevice(d_x_, h_x, d_xSize_, atomLocality, commandStream);
+ copyToDevice(d_x_, h_x, d_xSize_, atomLocality, xCopyStreams_[atomLocality]);
// markEvent is skipped in OpenCL as:
// - it's not needed, copy is done in the same stream as the only consumer task (PME)
// TODO: remove this by adding an event-mark free flavor of this function
if (GMX_GPU == GMX_GPU_CUDA)
{
- xReadyOnDevice_[atomLocality].markEvent(commandStream);
+ xReadyOnDevice_[atomLocality].markEvent(xCopyStreams_[atomLocality]);
}
}
void StatePropagatorDataGpu::Impl::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x, AtomLocality atomLocality)
{
- GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
- CommandStream commandStream = xCopyStreams_[atomLocality];
- GMX_ASSERT(commandStream != nullptr,
- "No stream is valid for copying positions with given atom locality.");
-
- copyFromDevice(h_x, d_x_, d_xSize_, atomLocality, commandStream);
+ copyFromDevice(h_x, d_x_, d_xSize_, atomLocality, xCopyStreams_[atomLocality]);
// Note: unlike copyCoordinatesToGpu this is not used in OpenCL, and the conditional is not needed.
- xReadyOnHost_[atomLocality].markEvent(commandStream);
+ xReadyOnHost_[atomLocality].markEvent(xCopyStreams_[atomLocality]);
}
void StatePropagatorDataGpu::Impl::waitCoordinatesReadyOnHost(AtomLocality atomLocality)
void StatePropagatorDataGpu::Impl::copyVelocitiesToGpu(const gmx::ArrayRef<const gmx::RVec> h_v,
AtomLocality atomLocality)
{
- GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
- CommandStream commandStream = vCopyStreams_[atomLocality];
- GMX_ASSERT(commandStream != nullptr,
- "No stream is valid for copying velocities with given atom locality.");
-
- copyToDevice(d_v_, h_v, d_vSize_, atomLocality, commandStream);
- vReadyOnDevice_[atomLocality].markEvent(commandStream);
+ copyToDevice(d_v_, h_v, d_vSize_, atomLocality, vCopyStreams_[atomLocality]);
+ vReadyOnDevice_[atomLocality].markEvent(vCopyStreams_[atomLocality]);
}
GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getVelocitiesReadyOnDeviceEvent(AtomLocality atomLocality)
void StatePropagatorDataGpu::Impl::copyVelocitiesFromGpu(gmx::ArrayRef<gmx::RVec> h_v, AtomLocality atomLocality)
{
- GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
- CommandStream commandStream = vCopyStreams_[atomLocality];
- GMX_ASSERT(commandStream != nullptr,
- "No stream is valid for copying velocities with given atom locality.");
-
- copyFromDevice(h_v, d_v_, d_vSize_, atomLocality, commandStream);
- vReadyOnHost_[atomLocality].markEvent(commandStream);
+ copyFromDevice(h_v, d_v_, d_vSize_, atomLocality, vCopyStreams_[atomLocality]);
+ vReadyOnHost_[atomLocality].markEvent(vCopyStreams_[atomLocality]);
}
void StatePropagatorDataGpu::Impl::waitVelocitiesReadyOnHost(AtomLocality atomLocality)
void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec> h_f,
AtomLocality atomLocality)
{
- GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
- CommandStream commandStream = fCopyStreams_[atomLocality];
- GMX_ASSERT(commandStream != nullptr,
- "No stream is valid for copying forces with given atom locality.");
-
- copyToDevice(d_f_, h_f, d_fSize_, atomLocality, commandStream);
- fReadyOnDevice_[atomLocality].markEvent(commandStream);
+ copyToDevice(d_f_, h_f, d_fSize_, atomLocality, fCopyStreams_[atomLocality]);
+ fReadyOnDevice_[atomLocality].markEvent(fCopyStreams_[atomLocality]);
}
GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getForcesReadyOnDeviceEvent(AtomLocality atomLocality,
void StatePropagatorDataGpu::Impl::copyForcesFromGpu(gmx::ArrayRef<gmx::RVec> h_f, AtomLocality atomLocality)
{
- GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
- CommandStream commandStream = fCopyStreams_[atomLocality];
- GMX_ASSERT(commandStream != nullptr,
- "No stream is valid for copying forces with given atom locality.");
-
- copyFromDevice(h_f, d_f_, d_fSize_, atomLocality, commandStream);
- fReadyOnHost_[atomLocality].markEvent(commandStream);
+ copyFromDevice(h_f, d_f_, d_fSize_, atomLocality, fCopyStreams_[atomLocality]);
+ fReadyOnHost_[atomLocality].markEvent(fCopyStreams_[atomLocality]);
}
void StatePropagatorDataGpu::Impl::waitForcesReadyOnHost(AtomLocality atomLocality)