namespace gmx
{
-//! Number of CUDA threads in a block
+/*!\brief Number of CUDA threads in a block
+ *
+ * \todo Check if using smaller block size will lead to better prformance.
+ */
constexpr static int c_threadsPerBlock = 256;
//! Maximum number of threads in a block (for __launch_bounds__)
constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock;
};
/*! \brief Main kernel for Leap-Frog integrator.
+ *
+ * The coordinates and velocities are updated on the GPU. Also saves the intermediate values of the coordinates for
+ * further use in constraints.
*
* Each GPU thread works with a single particle. Empty declaration is needed to
* avoid "no previous prototype for function" clang warning.
* \tparam numTempScaleValues The number of different T-couple values.
* \tparam velocityScaling Type of the Parrinello-Rahman velocity rescaling.
* \param[in] numAtoms Total number of atoms.
- * \param[in] gm_x Coordinates before the timestep
- * \param[out] gm_xp Coordinates after the timestep.
+ * \param[in,out] gm_x Coordinates to update upon integration.
+ * \param[out] gm_xp A copy of the coordinates before the integration (for constraints).
* \param[in,out] gm_v Velocities to update.
* \param[in] gm_f Atomic forces.
* \param[in] gm_inverseMasses Reciprocal masses.
template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
__launch_bounds__(c_maxThreadsPerBlock)
__global__ void leapfrog_kernel(const int numAtoms,
- const float3* __restrict__ gm_x,
+ float3* __restrict__ gm_x,
float3* __restrict__ gm_xp,
float3* __restrict__ gm_v,
const float3* __restrict__ gm_f,
template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
__launch_bounds__(c_maxThreadsPerBlock)
__global__ void leapfrog_kernel(const int numAtoms,
- const float3* __restrict__ gm_x,
+ float3* __restrict__ gm_x,
float3* __restrict__ gm_xp,
float3* __restrict__ gm_v,
const float3* __restrict__ gm_f,
float im = gm_inverseMasses[threadIndex];
float imdt = im*dt;
+ // Swapping places for xp and x so that the x will contain the updated coordinates and xp - the
+ // coordinates before update. This should be taken into account when (if) constraints are applied
+ // after the update: x and xp have to be passed to constraints in the 'wrong' order.
+ gm_xp[threadIndex] = x;
+
if (numTempScaleValues != NumTempScaleValues::None || velocityScaling != VelocityScalingType::None)
{
float3 vp = v;
x += v*dt;
gm_v[threadIndex] = v;
- gm_xp[threadIndex] = x;
+ gm_x[threadIndex] = x;
+
}
return;
}
/*! \brief Integrate
*
* Integrates the equation of motion using Leap-Frog algorithm.
- * Updates coordinates and velocities on the GPU.
+ * Updates coordinates and velocities on the GPU. The current coordinates are saved for constraints.
*
- * \param[in] d_x Initial coordinates.
- * \param[out] d_xp Place to save the resulting coordinates to.
+ * \param[in,out] d_x Coordinates to update
+ * \param[out] d_xp Place to save the values of initial coordinates coordinates to.
* \param[in,out] d_v Velocities (will be updated).
* \param[in] d_f Forces.
* \param[in] dt Timestep.
integrator->integrate(d_x, d_xp, d_v, d_f, testData->timestep_,
doTempCouple, testData->kineticEnergyData_.tcstat,
doPressureCouple, testData->dtPressureCouple_, testData->velocityScalingMatrix_);
-
- copyFromDeviceBuffer(h_xp, &d_xp, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
- copyToDeviceBuffer(&d_x, h_xp, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
}
- copyFromDeviceBuffer(h_xp, &d_xp, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
+ copyFromDeviceBuffer(h_xp, &d_x, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
copyFromDeviceBuffer(h_v, &d_v, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
freeDeviceBuffer(&d_x);
// TODO There is no point in having separate virial matrix for constraints
clear_mat(virial);
+ // The integrate should save a copy of the current coordinates in d_xp_ and write updated once into d_x_.
+ // The d_xp_ is only needed by constraints.
integrator_->integrate(d_x_, d_xp_, d_v_, d_f_, dt,
doTempCouple, tcstat,
doPressureCouple, dtPressureCouple, velocityScalingMatrix);
- lincsCuda_->apply(d_x_, d_xp_,
+ // Constraints need both coordinates before (d_x_) and after (d_xp_) update. However, after constraints
+ // are applied, the d_x_ can be discarded. So we intentionally swap the d_x_ and d_xp_ here to avoid the
+ // d_xp_ -> d_x_ copy after constraints. Note that the integrate saves them in the wrong order as well.
+ lincsCuda_->apply(d_xp_, d_x_,
updateVelocities, d_v_, 1.0/dt,
computeVirial, virial);
- settleCuda_->apply(d_x_, d_xp_,
+ settleCuda_->apply(d_xp_, d_x_,
updateVelocities, d_v_, 1.0/dt,
computeVirial, virial);
}
}
- // TODO: This should be eliminated
- cudaMemcpy(d_x_, d_xp_, numAtoms_*sizeof(float3), cudaMemcpyDeviceToDevice);
-
return;
}