namespace gmx
{
+/*!\brief Number of CUDA threads in a block
+ *
+ * \todo Check if using smaller block size will lead to better prformance.
+ */
+constexpr static int c_threadsPerBlock = 256;
+//! Maximum number of threads in a block (for __launch_bounds__)
+constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock;
+
+/*! \brief Scaling matrix struct.
+ *
+ * \todo Should be generalized.
+ */
+struct ScalingMatrix
+{
+ float xx, yy, zz, yx, zx, zy;
+};
+
+__launch_bounds__(c_maxThreadsPerBlock) __global__
+ void scaleCoordinates_kernel(const int numAtoms, float3* __restrict__ gm_x, const ScalingMatrix scalingMatrix)
+{
+ int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
+ if (threadIndex < numAtoms)
+ {
+ float3 x = gm_x[threadIndex];
+
+ x.x = scalingMatrix.xx * x.x + scalingMatrix.yx * x.y + scalingMatrix.zx * x.z;
+ x.y = scalingMatrix.yy * x.y + scalingMatrix.zy * x.z;
+ x.z = scalingMatrix.zz * x.z;
+
+ gm_x[threadIndex] = x;
+ }
+}
void UpdateConstrainCuda::Impl::integrate(GpuEventSynchronizer* fReadyOnDevice,
const real dt,
return;
}
+void UpdateConstrainCuda::Impl::scaleCoordinates(const matrix scalingMatrix)
+{
+ ScalingMatrix mu;
+ mu.xx = scalingMatrix[XX][XX];
+ mu.yy = scalingMatrix[YY][YY];
+ mu.zz = scalingMatrix[ZZ][ZZ];
+ mu.yx = scalingMatrix[YY][XX];
+ mu.zx = scalingMatrix[ZZ][XX];
+ mu.zy = scalingMatrix[ZZ][YY];
+
+ const auto kernelArgs = prepareGpuKernelArguments(
+ scaleCoordinates_kernel, coordinateScalingKernelLaunchConfig_, &numAtoms_, &d_x_, &mu);
+ launchGpuKernel(scaleCoordinates_kernel, coordinateScalingKernelLaunchConfig_, nullptr,
+ "scaleCoordinates_kernel", kernelArgs);
+ // TODO: Although this only happens on the pressure coupling steps, this synchronization
+ // can affect the perfornamce if nstpcouple is small.
+ gpuStreamSynchronize(commandStream_);
+}
+
UpdateConstrainCuda::Impl::Impl(const t_inputrec& ir,
const gmx_mtop_t& mtop,
const void* commandStream,
integrator_ = std::make_unique<LeapFrogCuda>(commandStream_);
lincsCuda_ = std::make_unique<LincsCuda>(ir.nLincsIter, ir.nProjOrder, commandStream_);
settleCuda_ = std::make_unique<SettleCuda>(mtop, commandStream_);
+
+ coordinateScalingKernelLaunchConfig_.blockSize[0] = c_threadsPerBlock;
+ coordinateScalingKernelLaunchConfig_.blockSize[1] = 1;
+ coordinateScalingKernelLaunchConfig_.blockSize[2] = 1;
+ coordinateScalingKernelLaunchConfig_.sharedMemorySize = 0;
+ coordinateScalingKernelLaunchConfig_.stream = commandStream_;
}
UpdateConstrainCuda::Impl::~Impl() {}
integrator_->set(md, numTempScaleValues, md.cTC);
lincsCuda_->set(idef, md);
settleCuda_->set(idef, md);
+
+ coordinateScalingKernelLaunchConfig_.gridSize[0] =
+ (numAtoms_ + c_threadsPerBlock - 1) / c_threadsPerBlock;
}
void UpdateConstrainCuda::Impl::setPbc(const t_pbc* pbc)
tcstat, doPressureCouple, dtPressureCouple, velocityScalingMatrix);
}
+void UpdateConstrainCuda::scaleCoordinates(const matrix scalingMatrix)
+{
+ impl_->scaleCoordinates(scalingMatrix);
+}
+
void UpdateConstrainCuda::set(DeviceBuffer<float> d_x,
DeviceBuffer<float> d_v,
const DeviceBuffer<float> d_f,
float dtPressureCouple,
const matrix velocityScalingMatrix);
+ /*! \brief Scale coordinates on the GPU for the pressure coupling.
+ *
+ * After pressure coupling step, the box size may change. Hence, the coordinates should be
+ * scaled so that all the particles fit in the new box.
+ *
+ * \param[in] scalingMatrix Coordinates scaling matrix.
+ */
+ void scaleCoordinates(const matrix scalingMatrix);
+
/*! \brief Set the pointers and update data-structures (e.g. after NB search step).
*
* \param[in,out] d_x Device buffer with coordinates.
private:
//! CUDA stream
CommandStream commandStream_ = nullptr;
+ //! CUDA kernel launch config
+ KernelLaunchConfig coordinateScalingKernelLaunchConfig_;
//! Periodic boundary data
PbcAiuc pbcAiuc_;