message(STATUS "Error executing hwloc-info: ${HWLOC_INFO_ERR}")
endif()
string(REGEX MATCH "[0-9]+.*[0-9]+" HWLOC_INFO_OUT "${HWLOC_INFO_OUT}")
- set(HWLOC_VERSION ${HWLOC_INFO_OUT} CACHE STRING "HWLOC library version")
+ set(HWLOC_LIBRARY_VERSION ${HWLOC_INFO_OUT})
endif()
endif()
endif()
# Parse header if cross-compiling, or if hwloc-info was not found
- if(NOT HWLOC_VERSION)
- # HWLOC is never installed as a framework on OS X, so this should always work.
- file(READ "${HWLOC_INCLUDE_DIRS}/hwloc.h"
- HEADER_CONTENTS LIMIT 16384)
- string(REGEX REPLACE ".*#define HWLOC_API_VERSION (0[xX][0-9a-fA-F]+).*" "\\1"
- HWLOC_API_VERSION "${HEADER_CONTENTS}")
- string(SUBSTRING "${HWLOC_API_VERSION}" 4 2 HEX_MAJOR)
- string(SUBSTRING "${HWLOC_API_VERSION}" 6 2 HEX_MINOR)
- string(SUBSTRING "${HWLOC_API_VERSION}" 8 2 HEX_PATCH)
- hex2dec(${HEX_MAJOR} DEC_MAJOR)
- hex2dec(${HEX_MINOR} DEC_MINOR)
- hex2dec(${HEX_PATCH} DEC_PATCH)
- set(HWLOC_VERSION "${DEC_MAJOR}.${DEC_MINOR}.${DEC_PATCH}" CACHE STRING "HWLOC library version")
+ # Also used to check that library and header versions match
+ # HWLOC is never installed as a framework on OS X, so this should always work.
+ file(READ "${HWLOC_INCLUDE_DIRS}/hwloc.h"
+ HEADER_CONTENTS LIMIT 16384)
+ string(REGEX REPLACE ".*#define HWLOC_API_VERSION (0[xX][0-9a-fA-F]+).*" "\\1"
+ HWLOC_API_VERSION "${HEADER_CONTENTS}")
+ string(SUBSTRING "${HWLOC_API_VERSION}" 4 2 HEX_MAJOR)
+ string(SUBSTRING "${HWLOC_API_VERSION}" 6 2 HEX_MINOR)
+ string(SUBSTRING "${HWLOC_API_VERSION}" 8 2 HEX_PATCH)
+ hex2dec(${HEX_MAJOR} DEC_MAJOR)
+ hex2dec(${HEX_MINOR} DEC_MINOR)
+ hex2dec(${HEX_PATCH} DEC_PATCH)
+ set(HWLOC_HEADER_VERSION "${DEC_MAJOR}.${DEC_MINOR}.${DEC_PATCH}")
+
+ if (HWLOC_LIBRARY_VERSION AND HWLOC_HEADER_VERSION)
+ string(SUBSTRING "${HWLOC_LIBRARY_VERSION}" 0 1 LIBRARY_MAJOR)
+ string(SUBSTRING "${HWLOC_HEADER_VERSION}" 0 1 HEADER_MAJOR)
+ string(COMPARE EQUAL "${LIBRARY_MAJOR}" "${HEADER_MAJOR}" HWLOC_VERSION_CHECK)
+ if(NOT HWLOC_VERSION_CHECK)
+ message(FATAL_ERROR "Detected version mismatch between HWLOC headers and library. "
+ "Library version is ${HWLOC_LIBRARY_VERSION}, but header version is ${HWLOC_HEADER_VERSION}. "
+ "Make sure that you have the correct include and library directory set for HWLOC")
+ endif()
+ endif()
+ if (HWLOC_LIBRARY_VERSION)
+ set(HWLOC_VERSION ${HWLOC_LIBRARY_VERSION} CACHE STRING "HWLOC library version")
+
+ else()
+ set(HWLOC_VERSION ${HWLOC_HEADER_VERSION} CACHE STRING "HWLOC library version")
endif()
+ set(GMX_HWLOC_API_VERSION ${HWLOC_API_VERSION} CACHE STRING "HWLOC API version during configuration time")
endif()
include(FindPackageHandleStandardArgs)
Be aware that different visualization software handles map formats differently.
During simulations, reference densities are interpreted as visualised by `VMD`.
-If the reference map shows unexpected behaviour, swapping endianess with a map
-conversion tool like `em2em` might help.
Output
^^^^^^
:issue:`3204`
+Fix issues with AWH with pull-geometry 'direction' to be periodic
+"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+
+Removed fatal error with AWH with periodic pull-geometry 'direction'
+when the distance was within 2% of half the box size.
+Changed an assertion failure when the AWH interval was larger than
+the box size to a fatal error.
+Clarified the documentation for pull geometry 'direction-periodic'.
+
+:issue:`2946`
+
+Remove assertion failure with AWH when not using the initial stage
+""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+
+:issue:`3217`
+
Fixes for ``gmx`` tools
^^^^^^^^^^^^^^^^^^^^^^^
and expect whitespace separation will continue to work in all cases.
:issue:`3176`
+
+Fix duplicate PDB CONECT record output
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+PDB "CONECT" record output was duplicated in some instances. Since |Gromacs| does
+not use these anywhere, analysis was not affected. The behavior is now fixed.
+
+:issue:`3206`
+
Update and constraints can run on a (single) GPU
""""""""""""""""""""""""""""""""""""""""""""""""
-For standard constant-NVT runs (see the user guide for more details),
+For standard simulations (see the user guide for more details),
update and constraints can be offloaded to a GPU with CUDA. Thus all compute
intensive parts of a simulation can be offloaded, which provides
better performance when using a fast GPU combined with a slow CPU.
.. mdp-value:: direction-periodic
- As :mdp-value:`pull-coord1-geometry=direction`, but allows the distance to be larger
- than half the box size. With this geometry the box should not be
+ As :mdp-value:`pull-coord1-geometry=direction`, but does not apply
+ periodic box vector corrections to keep the distance within half
+ the box length. This is (only) useful for pushing groups apart
+ by more than half the box length by continuously changing the reference
+ location using a pull rate. With this geometry the box should not be
dynamic (*e.g.* no pressure scaling) in the pull dimensions and
- the pull force is not added to virial.
+ the pull force is not added to the virial.
.. mdp-value:: direction-relative
Defaults to "auto," which currently always uses the CPU.
Setting "gpu" requires that a compatible CUDA GPU is available.
Update and constraints on a GPU is currently not supported
- with pressure coupling, free-energy, domain decomposition, virtual sites,
+ with free-energy, domain decomposition, virtual sites,
Ewald surface correction, replica exchange, the pull code,
orientation restraints and computational electrophysiology.
# The current test fixtures require the `gmx` tool-wrapper executable.
add_dependencies(gmxapi_extension_pytest gmx)
+add_dependencies(tests gmxapi_extension_histogram-test gmxapi_extension_bounding-test)
+
add_dependencies(check gmxapi_extension_pytest)
/* Use the Portable Hardware Locality package (hwloc) */
#cmakedefine01 GMX_USE_HWLOC
+/* Library version found for hwloc during configuration time */
+#define GMX_HWLOC_API_VERSION @GMX_HWLOC_API_VERSION@
+
/* Can and should use nice(3) to set priority */
#cmakedefine01 GMX_USE_NICE
* \param[in] checkpointWriting enables writing to the Key-Value-Tree
* that is used for storing the checkpoint
* information
+ *
+ * \note The provided state to checkpoint has to change if checkpointing
+ * is moved before the force provider call in the MD-loop.
*/
void writeCheckpointData(MdModulesWriteCheckpointData checkpointWriting)
{
if (densityFittingOptions_.active())
{
- const DensityFittingForceProviderState& state = forceProvider_->state();
+ const DensityFittingForceProviderState& state = forceProvider_->stateToCheckpoint();
checkpointWriting.builder_.addValue<std::int64_t>(
DensityFittingModuleInfo::name_ + "-stepsSinceLastCalculation",
state.stepsSinceLastCalculation_);
~Impl();
void calculateForces(const ForceProviderInput& forceProviderInput,
ForceProviderOutput* forceProviderOutput);
-
- DensityFittingForceProviderState state();
+ const DensityFittingForceProviderState& stateToCheckpoint();
private:
+ DensityFittingForceProviderState state();
const DensityFittingParameters& parameters_;
DensityFittingForceProviderState state_;
+ DensityFittingForceProviderState stateToCheckpoint_;
LocalAtomSet localAtomSet_;
GaussianSpreadKernelParameters::Shape spreadKernel_;
void DensityFittingForceProvider::Impl::calculateForces(const ForceProviderInput& forceProviderInput,
ForceProviderOutput* forceProviderOutput)
{
+ // TODO change if checkpointing moves to the start of the md loop
+ stateToCheckpoint_ = state();
// do nothing but count number of steps when not in density fitting step
if (state_.stepsSinceLastCalculation_ % parameters_.calculationIntervalInSteps_ != 0)
{
return state_;
}
+const DensityFittingForceProviderState& DensityFittingForceProvider::Impl::stateToCheckpoint()
+{
+ return stateToCheckpoint_;
+}
/********************************************************************
* DensityFittingForceProvider
*/
impl_->calculateForces(forceProviderInput, forceProviderOutput);
}
-DensityFittingForceProviderState DensityFittingForceProvider::state()
+const DensityFittingForceProviderState& DensityFittingForceProvider::stateToCheckpoint()
{
- return impl_->state();
+ return impl_->stateToCheckpoint();
}
} // namespace gmx
void calculateForces(const ForceProviderInput& forceProviderInput,
ForceProviderOutput* forceProviderOutput) override;
- //! Return the state of the forceprovider.
- DensityFittingForceProviderState state();
+ /*! \brief Return the state of the forceprovider to be checkpointed
+ * TODO update this routine if checkpointing is moved to the beginning of
+ * the md loop
+ */
+ const DensityFittingForceProviderState& stateToCheckpoint();
private:
class Impl;
* \returns true at steps where checks should be performed.
* \note Only returns true at free energy update steps.
*/
- bool isCheckCoveringStep(int64_t step) const { return step % numStepsCheckCovering_ == 0; }
+ bool isCheckCoveringStep(int64_t step) const
+ {
+ return step > 0 && (step % numStepsCheckCovering_ == 0);
+ }
/*! \brief
* Returns if to perform checks for anomalies in the histogram.
const real innerProduct = iprod(pullCoordParams.vec, pbc.box[dim]);
if (innerProduct >= (1 - margin) * boxLength && innerProduct <= (1 + margin) * boxLength)
{
- GMX_RELEASE_ASSERT(intervalLength < (1 + margin) * boxLength,
- "We have checked before that interval <= period");
+ if (intervalLength > (1 + margin) * boxLength)
+ {
+ gmx_fatal(FARGS,
+ "The AWH interval (%f nm) for a pull coordinate is larger than the "
+ "box size (%f nm)",
+ intervalLength, boxLength);
+ }
+
if (intervalLength > periodicFraction * boxLength)
{
period = boxLength;
*/
void communicateHaloForces(bool accumulateForces);
+ /*! \brief Get the event synchronizer for the forces ready on device.
+ * \returns The event to synchronize the stream that consumes forces on device.
+ */
+ GpuEventSynchronizer* getForcesReadyOnDeviceEvent();
private:
class Impl;
"A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
}
+/*!\brief get forces ready on device event stub. */
+GpuEventSynchronizer* GpuHaloExchange::getForcesReadyOnDeviceEvent()
+{
+ GMX_ASSERT(false,
+ "A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
+ return nullptr;
+}
+
} // namespace gmx
#endif /* GMX_GPU != GMX_GPU_CUDA */
#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
#include "gromacs/gpu_utils/vectype_ops.cuh"
#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/utility/gmxmpi.h"
#include "domdec_internal.h"
launchGpuKernel(kernelFn, config, nullptr, "Domdec GPU Apply X Halo Exchange", kernelArgs);
}
- communicateHaloData(d_x_, HaloQuantity::HaloCoordinates);
+ communicateHaloData(d_x_, HaloQuantity::HaloCoordinates, coordinatesReadyOnDeviceEvent);
return;
}
{
// Communicate halo data (in non-local stream)
- communicateHaloData(d_f_, HaloQuantity::HaloForces);
+ communicateHaloData(d_f_, HaloQuantity::HaloForces, nullptr);
float3* d_f = d_f_;
launchGpuKernel(kernelFn, config, nullptr, "Domdec GPU Apply F Halo Exchange", kernelArgs);
}
+ fReadyOnDevice_.markEvent(nonLocalStream_);
}
-void GpuHaloExchange::Impl::communicateHaloData(float3* d_ptr, HaloQuantity haloQuantity)
+void GpuHaloExchange::Impl::communicateHaloData(float3* d_ptr,
+ HaloQuantity haloQuantity,
+ GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
{
void* sendPtr;
recvRank = recvRankX_;
#if GMX_MPI
- // Wait for signal from receiving task that it is ready, and similarly send signal to task that will push data to this task
- char thisTaskIsReady, remoteTaskIsReady;
- MPI_Sendrecv(&thisTaskIsReady, sizeof(char), MPI_BYTE, recvRank, 0, &remoteTaskIsReady,
- sizeof(char), MPI_BYTE, sendRank, 0, mpi_comm_mysim_, MPI_STATUS_IGNORE);
+ // Wait for event from receiving task that remote coordinates are ready, and enqueue that event to stream used
+ // for subsequent data push. This avoids a race condition with the remote data being written in the previous timestep.
+ // Similarly send event to task that will push data to this task.
+ GpuEventSynchronizer* remoteCoordinatesReadyOnDeviceEvent;
+ MPI_Sendrecv(&coordinatesReadyOnDeviceEvent, sizeof(GpuEventSynchronizer*), MPI_BYTE,
+ recvRank, 0, &remoteCoordinatesReadyOnDeviceEvent, sizeof(GpuEventSynchronizer*),
+ MPI_BYTE, sendRank, 0, mpi_comm_mysim_, MPI_STATUS_IGNORE);
+ remoteCoordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_);
+#else
+ GMX_UNUSED_VALUE(coordinatesReadyOnDeviceEvent);
#endif
}
else
#endif
}
+GpuEventSynchronizer* GpuHaloExchange::Impl::getForcesReadyOnDeviceEvent()
+{
+ return &fReadyOnDevice_;
+}
+
/*! \brief Create Domdec GPU object */
GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void* localStream, void* nonLocalStream) :
dd_(dd),
impl_->communicateHaloForces(accumulateForces);
}
+GpuEventSynchronizer* GpuHaloExchange::getForcesReadyOnDeviceEvent()
+{
+ return impl_->getForcesReadyOnDeviceEvent();
+}
} // namespace gmx
*/
void communicateHaloForces(bool accumulateForces);
+ /*! \brief Get the event synchronizer for the forces ready on device.
+ * \returns The event to synchronize the stream that consumes forces on device.
+ */
+ GpuEventSynchronizer* getForcesReadyOnDeviceEvent();
+
private:
/*! \brief Data transfer wrapper for GPU halo exchange
* \param [inout] d_ptr pointer to coordinates or force buffer in GPU memory
* \param [in] haloQuantity switch on whether X or F halo exchange is being performed
+ * \param [in] coordinatesReadyOnDeviceEvent event recorded when coordinates have been copied to device
*/
- void communicateHaloData(float3* d_ptr, HaloQuantity haloQuantity);
+ void communicateHaloData(float3* d_ptr,
+ HaloQuantity haloQuantity,
+ GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
/*! \brief Data transfer for GPU halo exchange using CUDA memcopies
* \param [inout] sendPtr address to send data from
float3* d_x_ = nullptr;
//! full forces buffer in GPU memory
float3* d_f_ = nullptr;
+ //! An event recorded once the exchanged forces are ready on the GPU
+ GpuEventSynchronizer fReadyOnDevice_;
};
} // namespace gmx
#if GMX_MPI
MPI_Send(&sendBuf, sizeof(void**), MPI_BYTE, receiver.rankId, 0, comm_);
+#else
+ GMX_UNUSED_VALUE(sendBuf);
#endif
}
}
MPI_Irecv(&ppSync_[recvCount_], sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_,
&request_[recvCount_]);
recvCount_++;
+#else
+ GMX_UNUSED_VALUE(ppRank);
#endif
}
#if GMX_MPI
MPI_Send(&sendBuf, sizeof(void**), MPI_BYTE, receiver.rankId, 0, comm_);
+#else
+ GMX_UNUSED_VALUE(sendBuf);
#endif
}
}
// TODO Using MPI_Isend would be more efficient, particularly when
// sending to multiple PP ranks
MPI_Send(&pmeSyncPtr, sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_);
+#else
+ GMX_UNUSED_VALUE(pmeSyncPtr);
+ GMX_UNUSED_VALUE(ppRank);
#endif
}
messages = 0;
} while (status == -1);
#else
+ GMX_UNUSED_VALUE(pme);
GMX_UNUSED_VALUE(pme_pp);
GMX_UNUSED_VALUE(box);
GMX_UNUSED_VALUE(maxshift_x);
GMX_UNUSED_VALUE(grid_size);
GMX_UNUSED_VALUE(ewaldcoeff_q);
GMX_UNUSED_VALUE(ewaldcoeff_lj);
+ GMX_UNUSED_VALUE(useGpuForPme);
+ GMX_UNUSED_VALUE(stateGpu);
status = pmerecvqxX;
#endif
return status;
}
+#if GMX_MPI
/*! \brief Send force data to PP ranks */
static void sendFToPP(void* sendbuf, PpRanks receiver, gmx_pme_pp* pme_pp, int* messages)
{
}
else
{
-#if GMX_MPI
// Send using MPI
MPI_Isend(sendbuf, receiver.numAtoms * sizeof(rvec), MPI_BYTE, receiver.rankId, 0,
pme_pp->mpi_comm_mysim, &pme_pp->req[*messages]);
*messages = *messages + 1;
-#endif
}
}
+#endif
/*! \brief Send the PME mesh force, virial and energy to the PP-only ranks. */
static void gmx_pme_send_force_vir_ener(const gmx_pme_t& pme,
MPI_Waitall(messages, pme_pp->req.data(), pme_pp->stat.data());
#else
gmx_call("MPI not enabled");
+ GMX_UNUSED_VALUE(pme);
GMX_UNUSED_VALUE(pme_pp);
GMX_UNUSED_VALUE(output);
GMX_UNUSED_VALUE(dvdlambda_q);
}
}
}
-
+#else
+ GMX_UNUSED_VALUE(fr);
+ GMX_UNUSED_VALUE(reinitGpuPmePpComms);
+ GMX_UNUSED_VALUE(sendCoordinatesFromGpu);
+ GMX_UNUSED_VALUE(coordinatesReadyOnDeviceEvent);
#endif
if (!c_useDelayedWait)
{
#if GMX_MPI
MPI_Recv(recvptr, n * sizeof(rvec), MPI_BYTE, cr->dd->pme_nodeid, 0, cr->mpi_comm_mysim,
MPI_STATUS_IGNORE);
+#else
+ GMX_UNUSED_VALUE(cr);
#endif
}
}
void PmePpCommGpu::Impl::reinit(int size)
{
// This rank will access PME rank memory directly, so needs to receive the remote PME buffer addresses.
+#if GMX_MPI
MPI_Recv(&remotePmeXBuffer_, sizeof(void**), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
MPI_Recv(&remotePmeFBuffer_, sizeof(void**), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
// Reallocate buffer used for staging PME force on GPU
reallocateDeviceBuffer(&d_pmeForces_, size, &d_pmeForcesSize_, &d_pmeForcesSizeAlloc_, nullptr);
-
+#else
+ GMX_UNUSED_VALUE(size);
+#endif
return;
}
// launchRecvForceFromPmeCudaDirect() and sycnRecvForceFromPmeCudaDirect()
void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(void* recvPtr, int recvSize, bool receivePmeForceToGpu)
{
-
+#if GMX_MPI
// Receive event from PME task and add to stream, to ensure pull of data doesn't
// occur before PME force calc is completed
GpuEventSynchronizer* pmeSync;
// them with other forces on the CPU
cudaStreamSynchronize(pmePpCommStream_);
}
+#else
+ GMX_UNUSED_VALUE(recvPtr);
+ GMX_UNUSED_VALUE(recvSize);
+ GMX_UNUSED_VALUE(receivePmeForceToGpu);
+#endif
}
void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaDirect(void* sendPtr,
bool gmx_unused sendPmeCoordinatesFromGpu,
GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
{
-
+#if GMX_MPI
// ensure stream waits until coordinate data is available on device
coordinatesReadyOnDeviceEvent->enqueueWaitEvent(pmePpCommStream_);
pmeCoordinatesSynchronizer_.markEvent(pmePpCommStream_);
GpuEventSynchronizer* pmeSync = &pmeCoordinatesSynchronizer_;
MPI_Send(&pmeSync, sizeof(GpuEventSynchronizer*), MPI_BYTE, pmeRank_, 0, comm_);
+#else
+ GMX_UNUSED_VALUE(sendPtr);
+ GMX_UNUSED_VALUE(sendSize);
+ GMX_UNUSED_VALUE(sendPmeCoordinatesFromGpu);
+ GMX_UNUSED_VALUE(coordinatesReadyOnDeviceEvent);
+#endif
}
void* PmePpCommGpu::Impl::getGpuForceStagingPtr()
{
const MrcDensityMapOfFloatReader& reader() const;
private:
- std::vector<char> buffer_;
- InMemoryDeserializer serializer_;
- const MrcDensityMapOfFloatReader reader_;
+ const std::vector<char> buffer_;
+ std::unique_ptr<InMemoryDeserializer> serializer_;
+ std::unique_ptr<MrcDensityMapOfFloatReader> reader_;
};
MrcDensityMapOfFloatFromFileReader::Impl::Impl(const std::string& filename) :
buffer_(readCharBufferFromFile(filename)),
- serializer_(buffer_, false),
- reader_(&serializer_)
+ serializer_(std::make_unique<InMemoryDeserializer>(buffer_, false)),
+ reader_(std::make_unique<MrcDensityMapOfFloatReader>(serializer_.get()))
{
- layout_right::mapping<dynamicExtents3D> map(getDynamicExtents3D(reader_.header()));
- if (map.required_span_size() != reader_.constView().ssize())
+ if (!mrcHeaderIsSane(reader_->header()))
+ {
+ serializer_ = std::make_unique<InMemoryDeserializer>(buffer_, false, EndianSwapBehavior::DoSwap);
+ reader_ = std::make_unique<MrcDensityMapOfFloatReader>(serializer_.get());
+ if (!mrcHeaderIsSane(reader_->header()))
+ {
+ GMX_THROW(FileIOError(
+ "Header of '" + filename
+ + "' fails sanity check for little- as well as big-endian reading."));
+ }
+ }
+
+ layout_right::mapping<dynamicExtents3D> map(getDynamicExtents3D(reader_->header()));
+ if (map.required_span_size() != reader_->constView().ssize())
{
GMX_THROW(FileIOError("File header density extent information of " + filename
+ "' does not match density data size"));
const MrcDensityMapOfFloatReader& MrcDensityMapOfFloatFromFileReader::Impl::reader() const
{
- return reader_;
+ return *reader_;
}
/********************************************************************
* upon construction and returns coordinate transformation into the density
* lattice as well as the density data.
*
+ * Attempts reading with swapped endianess if header is not sane.
+ *
+ * Performs basic sanity checks on header information and data size.
+ *
* \note File reading is completed during construction. When the constructor
* completes succesfully, transformation to density lattice and density
* data are valid, irrespective of the state of the read file.
namespace gmx
{
+
+namespace
+{
+
+//! Returns true if any of the argument values is smaller than zero.
+template<typename Container>
+bool anySmallerZero(Container values)
+{
+ return std::any_of(std::begin(values), std::end(values), [](auto v) { return v < 0; });
+}
+
+//! Returns true if any of the argument values is larger than a given boundary value.
+template<typename Container>
+bool anyLargerThanValue(Container values, typename Container::value_type boundaryValue)
+{
+ return std::any_of(std::begin(values), std::end(values),
+ [boundaryValue](auto v) { return v > boundaryValue; });
+}
+
+} // namespace
+
size_t numberOfExpectedDataItems(const MrcDensityMapHeader& header)
{
- if (std::any_of(std::begin(header.numColumnRowSection_), std::end(header.numColumnRowSection_),
- [](auto i) { return i < 0; }))
+ if (anySmallerZero(header.numColumnRowSection_))
{
GMX_THROW(
InternalError("Cannot determine data size, because the mrc "
header.numColumnRowSection_[XX] };
};
+bool mrcHeaderIsSane(const MrcDensityMapHeader& header)
+{
+ // Make sure all numbers of columns, row sections, extents and cell angles
+ // are positive
+ if (anySmallerZero(header.numColumnRowSection_) || anySmallerZero(header.cellAngles_)
+ || anySmallerZero(header.extent_))
+ {
+ return false;
+ }
+
+ // The maximum integer number in an mrc header to be considered sane
+ constexpr std::int32_t c_maxIntegerNumber = 100'000;
+ if (anyLargerThanValue(header.numColumnRowSection_, c_maxIntegerNumber)
+ || anyLargerThanValue(header.extent_, c_maxIntegerNumber))
+ {
+ return false;
+ }
+
+ constexpr std::int32_t c_maxCellAngle = 360;
+ if (anyLargerThanValue(header.cellAngles_, c_maxCellAngle))
+ {
+ return false; //NOLINT(readability-simplify-boolean-expr)
+ }
+
+ return true;
+}
+
} // namespace gmx
* \returns density data extents in three dimensions.
*/
dynamicExtents3D getDynamicExtents3D(const MrcDensityMapHeader& header);
+
+/*! \brief Checks if the values in the header are sane.
+ *
+ * Checks extents and numbers of columns, rows and sections, as well as unit
+ * cell angles for positivity and to be within bounds.
+ *
+ * Bounds are set generously not to hamper future creative uses of mrc files.
+ *
+ * \returns true if all header values are within resonable albeit generous bounds
+ */
+bool mrcHeaderIsSane(const MrcDensityMapHeader& header);
+
} // namespace gmx
#endif /* end of include guard: GMX_FILEIO_MRCDENSITYMAPHEADER_H */
n = sscanf(line, format.c_str(), &aj);
if (n == 1)
{
- srenew(con->conect, ++con->nconect);
- con->conect[con->nconect - 1].ai = ai - 1;
- con->conect[con->nconect - 1].aj = aj - 1;
+ gmx_conect_add(con, ai - 1, aj - 1); /* to prevent duplicated records */
}
} while (n == 1);
}
EXPECT_EQ(expectedExtents[ZZ], extents.extent(ZZ));
}
+TEST(MrcDensityMapHeaderTest, IsSane)
+{
+ MrcDensityMapHeader header;
+ EXPECT_TRUE(mrcHeaderIsSane(header));
+
+ header.numColumnRowSection_[YY] = -1;
+ EXPECT_FALSE(mrcHeaderIsSane(header));
+
+ header.numColumnRowSection_[YY] = 10'000'000;
+ EXPECT_FALSE(mrcHeaderIsSane(header));
+
+ header = {};
+ EXPECT_TRUE(mrcHeaderIsSane(header));
+
+ header.cellAngles_[XX] = -20;
+ EXPECT_FALSE(mrcHeaderIsSane(header));
+}
+
} // namespace gmx
#include "gromacs/gpu_utils/devicebuffer_datatype.h"
#include "gromacs/gpu_utils/gpu_utils.h" //only for GpuApiCallBehavior
#include "gromacs/gpu_utils/gputraits_ocl.h"
+#include "gromacs/gpu_utils/oclutils.h"
#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/stringutil.h"
/*! \libinternal \brief
* Allocates a device-side buffer.
cl_int clError;
*buffer = clCreateBuffer(deviceContext, CL_MEM_READ_WRITE, numValues * sizeof(ValueType),
hostPtr, &clError);
- GMX_RELEASE_ASSERT(clError == CL_SUCCESS, "clCreateBuffer failure");
+ GMX_RELEASE_ASSERT(clError == CL_SUCCESS,
+ gmx::formatString("clCreateBuffer failure (OpenCL error %d: %s)", clError,
+ ocl_get_error_string(clError).c_str())
+ .c_str());
}
/*! \brief
GMX_ASSERT(buffer, "needs a buffer pointer");
if (*buffer)
{
- GMX_RELEASE_ASSERT(clReleaseMemObject(*buffer) == CL_SUCCESS, "clReleaseMemObject failed");
+ cl_int clError = clReleaseMemObject(*buffer);
+ GMX_RELEASE_ASSERT(clError == CL_SUCCESS,
+ gmx::formatString("clReleaseMemObject failed (OpenCL error %d: %s)",
+ clError, ocl_get_error_string(clError).c_str())
+ .c_str());
}
}
case GpuApiCallBehavior::Async:
clError = clEnqueueWriteBuffer(stream, *buffer, CL_FALSE, offset, bytes, hostBuffer, 0,
nullptr, timingEvent);
- GMX_RELEASE_ASSERT(clError == CL_SUCCESS, "Asynchronous H2D copy failed");
+ GMX_RELEASE_ASSERT(
+ clError == CL_SUCCESS,
+ gmx::formatString("Asynchronous H2D copy failed (OpenCL error %d: %s)", clError,
+ ocl_get_error_string(clError).c_str())
+ .c_str());
break;
case GpuApiCallBehavior::Sync:
clError = clEnqueueWriteBuffer(stream, *buffer, CL_TRUE, offset, bytes, hostBuffer, 0,
nullptr, timingEvent);
- GMX_RELEASE_ASSERT(clError == CL_SUCCESS, "Synchronous H2D copy failed");
+ GMX_RELEASE_ASSERT(
+ clError == CL_SUCCESS,
+ gmx::formatString("Synchronous H2D copy failed (OpenCL error %d: %s)", clError,
+ ocl_get_error_string(clError).c_str())
+ .c_str());
break;
default: throw;
case GpuApiCallBehavior::Async:
clError = clEnqueueReadBuffer(stream, *buffer, CL_FALSE, offset, bytes, hostBuffer, 0,
nullptr, timingEvent);
- GMX_RELEASE_ASSERT(clError == CL_SUCCESS, "Asynchronous D2H copy failed");
+ GMX_RELEASE_ASSERT(
+ clError == CL_SUCCESS,
+ gmx::formatString("Asynchronous D2H copy failed (OpenCL error %d: %s)", clError,
+ ocl_get_error_string(clError).c_str())
+ .c_str());
break;
case GpuApiCallBehavior::Sync:
clError = clEnqueueReadBuffer(stream, *buffer, CL_TRUE, offset, bytes, hostBuffer, 0,
nullptr, timingEvent);
- GMX_RELEASE_ASSERT(clError == CL_SUCCESS, "Synchronous D2H copy failed");
+ GMX_RELEASE_ASSERT(
+ clError == CL_SUCCESS,
+ gmx::formatString("Synchronous D2H copy failed (OpenCL error %d: %s)", clError,
+ ocl_get_error_string(clError).c_str())
+ .c_str());
break;
default: throw;
cl_event commandEvent;
cl_int clError = clEnqueueFillBuffer(stream, *buffer, &pattern, sizeof(pattern), offset, bytes,
numWaitEvents, waitEvents, &commandEvent);
- GMX_RELEASE_ASSERT(clError == CL_SUCCESS, "Couldn't clear the device buffer");
+ GMX_RELEASE_ASSERT(clError == CL_SUCCESS,
+ gmx::formatString("Couldn't clear the device buffer (OpenCL error %d: %s)",
+ clError, ocl_get_error_string(clError).c_str())
+ .c_str());
}
#endif
// Preprocessor variable for if hwloc api is version 1.x.x or 2.x.x
# if HWLOC_API_VERSION >= 0x00020000
# define GMX_HWLOC_API_VERSION_IS_2XX 1
+# if GMX_HWLOC_API_VERSION < 0x00020000
+# error "HWLOC library major version set during configuration is 1, but currently using version 2 headers"
+# endif
# else
# define GMX_HWLOC_API_VERSION_IS_2XX 0
+# if GMX_HWLOC_API_VERSION >= 0x00020000
+# error "HWLOC library major version set during configuration is 2, but currently using version 1 headers"
+# endif
# endif
/*****************************************************************************
// Flags to look for io devices
# if GMX_HWLOC_API_VERSION_IS_2XX
+ GMX_RELEASE_ASSERT(
+ (hwloc_get_api_version() >= 0x20000),
+ "Mismatch between hwloc headers and library, using v2 headers with v1 library");
hwloc_topology_set_io_types_filter(topo, HWLOC_TYPE_FILTER_KEEP_IMPORTANT);
# else
+ GMX_RELEASE_ASSERT(
+ (hwloc_get_api_version() < 0x20000),
+ "Mismatch between hwloc headers and library, using v1 headers with v2 library");
hwloc_topology_set_flags(topo, HWLOC_TOPOLOGY_FLAG_IO_DEVICES);
# endif
int nr_atoms,
rvec x[],
const unsigned short cFREEZE[],
- t_nrnb* nrnb)
+ t_nrnb* nrnb,
+ const bool scaleCoordinates)
{
ivec* nFreeze = ir->opts.nFreeze;
int d;
#endif
/* Scale the positions */
-#pragma omp parallel for num_threads(nthreads) schedule(static)
- for (int n = start; n < start + nr_atoms; n++)
+ if (scaleCoordinates)
{
- // Trivial OpenMP region that does not throw
- int g;
-
- if (cFREEZE == nullptr)
- {
- g = 0;
- }
- else
+#pragma omp parallel for num_threads(nthreads) schedule(static)
+ for (int n = start; n < start + nr_atoms; n++)
{
- g = cFREEZE[n];
- }
+ // Trivial OpenMP region that does not throw
+ int g;
- if (!nFreeze[g][XX])
- {
- x[n][XX] = mu[XX][XX] * x[n][XX] + mu[YY][XX] * x[n][YY] + mu[ZZ][XX] * x[n][ZZ];
- }
- if (!nFreeze[g][YY])
- {
- x[n][YY] = mu[YY][YY] * x[n][YY] + mu[ZZ][YY] * x[n][ZZ];
- }
- if (!nFreeze[g][ZZ])
- {
- x[n][ZZ] = mu[ZZ][ZZ] * x[n][ZZ];
+ if (cFREEZE == nullptr)
+ {
+ g = 0;
+ }
+ else
+ {
+ g = cFREEZE[n];
+ }
+
+ if (!nFreeze[g][XX])
+ {
+ x[n][XX] = mu[XX][XX] * x[n][XX] + mu[YY][XX] * x[n][YY] + mu[ZZ][XX] * x[n][ZZ];
+ }
+ if (!nFreeze[g][YY])
+ {
+ x[n][YY] = mu[YY][YY] * x[n][YY] + mu[ZZ][YY] * x[n][ZZ];
+ }
+ if (!nFreeze[g][ZZ])
+ {
+ x[n][ZZ] = mu[ZZ][ZZ] * x[n][ZZ];
+ }
}
}
/* compute final boxlengths */
* \todo Check if the force should be set to zero here.
* \todo This kernel can also accumulate incidental temperatures for each atom.
*
- * \tparam numTempScaleValues The number of different T-couple values.
- * \tparam velocityScaling Type of the Parrinello-Rahman velocity rescaling.
- * \param[in] numAtoms Total number of atoms.
- * \param[in,out] gm_x Coordinates to update upon integration.
- * \param[out] gm_xp A copy of the coordinates before the integration (for constraints).
- * \param[in,out] gm_v Velocities to update.
- * \param[in] gm_f Atomic forces.
- * \param[in] gm_inverseMasses Reciprocal masses.
- * \param[in] dt Timestep.
- * \param[in] gm_lambdas Temperature scaling factors (one per group)
- * \param[in] gm_tempScaleGroups Mapping of atoms into groups.
- * \param[in] dtPressureCouple Time step for pressure coupling
- * \param[in] velocityScalingMatrixDiagonal Diagonal elements of Parrinello-Rahman velocity scaling matrix
+ * \tparam numTempScaleValues The number of different T-couple values.
+ * \tparam velocityScaling Type of the Parrinello-Rahman velocity rescaling.
+ * \param[in] numAtoms Total number of atoms.
+ * \param[in,out] gm_x Coordinates to update upon integration.
+ * \param[out] gm_xp A copy of the coordinates before the integration (for constraints).
+ * \param[in,out] gm_v Velocities to update.
+ * \param[in] gm_f Atomic forces.
+ * \param[in] gm_inverseMasses Reciprocal masses.
+ * \param[in] dt Timestep.
+ * \param[in] gm_lambdas Temperature scaling factors (one per group)
+ * \param[in] gm_tempScaleGroups Mapping of atoms into groups.
+ * \param[in] dtPressureCouple Time step for pressure coupling
+ * \param[in] prVelocityScalingMatrixDiagonal Diagonal elements of Parrinello-Rahman velocity scaling matrix
*/
template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
__launch_bounds__(c_maxThreadsPerBlock) __global__
const float dt,
const float* __restrict__ gm_lambdas,
const unsigned short* __restrict__ gm_tempScaleGroups,
- const float3 velocityScalingMatrixDiagonal);
+ const float3 prVelocityScalingMatrixDiagonal);
template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
__launch_bounds__(c_maxThreadsPerBlock) __global__
const float dt,
const float* __restrict__ gm_lambdas,
const unsigned short* __restrict__ gm_tempScaleGroups,
- const float3 velocityScalingMatrixDiagonal)
+ const float3 prVelocityScalingMatrixDiagonal)
{
int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
if (threadIndex < numAtoms)
if (velocityScaling == VelocityScalingType::Diagonal)
{
- vp.x -= velocityScalingMatrixDiagonal.x * v.x;
- vp.y -= velocityScalingMatrixDiagonal.y * v.y;
- vp.z -= velocityScalingMatrixDiagonal.z * v.z;
+ vp.x -= prVelocityScalingMatrixDiagonal.x * v.x;
+ vp.y -= prVelocityScalingMatrixDiagonal.y * v.y;
+ vp.z -= prVelocityScalingMatrixDiagonal.z * v.z;
}
v = vp;
/*! \brief Select templated kernel.
*
- * Returns pointer to a CUDA kernel based on the number of temperature coupling groups.
- * If zero is passed as an argument, it is assumed that no temperature coupling groups are used.
+ * Returns pointer to a CUDA kernel based on the number of temperature coupling groups and
+ * whether or not the temperature and(or) pressure coupling is enabled.
*
- * \param[in] numTempScaleValues Numer of temperature coupling groups in the system
- * \param[in] velocityScaling Type of the Parrinello-Rahman velocity scaling
+ * \param[in] doTemperatureScaling If the kernel with temperature coupling velocity scaling
+ * should be selected.
+ * \param[in] numTempScaleValues Number of temperature coupling groups in the system.
+ * \param[in] prVelocityScalingType Type of the Parrinello-Rahman velocity scaling.
*
* \retrun Pointer to CUDA kernel
*/
-inline auto selectLeapFrogKernelPtr(int numTempScaleValues, VelocityScalingType velocityScaling)
+inline auto selectLeapFrogKernelPtr(bool doTemperatureScaling,
+ int numTempScaleValues,
+ VelocityScalingType prVelocityScalingType)
{
+ // Check input for consistency: if there is temperature coupling, at least one coupling group should be defined.
+ GMX_ASSERT(!doTemperatureScaling || (numTempScaleValues > 0),
+ "Temperature coupling was requested with no temperature coupling groups.");
auto kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::None>;
- if (velocityScaling == VelocityScalingType::None)
+ if (prVelocityScalingType == VelocityScalingType::None)
{
- if (numTempScaleValues == 0)
+ if (!doTemperatureScaling)
{
kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::None>;
}
{
kernelPtr = leapfrog_kernel<NumTempScaleValues::Multiple, VelocityScalingType::None>;
}
- else
- {
- GMX_RELEASE_ASSERT(false,
- "Number of temperature coupling groups should be greater than zero "
- "(zero for no coupling).");
- }
}
- else if (velocityScaling == VelocityScalingType::Diagonal)
+ else if (prVelocityScalingType == VelocityScalingType::Diagonal)
{
- if (numTempScaleValues == 0)
+ if (!doTemperatureScaling)
{
kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::Diagonal>;
}
{
kernelPtr = leapfrog_kernel<NumTempScaleValues::Multiple, VelocityScalingType::Diagonal>;
}
- else
- {
- GMX_RELEASE_ASSERT(false,
- "Number of temperature coupling groups should be greater than zero "
- "(zero for no coupling).");
- }
}
else
{
float3* d_v,
const float3* d_f,
const real dt,
- const bool doTempCouple,
+ const bool doTemperatureScaling,
gmx::ArrayRef<const t_grp_tcstat> tcstat,
- const bool doPressureCouple,
+ const bool doParrinelloRahman,
const float dtPressureCouple,
- const matrix velocityScalingMatrix)
+ const matrix prVelocityScalingMatrix)
{
ensureNoPendingCudaError("In CUDA version of Leap-Frog integrator");
auto kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::None>;
- if (doTempCouple || doPressureCouple)
+ if (doTemperatureScaling || doParrinelloRahman)
{
- if (doTempCouple)
+ if (doTemperatureScaling)
{
GMX_ASSERT(numTempScaleValues_ == ssize(h_lambdas_),
"Number of temperature scaling factors changed since it was set for the "
copyToDeviceBuffer(&d_lambdas_, h_lambdas_.data(), 0, numTempScaleValues_,
commandStream_, GpuApiCallBehavior::Async, nullptr);
}
- VelocityScalingType velocityScaling = VelocityScalingType::None;
- if (doPressureCouple)
+ VelocityScalingType prVelocityScalingType = VelocityScalingType::None;
+ if (doParrinelloRahman)
{
- velocityScaling = VelocityScalingType::Diagonal;
- GMX_ASSERT(velocityScalingMatrix[YY][XX] == 0 && velocityScalingMatrix[ZZ][XX] == 0
- && velocityScalingMatrix[ZZ][YY] == 0 && velocityScalingMatrix[XX][YY] == 0
- && velocityScalingMatrix[XX][ZZ] == 0 && velocityScalingMatrix[YY][ZZ] == 0,
+ prVelocityScalingType = VelocityScalingType::Diagonal;
+ GMX_ASSERT(prVelocityScalingMatrix[YY][XX] == 0 && prVelocityScalingMatrix[ZZ][XX] == 0
+ && prVelocityScalingMatrix[ZZ][YY] == 0
+ && prVelocityScalingMatrix[XX][YY] == 0
+ && prVelocityScalingMatrix[XX][ZZ] == 0
+ && prVelocityScalingMatrix[YY][ZZ] == 0,
"Fully anisotropic Parrinello-Rahman pressure coupling is not yet supported "
"in GPU version of Leap-Frog integrator.");
- velocityScalingMatrixDiagonal_ =
- make_float3(dtPressureCouple * velocityScalingMatrix[XX][XX],
- dtPressureCouple * velocityScalingMatrix[YY][YY],
- dtPressureCouple * velocityScalingMatrix[ZZ][ZZ]);
+ prVelocityScalingMatrixDiagonal_ =
+ make_float3(dtPressureCouple * prVelocityScalingMatrix[XX][XX],
+ dtPressureCouple * prVelocityScalingMatrix[YY][YY],
+ dtPressureCouple * prVelocityScalingMatrix[ZZ][ZZ]);
}
- kernelPtr = selectLeapFrogKernelPtr(numTempScaleValues_, velocityScaling);
+ kernelPtr = selectLeapFrogKernelPtr(doTemperatureScaling, numTempScaleValues_, prVelocityScalingType);
}
const auto kernelArgs = prepareGpuKernelArguments(
kernelPtr, kernelLaunchConfig_, &numAtoms_, &d_x, &d_xp, &d_v, &d_f, &d_inverseMasses_,
- &dt, &d_lambdas_, &d_tempScaleGroups_, &velocityScalingMatrixDiagonal_);
+ &dt, &d_lambdas_, &d_tempScaleGroups_, &prVelocityScalingMatrixDiagonal_);
launchGpuKernel(kernelPtr, kernelLaunchConfig_, nullptr, "leapfrog_kernel", kernelArgs);
return;
* Integrates the equation of motion using Leap-Frog algorithm.
* Updates coordinates and velocities on the GPU. The current coordinates are saved for constraints.
*
- * \param[in,out] d_x Coordinates to update
- * \param[out] d_xp Place to save the values of initial coordinates coordinates to.
- * \param[in,out] d_v Velocities (will be updated).
- * \param[in] d_f Forces.
- * \param[in] dt Timestep.
- * \param[in] doTempCouple If the temperature coupling should be applied.
- * \param[in] tcstat Temperature coupling data.
- * \param[in] doPressureCouple If the temperature coupling should be applied.
- * \param[in] dtPressureCouple Period between pressure coupling steps
- * \param[in] velocityScalingMatrix Parrinello-Rahman velocity scaling matrix
+ * \param[in,out] d_x Coordinates to update
+ * \param[out] d_xp Place to save the values of initial coordinates coordinates to.
+ * \param[in,out] d_v Velocities (will be updated).
+ * \param[in] d_f Forces.
+ * \param[in] dt Timestep.
+ * \param[in] doTemperatureScaling If velocities should be scaled for temperature coupling.
+ * \param[in] tcstat Temperature coupling data.
+ * \param[in] doParrinelloRahman If current step is a Parrinello-Rahman pressure coupling step.
+ * \param[in] dtPressureCouple Period between pressure coupling steps
+ * \param[in] prVelocityScalingMatrix Parrinello-Rahman velocity scaling matrix
*/
void integrate(const float3* d_x,
float3* d_xp,
float3* d_v,
const float3* d_f,
const real dt,
- const bool doTempCouple,
+ const bool doTemperatureScaling,
gmx::ArrayRef<const t_grp_tcstat> tcstat,
- const bool doPressureCouple,
+ const bool doParrinelloRahman,
const float dtPressureCouple,
- const matrix velocityScalingMatrix);
+ const matrix prVelocityScalingMatrix);
/*! \brief Set the integrator
*
//! Maximum size of the temperature coupling groups array
int numTempScaleGroupsAlloc_ = -1;
- //! Vector with diagonal elements of the pressure coupling velocity rescale factors
- float3 velocityScalingMatrixDiagonal_;
+ //! Vector with diagonal elements of the Parrinello-Rahman pressure coupling velocity rescale factors
+ float3 prVelocityScalingMatrixDiagonal_;
};
} // namespace gmx
/*! \brief Helper function to go through constraints recursively.
*
- * For each constraint, counts the number of coupled constraints and stores the value in spaceNeeded array.
+ * For each constraint, counts the number of coupled constraints and stores the value in numCoupledConstraints array.
* This information is used to split the array of constraints between thread blocks on a GPU so there is no
- * coupling between constraints from different thread blocks. After the 'spaceNeeded' array is filled, the
- * value spaceNeeded[c] should be equal to the number of constraints that are coupled to 'c' and located
+ * coupling between constraints from different thread blocks. After the 'numCoupledConstraints' array is filled, the
+ * value numCoupledConstraints[c] should be equal to the number of constraints that are coupled to 'c' and located
* after it in the constraints array.
*
- * \param[in] a Atom index.
- * \param[in,out] spaceNeeded Indicates if the constraint was already counted and stores
- * the number of constraints (i) connected to it and (ii) located
- * after it in memory. This array is filled by this recursive function.
- * For a set of coupled constraints, only for the first one in this list
- * the number of consecutive coupled constraints is needed: if there is
- * not enough space for this set of constraints in the thread block,
- * the group has to be moved to the next one.
- * \param[in] atomAdjacencyList Stores information about connections between atoms.
+ * \param[in] a Atom index.
+ * \param[in,out] numCoupledConstraints Indicates if the constraint was already counted and stores
+ * the number of constraints (i) connected to it and (ii) located
+ * after it in memory. This array is filled by this recursive function.
+ * For a set of coupled constraints, only for the first one in this list
+ * the number of consecutive coupled constraints is needed: if there is
+ * not enough space for this set of constraints in the thread block,
+ * the group has to be moved to the next one.
+ * \param[in] atomsAdjacencyList Stores information about connections between atoms.
*/
-inline int countCoupled(int a,
- std::vector<int>* spaceNeeded,
- std::vector<std::vector<std::tuple<int, int, int>>>* atomsAdjacencyList)
+inline int countCoupled(int a,
+ ArrayRef<int> numCoupledConstraints,
+ ArrayRef<const std::vector<std::tuple<int, int, int>>> atomsAdjacencyList)
{
- int c2, a2, sign;
int counted = 0;
- for (unsigned i = 0; i < atomsAdjacencyList->at(a).size(); i++)
+ for (const auto& adjacentAtom : atomsAdjacencyList[a])
{
- std::tie(a2, c2, sign) = atomsAdjacencyList->at(a).at(i);
- if (spaceNeeded->at(c2) == -1)
+ const int c2 = std::get<1>(adjacentAtom);
+ if (numCoupledConstraints[c2] == -1)
{
- spaceNeeded->at(c2) = 0; // To indicate we've been here
- counted += 1 + countCoupled(a2, spaceNeeded, atomsAdjacencyList);
+ numCoupledConstraints[c2] = 0; // To indicate we've been here
+ counted += 1 + countCoupled(std::get<0>(adjacentAtom), numCoupledConstraints, atomsAdjacencyList);
}
}
return counted;
}
+/*! \brief Add constraint to \p splitMap with all constraints coupled to it.
+ *
+ * Adds the constraint \p c from the constrain list \p iatoms to the map \p splitMap
+ * if it was not yet added. Then goes through all the constraints coupled to \p c
+ * and calls itself recursively. This ensures that all the coupled constraints will
+ * be added to neighboring locations in the final data structures on the device,
+ * hence mapping all coupled constraints to the same thread block. A value of -1 in
+ * the \p splitMap is used to flag that constraint was not yet added to the \p splitMap.
+ *
+ * \param[in] iatoms The list of constraints.
+ * \param[in] stride Number of elements per constraint in \p iatoms.
+ * \param[in] atomsAdjacencyList Information about connections between atoms.
+ * \param[our] splitMap Map of sequential constraint indexes to indexes to be on the device
+ * \param[in] c Sequential index for constraint to consider adding.
+ * \param[in,out] currentMapIndex The rolling index for the constraints mapping.
+ */
+inline void addWithCoupled(const t_iatom* iatoms,
+ const int stride,
+ ArrayRef<const std::vector<std::tuple<int, int, int>>> atomsAdjacencyList,
+ ArrayRef<int> splitMap,
+ const int c,
+ int* currentMapIndex)
+{
+ if (splitMap[c] == -1)
+ {
+ splitMap[c] = *currentMapIndex;
+ (*currentMapIndex)++;
+
+ // Constraints, coupled through both atoms.
+ for (int atomIndexInConstraint = 0; atomIndexInConstraint < 2; atomIndexInConstraint++)
+ {
+ const int a1 = iatoms[stride * c + 1 + atomIndexInConstraint];
+ for (const auto& adjacentAtom : atomsAdjacencyList[a1])
+ {
+ const int c2 = std::get<1>(adjacentAtom);
+ if (c2 != c)
+ {
+ addWithCoupled(iatoms, stride, atomsAdjacencyList, splitMap, c2, currentMapIndex);
+ }
+ }
+ }
+ }
+}
+
void LincsCuda::set(const t_idef& idef, const t_mdatoms& md)
{
int numAtoms = md.nr;
atomsAdjacencyList.at(a2).push_back(std::make_tuple(a1, c, -1));
}
- // Compute, how many coupled constraints are in front of each constraint.
- // Needed to introduce splits in data so that all coupled constraints will be computed in a single GPU block.
- // The position 'c' of the vector spaceNeeded should have the number of constraints that are coupled to a constraint
- // 'c' and are after 'c' in the vector. Only first index of the connected group of the constraints is needed later in the
- // code, hence the spaceNeeded vector is also used to keep track if the constrain was already counted.
- std::vector<int> spaceNeeded;
- spaceNeeded.resize(numConstraints, -1);
- std::fill(spaceNeeded.begin(), spaceNeeded.end(), -1);
+ // Compute, how many constraints are coupled to each constraint.
+ // Needed to introduce splits in data so that all coupled constraints will be computed in a
+ // single GPU block. The position 'c' of the vector numCoupledConstraints should have the number
+ // of constraints that are coupled to a constraint 'c' and are after 'c' in the vector. Only
+ // first index of the connected group of the constraints is needed later in the code, hence the
+ // numCoupledConstraints vector is also used to keep track if the constrain was already counted.
+ std::vector<int> numCoupledConstraints(numConstraints, -1);
for (int c = 0; c < numConstraints; c++)
{
int a1 = iatoms[stride * c + 1];
int a2 = iatoms[stride * c + 2];
- if (spaceNeeded.at(c) == -1)
+ if (numCoupledConstraints.at(c) == -1)
{
- spaceNeeded.at(c) = countCoupled(a1, &spaceNeeded, &atomsAdjacencyList)
- + countCoupled(a2, &spaceNeeded, &atomsAdjacencyList);
+ numCoupledConstraints.at(c) = countCoupled(a1, numCoupledConstraints, atomsAdjacencyList)
+ + countCoupled(a2, numCoupledConstraints, atomsAdjacencyList);
}
}
// Map of splits in the constraints data. For each 'old' constraint index gives 'new' which
// takes into account the empty spaces which might be needed in the end of each thread block.
- std::vector<int> splitMap;
- splitMap.resize(numConstraints, -1);
- int currentMapIndex = 0;
+ std::vector<int> splitMap(numConstraints, -1);
+ int currentMapIndex = 0;
for (int c = 0; c < numConstraints; c++)
{
// Check if coupled constraints all fit in one block
- GMX_RELEASE_ASSERT(
- spaceNeeded.at(c) < c_threadsPerBlock,
- "Maximum number of coupled constraints exceedes the size of the CUDA thread block. "
- "Most likely, you are trying to use GPU version of LINCS with constraints on "
- "all-bonds, "
- "which is not supported. Try using H-bonds constraints only.");
- if (currentMapIndex / c_threadsPerBlock != (currentMapIndex + spaceNeeded.at(c)) / c_threadsPerBlock)
+ if (numCoupledConstraints.at(c) > c_threadsPerBlock)
+ {
+ gmx_fatal(FARGS,
+ "Maximum number of coupled constraints (%d) exceeds the size of the CUDA "
+ "thread block (%d). Most likely, you are trying to use the GPU version of "
+ "LINCS with constraints on all-bonds, which is not supported for large "
+ "molecules. When compatible with the force field and integration settings, "
+ "using constraints on H-bonds only.",
+ numCoupledConstraints.at(c), c_threadsPerBlock);
+ }
+ if (currentMapIndex / c_threadsPerBlock
+ != (currentMapIndex + numCoupledConstraints.at(c)) / c_threadsPerBlock)
{
currentMapIndex = ((currentMapIndex / c_threadsPerBlock) + 1) * c_threadsPerBlock;
}
- splitMap.at(c) = currentMapIndex;
- currentMapIndex++;
+ addWithCoupled(iatoms, stride, atomsAdjacencyList, splitMap, c, ¤tMapIndex);
}
+
kernelParams_.numConstraintsThreads =
currentMapIndex + c_threadsPerBlock - currentMapIndex % c_threadsPerBlock;
GMX_RELEASE_ASSERT(kernelParams_.numConstraintsThreads % c_threadsPerBlock == 0,
maxCoupledConstraints * kernelParams_.numConstraintsThreads, commandStream_,
GpuApiCallBehavior::Sync, nullptr);
- GMX_RELEASE_ASSERT(md.invmass != nullptr, "Masses of attoms should be specified.\n");
+ GMX_RELEASE_ASSERT(md.invmass != nullptr, "Masses of atoms should be specified.\n");
copyToDeviceBuffer(&kernelParams_.d_inverseMasses, md.invmass, 0, numAtoms, commandStream_,
GpuApiCallBehavior::Sync, nullptr);
}
}
}
- // Copy coordinate from the GPU if update is on the GPU and there are forces to be computed on the CPU. At search steps the
- // current coordinates are already on the host, hence copy is not needed.
+ // Copy coordinate from the GPU if update is on the GPU and there are forces to be computed on
+ // the CPU, or for the computation of virial. At search steps the current coordinates are
+ // already on the host, hence copy is not needed.
if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
- && runScheduleWork->domainWork.haveCpuLocalForceWork)
+ && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial))
{
stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local);
stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
if (ddUsesGpuDirectCommunication && (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork))
{
/* Wait for non-local coordinate data to be copied from device */
- nbv->wait_nonlocal_x_copy_D2H_done();
+ stateGpu->waitCoordinatesReadyOnHost(AtomLocality::NonLocal);
}
/* Compute the bonded and non-bonded energies and optionally forces */
do_force_lowlevel(fr, inputrec, &(top->idef), cr, ms, nrnb, wcycle, mdatoms, x, hist, &forceOut, enerd,
fr->pmePpCommGpu->getForcesReadySynchronizer())) // buffer received from other GPU
: nullptr; // PME reduction not active on GPU
- gmx::FixedCapacityVector<GpuEventSynchronizer*, 2> dependencyList;
+ gmx::FixedCapacityVector<GpuEventSynchronizer*, 3> dependencyList;
if (stepWork.useGpuPmeFReduction)
{
}
if (useGpuForcesHaloExchange)
{
- // Add a stream synchronization to satisfy a dependency
- // for the local buffer ops on the result of GPU halo
- // exchange, which operates in the non-local stream and
- // writes to to local parf og the force buffer.
- //
- // TODO improve this through use of an event - see Redmine #3093
- // push the event into the dependencyList
- nbv->stream_local_wait_for_nonlocal();
+ dependencyList.push_back(gpuHaloExchange->getForcesReadyOnDeviceEvent());
}
nbv->atomdata_add_nbat_f_to_f_gpu(AtomLocality::Local, stateGpu->getForces(), pmeForcePtr,
dependencyList, stepWork.useGpuPmeFReduction,
matrix pressureCouplingMu,
t_state* state,
t_nrnb* nrnb,
- Update* upd)
+ Update* upd,
+ const bool scaleCoordinates)
{
int start = 0;
int homenr = md->homenr;
berendsen_pcoupl(fplog, step, inputrec, dtpc, pressure, state->box, forceVirial,
constraintVirial, pressureCouplingMu, &state->baros_integral);
berendsen_pscale(inputrec, pressureCouplingMu, state->box, state->box_rel, start,
- homenr, state->x.rvec_array(), md->cFREEZE, nrnb);
+ homenr, state->x.rvec_array(), md->cFREEZE, nrnb, scaleCoordinates);
}
break;
case (epcPARRINELLORAHMAN):
preserve_box_shape(inputrec, state->box_rel, state->box);
/* Scale the coordinates */
- auto x = state->x.rvec_array();
- for (int n = start; n < start + homenr; n++)
+ if (scaleCoordinates)
{
- tmvmul_ur0(pressureCouplingMu, x[n], x[n]);
+ auto x = state->x.rvec_array();
+ for (int n = start; n < start + homenr; n++)
+ {
+ tmvmul_ur0(pressureCouplingMu, x[n], x[n]);
+ }
}
}
break;
matrix pressureCouplingMu,
t_state* state,
t_nrnb* nrnb,
- gmx::Update* upd);
+ gmx::Update* upd,
+ bool scaleCoordinates);
void update_coords(int64_t step,
const t_inputrec* inputrec, /* input record and box stuff */
int nr_atoms,
rvec x[],
const unsigned short cFREEZE[],
- t_nrnb* nrnb);
+ t_nrnb* nrnb,
+ bool scaleCoordinates);
void pleaseCiteCouplingAlgorithms(FILE* fplog, const t_inputrec& ir);
* This will extract temperature scaling factors from tcstat, transform them into the plain
* array and call the normal integrate method.
*
- * \param[in] fReadyOnDevice Event synchronizer indicating that the forces are
- * ready in the device memory.
- * \param[in] dt Timestep.
- * \param[in] updateVelocities If the velocities should be constrained.
- * \param[in] computeVirial If virial should be updated.
- * \param[out] virial Place to save virial tensor.
- * \param[in] doTempCouple If the temperature coupling should be performed.
- * \param[in] tcstat Temperature coupling data.
- * \param[in] doPressureCouple If the temperature coupling should be applied.
- * \param[in] dtPressureCouple Period between pressure coupling steps
- * \param[in] velocityScalingMatrix Parrinello-Rahman velocity scaling matrix
+ * \param[in] fReadyOnDevice Event synchronizer indicating that the forces are
+ * ready in the device memory.
+ * \param[in] dt Timestep.
+ * \param[in] updateVelocities If the velocities should be constrained.
+ * \param[in] computeVirial If virial should be updated.
+ * \param[out] virial Place to save virial tensor.
+ * \param[in] doTemperatureScaling If velocities should be scaled for temperature coupling.
+ * \param[in] tcstat Temperature coupling data.
+ * \param[in] doParrinelloRahman If current step is a Parrinello-Rahman pressure coupling step.
+ * \param[in] dtPressureCouple Period between pressure coupling steps.
+ * \param[in] prVelocityScalingMatrix Parrinello-Rahman velocity scaling matrix.
*/
void integrate(GpuEventSynchronizer* fReadyOnDevice,
real dt,
bool updateVelocities,
bool computeVirial,
tensor virial,
- bool doTempCouple,
+ bool doTemperatureScaling,
gmx::ArrayRef<const t_grp_tcstat> tcstat,
- bool doPressureCouple,
+ bool doParrinelloRahman,
float dtPressureCouple,
- const matrix velocityScalingMatrix);
+ const matrix prVelocityScalingMatrix);
+
+ /*! \brief Scale coordinates on the GPU for the pressure coupling.
+ *
+ * After pressure coupling step, the box size may change. Hence, the coordinates should be
+ * scaled so that all the particles fit in the new box.
+ *
+ * \param[in] scalingMatrix Coordinates scaling matrix.
+ */
+ void scaleCoordinates(const matrix scalingMatrix);
/*! \brief Set the pointers and update data-structures (e.g. after NB search step).
*
{
};
-UpdateConstrainCuda::UpdateConstrainCuda(gmx_unused const t_inputrec& ir,
- gmx_unused const gmx_mtop_t& mtop,
- gmx_unused const void* commandStream,
- gmx_unused GpuEventSynchronizer* xUpdatedOnDevice) :
+UpdateConstrainCuda::UpdateConstrainCuda(const t_inputrec& /* ir */,
+ const gmx_mtop_t& /* mtop */,
+ const void* /* commandStream */,
+ GpuEventSynchronizer* /* xUpdatedOnDevice */) :
impl_(nullptr)
{
GMX_ASSERT(false,
UpdateConstrainCuda::~UpdateConstrainCuda() = default;
-void UpdateConstrainCuda::integrate(gmx_unused GpuEventSynchronizer* fReadyOnDevice,
- gmx_unused const real dt,
- gmx_unused const bool updateVelocities,
- gmx_unused const bool computeVirial,
- gmx_unused tensor virialScaled,
- gmx_unused const bool doTempCouple,
- gmx_unused gmx::ArrayRef<const t_grp_tcstat> tcstat,
- gmx_unused const bool doPressureCouple,
- gmx_unused const float dtPressureCouple,
- gmx_unused const matrix velocityScalingMatrix)
+void UpdateConstrainCuda::integrate(GpuEventSynchronizer* /* fReadyOnDevice */,
+ const real /* dt */,
+ const bool /* updateVelocities */,
+ const bool /* computeVirial */,
+ tensor /* virialScaled */,
+ const bool /* doTemperatureScaling */,
+ gmx::ArrayRef<const t_grp_tcstat> /* tcstat */,
+ const bool /* doParrinelloRahman */,
+ const float /* dtPressureCouple */,
+ const matrix /* prVelocityScalingMatrix*/)
{
GMX_ASSERT(false,
"A CPU stub for UpdateConstrain was called instead of the correct implementation.");
}
-void UpdateConstrainCuda::set(gmx_unused DeviceBuffer<float> d_x,
- gmx_unused DeviceBuffer<float> d_v,
- gmx_unused const DeviceBuffer<float> d_f,
- gmx_unused const t_idef& idef,
- gmx_unused const t_mdatoms& md,
- gmx_unused const int numTempScaleValues)
+void UpdateConstrainCuda::scaleCoordinates(const matrix /* scalingMatrix */)
{
GMX_ASSERT(false,
"A CPU stub for UpdateConstrain was called instead of the correct implementation.");
}
-void UpdateConstrainCuda::setPbc(gmx_unused const t_pbc* pbc)
+void UpdateConstrainCuda::set(DeviceBuffer<float> /* d_x */,
+ DeviceBuffer<float> /* d_v */,
+ const DeviceBuffer<float> /* d_f */,
+ const t_idef& /* idef */,
+ const t_mdatoms& /* md */,
+ const int /* numTempScaleValues */)
+{
+ GMX_ASSERT(false,
+ "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
+}
+
+void UpdateConstrainCuda::setPbc(const t_pbc* /* pbc */)
{
GMX_ASSERT(false,
"A CPU stub for UpdateConstrain was called instead of the correct implementation.");
namespace gmx
{
+/*!\brief Number of CUDA threads in a block
+ *
+ * \todo Check if using smaller block size will lead to better prformance.
+ */
+constexpr static int c_threadsPerBlock = 256;
+//! Maximum number of threads in a block (for __launch_bounds__)
+constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock;
+
+/*! \brief Scaling matrix struct.
+ *
+ * \todo Should be generalized.
+ */
+struct ScalingMatrix
+{
+ float xx, yy, zz, yx, zx, zy;
+};
+
+__launch_bounds__(c_maxThreadsPerBlock) __global__
+ static void scaleCoordinates_kernel(const int numAtoms,
+ float3* __restrict__ gm_x,
+ const ScalingMatrix scalingMatrix)
+{
+ int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
+ if (threadIndex < numAtoms)
+ {
+ float3 x = gm_x[threadIndex];
+
+ x.x = scalingMatrix.xx * x.x + scalingMatrix.yx * x.y + scalingMatrix.zx * x.z;
+ x.y = scalingMatrix.yy * x.y + scalingMatrix.zy * x.z;
+ x.z = scalingMatrix.zz * x.z;
+
+ gm_x[threadIndex] = x;
+ }
+}
void UpdateConstrainCuda::Impl::integrate(GpuEventSynchronizer* fReadyOnDevice,
const real dt,
const bool updateVelocities,
const bool computeVirial,
tensor virial,
- const bool doTempCouple,
+ const bool doTemperatureScaling,
gmx::ArrayRef<const t_grp_tcstat> tcstat,
- const bool doPressureCouple,
+ const bool doParrinelloRahman,
const float dtPressureCouple,
- const matrix velocityScalingMatrix)
+ const matrix prVelocityScalingMatrix)
{
// Clearing virial matrix
// TODO There is no point in having separate virial matrix for constraints
// The integrate should save a copy of the current coordinates in d_xp_ and write updated once
// into d_x_. The d_xp_ is only needed by constraints.
- integrator_->integrate(d_x_, d_xp_, d_v_, d_f_, dt, doTempCouple, tcstat, doPressureCouple,
- dtPressureCouple, velocityScalingMatrix);
+ integrator_->integrate(d_x_, d_xp_, d_v_, d_f_, dt, doTemperatureScaling, tcstat,
+ doParrinelloRahman, dtPressureCouple, prVelocityScalingMatrix);
// Constraints need both coordinates before (d_x_) and after (d_xp_) update. However, after constraints
// are applied, the d_x_ can be discarded. So we intentionally swap the d_x_ and d_xp_ here to avoid the
// d_xp_ -> d_x_ copy after constraints. Note that the integrate saves them in the wrong order as well.
return;
}
+void UpdateConstrainCuda::Impl::scaleCoordinates(const matrix scalingMatrix)
+{
+ ScalingMatrix mu;
+ mu.xx = scalingMatrix[XX][XX];
+ mu.yy = scalingMatrix[YY][YY];
+ mu.zz = scalingMatrix[ZZ][ZZ];
+ mu.yx = scalingMatrix[YY][XX];
+ mu.zx = scalingMatrix[ZZ][XX];
+ mu.zy = scalingMatrix[ZZ][YY];
+
+ const auto kernelArgs = prepareGpuKernelArguments(
+ scaleCoordinates_kernel, coordinateScalingKernelLaunchConfig_, &numAtoms_, &d_x_, &mu);
+ launchGpuKernel(scaleCoordinates_kernel, coordinateScalingKernelLaunchConfig_, nullptr,
+ "scaleCoordinates_kernel", kernelArgs);
+ // TODO: Although this only happens on the pressure coupling steps, this synchronization
+ // can affect the perfornamce if nstpcouple is small.
+ gpuStreamSynchronize(commandStream_);
+}
+
UpdateConstrainCuda::Impl::Impl(const t_inputrec& ir,
const gmx_mtop_t& mtop,
const void* commandStream,
integrator_ = std::make_unique<LeapFrogCuda>(commandStream_);
lincsCuda_ = std::make_unique<LincsCuda>(ir.nLincsIter, ir.nProjOrder, commandStream_);
settleCuda_ = std::make_unique<SettleCuda>(mtop, commandStream_);
+
+ coordinateScalingKernelLaunchConfig_.blockSize[0] = c_threadsPerBlock;
+ coordinateScalingKernelLaunchConfig_.blockSize[1] = 1;
+ coordinateScalingKernelLaunchConfig_.blockSize[2] = 1;
+ coordinateScalingKernelLaunchConfig_.sharedMemorySize = 0;
+ coordinateScalingKernelLaunchConfig_.stream = commandStream_;
}
UpdateConstrainCuda::Impl::~Impl() {}
integrator_->set(md, numTempScaleValues, md.cTC);
lincsCuda_->set(idef, md);
settleCuda_->set(idef, md);
+
+ coordinateScalingKernelLaunchConfig_.gridSize[0] =
+ (numAtoms_ + c_threadsPerBlock - 1) / c_threadsPerBlock;
}
void UpdateConstrainCuda::Impl::setPbc(const t_pbc* pbc)
const bool updateVelocities,
const bool computeVirial,
tensor virialScaled,
- const bool doTempCouple,
+ const bool doTemperatureScaling,
gmx::ArrayRef<const t_grp_tcstat> tcstat,
- const bool doPressureCouple,
+ const bool doParrinelloRahman,
const float dtPressureCouple,
- const matrix velocityScalingMatrix)
+ const matrix prVelocityScalingMatrix)
+{
+ impl_->integrate(fReadyOnDevice, dt, updateVelocities, computeVirial, virialScaled, doTemperatureScaling,
+ tcstat, doParrinelloRahman, dtPressureCouple, prVelocityScalingMatrix);
+}
+
+void UpdateConstrainCuda::scaleCoordinates(const matrix scalingMatrix)
{
- impl_->integrate(fReadyOnDevice, dt, updateVelocities, computeVirial, virialScaled, doTempCouple,
- tcstat, doPressureCouple, dtPressureCouple, velocityScalingMatrix);
+ impl_->scaleCoordinates(scalingMatrix);
}
void UpdateConstrainCuda::set(DeviceBuffer<float> d_x,
* 2. This is the temperature coupling step.
* Parameters virial/lambdas can be nullptr if computeVirial/doTempCouple are false.
*
- * \param[in] fReadyOnDevice Event synchronizer indicating that the forces are ready in
- * the device memory.
- * \param[in] dt Timestep.
- * \param[in] updateVelocities If the velocities should be constrained.
- * \param[in] computeVirial If virial should be updated.
- * \param[out] virial Place to save virial tensor.
- * \param[in] doTempCouple If the temperature coupling should be performed.
- * \param[in] tcstat Temperature coupling data.
- * \param[in] doPressureCouple If the temperature coupling should be applied.
- * \param[in] dtPressureCouple Period between pressure coupling steps
- * \param[in] velocityScalingMatrix Parrinello-Rahman velocity scaling matrix
+ * \param[in] fReadyOnDevice Event synchronizer indicating that the forces are ready in
+ * the device memory.
+ * \param[in] dt Timestep.
+ * \param[in] updateVelocities If the velocities should be constrained.
+ * \param[in] computeVirial If virial should be updated.
+ * \param[out] virial Place to save virial tensor.
+ * \param[in] doTemperatureScaling If velocities should be scaled for temperature coupling.
+ * \param[in] tcstat Temperature coupling data.
+ * \param[in] doParrinelloRahman If current step is a Parrinello-Rahman pressure coupling step.
+ * \param[in] dtPressureCouple Period between pressure coupling steps.
+ * \param[in] prVelocityScalingMatrix Parrinello-Rahman velocity scaling matrix.
*/
void integrate(GpuEventSynchronizer* fReadyOnDevice,
real dt,
bool updateVelocities,
bool computeVirial,
tensor virial,
- bool doTempCouple,
+ bool doTemperatureScaling,
gmx::ArrayRef<const t_grp_tcstat> tcstat,
- bool doPressureCouple,
+ bool doParrinelloRahman,
float dtPressureCouple,
- const matrix velocityScalingMatrix);
+ const matrix prVelocityScalingMatrix);
+
+ /*! \brief Scale coordinates on the GPU for the pressure coupling.
+ *
+ * After pressure coupling step, the box size may change. Hence, the coordinates should be
+ * scaled so that all the particles fit in the new box.
+ *
+ * \param[in] scalingMatrix Coordinates scaling matrix.
+ */
+ void scaleCoordinates(const matrix scalingMatrix);
/*! \brief Set the pointers and update data-structures (e.g. after NB search step).
*
private:
//! CUDA stream
CommandStream commandStream_ = nullptr;
+ //! CUDA kernel launch config
+ KernelLaunchConfig coordinateScalingKernelLaunchConfig_;
//! Periodic boundary data
PbcAiuc pbcAiuc_;
GMX_RELEASE_ASSERT(
ir->etc != etcNOSEHOOVER,
"Nose-Hoover temperature coupling is not supported with the GPU update.\n");
- GMX_RELEASE_ASSERT(ir->epc == epcNO,
- "Pressure coupling is not supported with the GPU update.\n");
+ GMX_RELEASE_ASSERT(ir->epc == epcNO || ir->epc == epcPARRINELLORAHMAN || ir->epc == epcBERENDSEN,
+ "Only Parrinello-Rahman and Berendsen pressure coupling are supported "
+ "with the GPU update.\n");
GMX_RELEASE_ASSERT(!mdatoms->haveVsites,
"Virtual sites are not supported with the GPU update.\n");
GMX_RELEASE_ASSERT(ed == nullptr,
}
integrator = std::make_unique<UpdateConstrainCuda>(
*ir, *top_global, stateGpu->getUpdateStream(), stateGpu->xUpdatedOnDevice());
+
+ t_pbc pbc;
+ set_pbc(&pbc, epbcXYZ, state->box);
+ integrator->setPbc(&pbc);
}
if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
if (correct_box(fplog, step, state->box, graph))
{
bMasterState = TRUE;
+ // If update is offloaded, it should be informed about the box size change
+ if (useGpuForUpdate)
+ {
+ t_pbc pbc;
+ set_pbc(&pbc, epbcXYZ, state->box);
+ integrator->setPbc(&pbc);
+ }
}
}
if (DOMAINDECOMP(cr) && bMasterState)
* energies of two subsequent steps. Therefore we need to compute the
* half step kinetic energy also if we need energies at the next step.
*/
- const bool needHalfStepKineticEnergy = (!EI_VV(ir->eI) && do_per_step(step + 1, nstglobalcomm));
+ const bool needHalfStepKineticEnergy =
+ (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
+
+ // Parrinello-Rahman requires the pressure to be availible before the update to compute
+ // the velocity scaling matrix. Hence, it runs one step after the nstpcouple step.
+ const bool doParrinelloRahman = (ir->epc == epcPARRINELLORAHMAN
+ && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
if (useGpuForUpdate)
{
{
integrator->set(stateGpu->getCoordinates(), stateGpu->getVelocities(),
stateGpu->getForces(), top.idef, *mdatoms, ekind->ngtc);
- t_pbc pbc;
- set_pbc(&pbc, epbcXYZ, state->box);
- integrator->setPbc(&pbc);
// Copy data to the GPU after buffers might have being reinitialized
stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
stateGpu->copyForcesToGpu(ArrayRef<RVec>(f), AtomLocality::Local);
}
- bool doTempCouple =
+ const bool doTemperatureScaling =
(ir->etc != etcNO && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
- bool doParrinelloRahman = (ir->epc == epcPARRINELLORAHMAN
- && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
// This applies Leap-Frog, LINCS and SETTLE in succession
integrator->integrate(stateGpu->getForcesReadyOnDeviceEvent(
AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
- ir->delta_t, true, bCalcVir, shake_vir, doTempCouple,
+ ir->delta_t, true, bCalcVir, shake_vir, doTemperatureScaling,
ekind->tcstat, doParrinelloRahman, ir->nstpcouple * ir->delta_t, M);
// Copy velocities D2H after update if:
// and when algorithms require it.
const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
- // With leap-frog we also need to compute the half-step
- // kinetic energy at the step before we need to write
- // the full-step kinetic energy
- const bool needEkinAtNextStep =
- (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
-
- if (bGStat || needEkinAtNextStep || doInterSimSignal)
+ if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
{
// Copy coordinates when needed to stop the CM motion.
if (useGpuForUpdate && !EI_VV(ir->eI) && bStopCM)
}
update_pcouple_after_coordinates(fplog, step, ir, mdatoms, pres, force_vir, shake_vir,
- pressureCouplingMu, state, nrnb, &upd);
+ pressureCouplingMu, state, nrnb, &upd, !useGpuForUpdate);
+
+ const bool doBerendsenPressureCoupling =
+ (inputrec->epc == epcBERENDSEN && do_per_step(step, inputrec->nstpcouple));
+ if (useGpuForUpdate && (doBerendsenPressureCoupling || doParrinelloRahman))
+ {
+ integrator->scaleCoordinates(pressureCouplingMu);
+ t_pbc pbc;
+ set_pbc(&pbc, epbcXYZ, state->box);
+ integrator->setPbc(&pbc);
+ }
/* ################# END UPDATE STEP 2 ################# */
/* #### We now have r(t+dt) and v(t+dt/2) ############# */
*
* \param[in] mdlog Logger object.
* \param[in] useGpuForNonbonded True if the nonbonded task is offloaded in this run.
- * \param[in] useGpuForPme True if the PME task is offloaded in this run.
+ * \param[in] pmeRunMode The PME run mode for this run
* \returns The object populated with development feature flags.
*/
static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger& mdlog,
const bool useGpuForNonbonded,
- const bool useGpuForPme)
+ const PmeRunMode pmeRunMode)
{
DevelopmentFeatureFlags devFlags;
if (devFlags.enableGpuPmePPComm)
{
- if (useGpuForPme)
+ if (pmeRunMode == PmeRunMode::GPU)
{
GMX_LOG(mdlog.warning)
.asParagraph()
}
else
{
+ std::string clarification;
+ if (pmeRunMode == PmeRunMode::Mixed)
+ {
+ clarification =
+ "PME FFT and gather are not offloaded to the GPU (PME is running in mixed "
+ "mode).";
+ }
+ else
+ {
+ clarification = "PME is not offloaded to the GPU.";
+ }
GMX_LOG(mdlog.warning)
.asParagraph()
- .appendTextFormatted(
+ .appendText(
"NOTE: GMX_GPU_PME_PP_COMMS environment variable detected, but the "
- "'GPU PME-PP communications' feature was not enabled as PME is not "
- "offloaded to the GPU.");
+ "'GPU PME-PP communications' feature was not enabled as "
+ + clarification);
devFlags.enableGpuPmePPComm = false;
}
}
// Initialize development feature flags that enabled by environment variable
// and report those features that are enabled.
const DevelopmentFeatureFlags devFlags =
- manageDevelopmentFeatures(mdlog, useGpuForNonbonded, useGpuForPme);
+ manageDevelopmentFeatures(mdlog, useGpuForNonbonded, pmeRunMode);
// NOTE: The devFlags need decideWhetherToUseGpusForNonbonded(...) and decideWhetherToUseGpusForPme(...) for overrides,
// decideWhetherToUseGpuForUpdate() needs devFlags for the '-update auto' override, hence the interleaving.
~StatePropagatorDataGpu();
/*! \brief Set the ranges for local and non-local atoms and reallocates buffers.
+ *
+ * Reallocates coordinate, velocities and force buffers on the device.
*
* \note
* The coordinates buffer is (re)allocated, when required by PME, with a padding,
* is scheduled in the \p pmeStream_ (unlike the coordinates H2D) as only the PME
* task uses this padding area.
*
+ * \note
+ * The force buffer is cleared if its size increases, so that previously unused
+ * memory is cleared before forces are accumulated.
+ *
* \param[in] numAtomsLocal Number of atoms in local domain.
* \param[in] numAtomsAll Total number of atoms to handle.
*/
/*! \brief Set the ranges for local and non-local atoms and reallocates buffers.
+ *
+ * Reallocates coordinate, velocities and force buffers on the device.
*
* \note
* The coordinates buffer is (re)allocated, when required by PME, with a padding,
* is scheduled in the \p pmeStream_ (unlike the coordinates H2D) as only the PME
* task uses this padding area.
*
+ * \note
+ * The force buffer is cleared if its size increases, so that previously unused
+ * memory is cleared before forces are accumulated.
+ *
* \param[in] numAtomsLocal Number of atoms in local domain.
* \param[in] numAtomsAll Total number of atoms to handle.
*/
}
reallocateDeviceBuffer(&d_v_, DIM * numAtomsAll_, &d_vSize_, &d_vCapacity_, deviceContext_);
+ const int d_fOldCapacity = d_fCapacity_;
reallocateDeviceBuffer(&d_f_, DIM * numAtomsAll_, &d_fSize_, &d_fCapacity_, deviceContext_);
+ // Clearing of the forces can be done in local stream since the nonlocal stream cannot reach
+ // the force accumulation stage before syncing with the local stream. Only done in CUDA,
+ // since the force buffer ops are not implemented in OpenCL.
+ if (GMX_GPU == GMX_GPU_CUDA && d_fCapacity_ != d_fOldCapacity)
+ {
+ clearDeviceBufferAsync(&d_f_, 0, d_fCapacity_, localStream_);
+ }
}
std::tuple<int, int> StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality(AtomLocality atomLocality)
nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
}
-/* F buffer operations on GPU: performs force summations and conversion from nb to rvec format. */
+/* F buffer operations on GPU: performs force summations and conversion from nb to rvec format.
+ *
+ * NOTE: When the total force device buffer is reallocated and its size increases, it is cleared in
+ * Local stream. Hence, if accumulateForce is true, NonLocal stream should start accumulating
+ * forces only after Local stream already done so.
+ */
void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomLocality,
DeviceBuffer<float> totalForcesDevice,
gmx_nbnxn_gpu_t* nb,
}
}
-void nbnxn_wait_nonlocal_x_copy_D2H_done(gmx_nbnxn_cuda_t* nb)
-{
- nb->xNonLocalCopyD2HDone->waitForEvent();
-}
-
-void nbnxn_stream_local_wait_for_nonlocal(gmx_nbnxn_cuda_t* nb)
-{
- cudaStream_t localStream = nb->stream[InteractionLocality::Local];
- cudaStream_t nonLocalStream = nb->stream[InteractionLocality::NonLocal];
-
- GpuEventSynchronizer event;
- event.markEvent(nonLocalStream);
- event.enqueueWaitEvent(localStream);
-}
-
} // namespace Nbnxm
Nbnxm::nbnxnInsertNonlocalGpuDependency(gpu_nbv, interactionLocality);
}
-void nonbonded_verlet_t::wait_nonlocal_x_copy_D2H_done()
-{
- Nbnxm::nbnxn_wait_nonlocal_x_copy_D2H_done(gpu_nbv);
-}
-
-void nonbonded_verlet_t::stream_local_wait_for_nonlocal()
-{
- Nbnxm::nbnxn_stream_local_wait_for_nonlocal(gpu_nbv);
-}
-
/*! \endcond */
*/
void atomdata_init_add_nbat_f_to_f_gpu(GpuEventSynchronizer* localReductionDone);
- /*! \brief Wait for non-local copy of coordinate buffer from device to host */
- void wait_nonlocal_x_copy_D2H_done();
-
/*! \brief return GPU pointer to f in rvec format */
void* get_gpu_frvec();
- /*! \brief Ensure local stream waits for non-local stream */
- void stream_local_wait_for_nonlocal();
-
//! Return the kernel setup
const Nbnxm::KernelSetup& kernelSetup() const { return kernelSetup_; }
CUDA_FUNC_QUALIFIER
void nbnxn_wait_x_on_device(gmx_nbnxn_gpu_t gmx_unused* nb) CUDA_FUNC_TERM;
-/*! \brief Wait for non-local copy of coordinate buffer from device to host
- * \param[in] nb The nonbonded data GPU structure
- */
-CUDA_FUNC_QUALIFIER
-void nbnxn_wait_nonlocal_x_copy_D2H_done(gmx_nbnxn_gpu_t gmx_unused* nb) CUDA_FUNC_TERM;
-
-/*! \brief Ensure local stream waits for non-local stream
- * \param[in] nb The nonbonded data GPU structure
- */
-CUDA_FUNC_QUALIFIER
-void nbnxn_stream_local_wait_for_nonlocal(gmx_nbnxn_gpu_t gmx_unused* nb) CUDA_FUNC_TERM;
-
} // namespace Nbnxm
#endif
*/
/* Stride of the packed x coordinate array */
-static constexpr int c_xStride2xNN = c_nbnxnCpuIClusterSize;
+static constexpr int c_xStride2xNN = (GMX_SIMD_REAL_WIDTH >= 2 * c_nbnxnCpuIClusterSize)
+ ? GMX_SIMD_REAL_WIDTH / 2
+ : c_nbnxnCpuIClusterSize;
/* Copies PBC shifted i-cell packed atom coordinates to working array */
static inline void icell_set_x_simd_2xnn(int ci,
PullCoordSpatialData& spatialData = pcrd->spatialData;
double md2;
- if (pcrd->params.eGeom == epullgDIRPBC)
+ /* With AWH pulling we allow for periodic pulling with geometry=direction.
+ * TODO: Store a periodicity flag instead of checking for external pull provider.
+ */
+ if (pcrd->params.eGeom == epullgDIRPBC
+ || (pcrd->params.eGeom == epullgDIR && pcrd->params.eType == epullEXTERNAL))
{
md2 = -1;
}
{
errorMessage += "Nose-Hoover temperature coupling is not supported.\n";
}
- if (inputrec.epc != epcNO)
+ if (!(inputrec.epc == epcNO || inputrec.epc == epcPARRINELLORAHMAN || inputrec.epc == epcBERENDSEN))
{
- // Coordinate D2H and H2d are missing as well as PBC reinitialization
- errorMessage += "Pressure coupling is not supported.\n";
+ errorMessage += "Only Parrinello-Rahman and Berendsen pressure coupling are supported.\n";
}
if (EEL_PME_EWALD(inputrec.coulombtype) && inputrec.epsilon_surface != 0)
{
{
errorMessage += "Swapping the coordinates is not supported.\n";
}
+
+ // \todo Check for coupled constraint block size restriction needs to be added
+ // when update auto chooses GPU in some cases. Currently exceeding the restriction
+ // triggers a fatal error during LINCS setup.
+
if (!errorMessage.empty())
{
if (updateTarget == TaskTarget::Gpu)
simulationWorkload.useGpuNonbonded = useGpuForNonbonded;
simulationWorkload.useCpuPme = (pmeRunMode == PmeRunMode::CPU);
simulationWorkload.useGpuPme = (pmeRunMode == PmeRunMode::GPU || pmeRunMode == PmeRunMode::Mixed);
- simulationWorkload.useGpuPmeFft = (pmeRunMode == PmeRunMode::Mixed);
- simulationWorkload.useGpuBonded = useGpuForBonded;
- simulationWorkload.useGpuUpdate = useGpuForUpdate;
- simulationWorkload.useGpuBufferOps = useGpuForBufferOps || useGpuForUpdate;
- simulationWorkload.useGpuHaloExchange = useGpuHaloExchange;
- simulationWorkload.useGpuPmePpCommunication = useGpuPmePpComm;
+ simulationWorkload.useGpuPmeFft = (pmeRunMode == PmeRunMode::Mixed);
+ simulationWorkload.useGpuBonded = useGpuForBonded;
+ simulationWorkload.useGpuUpdate = useGpuForUpdate;
+ simulationWorkload.useGpuBufferOps = useGpuForBufferOps || useGpuForUpdate;
+ simulationWorkload.useGpuHaloExchange = useGpuHaloExchange;
+ simulationWorkload.useGpuPmePpCommunication = useGpuPmePpComm && (pmeRunMode == PmeRunMode::GPU);
simulationWorkload.useGpuDirectCommunication = useGpuHaloExchange || useGpuPmePpComm;
simulationWorkload.haveEwaldSurfaceContribution = haveEwaldSurfaceContribution;
#include "taskassignment.h"
#include <algorithm>
+#include <exception>
#include <string>
#include <vector>
return numGpuTasksOnThisNode;
}
+/*! \brief Return on each rank the total count over all ranks of all
+ * simulations. */
+int countOverAllRanks(const t_commrec* cr, const gmx_multisim_t* ms, const int countOnThisRank)
+{
+ int countOverAllRanksValue = countOnThisRank;
+ if (PAR(cr))
+ {
+ // Count over the ranks of this simulation.
+ gmx_sumi(1, &countOverAllRanksValue, cr);
+ }
+ if (isMultiSim(ms))
+ {
+ // Count over the ranks of all simulations.
+ gmx_sumi_sim(1, &countOverAllRanksValue, ms);
+ if (PAR(cr))
+ {
+ // Propagate the information from other simulations back
+ // to non-master ranks so they can all agree on future
+ // behavior.
+ gmx_bcast(sizeof(decltype(countOverAllRanksValue)), &countOverAllRanksValue, cr);
+ }
+ }
+ return countOverAllRanksValue;
+}
+
} // namespace
GpuTaskAssignmentsBuilder::GpuTaskAssignmentsBuilder() = default;
auto gpuTasksOnRanksOfThisNode = findAllGpuTasksOnThisNode(gpuTasksOnThisRank, physicalNodeComm);
size_t numGpuTasksOnThisNode = countGpuTasksOnThisNode(gpuTasksOnRanksOfThisNode);
+ std::exception_ptr exceptionPtr;
std::vector<GpuTaskAssignment> taskAssignmentOnRanksOfThisNode;
try
{
taskAssignmentOnRanksOfThisNode =
buildTaskAssignment(gpuTasksOnRanksOfThisNode, gpuIdsForTaskAssignment);
}
- catch (const std::exception& ex)
+ catch (...)
+ {
+ exceptionPtr = std::current_exception();
+ }
+ int countOfExceptionsOnThisRank = int(bool(exceptionPtr));
+ int countOfExceptionsOverAllRanks = countOverAllRanks(cr, ms, countOfExceptionsOnThisRank);
+
+ // Avoid all ranks spamming the error stream
+ //
+ // TODO improve this so that unique errors on different ranks
+ // are all reported.
+ if (countOfExceptionsOnThisRank > 0 && physicalNodeComm.rank_ == 0)
{
- // TODO This implementation is quite similar to that of
- // processExceptionAsFatalError (which implements
- // GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR), but it is unclear
- // how we should involve MPI in the implementation of error
- // handling.
- if (physicalNodeComm.rank_ == 0)
+ try
+ {
+ if (exceptionPtr)
+ {
+ std::rethrow_exception(exceptionPtr);
+ }
+ }
+ catch (const std::exception& ex)
{
printFatalErrorMessage(stderr, ex);
}
-
- gmx_exit_on_fatal_error(ExitType_Abort, 1);
}
- // TODO This implements a global barrier so that MPI runtimes can
- // organize an orderly shutdown if one of the ranks has had to
- // issue a fatal error after an exception detected only on one
- // rank. When we have MPI-aware error handling and reporting, this
- // should be improved.
- multiSimBarrier(ms);
- simulationBarrier(cr);
+ if (countOfExceptionsOverAllRanks > 0)
+ {
+ gmx_fatal(FARGS,
+ "Exiting because task assignment failed. If there is no descriptive "
+ "error message above this, please report this failure as a bug.");
+ }
// TODO There is no check that mdrun -nb gpu or -pme gpu or
// -gpu_id is actually being implemented such that nonbonded tasks
{ "To dissimulate is to feign not to have what one has. To simulate is to feign to have "
"what one hasn't.",
"Jean Baudrillard" },
+ { "Install our Free Energy Patents app! There is energy all around us; and it's free! "
+ "Free energy is everywhere, and all around you, just waiting to be extracted! Over "
+ "100+ free energy patents!"
+ "Mind and Miracle Productions on Twitter, spamming a FEP thread" },
+ { "\"A slow sort of country!\" said the Queen. \"Now, HERE, you see, it "
+ "takes all the running YOU can do, to keep in the same place. If you want "
+ "to get somewhere else, you must run at least twice as fast as that!\""
+ "Lewis Carroll" },
};
if (beCool())
#include <algorithm>
#include <vector>
-#include "gromacs/utility/gmxassert.h"
-
namespace gmx
{
class CharBuffer
{
public:
- static const size_t ValueSize = sizeof(T);
+ static constexpr size_t ValueSize = sizeof(T);
explicit CharBuffer(T value) { u.v = value; }
explicit CharBuffer(const char buffer[]) { std::copy(buffer, buffer + ValueSize, u.c); }
} u;
};
+//! Return \c value with the byte order swapped.
+template<typename T>
+T swapEndian(const T& value)
+{
+ union {
+ T value_;
+ std::array<char, sizeof(T)> valueAsCharArray_;
+ } endianessSwappedValue;
+
+ endianessSwappedValue.value_ = value;
+ int hiByte = sizeof(T) - 1;
+ for (int loByte = 0; hiByte > loByte; loByte++, hiByte--)
+ {
+ std::swap(endianessSwappedValue.valueAsCharArray_[loByte],
+ endianessSwappedValue.valueAsCharArray_[hiByte]);
+ }
+
+ return endianessSwappedValue.value_;
+}
+
} // namespace
/********************************************************************
class InMemorySerializer::Impl
{
public:
+ Impl(EndianSwapBehavior endianSwapBehavior) : endianSwapBehavior_(endianSwapBehavior) {}
template<typename T>
void doValue(T value)
{
- CharBuffer<T>(value).appendTo(&buffer_);
+ if (endianSwapBehavior_ == EndianSwapBehavior::DoSwap)
+ {
+ CharBuffer<T>(swapEndian(value)).appendTo(&buffer_);
+ }
+ else
+ {
+ CharBuffer<T>(value).appendTo(&buffer_);
+ }
}
void doString(const std::string& value)
{
buffer_.insert(buffer_.end(), value.begin(), value.end());
}
- std::vector<char> buffer_;
+ std::vector<char> buffer_;
+ EndianSwapBehavior endianSwapBehavior_;
};
-InMemorySerializer::InMemorySerializer() : impl_(new Impl) {}
+InMemorySerializer::InMemorySerializer(EndianSwapBehavior endianSwapBehavior) :
+ impl_(new Impl(endianSwapBehavior))
+{
+}
-InMemorySerializer::~InMemorySerializer() {}
+InMemorySerializer::~InMemorySerializer() = default;
std::vector<char> InMemorySerializer::finishAndGetBuffer()
{
class InMemoryDeserializer::Impl
{
public:
- explicit Impl(ArrayRef<const char> buffer, bool sourceIsDouble) :
+ explicit Impl(ArrayRef<const char> buffer, bool sourceIsDouble, EndianSwapBehavior endianSwapBehavior) :
buffer_(buffer),
sourceIsDouble_(sourceIsDouble),
- pos_(0)
+ pos_(0),
+ endianSwapBehavior_(endianSwapBehavior)
{
}
template<typename T>
void doValue(T* value)
{
- *value = CharBuffer<T>(&buffer_[pos_]).value();
+ if (endianSwapBehavior_ == EndianSwapBehavior::DoSwap)
+ {
+ *value = swapEndian(CharBuffer<T>(&buffer_[pos_]).value());
+ }
+ else
+ {
+ *value = CharBuffer<T>(&buffer_[pos_]).value();
+ }
pos_ += CharBuffer<T>::ValueSize;
}
void doString(std::string* value)
ArrayRef<const char> buffer_;
bool sourceIsDouble_;
size_t pos_;
+ EndianSwapBehavior endianSwapBehavior_;
};
-InMemoryDeserializer::InMemoryDeserializer(ArrayRef<const char> buffer, bool sourceIsDouble) :
- impl_(new Impl(buffer, sourceIsDouble))
+InMemoryDeserializer::InMemoryDeserializer(ArrayRef<const char> buffer,
+ bool sourceIsDouble,
+ EndianSwapBehavior endianSwapBehavior) :
+ impl_(new Impl(buffer, sourceIsDouble, endianSwapBehavior))
{
}
-InMemoryDeserializer::~InMemoryDeserializer() {}
+InMemoryDeserializer::~InMemoryDeserializer() = default;
bool InMemoryDeserializer::sourceIsDouble() const
{
namespace gmx
{
+//! Specify endian swapping behvaior
+enum class EndianSwapBehavior : int
+{
+ DoSwap, //!< Swap the bytes
+ DoNotSwap, //!< Do not swap the bytes
+ Count //!< Number of possible behaviors
+};
+
class InMemorySerializer : public ISerializer
{
public:
- InMemorySerializer();
+ explicit InMemorySerializer(EndianSwapBehavior endianSwapBehavior = EndianSwapBehavior::DoNotSwap);
~InMemorySerializer() override;
std::vector<char> finishAndGetBuffer();
class InMemoryDeserializer : public ISerializer
{
public:
- explicit InMemoryDeserializer(ArrayRef<const char> buffer, bool sourceIsDouble);
+ InMemoryDeserializer(ArrayRef<const char> buffer,
+ bool sourceIsDouble,
+ EndianSwapBehavior endianSwapBehavior = EndianSwapBehavior::DoNotSwap);
~InMemoryDeserializer() override;
//! Get if the source data was written in double precsion
defaultinitializationallocator.cpp
enumerationhelpers.cpp
fixedcapacityvector.cpp
+ inmemoryserializer.cpp
keyvaluetreeserializer.cpp
keyvaluetreetransform.cpp
logger.cpp
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief
+ * Tests for gmx::InMemorySerializer and InMemoryDeserializer.
+ *
+ * \author Christian Blau <blau@kth.se>
+ * \ingroup module_utility
+ */
+
+#include "gmxpre.h"
+
+#include "gromacs/utility/inmemoryserializer.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace gmx
+{
+namespace test
+{
+namespace
+{
+union IntAndFloat32 {
+ std::int32_t int32Value_;
+ float floatValue_;
+};
+
+union IntAndFloat64 {
+ std::int64_t int64Value_;
+ double doubleValue_;
+};
+
+//! Constants used for testing endian swap operations
+//! \{
+constexpr std::int16_t c_int16Value = static_cast<std::int16_t>(0x7A2B);
+constexpr std::int16_t c_int16ValueSwapped = static_cast<std::int16_t>(0x2B7A);
+constexpr std::int32_t c_int32Value = static_cast<std::int32_t>(0x78ABCDEF);
+constexpr std::int32_t c_int32ValueSwapped = static_cast<std::int32_t>(0xEFCDAB78);
+constexpr std::int64_t c_int64Value = static_cast<std::int64_t>(0x78ABCDEF12345678);
+constexpr std::int64_t c_int64ValueSwapped = static_cast<std::int64_t>(0x78563412EFCDAB78);
+
+constexpr const IntAndFloat32 c_intAndFloat32{ c_int32Value };
+constexpr const IntAndFloat64 c_intAndFloat64{ c_int64Value };
+
+constexpr const IntAndFloat32 c_intAndFloat32Swapped{ c_int32ValueSwapped };
+constexpr const IntAndFloat64 c_intAndFloat64Swapped{ c_int64ValueSwapped };
+//! \}
+
+//! Return the integer used for testing, depending on the size of int.
+constexpr int integerSizeDependentTestingValue()
+{
+ return sizeof(int) == 4 ? c_int32Value : sizeof(int) == 8 ? c_int64Value : c_int16Value;
+}
+
+//! Return the endianess-swapped integer used for testing, depending on the size of int.
+constexpr int integerSizeDependentTestingValueEndianessSwapped()
+{
+ return sizeof(int) == 4 ? c_int32ValueSwapped
+ : sizeof(int) == 8 ? c_int64ValueSwapped : c_int16ValueSwapped;
+}
+
+class InMemorySerializerTest : public ::testing::Test
+{
+public:
+ struct SerializerValues
+ {
+ bool boolValue_;
+ unsigned char unsignedCharValue_;
+ char charValue_;
+ unsigned short unsignedShortValue_;
+ std::int32_t int32Value_;
+ float floatValue_;
+ std::int64_t int64Value_;
+ double doubleValue_;
+ int intValue_;
+ real realValue_;
+ };
+
+ void serialize(ISerializer* serializer, SerializerValues* values)
+ {
+ EXPECT_FALSE(serializer->reading());
+ doValues(serializer, values);
+ }
+
+ SerializerValues deserialize(ISerializer* serializer)
+ {
+ EXPECT_TRUE(serializer->reading());
+ SerializerValues result;
+ doValues(serializer, &result);
+ return result;
+ }
+
+ void checkSerializerValuesforEquality(const SerializerValues& lhs, const SerializerValues& rhs)
+ {
+ EXPECT_EQ(lhs.boolValue_, rhs.boolValue_);
+ EXPECT_EQ(lhs.unsignedCharValue_, rhs.unsignedCharValue_);
+ EXPECT_EQ(lhs.charValue_, rhs.charValue_);
+ EXPECT_EQ(lhs.intValue_, rhs.intValue_);
+ EXPECT_EQ(lhs.int32Value_, rhs.int32Value_);
+ EXPECT_EQ(lhs.int64Value_, rhs.int64Value_);
+ EXPECT_EQ(lhs.unsignedShortValue_, rhs.unsignedShortValue_);
+ EXPECT_EQ(lhs.realValue_, rhs.realValue_);
+ EXPECT_EQ(lhs.floatValue_, rhs.floatValue_);
+ EXPECT_EQ(lhs.doubleValue_, rhs.doubleValue_);
+ }
+
+private:
+ void doValues(ISerializer* serializer, SerializerValues* values)
+ {
+ serializer->doBool(&values->boolValue_);
+ serializer->doUChar(&values->unsignedCharValue_);
+ serializer->doChar(&values->charValue_);
+ serializer->doInt(&values->intValue_);
+ serializer->doInt32(&values->int32Value_);
+ serializer->doInt64(&values->int64Value_);
+ serializer->doUShort(&values->unsignedShortValue_);
+ serializer->doReal(&values->realValue_);
+ serializer->doFloat(&values->floatValue_);
+ serializer->doDouble(&values->doubleValue_);
+ }
+
+protected:
+ SerializerValues defaultValues_ = { true,
+ 0x78,
+ 0x78,
+ static_cast<unsigned short>(c_int16Value),
+ c_int32Value,
+ c_intAndFloat32.floatValue_,
+ c_int64Value,
+ c_intAndFloat64.doubleValue_,
+ integerSizeDependentTestingValue(),
+ std::is_same<real, double>::value
+ ? static_cast<real>(c_intAndFloat64.doubleValue_)
+ : static_cast<real>(c_intAndFloat32.floatValue_) };
+
+ SerializerValues endianessSwappedValues_ = {
+ true,
+ 0x78,
+ 0x78,
+ static_cast<unsigned short>(c_int16ValueSwapped),
+ c_int32ValueSwapped,
+ c_intAndFloat32Swapped.floatValue_,
+ c_int64ValueSwapped,
+ c_intAndFloat64Swapped.doubleValue_,
+ integerSizeDependentTestingValueEndianessSwapped(),
+ std::is_same<real, float>::value ? static_cast<real>(c_intAndFloat32Swapped.floatValue_)
+ : static_cast<real>(c_intAndFloat64Swapped.doubleValue_)
+ };
+};
+
+TEST_F(InMemorySerializerTest, Roundtrip)
+{
+ InMemorySerializer serializer;
+ SerializerValues values = defaultValues_;
+ serialize(&serializer, &values);
+
+ auto buffer = serializer.finishAndGetBuffer();
+
+ InMemoryDeserializer deserializer(buffer, std::is_same<real, double>::value);
+
+ SerializerValues deserialisedValues = deserialize(&deserializer);
+
+ checkSerializerValuesforEquality(values, deserialisedValues);
+}
+
+TEST_F(InMemorySerializerTest, RoundtripWithEndianessSwap)
+{
+ InMemorySerializer serializerWithSwap(EndianSwapBehavior::DoSwap);
+ SerializerValues values = defaultValues_;
+ serialize(&serializerWithSwap, &values);
+
+ auto buffer = serializerWithSwap.finishAndGetBuffer();
+
+ InMemoryDeserializer deserializerWithSwap(buffer, std::is_same<real, double>::value,
+ EndianSwapBehavior::DoSwap);
+
+ SerializerValues deserialisedValues = deserialize(&deserializerWithSwap);
+
+ checkSerializerValuesforEquality(values, deserialisedValues);
+}
+
+TEST_F(InMemorySerializerTest, SerializerExplicitEndianessSwap)
+{
+ InMemorySerializer serializerWithSwap(EndianSwapBehavior::DoSwap);
+ SerializerValues values = defaultValues_;
+ serialize(&serializerWithSwap, &values);
+
+ auto buffer = serializerWithSwap.finishAndGetBuffer();
+
+ InMemoryDeserializer deserializerWithOutSwap(buffer, std::is_same<real, double>::value);
+
+ SerializerValues deserialisedValues = deserialize(&deserializerWithOutSwap);
+ checkSerializerValuesforEquality(endianessSwappedValues_, deserialisedValues);
+}
+
+TEST_F(InMemorySerializerTest, DeserializerExplicitEndianessSwap)
+{
+ InMemorySerializer serializer;
+ SerializerValues values = defaultValues_;
+ serialize(&serializer, &values);
+
+ auto buffer = serializer.finishAndGetBuffer();
+
+ InMemoryDeserializer deserializerWithSwap(buffer, std::is_same<real, double>::value,
+ EndianSwapBehavior::DoSwap);
+
+ SerializerValues deserialisedValues = deserialize(&deserializerWithSwap);
+ checkSerializerValuesforEquality(endianessSwappedValues_, deserialisedValues);
+}
+
+} // namespace
+} // namespace test
+} // namespace gmx
const std::string mdpEnergyAndDensityfittingIntervalMismatch_ = formatString(
"nstcalcenergy = 7\n"
"density-guided-simulation-nst = 3\n");
+ //! Set mdp values so that we skip every other step
+ const std::string mdpSkipDensityfittingEveryOtherStep_ = formatString(
+ "nstenergy = 2\n"
+ "density-guided-simulation-nst = 2\n");
//! The command line to call mdrun
CommandLine commandLineForMdrun_;
};
".*is not a multiple of density-guided-simulation-nst.*");
}
+/* Fit a subset of three of twelve argon atoms into a reference density
+ * whose origin is offset from the simulation box origin. Stop the simulation,
+ * then restart.
+ *
+ * All density fitting mdp parameters are set to defaults
+ */
+TEST_F(DensityFittingTest, CheckpointWorks)
+{
+ runner_.useStringAsMdpFile(mdpMdDensfitYesUnsetValues + mdpSkipDensityfittingEveryOtherStep_);
+ runner_.cptFileName_ = fileManager_.getTemporaryFilePath(".cpt");
+ commandLineForMdrun_.addOption("-cpo", runner_.cptFileName_);
+
+ ASSERT_EQ(0, runner_.callGrompp());
+ ASSERT_EQ(0, runner_.callMdrun(commandLineForMdrun_));
+
+ // checkMdrun(expectedEnergyTermMagnitude);
+
+ CommandLine commandLineForRestart;
+ commandLineForRestart.addOption("-cpi", runner_.cptFileName_);
+ commandLineForRestart.addOption("-noappend");
+ runner_.nsteps_ = 4;
+ ASSERT_EQ(0, runner_.callMdrun(commandLineForRestart));
+
+ const real expectedEnergyTermMagnitude = -3378.825928;
+ checkMdrun(expectedEnergyTermMagnitude);
+}
+
+
} // namespace test
} // namespace gmx
// with forces on CPUs, but there is no real risk of a bug with
// those propagators that would only be caught with a tighter
// tolerance in this particular test.
+ int ulpToleranceInMixed = 32;
+ int ulpToleranceInDouble = 64;
+ if (integrator == "bd")
+ {
+ // Somehow, the bd integrator has never been as reproducible
+ // as the others, either in continuations or reruns.
+ ulpToleranceInMixed = 128;
+ }
EnergyTermsToCompare energyTermsToCompare{ {
- { interaction_function[F_EPOT].longname, relativeToleranceAsPrecisionDependentUlp(10.0, 32, 64) },
+ { interaction_function[F_EPOT].longname,
+ relativeToleranceAsPrecisionDependentUlp(10.0, ulpToleranceInMixed, ulpToleranceInDouble) },
} };
int numWarningsToTolerate = 1;
--- /dev/null
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="referencedata.xsl"?>
+<ReferenceData>
+ <Energy Name="Potential">
+ <Real Name="Time 0.000000 Step 0 in frame 0">-3379.9202</Real>
+ <Real Name="Time 0.002000 Step 2 in frame 1">-3390.9541</Real>
+ </Energy>
+ <Energy Name="Density fitting">
+ <Real Name="Time 0.000000 Step 0 in frame 0">-3378.9026</Real>
+ <Real Name="Time 0.002000 Step 2 in frame 1">-3389.9324</Real>
+ </Energy>
+</ReferenceData>