Merge origin/release-2020 into master
authorPaul Bauer <paul.bauer.q@gmail.com>
Sat, 30 Nov 2019 10:58:42 +0000 (11:58 +0100)
committerPaul Bauer <paul.bauer.q@gmail.com>
Sat, 30 Nov 2019 10:58:42 +0000 (11:58 +0100)
Resolved Conflicts:
src/gromacs/mdlib/update.cpp
src/gromacs/mdrun/md.cpp

Change-Id: Ibf1c632cc3096121dccd6c1c1af9bff3d590d18f

63 files changed:
cmake/FindHWLOC.cmake
docs/reference-manual/special/density-guided-simulation.rst
docs/release-notes/2019/2019.5.rst
docs/release-notes/2020/major/performance.rst
docs/user-guide/mdp-options.rst
docs/user-guide/mdrun-performance.rst
python_packaging/sample_restraint/tests/CMakeGROMACS.txt
src/config.h.cmakein
src/gromacs/applied_forces/densityfitting.cpp
src/gromacs/applied_forces/densityfittingforceprovider.cpp
src/gromacs/applied_forces/densityfittingforceprovider.h
src/gromacs/awh/biasparams.h
src/gromacs/awh/read_params.cpp
src/gromacs/domdec/gpuhaloexchange.h
src/gromacs/domdec/gpuhaloexchange_impl.cpp
src/gromacs/domdec/gpuhaloexchange_impl.cu
src/gromacs/domdec/gpuhaloexchange_impl.cuh
src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu
src/gromacs/ewald/pme_force_sender_gpu_impl.cu
src/gromacs/ewald/pme_only.cpp
src/gromacs/ewald/pme_pp.cpp
src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
src/gromacs/fileio/mrcdensitymap.cpp
src/gromacs/fileio/mrcdensitymap.h
src/gromacs/fileio/mrcdensitymapheader.cpp
src/gromacs/fileio/mrcdensitymapheader.h
src/gromacs/fileio/pdbio.cpp
src/gromacs/fileio/tests/mrcdensitymapheader.cpp
src/gromacs/gpu_utils/devicebuffer_ocl.h
src/gromacs/hardware/hardwaretopology.cpp
src/gromacs/mdlib/coupling.cpp
src/gromacs/mdlib/leapfrog_cuda.cu
src/gromacs/mdlib/leapfrog_cuda.cuh
src/gromacs/mdlib/lincs_cuda.cu
src/gromacs/mdlib/sim_util.cpp
src/gromacs/mdlib/update.cpp
src/gromacs/mdlib/update.h
src/gromacs/mdlib/update_constrain_cuda.h
src/gromacs/mdlib/update_constrain_cuda_impl.cpp
src/gromacs/mdlib/update_constrain_cuda_impl.cu
src/gromacs/mdlib/update_constrain_cuda_impl.h
src/gromacs/mdrun/md.cpp
src/gromacs/mdrun/runner.cpp
src/gromacs/mdtypes/state_propagator_data_gpu.h
src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
src/gromacs/nbnxm/nbnxm.cpp
src/gromacs/nbnxm/nbnxm.h
src/gromacs/nbnxm/nbnxm_gpu.h
src/gromacs/nbnxm/pairlist_simd_2xmm.h
src/gromacs/pulling/pull.cpp
src/gromacs/taskassignment/decidegpuusage.cpp
src/gromacs/taskassignment/decidesimulationworkload.cpp
src/gromacs/taskassignment/taskassignment.cpp
src/gromacs/utility/coolstuff.cpp
src/gromacs/utility/inmemoryserializer.cpp
src/gromacs/utility/inmemoryserializer.h
src/gromacs/utility/tests/CMakeLists.txt
src/gromacs/utility/tests/inmemoryserializer.cpp [new file with mode: 0644]
src/programs/mdrun/tests/densityfittingmodule.cpp
src/programs/mdrun/tests/exactcontinuation.cpp
src/programs/mdrun/tests/refdata/DensityFittingTest_CheckpointWorks.xml [new file with mode: 0644]

index 6f83a3ea3991733701ff81676af43fca4f380c2f..0fbc381f2e72d06da83cc06b2d770f9bc3cd1793 100644 (file)
@@ -355,7 +355,7 @@ if(HWLOC_INCLUDE_DIRS)
                message(STATUS "Error executing hwloc-info: ${HWLOC_INFO_ERR}")
             endif()
             string(REGEX MATCH "[0-9]+.*[0-9]+" HWLOC_INFO_OUT "${HWLOC_INFO_OUT}")
-            set(HWLOC_VERSION ${HWLOC_INFO_OUT} CACHE STRING "HWLOC library version")
+            set(HWLOC_LIBRARY_VERSION ${HWLOC_INFO_OUT})
         endif()
     endif()
 
@@ -364,20 +364,37 @@ if(HWLOC_INCLUDE_DIRS)
     endif()
 
     # Parse header if cross-compiling, or if hwloc-info was not found
-    if(NOT HWLOC_VERSION)
-        # HWLOC is never installed as a framework on OS X, so this should always work.
-        file(READ "${HWLOC_INCLUDE_DIRS}/hwloc.h"
-             HEADER_CONTENTS LIMIT 16384)
-        string(REGEX REPLACE ".*#define HWLOC_API_VERSION (0[xX][0-9a-fA-F]+).*" "\\1"
-               HWLOC_API_VERSION "${HEADER_CONTENTS}")
-        string(SUBSTRING "${HWLOC_API_VERSION}" 4 2 HEX_MAJOR)
-        string(SUBSTRING "${HWLOC_API_VERSION}" 6 2 HEX_MINOR)
-        string(SUBSTRING "${HWLOC_API_VERSION}" 8 2 HEX_PATCH)
-        hex2dec(${HEX_MAJOR} DEC_MAJOR)
-        hex2dec(${HEX_MINOR} DEC_MINOR)
-        hex2dec(${HEX_PATCH} DEC_PATCH)
-        set(HWLOC_VERSION "${DEC_MAJOR}.${DEC_MINOR}.${DEC_PATCH}" CACHE STRING "HWLOC library version")
+    # Also used to check that library and header versions match
+    # HWLOC is never installed as a framework on OS X, so this should always work.
+    file(READ "${HWLOC_INCLUDE_DIRS}/hwloc.h"
+         HEADER_CONTENTS LIMIT 16384)
+    string(REGEX REPLACE ".*#define HWLOC_API_VERSION (0[xX][0-9a-fA-F]+).*" "\\1"
+           HWLOC_API_VERSION "${HEADER_CONTENTS}")
+    string(SUBSTRING "${HWLOC_API_VERSION}" 4 2 HEX_MAJOR)
+    string(SUBSTRING "${HWLOC_API_VERSION}" 6 2 HEX_MINOR)
+    string(SUBSTRING "${HWLOC_API_VERSION}" 8 2 HEX_PATCH)
+    hex2dec(${HEX_MAJOR} DEC_MAJOR)
+    hex2dec(${HEX_MINOR} DEC_MINOR)
+    hex2dec(${HEX_PATCH} DEC_PATCH)
+    set(HWLOC_HEADER_VERSION "${DEC_MAJOR}.${DEC_MINOR}.${DEC_PATCH}")
+
+    if (HWLOC_LIBRARY_VERSION AND HWLOC_HEADER_VERSION)
+        string(SUBSTRING "${HWLOC_LIBRARY_VERSION}" 0 1 LIBRARY_MAJOR)
+        string(SUBSTRING "${HWLOC_HEADER_VERSION}" 0 1 HEADER_MAJOR)
+        string(COMPARE EQUAL "${LIBRARY_MAJOR}" "${HEADER_MAJOR}" HWLOC_VERSION_CHECK)
+        if(NOT HWLOC_VERSION_CHECK)
+            message(FATAL_ERROR "Detected version mismatch between HWLOC headers and library. "
+            "Library version is ${HWLOC_LIBRARY_VERSION}, but header version is ${HWLOC_HEADER_VERSION}. "
+            "Make sure that you have the correct include and library directory set for HWLOC")
+        endif()
+    endif()
+    if (HWLOC_LIBRARY_VERSION)
+        set(HWLOC_VERSION ${HWLOC_LIBRARY_VERSION} CACHE STRING "HWLOC library version")
+
+    else()
+        set(HWLOC_VERSION ${HWLOC_HEADER_VERSION} CACHE STRING "HWLOC library version")
     endif()
+    set(GMX_HWLOC_API_VERSION ${HWLOC_API_VERSION} CACHE STRING "HWLOC API version during configuration time")
 endif()
 
 include(FindPackageHandleStandardArgs)
index 8be48bbd82aee3953d463aceca5199fc799f9db5..36968f2dbe468a5889dfe193686ae2fe6feeadf3 100644 (file)
@@ -176,8 +176,6 @@ Closely related formats like `ccp4` and `map` might work.
 
 Be aware that different visualization software handles map formats differently.
 During simulations, reference densities are interpreted as visualised by `VMD`.
-If the reference map shows unexpected behaviour, swapping endianess with a map
-conversion tool like `em2em` might help.
 
 Output
 ^^^^^^
index 38398497955ea69294609cfb1518db351e36f557..7a757cc08eba888fd77e4b8f8ea464c89ab81e70 100644 (file)
@@ -61,6 +61,22 @@ about missing interactions.
 
 :issue:`3204`
 
+Fix issues with AWH with pull-geometry 'direction' to be periodic
+"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+
+Removed fatal error with AWH with periodic pull-geometry 'direction'
+when the distance was within 2% of half the box size.
+Changed an assertion failure when the AWH interval was larger than
+the box size to a fatal error.
+Clarified the documentation for pull geometry 'direction-periodic'.
+
+:issue:`2946`
+
+Remove assertion failure with AWH when not using the initial stage
+""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+
+:issue:`3217`
+
 Fixes for ``gmx`` tools
 ^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -83,3 +99,12 @@ is unchanged. Existing parsers that conform to the documentation
 and expect whitespace separation will continue to work in all cases.
 
 :issue:`3176`
+
+Fix duplicate PDB CONECT record output
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+PDB "CONECT" record output was duplicated in some instances. Since |Gromacs| does
+not use these anywhere, analysis was not affected. The behavior is now fixed.
+
+:issue:`3206`
+
index 9dd544f788f648a075ad784c5c77436f9e4cfab1..656c9be7f16943738c9a95a5c894fe9dbe4455ae 100644 (file)
@@ -32,7 +32,7 @@ the same SIMD flavor.
 Update and constraints can run on a (single) GPU
 """"""""""""""""""""""""""""""""""""""""""""""""
 
-For standard constant-NVT runs (see the user guide for more details),
+For standard simulations (see the user guide for more details),
 update and constraints can be offloaded to a GPU with CUDA. Thus all compute
 intensive parts of a simulation can be offloaded, which provides
 better performance when using a fast GPU combined with a slow CPU.
index 35d391bffa722c1376324991f845e29d4131706e..b9901f7b70b1eb1c670721119422b0759ce157bd 100644 (file)
@@ -1672,10 +1672,13 @@ pull-coord2-vec, pull-coord2-k, and so on.
 
    .. mdp-value:: direction-periodic
 
-      As :mdp-value:`pull-coord1-geometry=direction`, but allows the distance to be larger
-      than half the box size. With this geometry the box should not be
+      As :mdp-value:`pull-coord1-geometry=direction`, but does not apply
+      periodic box vector corrections to keep the distance within half
+      the box length. This is (only) useful for pushing groups apart
+      by more than half the box length by continuously changing the reference
+      location using a pull rate. With this geometry the box should not be
       dynamic (*e.g.* no pressure scaling) in the pull dimensions and
-      the pull force is not added to virial.
+      the pull force is not added to the virial.
 
    .. mdp-value:: direction-relative
 
index 962752417f6e783cc7a5caaa537985e94008d40d..a29a3a073807698aa164c5f2271ef93d23c91c39 100644 (file)
@@ -530,7 +530,7 @@ behavior.
     Defaults to "auto," which currently always uses the CPU.
     Setting "gpu" requires that a compatible CUDA GPU is available.
     Update and constraints on a GPU is currently not supported
-    with pressure coupling, free-energy, domain decomposition, virtual sites,
+    with free-energy, domain decomposition, virtual sites,
     Ewald surface correction, replica exchange, the pull code,
     orientation restraints and computational electrophysiology.
 
index a12dedbaef12e4ba6322bc9798c7a2da973877c7..acc960830bf3cad7c55d74fd041ba7fff924726e 100644 (file)
@@ -29,4 +29,6 @@ add_custom_target(gmxapi_extension_pytest
 # The current test fixtures require the `gmx` tool-wrapper executable.
 add_dependencies(gmxapi_extension_pytest gmx)
 
+add_dependencies(tests gmxapi_extension_histogram-test gmxapi_extension_bounding-test)
+
 add_dependencies(check gmxapi_extension_pytest)
index f8310fba9771ae00f83f9ae42aadd8f7a640f2ec..bed5e8f95c499395407e3f376fe71b23f291ca68 100644 (file)
 /* Use the Portable Hardware Locality package (hwloc) */
 #cmakedefine01 GMX_USE_HWLOC
 
+/* Library version found for hwloc during configuration time */
+#define GMX_HWLOC_API_VERSION @GMX_HWLOC_API_VERSION@
+
 /* Can and should use nice(3) to set priority */
 #cmakedefine01 GMX_USE_NICE
 
index 2f34d1281897680dfb270fc633a13b7e56655d43..28ead0fcfa1b5398876e0b3766a5f9f373683d18 100644 (file)
@@ -379,12 +379,15 @@ public:
      * \param[in] checkpointWriting enables writing to the Key-Value-Tree
      *                              that is used for storing the checkpoint
      *                              information
+     *
+     * \note The provided state to checkpoint has to change if checkpointing
+     *       is moved before the force provider call in the MD-loop.
      */
     void writeCheckpointData(MdModulesWriteCheckpointData checkpointWriting)
     {
         if (densityFittingOptions_.active())
         {
-            const DensityFittingForceProviderState& state = forceProvider_->state();
+            const DensityFittingForceProviderState& state = forceProvider_->stateToCheckpoint();
             checkpointWriting.builder_.addValue<std::int64_t>(
                     DensityFittingModuleInfo::name_ + "-stepsSinceLastCalculation",
                     state.stepsSinceLastCalculation_);
index 24e40526fb07dbb731f800622aa32150b7da694e..106831562de37c10b3d9e98faf16e22a6ffdee99 100644 (file)
@@ -102,12 +102,13 @@ public:
     ~Impl();
     void calculateForces(const ForceProviderInput& forceProviderInput,
                          ForceProviderOutput*      forceProviderOutput);
-
-    DensityFittingForceProviderState state();
+    const DensityFittingForceProviderState& stateToCheckpoint();
 
 private:
+    DensityFittingForceProviderState state();
     const DensityFittingParameters&  parameters_;
     DensityFittingForceProviderState state_;
+    DensityFittingForceProviderState stateToCheckpoint_;
     LocalAtomSet                     localAtomSet_;
 
     GaussianSpreadKernelParameters::Shape spreadKernel_;
@@ -176,6 +177,8 @@ DensityFittingForceProvider::Impl::Impl(const DensityFittingParameters&
 void DensityFittingForceProvider::Impl::calculateForces(const ForceProviderInput& forceProviderInput,
                                                         ForceProviderOutput* forceProviderOutput)
 {
+    // TODO change if checkpointing moves to the start of the md loop
+    stateToCheckpoint_ = state();
     // do nothing but count number of steps when not in density fitting step
     if (state_.stepsSinceLastCalculation_ % parameters_.calculationIntervalInSteps_ != 0)
     {
@@ -298,6 +301,10 @@ DensityFittingForceProviderState DensityFittingForceProvider::Impl::state()
     return state_;
 }
 
+const DensityFittingForceProviderState& DensityFittingForceProvider::Impl::stateToCheckpoint()
+{
+    return stateToCheckpoint_;
+}
 /********************************************************************
  * DensityFittingForceProvider
  */
@@ -321,9 +328,9 @@ void DensityFittingForceProvider::calculateForces(const ForceProviderInput& forc
     impl_->calculateForces(forceProviderInput, forceProviderOutput);
 }
 
-DensityFittingForceProviderState DensityFittingForceProvider::state()
+const DensityFittingForceProviderState& DensityFittingForceProvider::stateToCheckpoint()
 {
-    return impl_->state();
+    return impl_->stateToCheckpoint();
 }
 
 } // namespace gmx
index 3c979865c5bea522a2f36f0cbce0ab6abaca00cf..16a121ba3432d21ee32dc30e9393bac563b80678 100644 (file)
@@ -93,8 +93,11 @@ public:
     void calculateForces(const ForceProviderInput& forceProviderInput,
                          ForceProviderOutput*      forceProviderOutput) override;
 
-    //! Return the state of the forceprovider.
-    DensityFittingForceProviderState state();
+    /*! \brief Return the state of the forceprovider to be checkpointed
+     * TODO update this routine if checkpointing is moved to the beginning of
+     *      the md loop
+     */
+    const DensityFittingForceProviderState& stateToCheckpoint();
 
 private:
     class Impl;
index 938eefce1eb8a542432b220176a4e504007ece69..812963c27a508386a9cc4dd1713d223adcd4c0ac 100644 (file)
@@ -145,7 +145,10 @@ public:
      * \returns true at steps where checks should be performed.
      * \note  Only returns true at free energy update steps.
      */
-    bool isCheckCoveringStep(int64_t step) const { return step % numStepsCheckCovering_ == 0; }
+    bool isCheckCoveringStep(int64_t step) const
+    {
+        return step > 0 && (step % numStepsCheckCovering_ == 0);
+    }
 
     /*! \brief
      * Returns if to perform checks for anomalies in the histogram.
index 55837e1f88bc2b17960323059d355a19a56354e5..2b277177316ea97859389c93c38f3608f505a144 100644 (file)
@@ -620,8 +620,14 @@ static double get_pull_coord_period(const t_pull_coord& pullCoordParams, const t
             const real innerProduct = iprod(pullCoordParams.vec, pbc.box[dim]);
             if (innerProduct >= (1 - margin) * boxLength && innerProduct <= (1 + margin) * boxLength)
             {
-                GMX_RELEASE_ASSERT(intervalLength < (1 + margin) * boxLength,
-                                   "We have checked before that interval <= period");
+                if (intervalLength > (1 + margin) * boxLength)
+                {
+                    gmx_fatal(FARGS,
+                              "The AWH interval (%f nm) for a pull coordinate is larger than the "
+                              "box size (%f nm)",
+                              intervalLength, boxLength);
+                }
+
                 if (intervalLength > periodicFraction * boxLength)
                 {
                     period = boxLength;
index f577124adadae6fab7ac5a3c7823f87eba1cfbf5..b7e6ff54fa70c0aa273aa3300ac177e137384a26 100644 (file)
@@ -110,6 +110,10 @@ public:
      */
     void communicateHaloForces(bool accumulateForces);
 
+    /*! \brief Get the event synchronizer for the forces ready on device.
+     *  \returns  The event to synchronize the stream that consumes forces on device.
+     */
+    GpuEventSynchronizer* getForcesReadyOnDeviceEvent();
 
 private:
     class Impl;
index 7ea94ac6a4a068a781bc55a213e03d84493450f0..2511673218b3eac56b681e808c80da36989b17c6 100644 (file)
@@ -96,6 +96,14 @@ void GpuHaloExchange::communicateHaloForces(bool gmx_unused accumulateForces)
                "A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
 }
 
+/*!\brief get forces ready on device event stub. */
+GpuEventSynchronizer* GpuHaloExchange::getForcesReadyOnDeviceEvent()
+{
+    GMX_ASSERT(false,
+               "A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
+    return nullptr;
+}
+
 } // namespace gmx
 
 #endif /* GMX_GPU != GMX_GPU_CUDA */
index d917be3f3998e16c770f12b2d8faae85394bf8e7..660566a9dd9117343497c5d6308c8c6e255f840e 100644 (file)
@@ -58,6 +58,7 @@
 #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
 #include "gromacs/gpu_utils/vectype_ops.cuh"
 #include "gromacs/pbcutil/ishift.h"
+#include "gromacs/utility/gmxmpi.h"
 
 #include "domdec_internal.h"
 
@@ -235,7 +236,7 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix          box
         launchGpuKernel(kernelFn, config, nullptr, "Domdec GPU Apply X Halo Exchange", kernelArgs);
     }
 
-    communicateHaloData(d_x_, HaloQuantity::HaloCoordinates);
+    communicateHaloData(d_x_, HaloQuantity::HaloCoordinates, coordinatesReadyOnDeviceEvent);
 
     return;
 }
@@ -246,7 +247,7 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
 {
 
     // Communicate halo data (in non-local stream)
-    communicateHaloData(d_f_, HaloQuantity::HaloForces);
+    communicateHaloData(d_f_, HaloQuantity::HaloForces, nullptr);
 
     float3* d_f = d_f_;
 
@@ -289,10 +290,13 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
 
         launchGpuKernel(kernelFn, config, nullptr, "Domdec GPU Apply F Halo Exchange", kernelArgs);
     }
+    fReadyOnDevice_.markEvent(nonLocalStream_);
 }
 
 
-void GpuHaloExchange::Impl::communicateHaloData(float3* d_ptr, HaloQuantity haloQuantity)
+void GpuHaloExchange::Impl::communicateHaloData(float3*               d_ptr,
+                                                HaloQuantity          haloQuantity,
+                                                GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
 {
 
     void* sendPtr;
@@ -310,10 +314,16 @@ void GpuHaloExchange::Impl::communicateHaloData(float3* d_ptr, HaloQuantity halo
         recvRank  = recvRankX_;
 
 #if GMX_MPI
-        // Wait for signal from receiving task that it is ready, and similarly send signal to task that will push data to this task
-        char thisTaskIsReady, remoteTaskIsReady;
-        MPI_Sendrecv(&thisTaskIsReady, sizeof(char), MPI_BYTE, recvRank, 0, &remoteTaskIsReady,
-                     sizeof(char), MPI_BYTE, sendRank, 0, mpi_comm_mysim_, MPI_STATUS_IGNORE);
+        // Wait for event from receiving task that remote coordinates are ready, and enqueue that event to stream used
+        // for subsequent data push. This avoids a race condition with the remote data being written in the previous timestep.
+        // Similarly send event to task that will push data to this task.
+        GpuEventSynchronizer* remoteCoordinatesReadyOnDeviceEvent;
+        MPI_Sendrecv(&coordinatesReadyOnDeviceEvent, sizeof(GpuEventSynchronizer*), MPI_BYTE,
+                     recvRank, 0, &remoteCoordinatesReadyOnDeviceEvent, sizeof(GpuEventSynchronizer*),
+                     MPI_BYTE, sendRank, 0, mpi_comm_mysim_, MPI_STATUS_IGNORE);
+        remoteCoordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_);
+#else
+        GMX_UNUSED_VALUE(coordinatesReadyOnDeviceEvent);
 #endif
     }
     else
@@ -372,6 +382,11 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(void* sendPtr,
 #endif
 }
 
+GpuEventSynchronizer* GpuHaloExchange::Impl::getForcesReadyOnDeviceEvent()
+{
+    return &fReadyOnDevice_;
+}
+
 /*! \brief Create Domdec GPU object */
 GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void* localStream, void* nonLocalStream) :
     dd_(dd),
@@ -437,4 +452,8 @@ void GpuHaloExchange::communicateHaloForces(bool accumulateForces)
     impl_->communicateHaloForces(accumulateForces);
 }
 
+GpuEventSynchronizer* GpuHaloExchange::getForcesReadyOnDeviceEvent()
+{
+    return impl_->getForcesReadyOnDeviceEvent();
+}
 } // namespace gmx
index 626810b04ad4f6eabd8d4a7086436e85e651071a..017cb191869e40bc9d5b46cb1c9f164a0767e6a5 100644 (file)
@@ -96,12 +96,20 @@ public:
      */
     void communicateHaloForces(bool accumulateForces);
 
+    /*! \brief Get the event synchronizer for the forces ready on device.
+     *  \returns  The event to synchronize the stream that consumes forces on device.
+     */
+    GpuEventSynchronizer* getForcesReadyOnDeviceEvent();
+
 private:
     /*! \brief Data transfer wrapper for GPU halo exchange
      * \param [inout] d_ptr      pointer to coordinates or force buffer in GPU memory
      * \param [in] haloQuantity  switch on whether X or F halo exchange is being performed
+     * \param [in] coordinatesReadyOnDeviceEvent event recorded when coordinates have been copied to device
      */
-    void communicateHaloData(float3* d_ptr, HaloQuantity haloQuantity);
+    void communicateHaloData(float3*               d_ptr,
+                             HaloQuantity          haloQuantity,
+                             GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
 
     /*! \brief Data transfer for GPU halo exchange using CUDA memcopies
      * \param [inout] sendPtr    address to send data from
@@ -174,6 +182,8 @@ private:
     float3* d_x_ = nullptr;
     //! full forces buffer in GPU memory
     float3* d_f_ = nullptr;
+    //! An event recorded once the exchanged forces are ready on the GPU
+    GpuEventSynchronizer fReadyOnDevice_;
 };
 
 } // namespace gmx
index dbb0ace8b23ee4eec0cce5dec27cca6f1eb76f1b..3ca69e27a882225bba32590da8afb32f0d2f0501 100644 (file)
@@ -87,6 +87,8 @@ void PmeCoordinateReceiverGpu::Impl::sendCoordinateBufferAddressToPpRanks(rvec*
 
 #if GMX_MPI
         MPI_Send(&sendBuf, sizeof(void**), MPI_BYTE, receiver.rankId, 0, comm_);
+#else
+        GMX_UNUSED_VALUE(sendBuf);
 #endif
     }
 }
@@ -101,6 +103,8 @@ void PmeCoordinateReceiverGpu::Impl::launchReceiveCoordinatesFromPpCudaDirect(in
     MPI_Irecv(&ppSync_[recvCount_], sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_,
               &request_[recvCount_]);
     recvCount_++;
+#else
+    GMX_UNUSED_VALUE(ppRank);
 #endif
 }
 
index 73a7c19c91c203b2839bbaf1cce28c31529b30f1..1268bac7921307034e4164d82c2279b538a8f043 100644 (file)
@@ -82,6 +82,8 @@ void PmeForceSenderGpu::Impl::sendForceBufferAddressToPpRanks(rvec* d_f)
 
 #if GMX_MPI
         MPI_Send(&sendBuf, sizeof(void**), MPI_BYTE, receiver.rankId, 0, comm_);
+#else
+        GMX_UNUSED_VALUE(sendBuf);
 #endif
     }
 }
@@ -98,6 +100,9 @@ void PmeForceSenderGpu::Impl::sendFToPpCudaDirect(int ppRank)
     // TODO Using MPI_Isend would be more efficient, particularly when
     // sending to multiple PP ranks
     MPI_Send(&pmeSyncPtr, sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_);
+#else
+    GMX_UNUSED_VALUE(pmeSyncPtr);
+    GMX_UNUSED_VALUE(ppRank);
 #endif
 }
 
index 0610a2dc67724e61e86898b75fe0478d618567bb..be7a977daf1313fbf6f982727bd2de86dde75a8a 100644 (file)
@@ -483,6 +483,7 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
         messages = 0;
     } while (status == -1);
 #else
+    GMX_UNUSED_VALUE(pme);
     GMX_UNUSED_VALUE(pme_pp);
     GMX_UNUSED_VALUE(box);
     GMX_UNUSED_VALUE(maxshift_x);
@@ -494,6 +495,8 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
     GMX_UNUSED_VALUE(grid_size);
     GMX_UNUSED_VALUE(ewaldcoeff_q);
     GMX_UNUSED_VALUE(ewaldcoeff_lj);
+    GMX_UNUSED_VALUE(useGpuForPme);
+    GMX_UNUSED_VALUE(stateGpu);
 
     status = pmerecvqxX;
 #endif
@@ -506,6 +509,7 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
     return status;
 }
 
+#if GMX_MPI
 /*! \brief Send force data to PP ranks */
 static void sendFToPP(void* sendbuf, PpRanks receiver, gmx_pme_pp* pme_pp, int* messages)
 {
@@ -520,14 +524,13 @@ static void sendFToPP(void* sendbuf, PpRanks receiver, gmx_pme_pp* pme_pp, int*
     }
     else
     {
-#if GMX_MPI
         // Send using MPI
         MPI_Isend(sendbuf, receiver.numAtoms * sizeof(rvec), MPI_BYTE, receiver.rankId, 0,
                   pme_pp->mpi_comm_mysim, &pme_pp->req[*messages]);
         *messages = *messages + 1;
-#endif
     }
 }
+#endif
 
 /*! \brief Send the PME mesh force, virial and energy to the PP-only ranks. */
 static void gmx_pme_send_force_vir_ener(const gmx_pme_t& pme,
@@ -582,6 +585,7 @@ static void gmx_pme_send_force_vir_ener(const gmx_pme_t& pme,
     MPI_Waitall(messages, pme_pp->req.data(), pme_pp->stat.data());
 #else
     gmx_call("MPI not enabled");
+    GMX_UNUSED_VALUE(pme);
     GMX_UNUSED_VALUE(pme_pp);
     GMX_UNUSED_VALUE(output);
     GMX_UNUSED_VALUE(dvdlambda_q);
index fd9de7ed5fd74ab883ec7f8cdc01ac5191449d79..acd0a25ef74b5b7c75d4943c0248988a7965b1bd 100644 (file)
@@ -230,7 +230,11 @@ static void gmx_pme_send_coeffs_coords(t_forcerec*      fr,
             }
         }
     }
-
+#else
+    GMX_UNUSED_VALUE(fr);
+    GMX_UNUSED_VALUE(reinitGpuPmePpComms);
+    GMX_UNUSED_VALUE(sendCoordinatesFromGpu);
+    GMX_UNUSED_VALUE(coordinatesReadyOnDeviceEvent);
 #endif
     if (!c_useDelayedWait)
     {
@@ -420,6 +424,8 @@ static void recvFFromPme(gmx::PmePpCommGpu* pmePpCommGpu,
 #if GMX_MPI
         MPI_Recv(recvptr, n * sizeof(rvec), MPI_BYTE, cr->dd->pme_nodeid, 0, cr->mpi_comm_mysim,
                  MPI_STATUS_IGNORE);
+#else
+        GMX_UNUSED_VALUE(cr);
 #endif
     }
 }
index 9a87e951e56425b87257b87f9c0836199f692ada..827a1bab343f956694a8b09435bed71d99d2195c 100644 (file)
@@ -68,12 +68,15 @@ PmePpCommGpu::Impl::~Impl() = default;
 void PmePpCommGpu::Impl::reinit(int size)
 {
     // This rank will access PME rank memory directly, so needs to receive the remote PME buffer addresses.
+#if GMX_MPI
     MPI_Recv(&remotePmeXBuffer_, sizeof(void**), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
     MPI_Recv(&remotePmeFBuffer_, sizeof(void**), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
 
     // Reallocate buffer used for staging PME force on GPU
     reallocateDeviceBuffer(&d_pmeForces_, size, &d_pmeForcesSize_, &d_pmeForcesSizeAlloc_, nullptr);
-
+#else
+    GMX_UNUSED_VALUE(size);
+#endif
     return;
 }
 
@@ -81,7 +84,7 @@ void PmePpCommGpu::Impl::reinit(int size)
 // launchRecvForceFromPmeCudaDirect() and sycnRecvForceFromPmeCudaDirect()
 void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(void* recvPtr, int recvSize, bool receivePmeForceToGpu)
 {
-
+#if GMX_MPI
     // Receive event from PME task and add to stream, to ensure pull of data doesn't
     // occur before PME force calc is completed
     GpuEventSynchronizer* pmeSync;
@@ -107,6 +110,11 @@ void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(void* recvPtr, int recvSi
         // them with other forces on the CPU
         cudaStreamSynchronize(pmePpCommStream_);
     }
+#else
+    GMX_UNUSED_VALUE(recvPtr);
+    GMX_UNUSED_VALUE(recvSize);
+    GMX_UNUSED_VALUE(receivePmeForceToGpu);
+#endif
 }
 
 void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaDirect(void* sendPtr,
@@ -114,7 +122,7 @@ void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaDirect(void* sendPtr,
                                                         bool gmx_unused sendPmeCoordinatesFromGpu,
                                                         GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
 {
-
+#if GMX_MPI
     // ensure stream waits until coordinate data is available on device
     coordinatesReadyOnDeviceEvent->enqueueWaitEvent(pmePpCommStream_);
 
@@ -126,6 +134,12 @@ void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaDirect(void* sendPtr,
     pmeCoordinatesSynchronizer_.markEvent(pmePpCommStream_);
     GpuEventSynchronizer* pmeSync = &pmeCoordinatesSynchronizer_;
     MPI_Send(&pmeSync, sizeof(GpuEventSynchronizer*), MPI_BYTE, pmeRank_, 0, comm_);
+#else
+    GMX_UNUSED_VALUE(sendPtr);
+    GMX_UNUSED_VALUE(sendSize);
+    GMX_UNUSED_VALUE(sendPmeCoordinatesFromGpu);
+    GMX_UNUSED_VALUE(coordinatesReadyOnDeviceEvent);
+#endif
 }
 void* PmePpCommGpu::Impl::getGpuForceStagingPtr()
 {
index 3104463806782ab5ffed914f327636aa0be5c08f..ff866adfde4a91c514c13215ea874a43b8438cd7 100644 (file)
@@ -168,18 +168,30 @@ public:
     const MrcDensityMapOfFloatReader& reader() const;
 
 private:
-    std::vector<char>                buffer_;
-    InMemoryDeserializer             serializer_;
-    const MrcDensityMapOfFloatReader reader_;
+    const std::vector<char>                     buffer_;
+    std::unique_ptr<InMemoryDeserializer>       serializer_;
+    std::unique_ptr<MrcDensityMapOfFloatReader> reader_;
 };
 
 MrcDensityMapOfFloatFromFileReader::Impl::Impl(const std::string& filename) :
     buffer_(readCharBufferFromFile(filename)),
-    serializer_(buffer_, false),
-    reader_(&serializer_)
+    serializer_(std::make_unique<InMemoryDeserializer>(buffer_, false)),
+    reader_(std::make_unique<MrcDensityMapOfFloatReader>(serializer_.get()))
 {
-    layout_right::mapping<dynamicExtents3D> map(getDynamicExtents3D(reader_.header()));
-    if (map.required_span_size() != reader_.constView().ssize())
+    if (!mrcHeaderIsSane(reader_->header()))
+    {
+        serializer_ = std::make_unique<InMemoryDeserializer>(buffer_, false, EndianSwapBehavior::DoSwap);
+        reader_ = std::make_unique<MrcDensityMapOfFloatReader>(serializer_.get());
+        if (!mrcHeaderIsSane(reader_->header()))
+        {
+            GMX_THROW(FileIOError(
+                    "Header of '" + filename
+                    + "' fails sanity check for little- as well as big-endian reading."));
+        }
+    }
+
+    layout_right::mapping<dynamicExtents3D> map(getDynamicExtents3D(reader_->header()));
+    if (map.required_span_size() != reader_->constView().ssize())
     {
         GMX_THROW(FileIOError("File header density extent information of " + filename
                               + "' does not match density data size"));
@@ -188,7 +200,7 @@ MrcDensityMapOfFloatFromFileReader::Impl::Impl(const std::string& filename) :
 
 const MrcDensityMapOfFloatReader& MrcDensityMapOfFloatFromFileReader::Impl::reader() const
 {
-    return reader_;
+    return *reader_;
 }
 
 /********************************************************************
index 85ef1e55105c408ecbe9ac50cb7a017fc3f480d7..7f746567306c96948bce6d7509a65de00c8f9815 100644 (file)
@@ -87,6 +87,10 @@ private:
  * upon construction and returns coordinate transformation into the density
  * lattice as well as the density data.
  *
+ * Attempts reading with swapped endianess if header is not sane.
+ *
+ * Performs basic sanity checks on header information and data size.
+ *
  * \note File reading is completed during construction. When the constructor
  *       completes succesfully, transformation to density lattice and density
  *       data are valid, irrespective of the state of the read file.
index 28842078ba275962092c779c3786478979e9a7e9..faaefc5773a77b12c7289d99fb48c91513739acc 100644 (file)
 
 namespace gmx
 {
+
+namespace
+{
+
+//! Returns true if any of the argument values is smaller than zero.
+template<typename Container>
+bool anySmallerZero(Container values)
+{
+    return std::any_of(std::begin(values), std::end(values), [](auto v) { return v < 0; });
+}
+
+//! Returns true if any of the argument values is larger than a given boundary value.
+template<typename Container>
+bool anyLargerThanValue(Container values, typename Container::value_type boundaryValue)
+{
+    return std::any_of(std::begin(values), std::end(values),
+                       [boundaryValue](auto v) { return v > boundaryValue; });
+}
+
+} // namespace
+
 size_t numberOfExpectedDataItems(const MrcDensityMapHeader& header)
 {
-    if (std::any_of(std::begin(header.numColumnRowSection_), std::end(header.numColumnRowSection_),
-                    [](auto i) { return i < 0; }))
+    if (anySmallerZero(header.numColumnRowSection_))
     {
         GMX_THROW(
                 InternalError("Cannot determine data size, because the mrc "
@@ -95,4 +115,31 @@ dynamicExtents3D getDynamicExtents3D(const MrcDensityMapHeader& header)
              header.numColumnRowSection_[XX] };
 };
 
+bool mrcHeaderIsSane(const MrcDensityMapHeader& header)
+{
+    // Make sure all numbers of columns, row sections, extents and cell angles
+    // are positive
+    if (anySmallerZero(header.numColumnRowSection_) || anySmallerZero(header.cellAngles_)
+        || anySmallerZero(header.extent_))
+    {
+        return false;
+    }
+
+    // The maximum integer number in an mrc header to be considered sane
+    constexpr std::int32_t c_maxIntegerNumber = 100'000;
+    if (anyLargerThanValue(header.numColumnRowSection_, c_maxIntegerNumber)
+        || anyLargerThanValue(header.extent_, c_maxIntegerNumber))
+    {
+        return false;
+    }
+
+    constexpr std::int32_t c_maxCellAngle = 360;
+    if (anyLargerThanValue(header.cellAngles_, c_maxCellAngle))
+    {
+        return false; //NOLINT(readability-simplify-boolean-expr)
+    }
+
+    return true;
+}
+
 } // namespace gmx
index f358efbaa2e1ca56384987f485942dd7cf45a369..82160127fa62ccda348df20c5480730169f5974c 100644 (file)
@@ -190,5 +190,17 @@ TranslateAndScale getCoordinateTransformationToLattice(const MrcDensityMapHeader
  * \returns density data extents in three dimensions.
  */
 dynamicExtents3D getDynamicExtents3D(const MrcDensityMapHeader& header);
+
+/*! \brief Checks if the values in the header are sane.
+ *
+ * Checks extents and numbers of columns, rows and sections, as well as unit
+ * cell angles for positivity and to be within bounds.
+ *
+ * Bounds are set generously not to hamper future creative uses of mrc files.
+ *
+ * \returns true if all header values are within resonable albeit generous bounds
+ */
+bool mrcHeaderIsSane(const MrcDensityMapHeader& header);
+
 } // namespace gmx
 #endif /* end of include guard: GMX_FILEIO_MRCDENSITYMAPHEADER_H */
index fa4be6e34b02ec4b5c147b66021151052b5bd6fd..3466d8d4f1114644918dbe3c9fba228062ccc701 100644 (file)
@@ -765,9 +765,7 @@ static void gmx_conect_addline(gmx_conect_t* con, char* line)
             n      = sscanf(line, format.c_str(), &aj);
             if (n == 1)
             {
-                srenew(con->conect, ++con->nconect);
-                con->conect[con->nconect - 1].ai = ai - 1;
-                con->conect[con->nconect - 1].aj = aj - 1;
+                gmx_conect_add(con, ai - 1, aj - 1); /* to prevent duplicated records */
             }
         } while (n == 1);
     }
index ad303cad21672749cc815f76b2ab561e4bd7de9d..e9e17df359a87e4d0f267058456ed815e2964cd8 100644 (file)
@@ -140,4 +140,22 @@ TEST(MrcDensityMapHeaderTest, GetsCorrectExtents)
     EXPECT_EQ(expectedExtents[ZZ], extents.extent(ZZ));
 }
 
+TEST(MrcDensityMapHeaderTest, IsSane)
+{
+    MrcDensityMapHeader header;
+    EXPECT_TRUE(mrcHeaderIsSane(header));
+
+    header.numColumnRowSection_[YY] = -1;
+    EXPECT_FALSE(mrcHeaderIsSane(header));
+
+    header.numColumnRowSection_[YY] = 10'000'000;
+    EXPECT_FALSE(mrcHeaderIsSane(header));
+
+    header = {};
+    EXPECT_TRUE(mrcHeaderIsSane(header));
+
+    header.cellAngles_[XX] = -20;
+    EXPECT_FALSE(mrcHeaderIsSane(header));
+}
+
 } // namespace gmx
index ce1dcb74cbfdefbbf310d09cde582d74c6b5b095..43669df1cdb655210acf1cd5b68cd12821fdf126 100644 (file)
@@ -48,7 +48,9 @@
 #include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/gpu_utils/gpu_utils.h" //only for GpuApiCallBehavior
 #include "gromacs/gpu_utils/gputraits_ocl.h"
+#include "gromacs/gpu_utils/oclutils.h"
 #include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/stringutil.h"
 
 /*! \libinternal \brief
  * Allocates a device-side buffer.
@@ -67,7 +69,10 @@ void allocateDeviceBuffer(DeviceBuffer<ValueType>* buffer, size_t numValues, Dev
     cl_int clError;
     *buffer = clCreateBuffer(deviceContext, CL_MEM_READ_WRITE, numValues * sizeof(ValueType),
                              hostPtr, &clError);
-    GMX_RELEASE_ASSERT(clError == CL_SUCCESS, "clCreateBuffer failure");
+    GMX_RELEASE_ASSERT(clError == CL_SUCCESS,
+                       gmx::formatString("clCreateBuffer failure (OpenCL error %d: %s)", clError,
+                                         ocl_get_error_string(clError).c_str())
+                               .c_str());
 }
 
 /*! \brief
@@ -84,7 +89,11 @@ void freeDeviceBuffer(DeviceBuffer* buffer)
     GMX_ASSERT(buffer, "needs a buffer pointer");
     if (*buffer)
     {
-        GMX_RELEASE_ASSERT(clReleaseMemObject(*buffer) == CL_SUCCESS, "clReleaseMemObject failed");
+        cl_int clError = clReleaseMemObject(*buffer);
+        GMX_RELEASE_ASSERT(clError == CL_SUCCESS,
+                           gmx::formatString("clReleaseMemObject failed (OpenCL error %d: %s)",
+                                             clError, ocl_get_error_string(clError).c_str())
+                                   .c_str());
     }
 }
 
@@ -127,13 +136,21 @@ void copyToDeviceBuffer(DeviceBuffer<ValueType>* buffer,
         case GpuApiCallBehavior::Async:
             clError = clEnqueueWriteBuffer(stream, *buffer, CL_FALSE, offset, bytes, hostBuffer, 0,
                                            nullptr, timingEvent);
-            GMX_RELEASE_ASSERT(clError == CL_SUCCESS, "Asynchronous H2D copy failed");
+            GMX_RELEASE_ASSERT(
+                    clError == CL_SUCCESS,
+                    gmx::formatString("Asynchronous H2D copy failed (OpenCL error %d: %s)", clError,
+                                      ocl_get_error_string(clError).c_str())
+                            .c_str());
             break;
 
         case GpuApiCallBehavior::Sync:
             clError = clEnqueueWriteBuffer(stream, *buffer, CL_TRUE, offset, bytes, hostBuffer, 0,
                                            nullptr, timingEvent);
-            GMX_RELEASE_ASSERT(clError == CL_SUCCESS, "Synchronous H2D copy failed");
+            GMX_RELEASE_ASSERT(
+                    clError == CL_SUCCESS,
+                    gmx::formatString("Synchronous H2D copy failed (OpenCL error %d: %s)", clError,
+                                      ocl_get_error_string(clError).c_str())
+                            .c_str());
             break;
 
         default: throw;
@@ -175,13 +192,21 @@ void copyFromDeviceBuffer(ValueType*               hostBuffer,
         case GpuApiCallBehavior::Async:
             clError = clEnqueueReadBuffer(stream, *buffer, CL_FALSE, offset, bytes, hostBuffer, 0,
                                           nullptr, timingEvent);
-            GMX_RELEASE_ASSERT(clError == CL_SUCCESS, "Asynchronous D2H copy failed");
+            GMX_RELEASE_ASSERT(
+                    clError == CL_SUCCESS,
+                    gmx::formatString("Asynchronous D2H copy failed (OpenCL error %d: %s)", clError,
+                                      ocl_get_error_string(clError).c_str())
+                            .c_str());
             break;
 
         case GpuApiCallBehavior::Sync:
             clError = clEnqueueReadBuffer(stream, *buffer, CL_TRUE, offset, bytes, hostBuffer, 0,
                                           nullptr, timingEvent);
-            GMX_RELEASE_ASSERT(clError == CL_SUCCESS, "Synchronous D2H copy failed");
+            GMX_RELEASE_ASSERT(
+                    clError == CL_SUCCESS,
+                    gmx::formatString("Synchronous D2H copy failed (OpenCL error %d: %s)", clError,
+                                      ocl_get_error_string(clError).c_str())
+                            .c_str());
             break;
 
         default: throw;
@@ -208,7 +233,10 @@ void clearDeviceBufferAsync(DeviceBuffer<ValueType>* buffer, size_t startingOffs
     cl_event        commandEvent;
     cl_int clError = clEnqueueFillBuffer(stream, *buffer, &pattern, sizeof(pattern), offset, bytes,
                                          numWaitEvents, waitEvents, &commandEvent);
-    GMX_RELEASE_ASSERT(clError == CL_SUCCESS, "Couldn't clear the device buffer");
+    GMX_RELEASE_ASSERT(clError == CL_SUCCESS,
+                       gmx::formatString("Couldn't clear the device buffer (OpenCL error %d: %s)",
+                                         clError, ocl_get_error_string(clError).c_str())
+                               .c_str());
 }
 
 #endif
index c086002864c5def8076fc5e1d8ea62fc83326d54..ef310340d813f20743c5691c58200100427d9181 100644 (file)
@@ -165,8 +165,14 @@ void parseCpuInfo(HardwareTopology::Machine* machine, HardwareTopology::SupportL
 // Preprocessor variable for if hwloc api is version 1.x.x or 2.x.x
 #    if HWLOC_API_VERSION >= 0x00020000
 #        define GMX_HWLOC_API_VERSION_IS_2XX 1
+#        if GMX_HWLOC_API_VERSION < 0x00020000
+#            error "HWLOC library major version set during configuration is 1, but currently using version 2 headers"
+#        endif
 #    else
 #        define GMX_HWLOC_API_VERSION_IS_2XX 0
+#        if GMX_HWLOC_API_VERSION >= 0x00020000
+#            error "HWLOC library major version set during configuration is 2, but currently using version 1 headers"
+#        endif
 #    endif
 
 /*****************************************************************************
@@ -605,8 +611,14 @@ void parseHwLoc(HardwareTopology::Machine* machine, HardwareTopology::SupportLev
 
     // Flags to look for io devices
 #    if GMX_HWLOC_API_VERSION_IS_2XX
+    GMX_RELEASE_ASSERT(
+            (hwloc_get_api_version() >= 0x20000),
+            "Mismatch between hwloc headers and library, using v2 headers with v1 library");
     hwloc_topology_set_io_types_filter(topo, HWLOC_TYPE_FILTER_KEEP_IMPORTANT);
 #    else
+    GMX_RELEASE_ASSERT(
+            (hwloc_get_api_version() < 0x20000),
+            "Mismatch between hwloc headers and library, using v1 headers with v2 library");
     hwloc_topology_set_flags(topo, HWLOC_TOPOLOGY_FLAG_IO_DEVICES);
 #    endif
 
index f3a4a6ebe15d72025d9854b52902d112b9063390..909564f2af96958b0b15f0b413864324ba70aed9 100644 (file)
@@ -708,7 +708,8 @@ void berendsen_pscale(const t_inputrec*    ir,
                       int                  nr_atoms,
                       rvec                 x[],
                       const unsigned short cFREEZE[],
-                      t_nrnb*              nrnb)
+                      t_nrnb*              nrnb,
+                      const bool           scaleCoordinates)
 {
     ivec* nFreeze = ir->opts.nFreeze;
     int   d;
@@ -719,32 +720,35 @@ void berendsen_pscale(const t_inputrec*    ir,
 #endif
 
     /* Scale the positions */
-#pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (int n = start; n < start + nr_atoms; n++)
+    if (scaleCoordinates)
     {
-        // Trivial OpenMP region that does not throw
-        int g;
-
-        if (cFREEZE == nullptr)
-        {
-            g = 0;
-        }
-        else
+#pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (int n = start; n < start + nr_atoms; n++)
         {
-            g = cFREEZE[n];
-        }
+            // Trivial OpenMP region that does not throw
+            int g;
 
-        if (!nFreeze[g][XX])
-        {
-            x[n][XX] = mu[XX][XX] * x[n][XX] + mu[YY][XX] * x[n][YY] + mu[ZZ][XX] * x[n][ZZ];
-        }
-        if (!nFreeze[g][YY])
-        {
-            x[n][YY] = mu[YY][YY] * x[n][YY] + mu[ZZ][YY] * x[n][ZZ];
-        }
-        if (!nFreeze[g][ZZ])
-        {
-            x[n][ZZ] = mu[ZZ][ZZ] * x[n][ZZ];
+            if (cFREEZE == nullptr)
+            {
+                g = 0;
+            }
+            else
+            {
+                g = cFREEZE[n];
+            }
+
+            if (!nFreeze[g][XX])
+            {
+                x[n][XX] = mu[XX][XX] * x[n][XX] + mu[YY][XX] * x[n][YY] + mu[ZZ][XX] * x[n][ZZ];
+            }
+            if (!nFreeze[g][YY])
+            {
+                x[n][YY] = mu[YY][YY] * x[n][YY] + mu[ZZ][YY] * x[n][ZZ];
+            }
+            if (!nFreeze[g][ZZ])
+            {
+                x[n][ZZ] = mu[ZZ][ZZ] * x[n][ZZ];
+            }
         }
     }
     /* compute final boxlengths */
index 18768c28f29fd59e7647c9a19757ebe34833ea9e..ac4135960740aea0c45f4e8db8deb27856b2db45 100644 (file)
@@ -111,19 +111,19 @@ enum class VelocityScalingType
  *  \todo Check if the force should be set to zero here.
  *  \todo This kernel can also accumulate incidental temperatures for each atom.
  *
- * \tparam        numTempScaleValues             The number of different T-couple values.
- * \tparam        velocityScaling                Type of the Parrinello-Rahman velocity rescaling.
- * \param[in]     numAtoms                       Total number of atoms.
- * \param[in,out] gm_x                           Coordinates to update upon integration.
- * \param[out]    gm_xp                          A copy of the coordinates before the integration (for constraints).
- * \param[in,out] gm_v                           Velocities to update.
- * \param[in]     gm_f                           Atomic forces.
- * \param[in]     gm_inverseMasses               Reciprocal masses.
- * \param[in]     dt                             Timestep.
- * \param[in]     gm_lambdas                     Temperature scaling factors (one per group)
- * \param[in]     gm_tempScaleGroups             Mapping of atoms into groups.
- * \param[in]     dtPressureCouple               Time step for pressure coupling
- * \param[in]     velocityScalingMatrixDiagonal  Diagonal elements of Parrinello-Rahman velocity scaling matrix
+ * \tparam        numTempScaleValues               The number of different T-couple values.
+ * \tparam        velocityScaling                  Type of the Parrinello-Rahman velocity rescaling.
+ * \param[in]     numAtoms                         Total number of atoms.
+ * \param[in,out] gm_x                             Coordinates to update upon integration.
+ * \param[out]    gm_xp                            A copy of the coordinates before the integration (for constraints).
+ * \param[in,out] gm_v                             Velocities to update.
+ * \param[in]     gm_f                             Atomic forces.
+ * \param[in]     gm_inverseMasses                 Reciprocal masses.
+ * \param[in]     dt                               Timestep.
+ * \param[in]     gm_lambdas                       Temperature scaling factors (one per group)
+ * \param[in]     gm_tempScaleGroups               Mapping of atoms into groups.
+ * \param[in]     dtPressureCouple                 Time step for pressure coupling
+ * \param[in]     prVelocityScalingMatrixDiagonal  Diagonal elements of Parrinello-Rahman velocity scaling matrix
  */
 template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
 __launch_bounds__(c_maxThreadsPerBlock) __global__
@@ -136,7 +136,7 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
                              const float dt,
                              const float* __restrict__ gm_lambdas,
                              const unsigned short* __restrict__ gm_tempScaleGroups,
-                             const float3 velocityScalingMatrixDiagonal);
+                             const float3 prVelocityScalingMatrixDiagonal);
 
 template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
 __launch_bounds__(c_maxThreadsPerBlock) __global__
@@ -149,7 +149,7 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
                              const float dt,
                              const float* __restrict__ gm_lambdas,
                              const unsigned short* __restrict__ gm_tempScaleGroups,
-                             const float3 velocityScalingMatrixDiagonal)
+                             const float3 prVelocityScalingMatrixDiagonal)
 {
     int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
     if (threadIndex < numAtoms)
@@ -186,9 +186,9 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
 
             if (velocityScaling == VelocityScalingType::Diagonal)
             {
-                vp.x -= velocityScalingMatrixDiagonal.x * v.x;
-                vp.y -= velocityScalingMatrixDiagonal.y * v.y;
-                vp.z -= velocityScalingMatrixDiagonal.z * v.z;
+                vp.x -= prVelocityScalingMatrixDiagonal.x * v.x;
+                vp.y -= prVelocityScalingMatrixDiagonal.y * v.y;
+                vp.z -= prVelocityScalingMatrixDiagonal.z * v.z;
             }
 
             v = vp;
@@ -205,21 +205,28 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
 
 /*! \brief Select templated kernel.
  *
- * Returns pointer to a CUDA kernel based on the number of temperature coupling groups.
- * If zero is passed as an argument, it is assumed that no temperature coupling groups are used.
+ * Returns pointer to a CUDA kernel based on the number of temperature coupling groups and
+ * whether or not the temperature and(or) pressure coupling is enabled.
  *
- * \param[in]  numTempScaleValues  Numer of temperature coupling groups in the system
- * \param[in]  velocityScaling     Type of the Parrinello-Rahman velocity scaling
+ * \param[in]  doTemperatureScaling   If the kernel with temperature coupling velocity scaling
+ *                                    should be selected.
+ * \param[in]  numTempScaleValues     Number of temperature coupling groups in the system.
+ * \param[in]  prVelocityScalingType  Type of the Parrinello-Rahman velocity scaling.
  *
  * \retrun                         Pointer to CUDA kernel
  */
-inline auto selectLeapFrogKernelPtr(int numTempScaleValues, VelocityScalingType velocityScaling)
+inline auto selectLeapFrogKernelPtr(bool                doTemperatureScaling,
+                                    int                 numTempScaleValues,
+                                    VelocityScalingType prVelocityScalingType)
 {
+    // Check input for consistency: if there is temperature coupling, at least one coupling group should be defined.
+    GMX_ASSERT(!doTemperatureScaling || (numTempScaleValues > 0),
+               "Temperature coupling was requested with no temperature coupling groups.");
     auto kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::None>;
 
-    if (velocityScaling == VelocityScalingType::None)
+    if (prVelocityScalingType == VelocityScalingType::None)
     {
-        if (numTempScaleValues == 0)
+        if (!doTemperatureScaling)
         {
             kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::None>;
         }
@@ -231,16 +238,10 @@ inline auto selectLeapFrogKernelPtr(int numTempScaleValues, VelocityScalingType
         {
             kernelPtr = leapfrog_kernel<NumTempScaleValues::Multiple, VelocityScalingType::None>;
         }
-        else
-        {
-            GMX_RELEASE_ASSERT(false,
-                               "Number of temperature coupling groups should be greater than zero "
-                               "(zero for no coupling).");
-        }
     }
-    else if (velocityScaling == VelocityScalingType::Diagonal)
+    else if (prVelocityScalingType == VelocityScalingType::Diagonal)
     {
-        if (numTempScaleValues == 0)
+        if (!doTemperatureScaling)
         {
             kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::Diagonal>;
         }
@@ -252,12 +253,6 @@ inline auto selectLeapFrogKernelPtr(int numTempScaleValues, VelocityScalingType
         {
             kernelPtr = leapfrog_kernel<NumTempScaleValues::Multiple, VelocityScalingType::Diagonal>;
         }
-        else
-        {
-            GMX_RELEASE_ASSERT(false,
-                               "Number of temperature coupling groups should be greater than zero "
-                               "(zero for no coupling).");
-        }
     }
     else
     {
@@ -272,19 +267,19 @@ void LeapFrogCuda::integrate(const float3*                     d_x,
                              float3*                           d_v,
                              const float3*                     d_f,
                              const real                        dt,
-                             const bool                        doTempCouple,
+                             const bool                        doTemperatureScaling,
                              gmx::ArrayRef<const t_grp_tcstat> tcstat,
-                             const bool                        doPressureCouple,
+                             const bool                        doParrinelloRahman,
                              const float                       dtPressureCouple,
-                             const matrix                      velocityScalingMatrix)
+                             const matrix                      prVelocityScalingMatrix)
 {
 
     ensureNoPendingCudaError("In CUDA version of Leap-Frog integrator");
 
     auto kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::None>;
-    if (doTempCouple || doPressureCouple)
+    if (doTemperatureScaling || doParrinelloRahman)
     {
-        if (doTempCouple)
+        if (doTemperatureScaling)
         {
             GMX_ASSERT(numTempScaleValues_ == ssize(h_lambdas_),
                        "Number of temperature scaling factors changed since it was set for the "
@@ -296,26 +291,28 @@ void LeapFrogCuda::integrate(const float3*                     d_x,
             copyToDeviceBuffer(&d_lambdas_, h_lambdas_.data(), 0, numTempScaleValues_,
                                commandStream_, GpuApiCallBehavior::Async, nullptr);
         }
-        VelocityScalingType velocityScaling = VelocityScalingType::None;
-        if (doPressureCouple)
+        VelocityScalingType prVelocityScalingType = VelocityScalingType::None;
+        if (doParrinelloRahman)
         {
-            velocityScaling = VelocityScalingType::Diagonal;
-            GMX_ASSERT(velocityScalingMatrix[YY][XX] == 0 && velocityScalingMatrix[ZZ][XX] == 0
-                               && velocityScalingMatrix[ZZ][YY] == 0 && velocityScalingMatrix[XX][YY] == 0
-                               && velocityScalingMatrix[XX][ZZ] == 0 && velocityScalingMatrix[YY][ZZ] == 0,
+            prVelocityScalingType = VelocityScalingType::Diagonal;
+            GMX_ASSERT(prVelocityScalingMatrix[YY][XX] == 0 && prVelocityScalingMatrix[ZZ][XX] == 0
+                               && prVelocityScalingMatrix[ZZ][YY] == 0
+                               && prVelocityScalingMatrix[XX][YY] == 0
+                               && prVelocityScalingMatrix[XX][ZZ] == 0
+                               && prVelocityScalingMatrix[YY][ZZ] == 0,
                        "Fully anisotropic Parrinello-Rahman pressure coupling is not yet supported "
                        "in GPU version of Leap-Frog integrator.");
-            velocityScalingMatrixDiagonal_ =
-                    make_float3(dtPressureCouple * velocityScalingMatrix[XX][XX],
-                                dtPressureCouple * velocityScalingMatrix[YY][YY],
-                                dtPressureCouple * velocityScalingMatrix[ZZ][ZZ]);
+            prVelocityScalingMatrixDiagonal_ =
+                    make_float3(dtPressureCouple * prVelocityScalingMatrix[XX][XX],
+                                dtPressureCouple * prVelocityScalingMatrix[YY][YY],
+                                dtPressureCouple * prVelocityScalingMatrix[ZZ][ZZ]);
         }
-        kernelPtr = selectLeapFrogKernelPtr(numTempScaleValues_, velocityScaling);
+        kernelPtr = selectLeapFrogKernelPtr(doTemperatureScaling, numTempScaleValues_, prVelocityScalingType);
     }
 
     const auto kernelArgs = prepareGpuKernelArguments(
             kernelPtr, kernelLaunchConfig_, &numAtoms_, &d_x, &d_xp, &d_v, &d_f, &d_inverseMasses_,
-            &dt, &d_lambdas_, &d_tempScaleGroups_, &velocityScalingMatrixDiagonal_);
+            &dt, &d_lambdas_, &d_tempScaleGroups_, &prVelocityScalingMatrixDiagonal_);
     launchGpuKernel(kernelPtr, kernelLaunchConfig_, nullptr, "leapfrog_kernel", kernelArgs);
 
     return;
index 60dffc983e3f64272ed3b96caded1f2bd58275ac..cbf3738a8f82e6270f4363b67b29acba725f4583 100644 (file)
@@ -74,27 +74,27 @@ public:
      * Integrates the equation of motion using Leap-Frog algorithm.
      * Updates coordinates and velocities on the GPU. The current coordinates are saved for constraints.
      *
-     * \param[in,out] d_x                    Coordinates to update
-     * \param[out]    d_xp                   Place to save the values of initial coordinates coordinates to.
-     * \param[in,out] d_v                    Velocities (will be updated).
-     * \param[in]     d_f                    Forces.
-     * \param[in]     dt                     Timestep.
-     * \param[in]     doTempCouple           If the temperature coupling should be applied.
-     * \param[in]     tcstat                 Temperature coupling data.
-     * \param[in]     doPressureCouple       If the temperature coupling should be applied.
-     * \param[in]     dtPressureCouple       Period between pressure coupling steps
-     * \param[in]     velocityScalingMatrix  Parrinello-Rahman velocity scaling matrix
+     * \param[in,out] d_x                      Coordinates to update
+     * \param[out]    d_xp                     Place to save the values of initial coordinates coordinates to.
+     * \param[in,out] d_v                      Velocities (will be updated).
+     * \param[in]     d_f                      Forces.
+     * \param[in]     dt                       Timestep.
+     * \param[in]     doTemperatureScaling     If velocities should be scaled for temperature coupling.
+     * \param[in]     tcstat                   Temperature coupling data.
+     * \param[in]     doParrinelloRahman       If current step is a Parrinello-Rahman pressure coupling step.
+     * \param[in]     dtPressureCouple         Period between pressure coupling steps
+     * \param[in]     prVelocityScalingMatrix  Parrinello-Rahman velocity scaling matrix
      */
     void integrate(const float3*                     d_x,
                    float3*                           d_xp,
                    float3*                           d_v,
                    const float3*                     d_f,
                    const real                        dt,
-                   const bool                        doTempCouple,
+                   const bool                        doTemperatureScaling,
                    gmx::ArrayRef<const t_grp_tcstat> tcstat,
-                   const bool                        doPressureCouple,
+                   const bool                        doParrinelloRahman,
                    const float                       dtPressureCouple,
-                   const matrix                      velocityScalingMatrix);
+                   const matrix                      prVelocityScalingMatrix);
 
     /*! \brief Set the integrator
      *
@@ -148,8 +148,8 @@ private:
     //! Maximum size of the temperature coupling groups array
     int numTempScaleGroupsAlloc_ = -1;
 
-    //! Vector with diagonal elements of the pressure coupling velocity rescale factors
-    float3 velocityScalingMatrixDiagonal_;
+    //! Vector with diagonal elements of the Parrinello-Rahman pressure coupling velocity rescale factors
+    float3 prVelocityScalingMatrixDiagonal_;
 };
 
 } // namespace gmx
index c0caa1278647d4d5eee65a18c47ada36447f5fcb..ce1b442f84902e61797ac90035188e9309839be1 100644 (file)
@@ -549,41 +549,84 @@ LincsCuda::~LincsCuda()
 
 /*! \brief Helper function to go through constraints recursively.
  *
- *  For each constraint, counts the number of coupled constraints and stores the value in spaceNeeded array.
+ *  For each constraint, counts the number of coupled constraints and stores the value in numCoupledConstraints array.
  *  This information is used to split the array of constraints between thread blocks on a GPU so there is no
- *  coupling between constraints from different thread blocks. After the 'spaceNeeded' array is filled, the
- *  value spaceNeeded[c] should be equal to the number of constraints that are coupled to 'c' and located
+ *  coupling between constraints from different thread blocks. After the 'numCoupledConstraints' array is filled, the
+ *  value numCoupledConstraints[c] should be equal to the number of constraints that are coupled to 'c' and located
  *  after it in the constraints array.
  *
- * \param[in]     a                  Atom index.
- * \param[in,out] spaceNeeded        Indicates if the constraint was already counted and stores
- *                                   the number of constraints (i) connected to it and (ii) located
- *                                   after it in memory. This array is filled by this recursive function.
- *                                   For a set of coupled constraints, only for the first one in this list
- *                                   the number of consecutive coupled constraints is needed: if there is
- *                                   not enough space for this set of constraints in the thread block,
- *                                   the group has to be moved to the next one.
- * \param[in]     atomAdjacencyList  Stores information about connections between atoms.
+ * \param[in]     a                   Atom index.
+ * \param[in,out] numCoupledConstraints  Indicates if the constraint was already counted and stores
+ *                                    the number of constraints (i) connected to it and (ii) located
+ *                                    after it in memory. This array is filled by this recursive function.
+ *                                    For a set of coupled constraints, only for the first one in this list
+ *                                    the number of consecutive coupled constraints is needed: if there is
+ *                                    not enough space for this set of constraints in the thread block,
+ *                                    the group has to be moved to the next one.
+ * \param[in]     atomsAdjacencyList  Stores information about connections between atoms.
  */
-inline int countCoupled(int                                                  a,
-                        std::vector<int>*                                    spaceNeeded,
-                        std::vector<std::vector<std::tuple<int, int, int>>>* atomsAdjacencyList)
+inline int countCoupled(int           a,
+                        ArrayRef<int> numCoupledConstraints,
+                        ArrayRef<const std::vector<std::tuple<int, int, int>>> atomsAdjacencyList)
 
 {
-    int c2, a2, sign;
     int counted = 0;
-    for (unsigned i = 0; i < atomsAdjacencyList->at(a).size(); i++)
+    for (const auto& adjacentAtom : atomsAdjacencyList[a])
     {
-        std::tie(a2, c2, sign) = atomsAdjacencyList->at(a).at(i);
-        if (spaceNeeded->at(c2) == -1)
+        const int c2 = std::get<1>(adjacentAtom);
+        if (numCoupledConstraints[c2] == -1)
         {
-            spaceNeeded->at(c2) = 0; // To indicate we've been here
-            counted += 1 + countCoupled(a2, spaceNeeded, atomsAdjacencyList);
+            numCoupledConstraints[c2] = 0; // To indicate we've been here
+            counted += 1 + countCoupled(std::get<0>(adjacentAtom), numCoupledConstraints, atomsAdjacencyList);
         }
     }
     return counted;
 }
 
+/*! \brief Add constraint to \p splitMap with all constraints coupled to it.
+ *
+ *  Adds the constraint \p c from the constrain list \p iatoms to the map \p splitMap
+ *  if it was not yet added. Then goes through all the constraints coupled to \p c
+ *  and calls itself recursively. This ensures that all the coupled constraints will
+ *  be added to neighboring locations in the final data structures on the device,
+ *  hence mapping all coupled constraints to the same thread block. A value of -1 in
+ *  the \p splitMap is used to flag that constraint was not yet added to the \p splitMap.
+ *
+ * \param[in]     iatoms              The list of constraints.
+ * \param[in]     stride              Number of elements per constraint in \p iatoms.
+ * \param[in]     atomsAdjacencyList  Information about connections between atoms.
+ * \param[our]    splitMap            Map of sequential constraint indexes to indexes to be on the device
+ * \param[in]     c                   Sequential index for constraint to consider adding.
+ * \param[in,out] currentMapIndex     The rolling index for the constraints mapping.
+ */
+inline void addWithCoupled(const t_iatom*                                         iatoms,
+                           const int                                              stride,
+                           ArrayRef<const std::vector<std::tuple<int, int, int>>> atomsAdjacencyList,
+                           ArrayRef<int>                                          splitMap,
+                           const int                                              c,
+                           int*                                                   currentMapIndex)
+{
+    if (splitMap[c] == -1)
+    {
+        splitMap[c] = *currentMapIndex;
+        (*currentMapIndex)++;
+
+        // Constraints, coupled through both atoms.
+        for (int atomIndexInConstraint = 0; atomIndexInConstraint < 2; atomIndexInConstraint++)
+        {
+            const int a1 = iatoms[stride * c + 1 + atomIndexInConstraint];
+            for (const auto& adjacentAtom : atomsAdjacencyList[a1])
+            {
+                const int c2 = std::get<1>(adjacentAtom);
+                if (c2 != c)
+                {
+                    addWithCoupled(iatoms, stride, atomsAdjacencyList, splitMap, c2, currentMapIndex);
+                }
+            }
+        }
+    }
+}
+
 void LincsCuda::set(const t_idef& idef, const t_mdatoms& md)
 {
     int numAtoms = md.nr;
@@ -624,46 +667,49 @@ void LincsCuda::set(const t_idef& idef, const t_mdatoms& md)
         atomsAdjacencyList.at(a2).push_back(std::make_tuple(a1, c, -1));
     }
 
-    // Compute, how many coupled constraints are in front of each constraint.
-    // Needed to introduce splits in data so that all coupled constraints will be computed in a single GPU block.
-    // The position 'c' of the vector spaceNeeded should have the number of constraints that are coupled to a constraint
-    // 'c' and are after 'c' in the vector. Only first index of the connected group of the constraints is needed later in the
-    // code, hence the spaceNeeded vector is also used to keep track if the constrain was already counted.
-    std::vector<int> spaceNeeded;
-    spaceNeeded.resize(numConstraints, -1);
-    std::fill(spaceNeeded.begin(), spaceNeeded.end(), -1);
+    // Compute, how many constraints are coupled to each constraint.
+    // Needed to introduce splits in data so that all coupled constraints will be computed in a
+    // single GPU block. The position 'c' of the vector numCoupledConstraints should have the number
+    // of constraints that are coupled to a constraint 'c' and are after 'c' in the vector. Only
+    // first index of the connected group of the constraints is needed later in the code, hence the
+    // numCoupledConstraints vector is also used to keep track if the constrain was already counted.
+    std::vector<int> numCoupledConstraints(numConstraints, -1);
     for (int c = 0; c < numConstraints; c++)
     {
         int a1 = iatoms[stride * c + 1];
         int a2 = iatoms[stride * c + 2];
-        if (spaceNeeded.at(c) == -1)
+        if (numCoupledConstraints.at(c) == -1)
         {
-            spaceNeeded.at(c) = countCoupled(a1, &spaceNeeded, &atomsAdjacencyList)
-                                + countCoupled(a2, &spaceNeeded, &atomsAdjacencyList);
+            numCoupledConstraints.at(c) = countCoupled(a1, numCoupledConstraints, atomsAdjacencyList)
+                                          + countCoupled(a2, numCoupledConstraints, atomsAdjacencyList);
         }
     }
 
     // Map of splits in the constraints data. For each 'old' constraint index gives 'new' which
     // takes into account the empty spaces which might be needed in the end of each thread block.
-    std::vector<int> splitMap;
-    splitMap.resize(numConstraints, -1);
-    int currentMapIndex = 0;
+    std::vector<int> splitMap(numConstraints, -1);
+    int              currentMapIndex = 0;
     for (int c = 0; c < numConstraints; c++)
     {
         // Check if coupled constraints all fit in one block
-        GMX_RELEASE_ASSERT(
-                spaceNeeded.at(c) < c_threadsPerBlock,
-                "Maximum number of coupled constraints exceedes the size of the CUDA thread block. "
-                "Most likely, you are trying to use GPU version of LINCS with constraints on "
-                "all-bonds, "
-                "which is not supported. Try using H-bonds constraints only.");
-        if (currentMapIndex / c_threadsPerBlock != (currentMapIndex + spaceNeeded.at(c)) / c_threadsPerBlock)
+        if (numCoupledConstraints.at(c) > c_threadsPerBlock)
+        {
+            gmx_fatal(FARGS,
+                      "Maximum number of coupled constraints (%d) exceeds the size of the CUDA "
+                      "thread block (%d). Most likely, you are trying to use the GPU version of "
+                      "LINCS with constraints on all-bonds, which is not supported for large "
+                      "molecules. When compatible with the force field and integration settings, "
+                      "using constraints on H-bonds only.",
+                      numCoupledConstraints.at(c), c_threadsPerBlock);
+        }
+        if (currentMapIndex / c_threadsPerBlock
+            != (currentMapIndex + numCoupledConstraints.at(c)) / c_threadsPerBlock)
         {
             currentMapIndex = ((currentMapIndex / c_threadsPerBlock) + 1) * c_threadsPerBlock;
         }
-        splitMap.at(c) = currentMapIndex;
-        currentMapIndex++;
+        addWithCoupled(iatoms, stride, atomsAdjacencyList, splitMap, c, &currentMapIndex);
     }
+
     kernelParams_.numConstraintsThreads =
             currentMapIndex + c_threadsPerBlock - currentMapIndex % c_threadsPerBlock;
     GMX_RELEASE_ASSERT(kernelParams_.numConstraintsThreads % c_threadsPerBlock == 0,
@@ -842,7 +888,7 @@ void LincsCuda::set(const t_idef& idef, const t_mdatoms& md)
                        maxCoupledConstraints * kernelParams_.numConstraintsThreads, commandStream_,
                        GpuApiCallBehavior::Sync, nullptr);
 
-    GMX_RELEASE_ASSERT(md.invmass != nullptr, "Masses of attoms should be specified.\n");
+    GMX_RELEASE_ASSERT(md.invmass != nullptr, "Masses of atoms should be specified.\n");
     copyToDeviceBuffer(&kernelParams_.d_inverseMasses, md.invmass, 0, numAtoms, commandStream_,
                        GpuApiCallBehavior::Sync, nullptr);
 }
index aa2c7dbadf658d315fe13a2ac4a33c9a5261dacd..692c2570b66f9dfa613e29dae78d4f183685027f 100644 (file)
@@ -1031,10 +1031,11 @@ void do_force(FILE*                               fplog,
         }
     }
 
-    // Copy coordinate from the GPU if update is on the GPU and there are forces to be computed on the CPU. At search steps the
-    // current coordinates are already on the host, hence copy is not needed.
+    // Copy coordinate from the GPU if update is on the GPU and there are forces to be computed on
+    // the CPU, or for the computation of virial. At search steps the current coordinates are
+    // already on the host, hence copy is not needed.
     if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
-        && runScheduleWork->domainWork.haveCpuLocalForceWork)
+        && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial))
     {
         stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local);
         stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
@@ -1470,7 +1471,7 @@ void do_force(FILE*                               fplog,
     if (ddUsesGpuDirectCommunication && (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork))
     {
         /* Wait for non-local coordinate data to be copied from device */
-        nbv->wait_nonlocal_x_copy_D2H_done();
+        stateGpu->waitCoordinatesReadyOnHost(AtomLocality::NonLocal);
     }
     /* Compute the bonded and non-bonded energies and optionally forces */
     do_force_lowlevel(fr, inputrec, &(top->idef), cr, ms, nrnb, wcycle, mdatoms, x, hist, &forceOut, enerd,
@@ -1666,7 +1667,7 @@ void do_force(FILE*                               fplog,
                                            fr->pmePpCommGpu->getForcesReadySynchronizer())) // buffer received from other GPU
                         : nullptr; // PME reduction not active on GPU
 
-        gmx::FixedCapacityVector<GpuEventSynchronizer*, 2> dependencyList;
+        gmx::FixedCapacityVector<GpuEventSynchronizer*, 3> dependencyList;
 
         if (stepWork.useGpuPmeFReduction)
         {
@@ -1699,14 +1700,7 @@ void do_force(FILE*                               fplog,
             }
             if (useGpuForcesHaloExchange)
             {
-                // Add a stream synchronization to satisfy a dependency
-                // for the local buffer ops on the result of GPU halo
-                // exchange, which operates in the non-local stream and
-                // writes to to local parf og the force buffer.
-                //
-                // TODO improve this through use of an event - see Redmine #3093
-                //      push the event into the dependencyList
-                nbv->stream_local_wait_for_nonlocal();
+                dependencyList.push_back(gpuHaloExchange->getForcesReadyOnDeviceEvent());
             }
             nbv->atomdata_add_nbat_f_to_f_gpu(AtomLocality::Local, stateGpu->getForces(), pmeForcePtr,
                                               dependencyList, stepWork.useGpuPmeFReduction,
index eeff24cf943a8bd21f58d2affa9b5cdafd9e6caa..2e6dd74950f13a37fb15f9c9e335e805dd04fac7 100644 (file)
@@ -1667,7 +1667,8 @@ void update_pcouple_after_coordinates(FILE*             fplog,
                                       matrix            pressureCouplingMu,
                                       t_state*          state,
                                       t_nrnb*           nrnb,
-                                      Update*           upd)
+                                      Update*           upd,
+                                      const bool        scaleCoordinates)
 {
     int start  = 0;
     int homenr = md->homenr;
@@ -1687,7 +1688,7 @@ void update_pcouple_after_coordinates(FILE*             fplog,
                 berendsen_pcoupl(fplog, step, inputrec, dtpc, pressure, state->box, forceVirial,
                                  constraintVirial, pressureCouplingMu, &state->baros_integral);
                 berendsen_pscale(inputrec, pressureCouplingMu, state->box, state->box_rel, start,
-                                 homenr, state->x.rvec_array(), md->cFREEZE, nrnb);
+                                 homenr, state->x.rvec_array(), md->cFREEZE, nrnb, scaleCoordinates);
             }
             break;
         case (epcPARRINELLORAHMAN):
@@ -1708,10 +1709,13 @@ void update_pcouple_after_coordinates(FILE*             fplog,
                 preserve_box_shape(inputrec, state->box_rel, state->box);
 
                 /* Scale the coordinates */
-                auto x = state->x.rvec_array();
-                for (int n = start; n < start + homenr; n++)
+                if (scaleCoordinates)
                 {
-                    tmvmul_ur0(pressureCouplingMu, x[n], x[n]);
+                    auto x = state->x.rvec_array();
+                    for (int n = start; n < start + homenr; n++)
+                    {
+                        tmvmul_ur0(pressureCouplingMu, x[n], x[n]);
+                    }
                 }
             }
             break;
index 8239a97b87b1484a7aab2f53178432b93c87380b..472d5e6bbf8a1c0ebcdb62c5585e061f3d0a3bdc 100644 (file)
@@ -134,7 +134,8 @@ void update_pcouple_after_coordinates(FILE*             fplog,
                                       matrix            pressureCouplingMu,
                                       t_state*          state,
                                       t_nrnb*           nrnb,
-                                      gmx::Update*      upd);
+                                      gmx::Update*      upd,
+                                      bool              scaleCoordinates);
 
 void update_coords(int64_t           step,
                    const t_inputrec* inputrec, /* input record and box stuff   */
@@ -328,7 +329,8 @@ void berendsen_pscale(const t_inputrec*    ir,
                       int                  nr_atoms,
                       rvec                 x[],
                       const unsigned short cFREEZE[],
-                      t_nrnb*              nrnb);
+                      t_nrnb*              nrnb,
+                      bool                 scaleCoordinates);
 
 void pleaseCiteCouplingAlgorithms(FILE* fplog, const t_inputrec& ir);
 
index 90fb3679b3e1e446d737e26ab16c3fae0ad41330..ee6ff74b07b8c586f8936306aa00336c63e793df 100644 (file)
@@ -93,28 +93,37 @@ public:
      * This will extract temperature scaling factors from tcstat, transform them into the plain
      * array and call the normal integrate method.
      *
-     * \param[in]  fReadyOnDevice         Event synchronizer indicating that the forces are
-     *                                    ready in the device memory.
-     * \param[in]  dt                     Timestep.
-     * \param[in]  updateVelocities       If the velocities should be constrained.
-     * \param[in]  computeVirial          If virial should be updated.
-     * \param[out] virial                 Place to save virial tensor.
-     * \param[in]  doTempCouple           If the temperature coupling should be performed.
-     * \param[in]  tcstat                 Temperature coupling data.
-     * \param[in]  doPressureCouple       If the temperature coupling should be applied.
-     * \param[in]  dtPressureCouple       Period between pressure coupling steps
-     * \param[in]  velocityScalingMatrix  Parrinello-Rahman velocity scaling matrix
+     * \param[in]  fReadyOnDevice           Event synchronizer indicating that the forces are
+     *                                      ready in the device memory.
+     * \param[in]  dt                       Timestep.
+     * \param[in]  updateVelocities         If the velocities should be constrained.
+     * \param[in]  computeVirial            If virial should be updated.
+     * \param[out] virial                   Place to save virial tensor.
+     * \param[in]  doTemperatureScaling     If velocities should be scaled for temperature coupling.
+     * \param[in]  tcstat                   Temperature coupling data.
+     * \param[in]  doParrinelloRahman       If current step is a Parrinello-Rahman pressure coupling step.
+     * \param[in]  dtPressureCouple         Period between pressure coupling steps.
+     * \param[in]  prVelocityScalingMatrix  Parrinello-Rahman velocity scaling matrix.
      */
     void integrate(GpuEventSynchronizer*             fReadyOnDevice,
                    real                              dt,
                    bool                              updateVelocities,
                    bool                              computeVirial,
                    tensor                            virial,
-                   bool                              doTempCouple,
+                   bool                              doTemperatureScaling,
                    gmx::ArrayRef<const t_grp_tcstat> tcstat,
-                   bool                              doPressureCouple,
+                   bool                              doParrinelloRahman,
                    float                             dtPressureCouple,
-                   const matrix                      velocityScalingMatrix);
+                   const matrix                      prVelocityScalingMatrix);
+
+    /*! \brief Scale coordinates on the GPU for the pressure coupling.
+     *
+     * After pressure coupling step, the box size may change. Hence, the coordinates should be
+     * scaled so that all the particles fit in the new box.
+     *
+     * \param[in] scalingMatrix Coordinates scaling matrix.
+     */
+    void scaleCoordinates(const matrix scalingMatrix);
 
     /*! \brief Set the pointers and update data-structures (e.g. after NB search step).
      *
index 3f04316f2e806c9683520938b08852c9ad85566f..82c19e735903feabd6e5b3673a2deacbf1e134b5 100644 (file)
@@ -55,10 +55,10 @@ class UpdateConstrainCuda::Impl
 {
 };
 
-UpdateConstrainCuda::UpdateConstrainCuda(gmx_unused const t_inputrec& ir,
-                                         gmx_unused const gmx_mtop_t& mtop,
-                                         gmx_unused const void*       commandStream,
-                                         gmx_unused GpuEventSynchronizer* xUpdatedOnDevice) :
+UpdateConstrainCuda::UpdateConstrainCuda(const t_inputrec& /* ir   */,
+                                         const gmx_mtop_t& /* mtop */,
+                                         const void* /* commandStream */,
+                                         GpuEventSynchronizer* /* xUpdatedOnDevice */) :
     impl_(nullptr)
 {
     GMX_ASSERT(false,
@@ -67,33 +67,39 @@ UpdateConstrainCuda::UpdateConstrainCuda(gmx_unused const t_inputrec& ir,
 
 UpdateConstrainCuda::~UpdateConstrainCuda() = default;
 
-void UpdateConstrainCuda::integrate(gmx_unused GpuEventSynchronizer* fReadyOnDevice,
-                                    gmx_unused const real dt,
-                                    gmx_unused const bool updateVelocities,
-                                    gmx_unused const bool computeVirial,
-                                    gmx_unused tensor     virialScaled,
-                                    gmx_unused const bool doTempCouple,
-                                    gmx_unused gmx::ArrayRef<const t_grp_tcstat> tcstat,
-                                    gmx_unused const bool                        doPressureCouple,
-                                    gmx_unused const float                       dtPressureCouple,
-                                    gmx_unused const matrix velocityScalingMatrix)
+void UpdateConstrainCuda::integrate(GpuEventSynchronizer* /* fReadyOnDevice */,
+                                    const real /* dt */,
+                                    const bool /* updateVelocities */,
+                                    const bool /* computeVirial */,
+                                    tensor /* virialScaled */,
+                                    const bool /* doTemperatureScaling */,
+                                    gmx::ArrayRef<const t_grp_tcstat> /* tcstat */,
+                                    const bool /* doParrinelloRahman */,
+                                    const float /* dtPressureCouple */,
+                                    const matrix /* prVelocityScalingMatrix*/)
 {
     GMX_ASSERT(false,
                "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
 }
 
-void UpdateConstrainCuda::set(gmx_unused DeviceBuffer<float> d_x,
-                              gmx_unused DeviceBuffer<float> d_v,
-                              gmx_unused const DeviceBuffer<float> d_f,
-                              gmx_unused const t_idef& idef,
-                              gmx_unused const t_mdatoms& md,
-                              gmx_unused const int        numTempScaleValues)
+void UpdateConstrainCuda::scaleCoordinates(const matrix /* scalingMatrix */)
 {
     GMX_ASSERT(false,
                "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
 }
 
-void UpdateConstrainCuda::setPbc(gmx_unused const t_pbc* pbc)
+void UpdateConstrainCuda::set(DeviceBuffer<float> /* d_x */,
+                              DeviceBuffer<float> /* d_v */,
+                              const DeviceBuffer<float> /* d_f */,
+                              const t_idef& /* idef */,
+                              const t_mdatoms& /* md */,
+                              const int /* numTempScaleValues */)
+{
+    GMX_ASSERT(false,
+               "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
+}
+
+void UpdateConstrainCuda::setPbc(const t_pbc* /* pbc */)
 {
     GMX_ASSERT(false,
                "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
index 3f4109db5401f7cb8e32ea599baf5e46c01be879..7f38b8d74cc74a1b9917dea8f23c2d17b13b9f33 100644 (file)
 
 namespace gmx
 {
+/*!\brief Number of CUDA threads in a block
+ *
+ * \todo Check if using smaller block size will lead to better prformance.
+ */
+constexpr static int c_threadsPerBlock = 256;
+//! Maximum number of threads in a block (for __launch_bounds__)
+constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock;
+
+/*! \brief Scaling matrix struct.
+ *
+ * \todo Should be generalized.
+ */
+struct ScalingMatrix
+{
+    float xx, yy, zz, yx, zx, zy;
+};
+
+__launch_bounds__(c_maxThreadsPerBlock) __global__
+        static void scaleCoordinates_kernel(const int numAtoms,
+                                            float3* __restrict__ gm_x,
+                                            const ScalingMatrix scalingMatrix)
+{
+    int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
+    if (threadIndex < numAtoms)
+    {
+        float3 x = gm_x[threadIndex];
+
+        x.x = scalingMatrix.xx * x.x + scalingMatrix.yx * x.y + scalingMatrix.zx * x.z;
+        x.y = scalingMatrix.yy * x.y + scalingMatrix.zy * x.z;
+        x.z = scalingMatrix.zz * x.z;
+
+        gm_x[threadIndex] = x;
+    }
+}
 
 void UpdateConstrainCuda::Impl::integrate(GpuEventSynchronizer*             fReadyOnDevice,
                                           const real                        dt,
                                           const bool                        updateVelocities,
                                           const bool                        computeVirial,
                                           tensor                            virial,
-                                          const bool                        doTempCouple,
+                                          const bool                        doTemperatureScaling,
                                           gmx::ArrayRef<const t_grp_tcstat> tcstat,
-                                          const bool                        doPressureCouple,
+                                          const bool                        doParrinelloRahman,
                                           const float                       dtPressureCouple,
-                                          const matrix                      velocityScalingMatrix)
+                                          const matrix                      prVelocityScalingMatrix)
 {
     // Clearing virial matrix
     // TODO There is no point in having separate virial matrix for constraints
@@ -88,8 +122,8 @@ void UpdateConstrainCuda::Impl::integrate(GpuEventSynchronizer*             fRea
 
     // The integrate should save a copy of the current coordinates in d_xp_ and write updated once
     // into d_x_. The d_xp_ is only needed by constraints.
-    integrator_->integrate(d_x_, d_xp_, d_v_, d_f_, dt, doTempCouple, tcstat, doPressureCouple,
-                           dtPressureCouple, velocityScalingMatrix);
+    integrator_->integrate(d_x_, d_xp_, d_v_, d_f_, dt, doTemperatureScaling, tcstat,
+                           doParrinelloRahman, dtPressureCouple, prVelocityScalingMatrix);
     // Constraints need both coordinates before (d_x_) and after (d_xp_) update. However, after constraints
     // are applied, the d_x_ can be discarded. So we intentionally swap the d_x_ and d_xp_ here to avoid the
     // d_xp_ -> d_x_ copy after constraints. Note that the integrate saves them in the wrong order as well.
@@ -111,6 +145,25 @@ void UpdateConstrainCuda::Impl::integrate(GpuEventSynchronizer*             fRea
     return;
 }
 
+void UpdateConstrainCuda::Impl::scaleCoordinates(const matrix scalingMatrix)
+{
+    ScalingMatrix mu;
+    mu.xx = scalingMatrix[XX][XX];
+    mu.yy = scalingMatrix[YY][YY];
+    mu.zz = scalingMatrix[ZZ][ZZ];
+    mu.yx = scalingMatrix[YY][XX];
+    mu.zx = scalingMatrix[ZZ][XX];
+    mu.zy = scalingMatrix[ZZ][YY];
+
+    const auto kernelArgs = prepareGpuKernelArguments(
+            scaleCoordinates_kernel, coordinateScalingKernelLaunchConfig_, &numAtoms_, &d_x_, &mu);
+    launchGpuKernel(scaleCoordinates_kernel, coordinateScalingKernelLaunchConfig_, nullptr,
+                    "scaleCoordinates_kernel", kernelArgs);
+    // TODO: Although this only happens on the pressure coupling steps, this synchronization
+    //       can affect the perfornamce if nstpcouple is small.
+    gpuStreamSynchronize(commandStream_);
+}
+
 UpdateConstrainCuda::Impl::Impl(const t_inputrec&     ir,
                                 const gmx_mtop_t&     mtop,
                                 const void*           commandStream,
@@ -125,6 +178,12 @@ UpdateConstrainCuda::Impl::Impl(const t_inputrec&     ir,
     integrator_ = std::make_unique<LeapFrogCuda>(commandStream_);
     lincsCuda_  = std::make_unique<LincsCuda>(ir.nLincsIter, ir.nProjOrder, commandStream_);
     settleCuda_ = std::make_unique<SettleCuda>(mtop, commandStream_);
+
+    coordinateScalingKernelLaunchConfig_.blockSize[0]     = c_threadsPerBlock;
+    coordinateScalingKernelLaunchConfig_.blockSize[1]     = 1;
+    coordinateScalingKernelLaunchConfig_.blockSize[2]     = 1;
+    coordinateScalingKernelLaunchConfig_.sharedMemorySize = 0;
+    coordinateScalingKernelLaunchConfig_.stream           = commandStream_;
 }
 
 UpdateConstrainCuda::Impl::~Impl() {}
@@ -155,6 +214,9 @@ void UpdateConstrainCuda::Impl::set(DeviceBuffer<float>       d_x,
     integrator_->set(md, numTempScaleValues, md.cTC);
     lincsCuda_->set(idef, md);
     settleCuda_->set(idef, md);
+
+    coordinateScalingKernelLaunchConfig_.gridSize[0] =
+            (numAtoms_ + c_threadsPerBlock - 1) / c_threadsPerBlock;
 }
 
 void UpdateConstrainCuda::Impl::setPbc(const t_pbc* pbc)
@@ -182,14 +244,19 @@ void UpdateConstrainCuda::integrate(GpuEventSynchronizer*             fReadyOnDe
                                     const bool                        updateVelocities,
                                     const bool                        computeVirial,
                                     tensor                            virialScaled,
-                                    const bool                        doTempCouple,
+                                    const bool                        doTemperatureScaling,
                                     gmx::ArrayRef<const t_grp_tcstat> tcstat,
-                                    const bool                        doPressureCouple,
+                                    const bool                        doParrinelloRahman,
                                     const float                       dtPressureCouple,
-                                    const matrix                      velocityScalingMatrix)
+                                    const matrix                      prVelocityScalingMatrix)
+{
+    impl_->integrate(fReadyOnDevice, dt, updateVelocities, computeVirial, virialScaled, doTemperatureScaling,
+                     tcstat, doParrinelloRahman, dtPressureCouple, prVelocityScalingMatrix);
+}
+
+void UpdateConstrainCuda::scaleCoordinates(const matrix scalingMatrix)
 {
-    impl_->integrate(fReadyOnDevice, dt, updateVelocities, computeVirial, virialScaled, doTempCouple,
-                     tcstat, doPressureCouple, dtPressureCouple, velocityScalingMatrix);
+    impl_->scaleCoordinates(scalingMatrix);
 }
 
 void UpdateConstrainCuda::set(DeviceBuffer<float>       d_x,
index 88dcfb69a92aa2931855fc9eb278a1e416bf9b1a..62ff01b19c6105fc752c4f7e62c741bd072185cd 100644 (file)
@@ -92,28 +92,37 @@ public:
      *   2. This is the temperature coupling step.
      * Parameters virial/lambdas can be nullptr if computeVirial/doTempCouple are false.
      *
-     * \param[in]  fReadyOnDevice         Event synchronizer indicating that the forces are ready in
-     *                                    the device memory.
-     * \param[in]  dt                     Timestep.
-     * \param[in]  updateVelocities       If the velocities should be constrained.
-     * \param[in]  computeVirial          If virial should be updated.
-     * \param[out] virial                 Place to save virial tensor.
-     * \param[in]  doTempCouple           If the temperature coupling should be performed.
-     * \param[in]  tcstat                 Temperature coupling data.
-     * \param[in]  doPressureCouple       If the temperature coupling should be applied.
-     * \param[in]  dtPressureCouple       Period between pressure coupling steps
-     * \param[in]  velocityScalingMatrix  Parrinello-Rahman velocity scaling matrix
+     * \param[in]  fReadyOnDevice           Event synchronizer indicating that the forces are ready in
+     *                                      the device memory.
+     * \param[in]  dt                       Timestep.
+     * \param[in]  updateVelocities         If the velocities should be constrained.
+     * \param[in]  computeVirial            If virial should be updated.
+     * \param[out] virial                   Place to save virial tensor.
+     * \param[in]  doTemperatureScaling     If velocities should be scaled for temperature coupling.
+     * \param[in]  tcstat                   Temperature coupling data.
+     * \param[in]  doParrinelloRahman       If current step is a Parrinello-Rahman pressure coupling step.
+     * \param[in]  dtPressureCouple         Period between pressure coupling steps.
+     * \param[in]  prVelocityScalingMatrix  Parrinello-Rahman velocity scaling matrix.
      */
     void integrate(GpuEventSynchronizer*             fReadyOnDevice,
                    real                              dt,
                    bool                              updateVelocities,
                    bool                              computeVirial,
                    tensor                            virial,
-                   bool                              doTempCouple,
+                   bool                              doTemperatureScaling,
                    gmx::ArrayRef<const t_grp_tcstat> tcstat,
-                   bool                              doPressureCouple,
+                   bool                              doParrinelloRahman,
                    float                             dtPressureCouple,
-                   const matrix                      velocityScalingMatrix);
+                   const matrix                      prVelocityScalingMatrix);
+
+    /*! \brief Scale coordinates on the GPU for the pressure coupling.
+     *
+     * After pressure coupling step, the box size may change. Hence, the coordinates should be
+     * scaled so that all the particles fit in the new box.
+     *
+     * \param[in] scalingMatrix Coordinates scaling matrix.
+     */
+    void scaleCoordinates(const matrix scalingMatrix);
 
     /*! \brief Set the pointers and update data-structures (e.g. after NB search step).
      *
@@ -147,6 +156,8 @@ public:
 private:
     //! CUDA stream
     CommandStream commandStream_ = nullptr;
+    //! CUDA kernel launch config
+    KernelLaunchConfig coordinateScalingKernelLaunchConfig_;
 
     //! Periodic boundary data
     PbcAiuc pbcAiuc_;
index da1278b19ed5f0e5ffe4f35ad618912ef14c0999..4f163faf865f7ef5c3ee9caadb0fee5ec2bf86e8 100644 (file)
@@ -344,8 +344,9 @@ void gmx::LegacySimulator::do_md()
         GMX_RELEASE_ASSERT(
                 ir->etc != etcNOSEHOOVER,
                 "Nose-Hoover temperature coupling is not supported with the GPU update.\n");
-        GMX_RELEASE_ASSERT(ir->epc == epcNO,
-                           "Pressure coupling is not supported with the GPU update.\n");
+        GMX_RELEASE_ASSERT(ir->epc == epcNO || ir->epc == epcPARRINELLORAHMAN || ir->epc == epcBERENDSEN,
+                           "Only Parrinello-Rahman and Berendsen pressure coupling are supported "
+                           "with the GPU update.\n");
         GMX_RELEASE_ASSERT(!mdatoms->haveVsites,
                            "Virtual sites are not supported with the GPU update.\n");
         GMX_RELEASE_ASSERT(ed == nullptr,
@@ -370,6 +371,10 @@ void gmx::LegacySimulator::do_md()
         }
         integrator = std::make_unique<UpdateConstrainCuda>(
                 *ir, *top_global, stateGpu->getUpdateStream(), stateGpu->xUpdatedOnDevice());
+
+        t_pbc pbc;
+        set_pbc(&pbc, epbcXYZ, state->box);
+        integrator->setPbc(&pbc);
     }
 
     if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
@@ -820,6 +825,13 @@ void gmx::LegacySimulator::do_md()
                 if (correct_box(fplog, step, state->box, graph))
                 {
                     bMasterState = TRUE;
+                    // If update is offloaded, it should be informed about the box size change
+                    if (useGpuForUpdate)
+                    {
+                        t_pbc pbc;
+                        set_pbc(&pbc, epbcXYZ, state->box);
+                        integrator->setPbc(&pbc);
+                    }
                 }
             }
             if (DOMAINDECOMP(cr) && bMasterState)
@@ -1235,7 +1247,13 @@ void gmx::LegacySimulator::do_md()
          * energies of two subsequent steps. Therefore we need to compute the
          * half step kinetic energy also if we need energies at the next step.
          */
-        const bool needHalfStepKineticEnergy = (!EI_VV(ir->eI) && do_per_step(step + 1, nstglobalcomm));
+        const bool needHalfStepKineticEnergy =
+                (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
+
+        // Parrinello-Rahman requires the pressure to be availible before the update to compute
+        // the velocity scaling matrix. Hence, it runs one step after the nstpcouple step.
+        const bool doParrinelloRahman = (ir->epc == epcPARRINELLORAHMAN
+                                         && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
 
         if (useGpuForUpdate)
         {
@@ -1243,9 +1261,6 @@ void gmx::LegacySimulator::do_md()
             {
                 integrator->set(stateGpu->getCoordinates(), stateGpu->getVelocities(),
                                 stateGpu->getForces(), top.idef, *mdatoms, ekind->ngtc);
-                t_pbc pbc;
-                set_pbc(&pbc, epbcXYZ, state->box);
-                integrator->setPbc(&pbc);
 
                 // Copy data to the GPU after buffers might have being reinitialized
                 stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
@@ -1258,15 +1273,13 @@ void gmx::LegacySimulator::do_md()
                 stateGpu->copyForcesToGpu(ArrayRef<RVec>(f), AtomLocality::Local);
             }
 
-            bool doTempCouple =
+            const bool doTemperatureScaling =
                     (ir->etc != etcNO && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
-            bool doParrinelloRahman = (ir->epc == epcPARRINELLORAHMAN
-                                       && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
 
             // This applies Leap-Frog, LINCS and SETTLE in succession
             integrator->integrate(stateGpu->getForcesReadyOnDeviceEvent(
                                           AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
-                                  ir->delta_t, true, bCalcVir, shake_vir, doTempCouple,
+                                  ir->delta_t, true, bCalcVir, shake_vir, doTemperatureScaling,
                                   ekind->tcstat, doParrinelloRahman, ir->nstpcouple * ir->delta_t, M);
 
             // Copy velocities D2H after update if:
@@ -1372,13 +1385,7 @@ void gmx::LegacySimulator::do_md()
             // and when algorithms require it.
             const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
 
-            // With leap-frog we also need to compute the half-step
-            // kinetic energy at the step before we need to write
-            // the full-step kinetic energy
-            const bool needEkinAtNextStep =
-                    (!EI_VV(ir->eI) && (do_per_step(step + 1, nstglobalcomm) || step_rel + 1 == ir->nsteps));
-
-            if (bGStat || needEkinAtNextStep || doInterSimSignal)
+            if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
             {
                 // Copy coordinates when needed to stop the CM motion.
                 if (useGpuForUpdate && !EI_VV(ir->eI) && bStopCM)
@@ -1441,7 +1448,17 @@ void gmx::LegacySimulator::do_md()
         }
 
         update_pcouple_after_coordinates(fplog, step, ir, mdatoms, pres, force_vir, shake_vir,
-                                         pressureCouplingMu, state, nrnb, &upd);
+                                         pressureCouplingMu, state, nrnb, &upd, !useGpuForUpdate);
+
+        const bool doBerendsenPressureCoupling =
+                (inputrec->epc == epcBERENDSEN && do_per_step(step, inputrec->nstpcouple));
+        if (useGpuForUpdate && (doBerendsenPressureCoupling || doParrinelloRahman))
+        {
+            integrator->scaleCoordinates(pressureCouplingMu);
+            t_pbc pbc;
+            set_pbc(&pbc, epbcXYZ, state->box);
+            integrator->setPbc(&pbc);
+        }
 
         /* ################# END UPDATE STEP 2 ################# */
         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
index 5fb17e91995d3ec9951f9443102341be6128be1e..75a6f6d14cb18b3607a68c2f2cec1135ea7fbe6d 100644 (file)
@@ -196,12 +196,12 @@ struct DevelopmentFeatureFlags
  *
  * \param[in]  mdlog                Logger object.
  * \param[in]  useGpuForNonbonded   True if the nonbonded task is offloaded in this run.
- * \param[in]  useGpuForPme         True if the PME task is offloaded in this run.
+ * \param[in]  pmeRunMode           The PME run mode for this run
  * \returns                         The object populated with development feature flags.
  */
 static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger& mdlog,
                                                          const bool           useGpuForNonbonded,
-                                                         const bool           useGpuForPme)
+                                                         const PmeRunMode     pmeRunMode)
 {
     DevelopmentFeatureFlags devFlags;
 
@@ -266,7 +266,7 @@ static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger& md
 
     if (devFlags.enableGpuPmePPComm)
     {
-        if (useGpuForPme)
+        if (pmeRunMode == PmeRunMode::GPU)
         {
             GMX_LOG(mdlog.warning)
                     .asParagraph()
@@ -276,12 +276,23 @@ static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger& md
         }
         else
         {
+            std::string clarification;
+            if (pmeRunMode == PmeRunMode::Mixed)
+            {
+                clarification =
+                        "PME FFT and gather are not offloaded to the GPU (PME is running in mixed "
+                        "mode).";
+            }
+            else
+            {
+                clarification = "PME is not offloaded to the GPU.";
+            }
             GMX_LOG(mdlog.warning)
                     .asParagraph()
-                    .appendTextFormatted(
+                    .appendText(
                             "NOTE: GMX_GPU_PME_PP_COMMS environment variable detected, but the "
-                            "'GPU PME-PP communications' feature was not enabled as PME is not "
-                            "offloaded to the GPU.");
+                            "'GPU PME-PP communications' feature was not enabled as "
+                            + clarification);
             devFlags.enableGpuPmePPComm = false;
         }
     }
@@ -889,7 +900,7 @@ int Mdrunner::mdrunner()
     // Initialize development feature flags that enabled by environment variable
     // and report those features that are enabled.
     const DevelopmentFeatureFlags devFlags =
-            manageDevelopmentFeatures(mdlog, useGpuForNonbonded, useGpuForPme);
+            manageDevelopmentFeatures(mdlog, useGpuForNonbonded, pmeRunMode);
 
     // NOTE: The devFlags need decideWhetherToUseGpusForNonbonded(...) and decideWhetherToUseGpusForPme(...) for overrides,
     //       decideWhetherToUseGpuForUpdate() needs devFlags for the '-update auto' override, hence the interleaving.
index a0d9309f6714ec14ad949b613c54441e3a8e98e2..c6ae19c589c0101759ddd7b135f2161f282872e8 100644 (file)
@@ -144,6 +144,8 @@ public:
     ~StatePropagatorDataGpu();
 
     /*! \brief Set the ranges for local and non-local atoms and reallocates buffers.
+     *
+     * Reallocates coordinate, velocities and force buffers on the device.
      *
      * \note
      * The coordinates buffer is (re)allocated, when required by PME, with a padding,
@@ -151,6 +153,10 @@ public:
      * is scheduled in the \p pmeStream_ (unlike the coordinates H2D) as only the PME
      * task uses this padding area.
      *
+     * \note
+     * The force buffer is cleared if its size increases, so that previously unused
+     * memory is cleared before forces are accumulated.
+     *
      *  \param[in] numAtomsLocal  Number of atoms in local domain.
      *  \param[in] numAtomsAll    Total number of atoms to handle.
      */
index 70365ac62eba349060f56b5f89c8970cec324f5b..af073b6284dde35d568465192004b375be8b2dd0 100644 (file)
@@ -137,6 +137,8 @@ public:
 
 
     /*! \brief Set the ranges for local and non-local atoms and reallocates buffers.
+     *
+     * Reallocates coordinate, velocities and force buffers on the device.
      *
      * \note
      * The coordinates buffer is (re)allocated, when required by PME, with a padding,
@@ -144,6 +146,10 @@ public:
      * is scheduled in the \p pmeStream_ (unlike the coordinates H2D) as only the PME
      * task uses this padding area.
      *
+     * \note
+     * The force buffer is cleared if its size increases, so that previously unused
+     * memory is cleared before forces are accumulated.
+     *
      *  \param[in] numAtomsLocal  Number of atoms in local domain.
      *  \param[in] numAtomsAll    Total number of atoms to handle.
      */
index d68e3e77b933d15bfbe2000017a0892f1fbdcd54..9d674afd8f5d05c8abdbadeab75010efe7c032ff 100644 (file)
@@ -194,7 +194,15 @@ void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll)
     }
 
     reallocateDeviceBuffer(&d_v_, DIM * numAtomsAll_, &d_vSize_, &d_vCapacity_, deviceContext_);
+    const int d_fOldCapacity = d_fCapacity_;
     reallocateDeviceBuffer(&d_f_, DIM * numAtomsAll_, &d_fSize_, &d_fCapacity_, deviceContext_);
+    // Clearing of the forces can be done in local stream since the nonlocal stream cannot reach
+    // the force accumulation stage before syncing with the local stream. Only done in CUDA,
+    // since the force buffer ops are not implemented in OpenCL.
+    if (GMX_GPU == GMX_GPU_CUDA && d_fCapacity_ != d_fOldCapacity)
+    {
+        clearDeviceBufferAsync(&d_f_, 0, d_fCapacity_, localStream_);
+    }
 }
 
 std::tuple<int, int> StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality(AtomLocality atomLocality)
index c59a4a828573c8ce324d265bbf7e1654817cd63e..04d0dfd3838fa29fd09171261ea4f42c5cd0b08d 100644 (file)
@@ -875,7 +875,12 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
     nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
 }
 
-/* F buffer operations on GPU: performs force summations and conversion from nb to rvec format. */
+/* F buffer operations on GPU: performs force summations and conversion from nb to rvec format.
+ *
+ * NOTE: When the total force device buffer is reallocated and its size increases, it is cleared in
+ *       Local stream. Hence, if accumulateForce is true, NonLocal stream should start accumulating
+ *       forces only after Local stream already done so.
+ */
 void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                         atomLocality,
                                DeviceBuffer<float>                        totalForcesDevice,
                                gmx_nbnxn_gpu_t*                           nb,
@@ -945,19 +950,4 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                         atomLo
     }
 }
 
-void nbnxn_wait_nonlocal_x_copy_D2H_done(gmx_nbnxn_cuda_t* nb)
-{
-    nb->xNonLocalCopyD2HDone->waitForEvent();
-}
-
-void nbnxn_stream_local_wait_for_nonlocal(gmx_nbnxn_cuda_t* nb)
-{
-    cudaStream_t localStream    = nb->stream[InteractionLocality::Local];
-    cudaStream_t nonLocalStream = nb->stream[InteractionLocality::NonLocal];
-
-    GpuEventSynchronizer event;
-    event.markEvent(nonLocalStream);
-    event.enqueueWaitEvent(localStream);
-}
-
 } // namespace Nbnxm
index 8cf4523d08bddf8726839b18715c08342c9d8c4c..1531f02816f73744b0cf3d687040079a296ab983 100644 (file)
@@ -245,14 +245,4 @@ void nonbonded_verlet_t::insertNonlocalGpuDependency(const gmx::InteractionLocal
     Nbnxm::nbnxnInsertNonlocalGpuDependency(gpu_nbv, interactionLocality);
 }
 
-void nonbonded_verlet_t::wait_nonlocal_x_copy_D2H_done()
-{
-    Nbnxm::nbnxn_wait_nonlocal_x_copy_D2H_done(gpu_nbv);
-}
-
-void nonbonded_verlet_t::stream_local_wait_for_nonlocal()
-{
-    Nbnxm::nbnxn_stream_local_wait_for_nonlocal(gpu_nbv);
-}
-
 /*! \endcond */
index 4574167c187a4d3fa234defd94933ba679aa3a9b..1548f3704ff9c74b83d01b82d29a29ce1e2a1cd4 100644 (file)
@@ -350,15 +350,9 @@ public:
      */
     void atomdata_init_add_nbat_f_to_f_gpu(GpuEventSynchronizer* localReductionDone);
 
-    /*! \brief Wait for non-local copy of coordinate buffer from device to host */
-    void wait_nonlocal_x_copy_D2H_done();
-
     /*! \brief return GPU pointer to f in rvec format */
     void* get_gpu_frvec();
 
-    /*! \brief Ensure local stream waits for non-local stream */
-    void stream_local_wait_for_nonlocal();
-
     //! Return the kernel setup
     const Nbnxm::KernelSetup& kernelSetup() const { return kernelSetup_; }
 
index ecae0d39d21c21dee5c7f18e717bed0d86aac512..7a4a94b6a52f5888932418bcce202b1391e97808 100644 (file)
@@ -328,17 +328,5 @@ void nbnxn_gpu_add_nbat_f_to_f(gmx::AtomLocality gmx_unused atomLocality,
 CUDA_FUNC_QUALIFIER
 void nbnxn_wait_x_on_device(gmx_nbnxn_gpu_t gmx_unused* nb) CUDA_FUNC_TERM;
 
-/*! \brief Wait for non-local copy of coordinate buffer from device to host
- * \param[in] nb                   The nonbonded data GPU structure
- */
-CUDA_FUNC_QUALIFIER
-void nbnxn_wait_nonlocal_x_copy_D2H_done(gmx_nbnxn_gpu_t gmx_unused* nb) CUDA_FUNC_TERM;
-
-/*! \brief Ensure local stream waits for non-local stream
- * \param[in] nb                   The nonbonded data GPU structure
- */
-CUDA_FUNC_QUALIFIER
-void nbnxn_stream_local_wait_for_nonlocal(gmx_nbnxn_gpu_t gmx_unused* nb) CUDA_FUNC_TERM;
-
 } // namespace Nbnxm
 #endif
index fb6302028ba3543df36fcb6b11942b794defc5b1..20037e050fe60a2d1c1b567a3cc9aa9063276504 100644 (file)
@@ -34,7 +34,9 @@
  */
 
 /* Stride of the packed x coordinate array */
-static constexpr int c_xStride2xNN = c_nbnxnCpuIClusterSize;
+static constexpr int c_xStride2xNN = (GMX_SIMD_REAL_WIDTH >= 2 * c_nbnxnCpuIClusterSize)
+                                             ? GMX_SIMD_REAL_WIDTH / 2
+                                             : c_nbnxnCpuIClusterSize;
 
 /* Copies PBC shifted i-cell packed atom coordinates to working array */
 static inline void icell_set_x_simd_2xnn(int  ci,
index 4e159aa7606a9e68e05966f421ee30a7586e9fe1..c20deec924102628ce1a33e4c3a6cfa4709c0e39 100644 (file)
@@ -516,7 +516,11 @@ static void get_pull_coord_dr(struct pull_t* pull, int coord_ind, const t_pbc* p
     PullCoordSpatialData& spatialData = pcrd->spatialData;
 
     double md2;
-    if (pcrd->params.eGeom == epullgDIRPBC)
+    /* With AWH pulling we allow for periodic pulling with geometry=direction.
+     * TODO: Store a periodicity flag instead of checking for external pull provider.
+     */
+    if (pcrd->params.eGeom == epullgDIRPBC
+        || (pcrd->params.eGeom == epullgDIR && pcrd->params.eType == epullEXTERNAL))
     {
         md2 = -1;
     }
index 66b1a3cc53bb83787aaf1a31071c42ac53e7e89a..b4ab67075d1383c3ee6ea69ad28e91b00570ca5d 100644 (file)
@@ -534,10 +534,9 @@ bool decideWhetherToUseGpuForUpdate(const bool        forceGpuUpdateDefaultOn,
     {
         errorMessage += "Nose-Hoover temperature coupling is not supported.\n";
     }
-    if (inputrec.epc != epcNO)
+    if (!(inputrec.epc == epcNO || inputrec.epc == epcPARRINELLORAHMAN || inputrec.epc == epcBERENDSEN))
     {
-        // Coordinate D2H and H2d are missing as well as PBC reinitialization
-        errorMessage += "Pressure coupling is not supported.\n";
+        errorMessage += "Only Parrinello-Rahman and Berendsen pressure coupling are supported.\n";
     }
     if (EEL_PME_EWALD(inputrec.coulombtype) && inputrec.epsilon_surface != 0)
     {
@@ -575,6 +574,11 @@ bool decideWhetherToUseGpuForUpdate(const bool        forceGpuUpdateDefaultOn,
     {
         errorMessage += "Swapping the coordinates is not supported.\n";
     }
+
+    // \todo Check for coupled constraint block size restriction needs to be added
+    //       when update auto chooses GPU in some cases. Currently exceeding the restriction
+    //       triggers a fatal error during LINCS setup.
+
     if (!errorMessage.empty())
     {
         if (updateTarget == TaskTarget::Gpu)
index afba4097f4edd13d85196874a38b546c0eda6c39..16849d4e5dd611856ca9e537b5b6a7663bcfd79f 100644 (file)
@@ -64,12 +64,12 @@ SimulationWorkload createSimulationWorkload(bool       useGpuForNonbonded,
     simulationWorkload.useGpuNonbonded = useGpuForNonbonded;
     simulationWorkload.useCpuPme       = (pmeRunMode == PmeRunMode::CPU);
     simulationWorkload.useGpuPme = (pmeRunMode == PmeRunMode::GPU || pmeRunMode == PmeRunMode::Mixed);
-    simulationWorkload.useGpuPmeFft                 = (pmeRunMode == PmeRunMode::Mixed);
-    simulationWorkload.useGpuBonded                 = useGpuForBonded;
-    simulationWorkload.useGpuUpdate                 = useGpuForUpdate;
-    simulationWorkload.useGpuBufferOps              = useGpuForBufferOps || useGpuForUpdate;
-    simulationWorkload.useGpuHaloExchange           = useGpuHaloExchange;
-    simulationWorkload.useGpuPmePpCommunication     = useGpuPmePpComm;
+    simulationWorkload.useGpuPmeFft             = (pmeRunMode == PmeRunMode::Mixed);
+    simulationWorkload.useGpuBonded             = useGpuForBonded;
+    simulationWorkload.useGpuUpdate             = useGpuForUpdate;
+    simulationWorkload.useGpuBufferOps          = useGpuForBufferOps || useGpuForUpdate;
+    simulationWorkload.useGpuHaloExchange       = useGpuHaloExchange;
+    simulationWorkload.useGpuPmePpCommunication = useGpuPmePpComm && (pmeRunMode == PmeRunMode::GPU);
     simulationWorkload.useGpuDirectCommunication    = useGpuHaloExchange || useGpuPmePpComm;
     simulationWorkload.haveEwaldSurfaceContribution = haveEwaldSurfaceContribution;
 
index e88a78b2401ea3aa54be297c37abe66ecad2566a..6a85949ff4c22b78970fd4cc223a82e36ef88dbe 100644 (file)
@@ -52,6 +52,7 @@
 #include "taskassignment.h"
 
 #include <algorithm>
+#include <exception>
 #include <string>
 #include <vector>
 
@@ -189,6 +190,31 @@ size_t countGpuTasksOnThisNode(const GpuTasksOnRanks& gpuTasksOnRanksOfThisNode)
     return numGpuTasksOnThisNode;
 }
 
+/*! \brief Return on each rank the total count over all ranks of all
+ * simulations. */
+int countOverAllRanks(const t_commrec* cr, const gmx_multisim_t* ms, const int countOnThisRank)
+{
+    int countOverAllRanksValue = countOnThisRank;
+    if (PAR(cr))
+    {
+        // Count over the ranks of this simulation.
+        gmx_sumi(1, &countOverAllRanksValue, cr);
+    }
+    if (isMultiSim(ms))
+    {
+        // Count over the ranks of all simulations.
+        gmx_sumi_sim(1, &countOverAllRanksValue, ms);
+        if (PAR(cr))
+        {
+            // Propagate the information from other simulations back
+            // to non-master ranks so they can all agree on future
+            // behavior.
+            gmx_bcast(sizeof(decltype(countOverAllRanksValue)), &countOverAllRanksValue, cr);
+        }
+    }
+    return countOverAllRanksValue;
+}
+
 } // namespace
 
 GpuTaskAssignmentsBuilder::GpuTaskAssignmentsBuilder() = default;
@@ -217,6 +243,7 @@ GpuTaskAssignments GpuTaskAssignmentsBuilder::build(const std::vector<int>& gpuI
     auto gpuTasksOnRanksOfThisNode = findAllGpuTasksOnThisNode(gpuTasksOnThisRank, physicalNodeComm);
     size_t numGpuTasksOnThisNode   = countGpuTasksOnThisNode(gpuTasksOnRanksOfThisNode);
 
+    std::exception_ptr             exceptionPtr;
     std::vector<GpuTaskAssignment> taskAssignmentOnRanksOfThisNode;
     try
     {
@@ -296,27 +323,37 @@ GpuTaskAssignments GpuTaskAssignmentsBuilder::build(const std::vector<int>& gpuI
         taskAssignmentOnRanksOfThisNode =
                 buildTaskAssignment(gpuTasksOnRanksOfThisNode, gpuIdsForTaskAssignment);
     }
-    catch (const std::exception& ex)
+    catch (...)
+    {
+        exceptionPtr = std::current_exception();
+    }
+    int countOfExceptionsOnThisRank   = int(bool(exceptionPtr));
+    int countOfExceptionsOverAllRanks = countOverAllRanks(cr, ms, countOfExceptionsOnThisRank);
+
+    // Avoid all ranks spamming the error stream
+    //
+    // TODO improve this so that unique errors on different ranks
+    // are all reported.
+    if (countOfExceptionsOnThisRank > 0 && physicalNodeComm.rank_ == 0)
     {
-        // TODO This implementation is quite similar to that of
-        // processExceptionAsFatalError (which implements
-        // GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR), but it is unclear
-        // how we should involve MPI in the implementation of error
-        // handling.
-        if (physicalNodeComm.rank_ == 0)
+        try
+        {
+            if (exceptionPtr)
+            {
+                std::rethrow_exception(exceptionPtr);
+            }
+        }
+        catch (const std::exception& ex)
         {
             printFatalErrorMessage(stderr, ex);
         }
-
-        gmx_exit_on_fatal_error(ExitType_Abort, 1);
     }
-    // TODO This implements a global barrier so that MPI runtimes can
-    // organize an orderly shutdown if one of the ranks has had to
-    // issue a fatal error after an exception detected only on one
-    // rank. When we have MPI-aware error handling and reporting, this
-    // should be improved.
-    multiSimBarrier(ms);
-    simulationBarrier(cr);
+    if (countOfExceptionsOverAllRanks > 0)
+    {
+        gmx_fatal(FARGS,
+                  "Exiting because task assignment failed. If there is no descriptive "
+                  "error message above this, please report this failure as a bug.");
+    }
 
     // TODO There is no check that mdrun -nb gpu or -pme gpu or
     // -gpu_id is actually being implemented such that nonbonded tasks
index fadd38184da94204129acce104edbc91d9323a9f..9174e7303df76cc802db42da32434aa4208021ce 100644 (file)
@@ -1502,6 +1502,14 @@ std::string getCoolQuote()
         { "To dissimulate is to feign not to have what one has. To simulate is to feign to have "
           "what one hasn't.",
           "Jean Baudrillard" },
+        { "Install our Free Energy Patents app! There is energy all around us; and it's free! "
+          "Free energy is everywhere, and all around you, just waiting to be extracted! Over "
+          "100+ free energy patents!"
+          "Mind and Miracle Productions on Twitter, spamming a FEP thread" },
+        { "\"A slow sort of country!\" said the Queen. \"Now, HERE, you see, it "
+          "takes all the running YOU can do, to keep in the same place. If you want "
+          "to get somewhere else, you must run at least twice as fast as that!\""
+          "Lewis Carroll" },
     };
 
     if (beCool())
index c1adcee5eefb37ad837ae572f4565fd3f2f2537d..eb012c36fccb4a9fe44d3fa47a848ed5510a72ab 100644 (file)
@@ -39,8 +39,6 @@
 #include <algorithm>
 #include <vector>
 
-#include "gromacs/utility/gmxassert.h"
-
 namespace gmx
 {
 
@@ -51,7 +49,7 @@ template<typename T>
 class CharBuffer
 {
 public:
-    static const size_t ValueSize = sizeof(T);
+    static constexpr size_t ValueSize = sizeof(T);
 
     explicit CharBuffer(T value) { u.v = value; }
     explicit CharBuffer(const char buffer[]) { std::copy(buffer, buffer + ValueSize, u.c); }
@@ -70,6 +68,26 @@ private:
     } u;
 };
 
+//! Return \c value with the byte order swapped.
+template<typename T>
+T swapEndian(const T& value)
+{
+    union {
+        T                           value_;
+        std::array<char, sizeof(T)> valueAsCharArray_;
+    } endianessSwappedValue;
+
+    endianessSwappedValue.value_ = value;
+    int hiByte                   = sizeof(T) - 1;
+    for (int loByte = 0; hiByte > loByte; loByte++, hiByte--)
+    {
+        std::swap(endianessSwappedValue.valueAsCharArray_[loByte],
+                  endianessSwappedValue.valueAsCharArray_[hiByte]);
+    }
+
+    return endianessSwappedValue.value_;
+}
+
 } // namespace
 
 /********************************************************************
@@ -79,10 +97,18 @@ private:
 class InMemorySerializer::Impl
 {
 public:
+    Impl(EndianSwapBehavior endianSwapBehavior) : endianSwapBehavior_(endianSwapBehavior) {}
     template<typename T>
     void doValue(T value)
     {
-        CharBuffer<T>(value).appendTo(&buffer_);
+        if (endianSwapBehavior_ == EndianSwapBehavior::DoSwap)
+        {
+            CharBuffer<T>(swapEndian(value)).appendTo(&buffer_);
+        }
+        else
+        {
+            CharBuffer<T>(value).appendTo(&buffer_);
+        }
     }
     void doString(const std::string& value)
     {
@@ -90,12 +116,16 @@ public:
         buffer_.insert(buffer_.end(), value.begin(), value.end());
     }
 
-    std::vector<char> buffer_;
+    std::vector<char>  buffer_;
+    EndianSwapBehavior endianSwapBehavior_;
 };
 
-InMemorySerializer::InMemorySerializer() : impl_(new Impl) {}
+InMemorySerializer::InMemorySerializer(EndianSwapBehavior endianSwapBehavior) :
+    impl_(new Impl(endianSwapBehavior))
+{
+}
 
-InMemorySerializer::~InMemorySerializer() {}
+InMemorySerializer::~InMemorySerializer() = default;
 
 std::vector<char> InMemorySerializer::finishAndGetBuffer()
 {
@@ -180,17 +210,25 @@ void InMemorySerializer::doString(std::string* value)
 class InMemoryDeserializer::Impl
 {
 public:
-    explicit Impl(ArrayRef<const char> buffer, bool sourceIsDouble) :
+    explicit Impl(ArrayRef<const char> buffer, bool sourceIsDouble, EndianSwapBehavior endianSwapBehavior) :
         buffer_(buffer),
         sourceIsDouble_(sourceIsDouble),
-        pos_(0)
+        pos_(0),
+        endianSwapBehavior_(endianSwapBehavior)
     {
     }
 
     template<typename T>
     void doValue(T* value)
     {
-        *value = CharBuffer<T>(&buffer_[pos_]).value();
+        if (endianSwapBehavior_ == EndianSwapBehavior::DoSwap)
+        {
+            *value = swapEndian(CharBuffer<T>(&buffer_[pos_]).value());
+        }
+        else
+        {
+            *value = CharBuffer<T>(&buffer_[pos_]).value();
+        }
         pos_ += CharBuffer<T>::ValueSize;
     }
     void doString(std::string* value)
@@ -204,14 +242,17 @@ public:
     ArrayRef<const char> buffer_;
     bool                 sourceIsDouble_;
     size_t               pos_;
+    EndianSwapBehavior   endianSwapBehavior_;
 };
 
-InMemoryDeserializer::InMemoryDeserializer(ArrayRef<const char> buffer, bool sourceIsDouble) :
-    impl_(new Impl(buffer, sourceIsDouble))
+InMemoryDeserializer::InMemoryDeserializer(ArrayRef<const char> buffer,
+                                           bool                 sourceIsDouble,
+                                           EndianSwapBehavior   endianSwapBehavior) :
+    impl_(new Impl(buffer, sourceIsDouble, endianSwapBehavior))
 {
 }
 
-InMemoryDeserializer::~InMemoryDeserializer() {}
+InMemoryDeserializer::~InMemoryDeserializer() = default;
 
 bool InMemoryDeserializer::sourceIsDouble() const
 {
index 60998c15aa75ac7b93e8dbbd8bb8eeb29e75a77d..8b6ac98a70f1ba35430753dc3f2fe3ada0cf4afa 100644 (file)
 namespace gmx
 {
 
+//! Specify endian swapping behvaior
+enum class EndianSwapBehavior : int
+{
+    DoSwap,    //!< Swap the bytes
+    DoNotSwap, //!< Do not swap the bytes
+    Count      //!< Number of possible behaviors
+};
+
 class InMemorySerializer : public ISerializer
 {
 public:
-    InMemorySerializer();
+    explicit InMemorySerializer(EndianSwapBehavior endianSwapBehavior = EndianSwapBehavior::DoNotSwap);
     ~InMemorySerializer() override;
 
     std::vector<char> finishAndGetBuffer();
@@ -85,7 +93,9 @@ private:
 class InMemoryDeserializer : public ISerializer
 {
 public:
-    explicit InMemoryDeserializer(ArrayRef<const char> buffer, bool sourceIsDouble);
+    InMemoryDeserializer(ArrayRef<const char> buffer,
+                         bool                 sourceIsDouble,
+                         EndianSwapBehavior   endianSwapBehavior = EndianSwapBehavior::DoNotSwap);
     ~InMemoryDeserializer() override;
 
     //! Get if the source data was written in double precsion
index ea9bfe398ba358dfaab1284f282c2bfc0739d28f..e8040f05f7a5909654efe9add4ab6402d3b91263 100644 (file)
@@ -40,6 +40,7 @@ gmx_add_unit_test(UtilityUnitTests utility-test
                   defaultinitializationallocator.cpp
                   enumerationhelpers.cpp
                   fixedcapacityvector.cpp
+                  inmemoryserializer.cpp
                   keyvaluetreeserializer.cpp
                   keyvaluetreetransform.cpp
                   logger.cpp
diff --git a/src/gromacs/utility/tests/inmemoryserializer.cpp b/src/gromacs/utility/tests/inmemoryserializer.cpp
new file mode 100644 (file)
index 0000000..a3612eb
--- /dev/null
@@ -0,0 +1,246 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief
+ * Tests for gmx::InMemorySerializer and InMemoryDeserializer.
+ *
+ * \author Christian Blau <blau@kth.se>
+ * \ingroup module_utility
+ */
+
+#include "gmxpre.h"
+
+#include "gromacs/utility/inmemoryserializer.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace gmx
+{
+namespace test
+{
+namespace
+{
+union IntAndFloat32 {
+    std::int32_t int32Value_;
+    float        floatValue_;
+};
+
+union IntAndFloat64 {
+    std::int64_t int64Value_;
+    double       doubleValue_;
+};
+
+//! Constants used for testing endian swap operations
+//! \{
+constexpr std::int16_t c_int16Value        = static_cast<std::int16_t>(0x7A2B);
+constexpr std::int16_t c_int16ValueSwapped = static_cast<std::int16_t>(0x2B7A);
+constexpr std::int32_t c_int32Value        = static_cast<std::int32_t>(0x78ABCDEF);
+constexpr std::int32_t c_int32ValueSwapped = static_cast<std::int32_t>(0xEFCDAB78);
+constexpr std::int64_t c_int64Value        = static_cast<std::int64_t>(0x78ABCDEF12345678);
+constexpr std::int64_t c_int64ValueSwapped = static_cast<std::int64_t>(0x78563412EFCDAB78);
+
+constexpr const IntAndFloat32 c_intAndFloat32{ c_int32Value };
+constexpr const IntAndFloat64 c_intAndFloat64{ c_int64Value };
+
+constexpr const IntAndFloat32 c_intAndFloat32Swapped{ c_int32ValueSwapped };
+constexpr const IntAndFloat64 c_intAndFloat64Swapped{ c_int64ValueSwapped };
+//! \}
+
+//! Return the integer used for testing, depending on the size of int.
+constexpr int integerSizeDependentTestingValue()
+{
+    return sizeof(int) == 4 ? c_int32Value : sizeof(int) == 8 ? c_int64Value : c_int16Value;
+}
+
+//! Return the endianess-swapped integer used for testing, depending on the size of int.
+constexpr int integerSizeDependentTestingValueEndianessSwapped()
+{
+    return sizeof(int) == 4 ? c_int32ValueSwapped
+                            : sizeof(int) == 8 ? c_int64ValueSwapped : c_int16ValueSwapped;
+}
+
+class InMemorySerializerTest : public ::testing::Test
+{
+public:
+    struct SerializerValues
+    {
+        bool           boolValue_;
+        unsigned char  unsignedCharValue_;
+        char           charValue_;
+        unsigned short unsignedShortValue_;
+        std::int32_t   int32Value_;
+        float          floatValue_;
+        std::int64_t   int64Value_;
+        double         doubleValue_;
+        int            intValue_;
+        real           realValue_;
+    };
+
+    void serialize(ISerializer* serializer, SerializerValues* values)
+    {
+        EXPECT_FALSE(serializer->reading());
+        doValues(serializer, values);
+    }
+
+    SerializerValues deserialize(ISerializer* serializer)
+    {
+        EXPECT_TRUE(serializer->reading());
+        SerializerValues result;
+        doValues(serializer, &result);
+        return result;
+    }
+
+    void checkSerializerValuesforEquality(const SerializerValues& lhs, const SerializerValues& rhs)
+    {
+        EXPECT_EQ(lhs.boolValue_, rhs.boolValue_);
+        EXPECT_EQ(lhs.unsignedCharValue_, rhs.unsignedCharValue_);
+        EXPECT_EQ(lhs.charValue_, rhs.charValue_);
+        EXPECT_EQ(lhs.intValue_, rhs.intValue_);
+        EXPECT_EQ(lhs.int32Value_, rhs.int32Value_);
+        EXPECT_EQ(lhs.int64Value_, rhs.int64Value_);
+        EXPECT_EQ(lhs.unsignedShortValue_, rhs.unsignedShortValue_);
+        EXPECT_EQ(lhs.realValue_, rhs.realValue_);
+        EXPECT_EQ(lhs.floatValue_, rhs.floatValue_);
+        EXPECT_EQ(lhs.doubleValue_, rhs.doubleValue_);
+    }
+
+private:
+    void doValues(ISerializer* serializer, SerializerValues* values)
+    {
+        serializer->doBool(&values->boolValue_);
+        serializer->doUChar(&values->unsignedCharValue_);
+        serializer->doChar(&values->charValue_);
+        serializer->doInt(&values->intValue_);
+        serializer->doInt32(&values->int32Value_);
+        serializer->doInt64(&values->int64Value_);
+        serializer->doUShort(&values->unsignedShortValue_);
+        serializer->doReal(&values->realValue_);
+        serializer->doFloat(&values->floatValue_);
+        serializer->doDouble(&values->doubleValue_);
+    }
+
+protected:
+    SerializerValues defaultValues_ = { true,
+                                        0x78,
+                                        0x78,
+                                        static_cast<unsigned short>(c_int16Value),
+                                        c_int32Value,
+                                        c_intAndFloat32.floatValue_,
+                                        c_int64Value,
+                                        c_intAndFloat64.doubleValue_,
+                                        integerSizeDependentTestingValue(),
+                                        std::is_same<real, double>::value
+                                                ? static_cast<real>(c_intAndFloat64.doubleValue_)
+                                                : static_cast<real>(c_intAndFloat32.floatValue_) };
+
+    SerializerValues endianessSwappedValues_ = {
+        true,
+        0x78,
+        0x78,
+        static_cast<unsigned short>(c_int16ValueSwapped),
+        c_int32ValueSwapped,
+        c_intAndFloat32Swapped.floatValue_,
+        c_int64ValueSwapped,
+        c_intAndFloat64Swapped.doubleValue_,
+        integerSizeDependentTestingValueEndianessSwapped(),
+        std::is_same<real, float>::value ? static_cast<real>(c_intAndFloat32Swapped.floatValue_)
+                                         : static_cast<real>(c_intAndFloat64Swapped.doubleValue_)
+    };
+};
+
+TEST_F(InMemorySerializerTest, Roundtrip)
+{
+    InMemorySerializer serializer;
+    SerializerValues   values = defaultValues_;
+    serialize(&serializer, &values);
+
+    auto buffer = serializer.finishAndGetBuffer();
+
+    InMemoryDeserializer deserializer(buffer, std::is_same<real, double>::value);
+
+    SerializerValues deserialisedValues = deserialize(&deserializer);
+
+    checkSerializerValuesforEquality(values, deserialisedValues);
+}
+
+TEST_F(InMemorySerializerTest, RoundtripWithEndianessSwap)
+{
+    InMemorySerializer serializerWithSwap(EndianSwapBehavior::DoSwap);
+    SerializerValues   values = defaultValues_;
+    serialize(&serializerWithSwap, &values);
+
+    auto buffer = serializerWithSwap.finishAndGetBuffer();
+
+    InMemoryDeserializer deserializerWithSwap(buffer, std::is_same<real, double>::value,
+                                              EndianSwapBehavior::DoSwap);
+
+    SerializerValues deserialisedValues = deserialize(&deserializerWithSwap);
+
+    checkSerializerValuesforEquality(values, deserialisedValues);
+}
+
+TEST_F(InMemorySerializerTest, SerializerExplicitEndianessSwap)
+{
+    InMemorySerializer serializerWithSwap(EndianSwapBehavior::DoSwap);
+    SerializerValues   values = defaultValues_;
+    serialize(&serializerWithSwap, &values);
+
+    auto buffer = serializerWithSwap.finishAndGetBuffer();
+
+    InMemoryDeserializer deserializerWithOutSwap(buffer, std::is_same<real, double>::value);
+
+    SerializerValues deserialisedValues = deserialize(&deserializerWithOutSwap);
+    checkSerializerValuesforEquality(endianessSwappedValues_, deserialisedValues);
+}
+
+TEST_F(InMemorySerializerTest, DeserializerExplicitEndianessSwap)
+{
+    InMemorySerializer serializer;
+    SerializerValues   values = defaultValues_;
+    serialize(&serializer, &values);
+
+    auto buffer = serializer.finishAndGetBuffer();
+
+    InMemoryDeserializer deserializerWithSwap(buffer, std::is_same<real, double>::value,
+                                              EndianSwapBehavior::DoSwap);
+
+    SerializerValues deserialisedValues = deserialize(&deserializerWithSwap);
+    checkSerializerValuesforEquality(endianessSwappedValues_, deserialisedValues);
+}
+
+} // namespace
+} // namespace test
+} // namespace gmx
index fa098d82c90409ae19b2babc4bf68017d3a2598c..366a5342e9ecd375a3a3908e06aa4851a9dc6ae5 100644 (file)
@@ -127,6 +127,10 @@ public:
     const std::string mdpEnergyAndDensityfittingIntervalMismatch_ = formatString(
             "nstcalcenergy = 7\n"
             "density-guided-simulation-nst = 3\n");
+    //! Set mdp values so that we skip every other step
+    const std::string mdpSkipDensityfittingEveryOtherStep_ = formatString(
+            "nstenergy = 2\n"
+            "density-guided-simulation-nst = 2\n");
     //! The command line to call mdrun
     CommandLine commandLineForMdrun_;
 };
@@ -171,5 +175,33 @@ TEST_F(DensityFittingTest, GromppErrorWhenEnergyEvaluationFrequencyMismatch)
                               ".*is not a multiple of density-guided-simulation-nst.*");
 }
 
+/* Fit a subset of three of twelve argon atoms into a reference density
+ * whose origin is offset from the simulation box origin. Stop the simulation,
+ * then restart.
+ *
+ * All density fitting mdp parameters are set to defaults
+ */
+TEST_F(DensityFittingTest, CheckpointWorks)
+{
+    runner_.useStringAsMdpFile(mdpMdDensfitYesUnsetValues + mdpSkipDensityfittingEveryOtherStep_);
+    runner_.cptFileName_ = fileManager_.getTemporaryFilePath(".cpt");
+    commandLineForMdrun_.addOption("-cpo", runner_.cptFileName_);
+
+    ASSERT_EQ(0, runner_.callGrompp());
+    ASSERT_EQ(0, runner_.callMdrun(commandLineForMdrun_));
+
+    // checkMdrun(expectedEnergyTermMagnitude);
+
+    CommandLine commandLineForRestart;
+    commandLineForRestart.addOption("-cpi", runner_.cptFileName_);
+    commandLineForRestart.addOption("-noappend");
+    runner_.nsteps_ = 4;
+    ASSERT_EQ(0, runner_.callMdrun(commandLineForRestart));
+
+    const real expectedEnergyTermMagnitude = -3378.825928;
+    checkMdrun(expectedEnergyTermMagnitude);
+}
+
+
 } // namespace test
 } // namespace gmx
index 148e5f72efb5a8e031fb82ce3e9a700607faa8f4..db688cdd1824c7db3b4495b2f45c9176a1e08e4d 100644 (file)
@@ -372,8 +372,17 @@ TEST_P(MdrunNoAppendContinuationIsExact, WithinTolerances)
     // with forces on CPUs, but there is no real risk of a bug with
     // those propagators that would only be caught with a tighter
     // tolerance in this particular test.
+    int ulpToleranceInMixed  = 32;
+    int ulpToleranceInDouble = 64;
+    if (integrator == "bd")
+    {
+        // Somehow, the bd integrator has never been as reproducible
+        // as the others, either in continuations or reruns.
+        ulpToleranceInMixed = 128;
+    }
     EnergyTermsToCompare energyTermsToCompare{ {
-            { interaction_function[F_EPOT].longname, relativeToleranceAsPrecisionDependentUlp(10.0, 32, 64) },
+            { interaction_function[F_EPOT].longname,
+              relativeToleranceAsPrecisionDependentUlp(10.0, ulpToleranceInMixed, ulpToleranceInDouble) },
     } };
 
     int numWarningsToTolerate = 1;
diff --git a/src/programs/mdrun/tests/refdata/DensityFittingTest_CheckpointWorks.xml b/src/programs/mdrun/tests/refdata/DensityFittingTest_CheckpointWorks.xml
new file mode 100644 (file)
index 0000000..b1b3ab3
--- /dev/null
@@ -0,0 +1,12 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="referencedata.xsl"?>
+<ReferenceData>
+  <Energy Name="Potential">
+    <Real Name="Time 0.000000 Step 0 in frame 0">-3379.9202</Real>
+    <Real Name="Time 0.002000 Step 2 in frame 1">-3390.9541</Real>
+  </Energy>
+  <Energy Name="Density fitting">
+    <Real Name="Time 0.000000 Step 0 in frame 0">-3378.9026</Real>
+    <Real Name="Time 0.002000 Step 2 in frame 1">-3389.9324</Real>
+  </Energy>
+</ReferenceData>