From: Teemu Murtola Date: Thu, 14 Aug 2014 02:59:28 +0000 (+0300) Subject: More precise analysisdata histogram accumulation X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?p=alexxy%2Fgromacs.git;a=commitdiff_plain;h=982c4f9cd49002caeaacf9142af0f25aed3c345b More precise analysisdata histogram accumulation Accumulate histograms within a frame as 64-bit integers (if possible) or as double-precision values to support cases like RDFs where there can be a very large number of points contributing to the histogram. Add a supporting class that could be used in other analysis modules as well to make them capable of parallel processing (at least the AnalysisDataFrameAverager could be adapted to use this approach). Also, make the definition of "parallelization factor" the same here and in datastorage.*, adapting a simpler definition. Change-Id: I63deed99d3c0d3c168d11ab0d85981f1ed6fee8d --- diff --git a/src/gromacs/analysisdata/datastorage.cpp b/src/gromacs/analysisdata/datastorage.cpp index 450e8f3b08..370c7b4ab7 100644 --- a/src/gromacs/analysisdata/datastorage.cpp +++ b/src/gromacs/analysisdata/datastorage.cpp @@ -870,7 +870,7 @@ AnalysisDataStorage::startParallelDataStorage( AnalysisDataModuleManager *modules, const AnalysisDataParallelOptions &options) { - const int pendingLimit = 2 * options.parallelizationFactor() - 1; + const int pendingLimit = options.parallelizationFactor(); impl_->pendingLimit_ = pendingLimit; modules->notifyParallelDataStart(data, options); // Data needs to be set before calling extendBuffer() diff --git a/src/gromacs/analysisdata/datastorage.h b/src/gromacs/analysisdata/datastorage.h index 8fd2b00ff0..4e9fd25341 100644 --- a/src/gromacs/analysisdata/datastorage.h +++ b/src/gromacs/analysisdata/datastorage.h @@ -368,7 +368,7 @@ class AnalysisDataStorage * far in the future: * If \c i is the index of the last frame such that all frames from * 0, ..., \c i have been finished, then \p header().index() should be - * at most \c 2*parallelizationFactor-1 larger than \c i, where + * at most \c parallelizationFactor larger than \c i, where * parallelizationFactor is the parallelization factor passed to * setParallelOptions(). * Throws APIError if this constraint is violated. diff --git a/src/gromacs/analysisdata/framelocaldata.h b/src/gromacs/analysisdata/framelocaldata.h new file mode 100644 index 0000000000..f6fa5ca3a7 --- /dev/null +++ b/src/gromacs/analysisdata/framelocaldata.h @@ -0,0 +1,308 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2014, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * \brief + * Defines gmx::AnalysisDataFrameLocalData and supporting types. + * + * \author Teemu Murtola + * \ingroup module_analysisdata + */ +#ifndef GMX_ANALYSISDATA_FRAMELOCALDATA_H +#define GMX_ANALYSISDATA_FRAMELOCALDATA_H + +#include +#include +#include + +#include "gromacs/analysisdata/paralleloptions.h" +#include "gromacs/utility/arrayref.h" +#include "gromacs/utility/gmxassert.h" + +namespace gmx +{ + +//! \addtogroup module_analysisdata +//! \{ + +/*! \internal + * \brief + * Handle to a single data set within frame-local data array. + * + * Methods in this class do not throw. + * + * \see AnalysisDataFrameLocalData + */ +template +class AnalysisDataFrameLocalDataSetHandle +{ + public: + //! Constructs a handle from an array of values. + explicit AnalysisDataFrameLocalDataSetHandle(ArrayRef values) + : values_(values) + { + } + + //! Clears all values in the data set. + void clear() + { + std::fill(values_.begin(), values_.end(), ValueType()); + } + + //! Accesses a single value in the data set. + ValueType &value(int column) + { + GMX_ASSERT(column >= 0 && column < static_cast(values_.size()), + "Invalid column index"); + return values_[column]; + } + + private: + ArrayRef values_; +}; + +/*! \internal + * \brief + * Handle to a single frame data within frame-local data array. + * + * Methods in this class do not throw. + * + * \see AnalysisDataFrameLocalData + */ +template +class AnalysisDataFrameLocalDataHandle +{ + public: + //! Shorthand for the internal array of values. + typedef std::vector ValueArray; + //! Shorthand for a handle to a single data set. + typedef AnalysisDataFrameLocalDataSetHandle DataSetHandle; + + //! Constructs a handle from specified frame data. + AnalysisDataFrameLocalDataHandle(const std::vector *dataSetIndices, + ValueArray *values) + : dataSetIndices_(dataSetIndices), values_(values) + { + } + + //! Returns the number of data sets in the array. + int dataSetCount() const + { + return dataSetIndices_->size() - 1; + } + //! Clears all values in the frame. + void clear() + { + std::fill(values_->begin(), values_->end(), ValueType()); + } + + //! Returns a handle for a single data set. + DataSetHandle dataSet(int dataSet) + { + GMX_ASSERT(dataSet >= 0 && dataSet < dataSetCount(), + "Invalid data set index"); + const int firstIndex = (*dataSetIndices_)[dataSet]; + const int lastIndex = (*dataSetIndices_)[dataSet + 1]; + typename ValueArray::iterator begin = values_->begin() + firstIndex; + typename ValueArray::iterator end = values_->begin() + lastIndex; + return DataSetHandle(arrayRefFromVector(begin, end)); + } + //! Accesses a single value in the frame. + ValueType &value(int dataSet, int column) + { + GMX_ASSERT(dataSet >= 0 && dataSet < dataSetCount(), + "Invalid data set index"); + const int firstIndex = (*dataSetIndices_)[dataSet]; + GMX_ASSERT(column >= 0 + && column < (*dataSetIndices_)[dataSet+1] - firstIndex, + "Invalid column index"); + return (*values_)[firstIndex + column]; + } + + private: + const std::vector *dataSetIndices_; + ValueArray *values_; +}; + +/*! \internal \brief + * Container for an array of frame-local values that supports parallel data + * processing. + * + * \tparam ValueType Type of values to store. + * + * This class provides a convenient interface to create an array of frame-local + * data for use in analysis data modules that support parallel processing. + * The object is initialized by setting the desired dimensionality with + * setDataSetCount() and setColumnCount(), followed by a call to init(), + * typically in AnalysisDataModuleInterface::parallelDataStarted(), + * + * After initialization, frameData() can be used to access the data for a given + * frame, independently from other frames. This works if the assumptions about + * parallelism hold: if `N` is the parallelization factor given for init() with + * AnalysisDataParallelOptions::parallelizationFactor(), then frame `i+N` must + * not be accessed before all processing for frame `i` is finished. + * Technically, the data for different frames is kept in a ring buffer of size + * `N`. + * + * The data for a frame is not cleared after it is reused for a new frame (but + * is initially cleared). This allows using the data for accumulating values + * over all frames in a lock-free manner. + * + * frameDataSet() is provided for convenience when only a single data set + * needs to be accessed (typically in AnalysisDataModuleInterface::pointsAdded()). + * + * Methods in this class do not throw except where indicated. + * + * \see AnalysisDataFrameLocalData + */ +template +class AnalysisDataFrameLocalData +{ + public: + //! Shorthand for the internal array of values for a frame. + typedef std::vector ValueArray; + //! Shorthand for a handle to a single frame. + typedef AnalysisDataFrameLocalDataHandle FrameHandle; + //! Shorthand for a handle to a single data set. + typedef AnalysisDataFrameLocalDataSetHandle DataSetHandle; + + //! Constructs an empty container with a single data set. + AnalysisDataFrameLocalData() + { + dataSetColumns_.resize(2); + } + + //! Whether init() has been called. + bool isInitialized() const { return !values_.empty(); } + /*! \brief + * Returns number of independent data frames in this object. + * + * This supports looping over all the frame arrays to, e.g., sum them + * up at the end in accumulation scenarios. + */ + int frameCount() const { return values_.size(); } + + /*! \brief + * Sets the number of data sets stored for each frame. + * + * \throws std::bad_alloc if out of memory. + * + * If not called, there is a single data set in the object. + * Cannot be called after init(). + */ + void setDataSetCount(int dataSetCount) + { + GMX_RELEASE_ASSERT(!isInitialized(), + "Cannot change value count after init()"); + GMX_RELEASE_ASSERT(dataSetCount >= 0, + "Invalid data set count"); + dataSetColumns_.resize(dataSetCount + 1); + } + /*! \brief + * Sets the number of columns stored for a data set. + * + * Must be called for each data set that needs to have values, + * otherwise there will be zero columns for that data set. + * Cannot be called after init(). + */ + void setColumnCount(int dataSet, int columnCount) + { + GMX_RELEASE_ASSERT(!isInitialized(), + "Cannot change value count after init()"); + GMX_RELEASE_ASSERT(dataSet >= 0 && dataSet < static_cast(dataSetColumns_.size()) - 1, + "Invalid data set index"); + GMX_RELEASE_ASSERT(columnCount >= 0, + "Invalid column count"); + dataSetColumns_[dataSet + 1] = columnCount; + } + + /*! \brief + * Initializes the storage to support specified parallelism. + * + * \throws std::bad_alloc if out of memory. + */ + void init(const AnalysisDataParallelOptions &opt) + { + GMX_RELEASE_ASSERT(!isInitialized(), "init() called multiple times"); + std::partial_sum(dataSetColumns_.begin(), dataSetColumns_.end(), + dataSetColumns_.begin()); + values_.resize(opt.parallelizationFactor()); + typename std::vector::iterator i; + for (i = values_.begin(); i != values_.end(); ++i) + { + i->resize(dataSetColumns_.back()); + } + } + + //! Returns a handle to access data for a frame. + FrameHandle frameData(int frameIndex) + { + GMX_ASSERT(frameIndex >= 0, "Invalid frame index"); + GMX_ASSERT(isInitialized(), "Cannot access data before init()"); + return FrameHandle(&dataSetColumns_, + &values_[frameIndex % values_.size()]); + } + //! Returns a handle to access a single data set within a frame. + DataSetHandle frameDataSet(int frameIndex, int dataSet) + { + return frameData(frameIndex).dataSet(dataSet); + } + + private: + /*! \brief + * Index to find data sets within a per-frame array in `values_`. + * + * The first entry is always zero, followed by one entry for each data + * set. Before init(), the data set entries hold the numbers set with + * setColumnCount(). After init(), the data set entries hold the + * indices of the first column for that data set in the per-frame + * arrays in `values_`. + */ + std::vector dataSetColumns_; + /*! \brief + * Data array for each frame. + * + * This is a ring buffer whose size is specified by the desired + * parallelism level. For each frame, there is a single array of + * values, where the individual data sets are indexed with + * `dataSetColumns_`. + */ + std::vector values_; +}; + +//! \} + +} // namespace gmx + +#endif diff --git a/src/gromacs/analysisdata/modules/histogram.cpp b/src/gromacs/analysisdata/modules/histogram.cpp index cc9c601e96..24f67024a7 100644 --- a/src/gromacs/analysisdata/modules/histogram.cpp +++ b/src/gromacs/analysisdata/modules/histogram.cpp @@ -50,6 +50,8 @@ #include "gromacs/analysisdata/dataframe.h" #include "gromacs/analysisdata/datastorage.h" +#include "gromacs/analysisdata/framelocaldata.h" +#include "gromacs/utility/basedefinitions.h" #include "gromacs/utility/exceptions.h" #include "gromacs/utility/gmxassert.h" @@ -340,7 +342,7 @@ AbstractAverageHistogram::normalizeProbability() { for (int c = 0; c < columnCount(); ++c) { - real sum = 0; + double sum = 0; for (int i = 0; i < rowCount(); ++i) { sum += value(i, c).value(); @@ -504,9 +506,14 @@ BasicAverageHistogramModule::dataFinished() * BasicHistogramImpl */ -/*! \internal \brief - * Private implementation class for AnalysisDataSimpleHistogramModule and - * AnalysisDataWeightedHistogramModule. +/*! \internal + * \brief + * Base class for private implementation classes for histogram modules. + * + * Actual implementation classes are derived from this and add an accumulation + * data member that is specific to the histogram type in question. + * This is done like this to keep implementation details out of the header, and + * to not unnecessarily duplicate code. * * \ingroup module_analysisdata */ @@ -520,16 +527,13 @@ class BasicHistogramImpl BasicHistogramImpl(); //! Creates an histogram impl with defined bin parameters. explicit BasicHistogramImpl(const AnalysisHistogramSettings &settings); - ~BasicHistogramImpl(); + // Virtual only for simplicity. + virtual ~BasicHistogramImpl(); /*! \brief * (Re)initializes the histogram from settings. */ void init(const AnalysisHistogramSettings &settings); - /*! \brief - * Initializes data storage frame when a new frame starts. - */ - void initFrame(int dataSetCount, AnalysisDataStorageFrame *frame); //! Storage implementation object. AnalysisDataStorage storage_; @@ -562,21 +566,6 @@ void BasicHistogramImpl::init(const AnalysisHistogramSettings &settings) averager_->init(settings); } - -void -BasicHistogramImpl::initFrame(int dataSetCount, AnalysisDataStorageFrame *frame) -{ - for (int s = 0; s < dataSetCount; ++s) - { - frame->selectDataSet(s); - for (int i = 0; i < frame->columnCount(); ++i) - { - frame->setValue(i, 0.0); - } - } - frame->selectDataSet(0); -} - } // namespace internal @@ -584,15 +573,37 @@ BasicHistogramImpl::initFrame(int dataSetCount, AnalysisDataStorageFrame *frame) * AnalysisDataSimpleHistogramModule */ +/*! \internal \brief + * Private implementation class for AnalysisDataSimpleHistogramModule. + * + * \ingroup module_analysisdata + */ +class AnalysisDataSimpleHistogramModule::Impl : public internal::BasicHistogramImpl +{ + public: + //! Shorthand for the per-frame accumulation data structure type. + typedef AnalysisDataFrameLocalData FrameLocalData; + + Impl() {} + //! Creates an histogram impl with defined bin parameters. + explicit Impl(const AnalysisHistogramSettings &settings) + : BasicHistogramImpl(settings) + { + } + + //! Accumulates the histogram within a frame. + FrameLocalData accumulator_; +}; + AnalysisDataSimpleHistogramModule::AnalysisDataSimpleHistogramModule() - : impl_(new internal::BasicHistogramImpl()) + : impl_(new Impl()) { } AnalysisDataSimpleHistogramModule::AnalysisDataSimpleHistogramModule( const AnalysisHistogramSettings &settings) - : impl_(new internal::BasicHistogramImpl(settings)) + : impl_(new Impl(settings)) { } @@ -643,11 +654,16 @@ AnalysisDataSimpleHistogramModule::parallelDataStarted( const AnalysisDataParallelOptions &options) { addModule(impl_->averager_); - setDataSetCount(data->dataSetCount()); - for (int i = 0; i < data->dataSetCount(); ++i) + const int dataSetCount = data->dataSetCount(); + const int columnCount = settings().binCount(); + setDataSetCount(dataSetCount); + impl_->accumulator_.setDataSetCount(dataSetCount); + for (int i = 0; i < dataSetCount; ++i) { - setColumnCount(i, settings().binCount()); + setColumnCount(i, columnCount); + impl_->accumulator_.setColumnCount(i, columnCount); } + impl_->accumulator_.init(options); impl_->storage_.startParallelDataStorage(this, &moduleManager(), options); return true; } @@ -656,17 +672,15 @@ AnalysisDataSimpleHistogramModule::parallelDataStarted( void AnalysisDataSimpleHistogramModule::frameStarted(const AnalysisDataFrameHeader &header) { - AnalysisDataStorageFrame &frame = impl_->storage_.startFrame(header); - impl_->initFrame(dataSetCount(), &frame); + impl_->accumulator_.frameData(header.index()).clear(); } void AnalysisDataSimpleHistogramModule::pointsAdded(const AnalysisDataPointSetRef &points) { - AnalysisDataStorageFrame &frame = - impl_->storage_.currentFrame(points.frameIndex()); - frame.selectDataSet(points.dataSetIndex()); + Impl::FrameLocalData::DataSetHandle handle + = impl_->accumulator_.frameDataSet(points.frameIndex(), points.dataSetIndex()); for (int i = 0; i < points.columnCount(); ++i) { if (points.present(i)) @@ -674,7 +688,7 @@ AnalysisDataSimpleHistogramModule::pointsAdded(const AnalysisDataPointSetRef &po const int bin = settings().findBin(points.y(i)); if (bin != -1) { - frame.value(bin) += 1; + handle.value(bin) += 1; } } } @@ -684,7 +698,20 @@ AnalysisDataSimpleHistogramModule::pointsAdded(const AnalysisDataPointSetRef &po void AnalysisDataSimpleHistogramModule::frameFinished(const AnalysisDataFrameHeader &header) { - impl_->storage_.finishFrame(header.index()); + Impl::FrameLocalData::FrameHandle handle + = impl_->accumulator_.frameData(header.index()); + AnalysisDataStorageFrame &frame = impl_->storage_.startFrame(header); + const int columnCount = settings().binCount(); + for (int s = 0; s < dataSetCount(); ++s) + { + Impl::FrameLocalData::DataSetHandle dataSet = handle.dataSet(s); + frame.selectDataSet(s); + for (int i = 0; i < columnCount; ++i) + { + frame.setValue(i, dataSet.value(i)); + } + } + frame.finishFrame(); } @@ -713,15 +740,37 @@ AnalysisDataSimpleHistogramModule::requestStorageInternal(int nframes) * AnalysisDataWeightedHistogramModule */ +/*! \internal \brief + * Private implementation class for AnalysisDataWeightedHistogramModule. + * + * \ingroup module_analysisdata + */ +class AnalysisDataWeightedHistogramModule::Impl : public internal::BasicHistogramImpl +{ + public: + //! Shorthand for the per-frame accumulation data structure type. + typedef AnalysisDataFrameLocalData FrameLocalData; + + Impl() {} + //! Creates an histogram impl with defined bin parameters. + explicit Impl(const AnalysisHistogramSettings &settings) + : BasicHistogramImpl(settings) + { + } + + //! Accumulates the histogram within a frame. + FrameLocalData accumulator_; +}; + AnalysisDataWeightedHistogramModule::AnalysisDataWeightedHistogramModule() - : impl_(new internal::BasicHistogramImpl()) + : impl_(new Impl()) { } AnalysisDataWeightedHistogramModule::AnalysisDataWeightedHistogramModule( const AnalysisHistogramSettings &settings) - : impl_(new internal::BasicHistogramImpl(settings)) + : impl_(new Impl(settings)) { } @@ -771,11 +820,16 @@ AnalysisDataWeightedHistogramModule::parallelDataStarted( const AnalysisDataParallelOptions &options) { addModule(impl_->averager_); - setDataSetCount(data->dataSetCount()); - for (int i = 0; i < data->dataSetCount(); ++i) + const int dataSetCount = data->dataSetCount(); + const int columnCount = settings().binCount(); + setDataSetCount(dataSetCount); + impl_->accumulator_.setDataSetCount(dataSetCount); + for (int i = 0; i < dataSetCount; ++i) { - setColumnCount(i, settings().binCount()); + setColumnCount(i, columnCount); + impl_->accumulator_.setColumnCount(i, columnCount); } + impl_->accumulator_.init(options); impl_->storage_.startParallelDataStorage(this, &moduleManager(), options); return true; } @@ -784,8 +838,7 @@ AnalysisDataWeightedHistogramModule::parallelDataStarted( void AnalysisDataWeightedHistogramModule::frameStarted(const AnalysisDataFrameHeader &header) { - AnalysisDataStorageFrame &frame = impl_->storage_.startFrame(header); - impl_->initFrame(dataSetCount(), &frame); + impl_->accumulator_.frameData(header.index()).clear(); } @@ -799,12 +852,11 @@ AnalysisDataWeightedHistogramModule::pointsAdded(const AnalysisDataPointSetRef & int bin = settings().findBin(points.y(0)); if (bin != -1) { - AnalysisDataStorageFrame &frame = - impl_->storage_.currentFrame(points.frameIndex()); - frame.selectDataSet(points.dataSetIndex()); + Impl::FrameLocalData::DataSetHandle handle + = impl_->accumulator_.frameDataSet(points.frameIndex(), points.dataSetIndex()); for (int i = 1; i < points.columnCount(); ++i) { - frame.value(bin) += points.y(i); + handle.value(bin) += points.y(i); } } } @@ -813,7 +865,20 @@ AnalysisDataWeightedHistogramModule::pointsAdded(const AnalysisDataPointSetRef & void AnalysisDataWeightedHistogramModule::frameFinished(const AnalysisDataFrameHeader &header) { - impl_->storage_.finishFrame(header.index()); + Impl::FrameLocalData::FrameHandle handle + = impl_->accumulator_.frameData(header.index()); + AnalysisDataStorageFrame &frame = impl_->storage_.startFrame(header); + const int columnCount = settings().binCount(); + for (int s = 0; s < dataSetCount(); ++s) + { + Impl::FrameLocalData::DataSetHandle dataSet = handle.dataSet(s); + frame.selectDataSet(s); + for (int i = 0; i < columnCount; ++i) + { + frame.setValue(i, dataSet.value(i)); + } + } + frame.finishFrame(); } diff --git a/src/gromacs/analysisdata/modules/histogram.h b/src/gromacs/analysisdata/modules/histogram.h index 280c4784f7..fe47fe0840 100644 --- a/src/gromacs/analysisdata/modules/histogram.h +++ b/src/gromacs/analysisdata/modules/histogram.h @@ -237,13 +237,6 @@ class AnalysisHistogramSettings }; -namespace internal -{ - -class BasicHistogramImpl; - -} // namespace internal - class AbstractAverageHistogram; //! Smart pointer to manage an AbstractAverageHistogram object. @@ -344,6 +337,10 @@ class AbstractAverageHistogram : public AbstractAnalysisArrayData * The number of columns for all data sets equals the number of bins in the * histogram. * + * The histograms are accumulated as 64-bit integers within a frame and summed + * in double precision across frames, even if the output data is in single + * precision. + * * \inpublicapi * \ingroup module_analysisdata */ @@ -396,7 +393,9 @@ class AnalysisDataSimpleHistogramModule : public AbstractAnalysisData, virtual AnalysisDataFrameRef tryGetDataFrameInternal(int index) const; virtual bool requestStorageInternal(int nframes); - PrivateImplPointer impl_; + class Impl; + + PrivateImplPointer impl_; // Copy and assign disallowed by base. }; @@ -417,6 +416,9 @@ class AnalysisDataSimpleHistogramModule : public AbstractAnalysisData, * The number of columns for all data sets equals the number of bins in the * histogram. * + * The histograms are accumulated in double precision, even if the output data + * is in single precision. + * * \inpublicapi * \ingroup module_analysisdata */ @@ -455,7 +457,9 @@ class AnalysisDataWeightedHistogramModule : public AbstractAnalysisData, virtual AnalysisDataFrameRef tryGetDataFrameInternal(int index) const; virtual bool requestStorageInternal(int nframes); - PrivateImplPointer impl_; + class Impl; + + PrivateImplPointer impl_; // Copy and assign disallowed by base. };